Pārlūkot izejas kodu

[fix] when two results are merged, really use the content with more text

Dalf 10 gadus atpakaļ
vecāks
revīzija
6b058962e1
1 mainītis faili ar 12 papildinājumiem un 1 dzēšanām
  1. 12
    1
      searx/search.py

+ 12
- 1
searx/search.py Parādīt failu

@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
16 16
 '''
17 17
 
18 18
 import grequests
19
+import re
19 20
 from itertools import izip_longest, chain
20 21
 from datetime import datetime
21 22
 from operator import itemgetter
@@ -76,6 +77,13 @@ def make_callback(engine_name, results, suggestions, callback, params):
76 77
 
77 78
     return process_callback
78 79
 
80
+# return the meaningful length of the content for a result
81
+def content_result_len(result):
82
+    if isinstance(result.get('content'), basestring):
83
+        content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content'))
84
+        return len(content) 
85
+    else:
86
+        return 0
79 87
 
80 88
 # score results and remove duplications
81 89
 def score_results(results):
@@ -110,6 +118,9 @@ def score_results(results):
110 118
         duplicated = False
111 119
 
112 120
         # check for duplicates
121
+        if 'content' in res:
122
+            res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
123
+
113 124
         for new_res in results:
114 125
             # remove / from the end of the url if required
115 126
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
@@ -126,7 +137,7 @@ def score_results(results):
126 137
         # merge duplicates together
127 138
         if duplicated:
128 139
             # using content with more text
129
-            if res.get('content') > duplicated.get('content'):
140
+            if content_result_len(res) > content_result_len(duplicated):
130 141
                 duplicated['content'] = res['content']
131 142
 
132 143
             # increase result-score