|
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
16
|
16
|
'''
|
17
|
17
|
|
18
|
18
|
import grequests
|
|
19
|
+import re
|
19
|
20
|
from itertools import izip_longest, chain
|
20
|
21
|
from datetime import datetime
|
21
|
22
|
from operator import itemgetter
|
|
@@ -76,6 +77,13 @@ def make_callback(engine_name, results, suggestions, callback, params):
|
76
|
77
|
|
77
|
78
|
return process_callback
|
78
|
79
|
|
|
80
|
+# return the meaningful length of the content for a result
|
|
81
|
+def content_result_len(result):
|
|
82
|
+ if isinstance(result.get('content'), basestring):
|
|
83
|
+ content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content'))
|
|
84
|
+ return len(content)
|
|
85
|
+ else:
|
|
86
|
+ return 0
|
79
|
87
|
|
80
|
88
|
# score results and remove duplications
|
81
|
89
|
def score_results(results):
|
|
@@ -110,6 +118,9 @@ def score_results(results):
|
110
|
118
|
duplicated = False
|
111
|
119
|
|
112
|
120
|
# check for duplicates
|
|
121
|
+ if 'content' in res:
|
|
122
|
+ res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
|
|
123
|
+
|
113
|
124
|
for new_res in results:
|
114
|
125
|
# remove / from the end of the url if required
|
115
|
126
|
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
|
@@ -126,7 +137,7 @@ def score_results(results):
|
126
|
137
|
# merge duplicates together
|
127
|
138
|
if duplicated:
|
128
|
139
|
# using content with more text
|
129
|
|
- if res.get('content') > duplicated.get('content'):
|
|
140
|
+ if content_result_len(res) > content_result_len(duplicated):
|
130
|
141
|
duplicated['content'] = res['content']
|
131
|
142
|
|
132
|
143
|
# increase result-score
|