Browse Source

[fix] when two results are merged, really use the content with more text

Dalf 10 years ago
parent
commit
6b058962e1
1 changed files with 12 additions and 1 deletions
  1. 12
    1
      searx/search.py

+ 12
- 1
searx/search.py View File

16
 '''
16
 '''
17
 
17
 
18
 import grequests
18
 import grequests
19
+import re
19
 from itertools import izip_longest, chain
20
 from itertools import izip_longest, chain
20
 from datetime import datetime
21
 from datetime import datetime
21
 from operator import itemgetter
22
 from operator import itemgetter
76
 
77
 
77
     return process_callback
78
     return process_callback
78
 
79
 
80
+# return the meaningful length of the content for a result
81
+def content_result_len(result):
82
+    if isinstance(result.get('content'), basestring):
83
+        content = re.sub('[,;:!?\./\\\\ ()-_]', '', result.get('content'))
84
+        return len(content) 
85
+    else:
86
+        return 0
79
 
87
 
80
 # score results and remove duplications
88
 # score results and remove duplications
81
 def score_results(results):
89
 def score_results(results):
110
         duplicated = False
118
         duplicated = False
111
 
119
 
112
         # check for duplicates
120
         # check for duplicates
121
+        if 'content' in res:
122
+            res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
123
+
113
         for new_res in results:
124
         for new_res in results:
114
             # remove / from the end of the url if required
125
             # remove / from the end of the url if required
115
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
126
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
126
         # merge duplicates together
137
         # merge duplicates together
127
         if duplicated:
138
         if duplicated:
128
             # using content with more text
139
             # using content with more text
129
-            if res.get('content') > duplicated.get('content'):
140
+            if content_result_len(res) > content_result_len(duplicated):
130
                 duplicated['content'] = res['content']
141
                 duplicated['content'] = res['content']
131
 
142
 
132
             # increase result-score
143
             # increase result-score