Bläddra i källkod

[fix] www. domain duplications

Adam Tauber 11 år sedan
förälder
incheckning
b226e6462b
1 ändrade filer med 9 tillägg och 1 borttagningar
  1. 9
    1
      searx/engines/__init__.py

+ 9
- 1
searx/engines/__init__.py Visa fil

@@ -154,16 +154,24 @@ def score_results(results):
154 154
     # deduplication + scoring
155 155
     for i, res in enumerate(flat_res):
156 156
         res['parsed_url'] = urlparse(res['url'])
157
+        res['host'] = res['parsed_url'].netloc
158
+
159
+        if res['host'].startswith('www.'):
160
+            res['host'] = res['host'].replace('www.', '', 1)
161
+
157 162
         res['engines'] = [res['engine']]
158 163
         weight = 1.0
164
+
159 165
         if hasattr(engines[res['engine']], 'weight'):
160 166
             weight = float(engines[res['engine']].weight)
167
+
161 168
         score = int((flat_len - i) / engines_len) * weight + 1
162 169
         duplicated = False
170
+
163 171
         for new_res in results:
164 172
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
165 173
             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
166
-            if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
174
+            if res['host'] == new_res['host'] and\
167 175
                p1 == p2 and\
168 176
                res['parsed_url'].query == new_res['parsed_url'].query and\
169 177
                res.get('template') == new_res.get('template'):