Parcourir la source

[enh] better url comparison

asciimoo il y a 11 ans
Parent
révision
70cbc09e93
1 fichiers modifiés avec 9 ajouts et 1 suppressions
  1. 9
    1
      searx/engines/__init__.py

+ 9
- 1
searx/engines/__init__.py Voir le fichier

@@ -22,6 +22,7 @@ from imp import load_source
22 22
 import grequests
23 23
 from itertools import izip_longest, chain
24 24
 from operator import itemgetter
25
+from urlparse import urlparse
25 26
 
26 27
 engine_dir = dirname(realpath(__file__))
27 28
 
@@ -87,16 +88,23 @@ def search(query, request, selected_engines):
87 88
     results = []
88 89
     # deduplication + scoring
89 90
     for i,res in enumerate(flat_res):
91
+        res['parsed_url'] = urlparse(res['url'])
90 92
         score = flat_len - i
91 93
         duplicated = False
92 94
         for new_res in results:
93
-            if res['url'] == new_res['url']:
95
+            if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
96
+               res['parsed_url'].path == new_res['parsed_url'].path:
94 97
                 duplicated = new_res
95 98
                 break
96 99
         if duplicated:
97 100
             if len(res.get('content', '')) > len(duplicated.get('content', '')):
98 101
                 duplicated['content'] = res['content']
99 102
             duplicated['score'] += score
103
+            if duplicated['parsed_url'].scheme == 'https':
104
+                continue
105
+            elif res['parsed_url'].scheme == 'https':
106
+                duplicated['parsed_url'].scheme == 'https'
107
+                duplicated['url'] = duplicated['parsed_url'].geturl()
100 108
         else:
101 109
             res['score'] = score
102 110
             results.append(res)