Quellcode durchsuchen

[enh] better url comparison

asciimoo vor 11 Jahren
Ursprung
Commit
70cbc09e93
1 geänderte Dateien mit 9 neuen und 1 gelöschten Zeilen
  1. 9
    1
      searx/engines/__init__.py

+ 9
- 1
searx/engines/__init__.py Datei anzeigen

22
 import grequests
22
 import grequests
23
 from itertools import izip_longest, chain
23
 from itertools import izip_longest, chain
24
 from operator import itemgetter
24
 from operator import itemgetter
25
+from urlparse import urlparse
25
 
26
 
26
 engine_dir = dirname(realpath(__file__))
27
 engine_dir = dirname(realpath(__file__))
27
 
28
 
87
     results = []
88
     results = []
88
     # deduplication + scoring
89
     # deduplication + scoring
89
     for i,res in enumerate(flat_res):
90
     for i,res in enumerate(flat_res):
91
+        res['parsed_url'] = urlparse(res['url'])
90
         score = flat_len - i
92
         score = flat_len - i
91
         duplicated = False
93
         duplicated = False
92
         for new_res in results:
94
         for new_res in results:
93
-            if res['url'] == new_res['url']:
95
+            if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
96
+               res['parsed_url'].path == new_res['parsed_url'].path:
94
                 duplicated = new_res
97
                 duplicated = new_res
95
                 break
98
                 break
96
         if duplicated:
99
         if duplicated:
97
             if len(res.get('content', '')) > len(duplicated.get('content', '')):
100
             if len(res.get('content', '')) > len(duplicated.get('content', '')):
98
                 duplicated['content'] = res['content']
101
                 duplicated['content'] = res['content']
99
             duplicated['score'] += score
102
             duplicated['score'] += score
103
+            if duplicated['parsed_url'].scheme == 'https':
104
+                continue
105
+            elif res['parsed_url'].scheme == 'https':
106
+                duplicated['parsed_url'].scheme == 'https'
107
+                duplicated['url'] = duplicated['parsed_url'].geturl()
100
         else:
108
         else:
101
             res['score'] = score
109
             res['score'] = score
102
             results.append(res)
110
             results.append(res)