|
@@ -22,6 +22,7 @@ from imp import load_source
|
22
|
22
|
import grequests
|
23
|
23
|
from itertools import izip_longest, chain
|
24
|
24
|
from operator import itemgetter
|
|
25
|
+from urlparse import urlparse
|
25
|
26
|
|
26
|
27
|
engine_dir = dirname(realpath(__file__))
|
27
|
28
|
|
|
@@ -87,16 +88,23 @@ def search(query, request, selected_engines):
|
87
|
88
|
results = []
|
88
|
89
|
# deduplication + scoring
|
89
|
90
|
for i,res in enumerate(flat_res):
|
|
91
|
+ res['parsed_url'] = urlparse(res['url'])
|
90
|
92
|
score = flat_len - i
|
91
|
93
|
duplicated = False
|
92
|
94
|
for new_res in results:
|
93
|
|
- if res['url'] == new_res['url']:
|
|
95
|
+ if res['parsed_url'].netloc == new_res['parsed_url'].netloc and\
|
|
96
|
+ res['parsed_url'].path == new_res['parsed_url'].path:
|
94
|
97
|
duplicated = new_res
|
95
|
98
|
break
|
96
|
99
|
if duplicated:
|
97
|
100
|
if len(res.get('content', '')) > len(duplicated.get('content', '')):
|
98
|
101
|
duplicated['content'] = res['content']
|
99
|
102
|
duplicated['score'] += score
|
|
103
|
+ if duplicated['parsed_url'].scheme == 'https':
|
|
104
|
+ continue
|
|
105
|
+ elif res['parsed_url'].scheme == 'https':
|
|
106
|
+ duplicated['parsed_url'].scheme == 'https'
|
|
107
|
+ duplicated['url'] = duplicated['parsed_url'].geturl()
|
100
|
108
|
else:
|
101
|
109
|
res['score'] = score
|
102
|
110
|
results.append(res)
|