|
@@ -18,7 +18,17 @@ def result_content_len(content):
|
18
|
18
|
|
19
|
19
|
|
20
|
20
|
def compare_urls(url_a, url_b):
|
21
|
|
- if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
|
|
21
|
+ # ignore www. in comparison
|
|
22
|
+ if url_a.netloc.startswith('www.'):
|
|
23
|
+ host_a = url_a.netloc.replace('www.', '', 1)
|
|
24
|
+ else:
|
|
25
|
+ host_a = url_a.netloc
|
|
26
|
+ if url_b.netloc.startswith('www.'):
|
|
27
|
+ host_b = url_b.netloc.replace('www.', '', 1)
|
|
28
|
+ else:
|
|
29
|
+ host_b = url_b.netloc
|
|
30
|
+
|
|
31
|
+ if host_a != host_b or url_a.query != url_b.query:
|
22
|
32
|
return False
|
23
|
33
|
|
24
|
34
|
# remove / from the end of the url if required
|
|
@@ -37,15 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
|
37
|
47
|
urls1 = infobox1.get('urls', None)
|
38
|
48
|
if urls1 is None:
|
39
|
49
|
urls1 = []
|
40
|
|
- infobox1['urls'] = urls1
|
41
|
50
|
|
42
|
|
- urlSet = set()
|
43
|
|
- for url in infobox1.get('urls', []):
|
44
|
|
- urlSet.add(url.get('url', None))
|
|
51
|
+ for url2 in infobox2.get('urls', []):
|
|
52
|
+ unique_url = True
|
|
53
|
+ for url1 in infobox1.get('urls', []):
|
|
54
|
+ if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
|
|
55
|
+ unique_url = False
|
|
56
|
+ break
|
|
57
|
+ if unique_url:
|
|
58
|
+ urls1.append(url2)
|
45
|
59
|
|
46
|
|
- for url in infobox2.get('urls', []):
|
47
|
|
- if url.get('url', None) not in urlSet:
|
48
|
|
- urls1.append(url)
|
|
60
|
+ infobox1['urls'] = urls1
|
49
|
61
|
|
50
|
62
|
if 'img_src' in infobox2:
|
51
|
63
|
img1 = infobox1.get('img_src', None)
|
|
@@ -97,7 +109,6 @@ class ResultContainer(object):
|
97
|
109
|
self.results = defaultdict(list)
|
98
|
110
|
self._merged_results = []
|
99
|
111
|
self.infoboxes = []
|
100
|
|
- self._infobox_ids = {}
|
101
|
112
|
self.suggestions = set()
|
102
|
113
|
self.answers = set()
|
103
|
114
|
self._number_of_results = []
|
|
@@ -138,14 +149,13 @@ class ResultContainer(object):
|
138
|
149
|
add_infobox = True
|
139
|
150
|
infobox_id = infobox.get('id', None)
|
140
|
151
|
if infobox_id is not None:
|
141
|
|
- existingIndex = self._infobox_ids.get(infobox_id, None)
|
142
|
|
- if existingIndex is not None:
|
143
|
|
- merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
|
144
|
|
- add_infobox = False
|
|
152
|
+ for existingIndex in self.infoboxes:
|
|
153
|
+ if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
|
|
154
|
+ merge_two_infoboxes(existingIndex, infobox)
|
|
155
|
+ add_infobox = False
|
145
|
156
|
|
146
|
157
|
if add_infobox:
|
147
|
158
|
self.infoboxes.append(infobox)
|
148
|
|
- self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
|
149
|
159
|
|
150
|
160
|
def _merge_result(self, result, position):
|
151
|
161
|
result['parsed_url'] = urlparse(result['url'])
|
|
@@ -155,11 +165,6 @@ class ResultContainer(object):
|
155
|
165
|
result['parsed_url'] = result['parsed_url']._replace(scheme="http")
|
156
|
166
|
result['url'] = result['parsed_url'].geturl()
|
157
|
167
|
|
158
|
|
- result['host'] = result['parsed_url'].netloc
|
159
|
|
-
|
160
|
|
- if result['host'].startswith('www.'):
|
161
|
|
- result['host'] = result['host'].replace('www.', '', 1)
|
162
|
|
-
|
163
|
168
|
result['engines'] = [result['engine']]
|
164
|
169
|
|
165
|
170
|
# strip multiple spaces and cariage returns from content
|