Pārlūkot izejas kodu

[fix] urls merge in infobox (#593)

TODO:
    merge attributes
marc 8 gadus atpakaļ
vecāks
revīzija
c2e4014287
2 mainītis faili ar 25 papildinājumiem un 21 dzēšanām
  1. 1
    2
      searx/engines/wikipedia.py
  2. 24
    19
      searx/results.py

+ 1
- 2
searx/engines/wikipedia.py Parādīt failu

99
         return []
99
         return []
100
 
100
 
101
     # link to wikipedia article
101
     # link to wikipedia article
102
-    # parenthesis are not quoted to make infobox mergeable with wikidata's
103
     wikipedia_link = url_lang(resp.search_params['language']) \
102
     wikipedia_link = url_lang(resp.search_params['language']) \
104
-        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
103
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
105
 
104
 
106
     results.append({'url': wikipedia_link, 'title': title})
105
     results.append({'url': wikipedia_link, 'title': title})
107
 
106
 

+ 24
- 19
searx/results.py Parādīt failu

18
 
18
 
19
 
19
 
20
 def compare_urls(url_a, url_b):
20
 def compare_urls(url_a, url_b):
21
-    if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
21
+    # ignore www. in comparison
22
+    if url_a.netloc.startswith('www.'):
23
+        host_a = url_a.netloc.replace('www.', '', 1)
24
+    else:
25
+        host_a = url_a.netloc
26
+    if url_b.netloc.startswith('www.'):
27
+        host_b = url_b.netloc.replace('www.', '', 1)
28
+    else:
29
+        host_b = url_b.netloc
30
+
31
+    if host_a != host_b or url_a.query != url_b.query:
22
         return False
32
         return False
23
 
33
 
24
     # remove / from the end of the url if required
34
     # remove / from the end of the url if required
37
         urls1 = infobox1.get('urls', None)
47
         urls1 = infobox1.get('urls', None)
38
         if urls1 is None:
48
         if urls1 is None:
39
             urls1 = []
49
             urls1 = []
40
-            infobox1['urls'] = urls1
41
 
50
 
42
-        urlSet = set()
43
-        for url in infobox1.get('urls', []):
44
-            urlSet.add(url.get('url', None))
51
+        for url2 in infobox2.get('urls', []):
52
+            unique_url = True
53
+            for url1 in infobox1.get('urls', []):
54
+                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
55
+                    unique_url = False
56
+                    break
57
+            if unique_url:
58
+                urls1.append(url2)
45
 
59
 
46
-        for url in infobox2.get('urls', []):
47
-            if url.get('url', None) not in urlSet:
48
-                urls1.append(url)
60
+        infobox1['urls'] = urls1
49
 
61
 
50
     if 'img_src' in infobox2:
62
     if 'img_src' in infobox2:
51
         img1 = infobox1.get('img_src', None)
63
         img1 = infobox1.get('img_src', None)
97
         self.results = defaultdict(list)
109
         self.results = defaultdict(list)
98
         self._merged_results = []
110
         self._merged_results = []
99
         self.infoboxes = []
111
         self.infoboxes = []
100
-        self._infobox_ids = {}
101
         self.suggestions = set()
112
         self.suggestions = set()
102
         self.answers = set()
113
         self.answers = set()
103
         self._number_of_results = []
114
         self._number_of_results = []
138
         add_infobox = True
149
         add_infobox = True
139
         infobox_id = infobox.get('id', None)
150
         infobox_id = infobox.get('id', None)
140
         if infobox_id is not None:
151
         if infobox_id is not None:
141
-            existingIndex = self._infobox_ids.get(infobox_id, None)
142
-            if existingIndex is not None:
143
-                merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
144
-                add_infobox = False
152
+            for existingIndex in self.infoboxes:
153
+                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
154
+                    merge_two_infoboxes(existingIndex, infobox)
155
+                    add_infobox = False
145
 
156
 
146
         if add_infobox:
157
         if add_infobox:
147
             self.infoboxes.append(infobox)
158
             self.infoboxes.append(infobox)
148
-            self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
149
 
159
 
150
     def _merge_result(self, result, position):
160
     def _merge_result(self, result, position):
151
         result['parsed_url'] = urlparse(result['url'])
161
         result['parsed_url'] = urlparse(result['url'])
155
             result['parsed_url'] = result['parsed_url']._replace(scheme="http")
165
             result['parsed_url'] = result['parsed_url']._replace(scheme="http")
156
             result['url'] = result['parsed_url'].geturl()
166
             result['url'] = result['parsed_url'].geturl()
157
 
167
 
158
-        result['host'] = result['parsed_url'].netloc
159
-
160
-        if result['host'].startswith('www.'):
161
-            result['host'] = result['host'].replace('www.', '', 1)
162
-
163
         result['engines'] = [result['engine']]
168
         result['engines'] = [result['engine']]
164
 
169
 
165
         # strip multiple spaces and cariage returns from content
170
         # strip multiple spaces and cariage returns from content