浏览代码

[fix] urls merge in infobox (#593)

TODO:
    merge attributes
marc 8 年前
父节点
当前提交
c2e4014287
共有 2 个文件被更改,包括 25 次插入21 次删除
  1. 1
    2
      searx/engines/wikipedia.py
  2. 24
    19
      searx/results.py

+ 1
- 2
searx/engines/wikipedia.py 查看文件

@@ -99,9 +99,8 @@ def response(resp):
99 99
         return []
100 100
 
101 101
     # link to wikipedia article
102
-    # parenthesis are not quoted to make infobox mergeable with wikidata's
103 102
     wikipedia_link = url_lang(resp.search_params['language']) \
104
-        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
103
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
105 104
 
106 105
     results.append({'url': wikipedia_link, 'title': title})
107 106
 

+ 24
- 19
searx/results.py 查看文件

@@ -18,7 +18,17 @@ def result_content_len(content):
18 18
 
19 19
 
20 20
 def compare_urls(url_a, url_b):
21
-    if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
21
+    # ignore www. in comparison
22
+    if url_a.netloc.startswith('www.'):
23
+        host_a = url_a.netloc.replace('www.', '', 1)
24
+    else:
25
+        host_a = url_a.netloc
26
+    if url_b.netloc.startswith('www.'):
27
+        host_b = url_b.netloc.replace('www.', '', 1)
28
+    else:
29
+        host_b = url_b.netloc
30
+
31
+    if host_a != host_b or url_a.query != url_b.query:
22 32
         return False
23 33
 
24 34
     # remove / from the end of the url if required
@@ -37,15 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
37 47
         urls1 = infobox1.get('urls', None)
38 48
         if urls1 is None:
39 49
             urls1 = []
40
-            infobox1['urls'] = urls1
41 50
 
42
-        urlSet = set()
43
-        for url in infobox1.get('urls', []):
44
-            urlSet.add(url.get('url', None))
51
+        for url2 in infobox2.get('urls', []):
52
+            unique_url = True
53
+            for url1 in infobox1.get('urls', []):
54
+                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
55
+                    unique_url = False
56
+                    break
57
+            if unique_url:
58
+                urls1.append(url2)
45 59
 
46
-        for url in infobox2.get('urls', []):
47
-            if url.get('url', None) not in urlSet:
48
-                urls1.append(url)
60
+        infobox1['urls'] = urls1
49 61
 
50 62
     if 'img_src' in infobox2:
51 63
         img1 = infobox1.get('img_src', None)
@@ -97,7 +109,6 @@ class ResultContainer(object):
97 109
         self.results = defaultdict(list)
98 110
         self._merged_results = []
99 111
         self.infoboxes = []
100
-        self._infobox_ids = {}
101 112
         self.suggestions = set()
102 113
         self.answers = set()
103 114
         self._number_of_results = []
@@ -138,14 +149,13 @@ class ResultContainer(object):
138 149
         add_infobox = True
139 150
         infobox_id = infobox.get('id', None)
140 151
         if infobox_id is not None:
141
-            existingIndex = self._infobox_ids.get(infobox_id, None)
142
-            if existingIndex is not None:
143
-                merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
144
-                add_infobox = False
152
+            for existingIndex in self.infoboxes:
153
+                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
154
+                    merge_two_infoboxes(existingIndex, infobox)
155
+                    add_infobox = False
145 156
 
146 157
         if add_infobox:
147 158
             self.infoboxes.append(infobox)
148
-            self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
149 159
 
150 160
     def _merge_result(self, result, position):
151 161
         result['parsed_url'] = urlparse(result['url'])
@@ -155,11 +165,6 @@ class ResultContainer(object):
155 165
             result['parsed_url'] = result['parsed_url']._replace(scheme="http")
156 166
             result['url'] = result['parsed_url'].geturl()
157 167
 
158
-        result['host'] = result['parsed_url'].netloc
159
-
160
-        if result['host'].startswith('www.'):
161
-            result['host'] = result['host'].replace('www.', '', 1)
162
-
163 168
         result['engines'] = [result['engine']]
164 169
 
165 170
         # strip multiple spaces and cariage returns from content