浏览代码

[fix] merge infoboxes based on weight

marc 8 年前
父节点
当前提交
ad58b14be7
共有 4 个文件被更改,包括 50 次插入16 次删除
  1. 19
    5
      searx/engines/wikidata.py
  2. 17
    1
      searx/results.py
  3. 2
    0
      searx/settings.yml
  4. 12
    10
      tests/unit/engines/test_wikidata.py

+ 19
- 5
searx/engines/wikidata.py 查看文件

35
 
35
 
36
 url_map = 'https://www.openstreetmap.org/'\
36
 url_map = 'https://www.openstreetmap.org/'\
37
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
37
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
38
-url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500'
38
+url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
39
 
39
 
40
 # xpaths
40
 # xpaths
41
 wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
41
 wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
162
 
162
 
163
     # INFOBOX ATTRIBUTES (ROWS)
163
     # INFOBOX ATTRIBUTES (ROWS)
164
 
164
 
165
+    # DATES
165
     # inception date
166
     # inception date
166
     add_attribute(attributes, result, 'P571', date=True)
167
     add_attribute(attributes, result, 'P571', date=True)
167
     # dissolution date
168
     # dissolution date
170
     add_attribute(attributes, result, 'P580', date=True)
171
     add_attribute(attributes, result, 'P580', date=True)
171
     # end date
172
     # end date
172
     add_attribute(attributes, result, 'P582', date=True)
173
     add_attribute(attributes, result, 'P582', date=True)
173
-
174
     # date of birth
174
     # date of birth
175
     add_attribute(attributes, result, 'P569', date=True)
175
     add_attribute(attributes, result, 'P569', date=True)
176
     # date of death
176
     # date of death
177
     add_attribute(attributes, result, 'P570', date=True)
177
     add_attribute(attributes, result, 'P570', date=True)
178
+    # date of spacecraft launch
179
+    add_attribute(attributes, result, 'P619', date=True)
180
+    # date of spacecraft landing
181
+    add_attribute(attributes, result, 'P620', date=True)
178
 
182
 
179
     # nationality
183
     # nationality
180
     add_attribute(attributes, result, 'P27')
184
     add_attribute(attributes, result, 'P27')
201
     # area
205
     # area
202
     add_attribute(attributes, result, 'P2046')
206
     add_attribute(attributes, result, 'P2046')
203
     # currency
207
     # currency
204
-    add_attribute(attributes, result, 'P38')
208
+    add_attribute(attributes, result, 'P38', trim=True)
205
     # heigth (building)
209
     # heigth (building)
206
     add_attribute(attributes, result, 'P2048')
210
     add_attribute(attributes, result, 'P2048')
207
 
211
 
230
     add_attribute(attributes, result, 'P264')
234
     add_attribute(attributes, result, 'P264')
231
     # publisher
235
     # publisher
232
     add_attribute(attributes, result, 'P123')
236
     add_attribute(attributes, result, 'P123')
237
+    # original network
238
+    add_attribute(attributes, result, 'P449')
239
+    # distributor
240
+    add_attribute(attributes, result, 'P750')
233
     # composer
241
     # composer
234
     add_attribute(attributes, result, 'P86')
242
     add_attribute(attributes, result, 'P86')
235
     # publication date
243
     # publication date
266
     add_attribute(attributes, result, 'P112')
274
     add_attribute(attributes, result, 'P112')
267
     # legal form (company/organization)
275
     # legal form (company/organization)
268
     add_attribute(attributes, result, 'P1454')
276
     add_attribute(attributes, result, 'P1454')
277
+    # operator
278
+    add_attribute(attributes, result, 'P137')
279
+    # crew members (tripulation)
280
+    add_attribute(attributes, result, 'P1029')
269
     # taxon
281
     # taxon
270
     add_attribute(attributes, result, 'P225')
282
     add_attribute(attributes, result, 'P225')
271
     # chemical formula
283
     # chemical formula
300
 
312
 
301
 # only returns first match
313
 # only returns first match
302
 def add_image(result):
314
 def add_image(result):
303
-    # P18: image, P154: logo, P242: map, P41: flag, P2716: collage, P2910: icon
304
-    property_ids = ['P18', 'P154', 'P242', 'P41', 'P2716', 'P2910']
315
+    # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
316
+    property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
305
 
317
 
306
     for property_id in property_ids:
318
     for property_id in property_ids:
307
         image = result.xpath(property_xpath.replace('{propertyid}', property_id))
319
         image = result.xpath(property_xpath.replace('{propertyid}', property_id))
320
             label = default_label
332
             label = default_label
321
         else:
333
         else:
322
             label = extract_text(attribute[0].xpath(label_xpath))
334
             label = extract_text(attribute[0].xpath(label_xpath))
335
+            label = label[0].upper() + label[1:]
323
 
336
 
324
         if date:
337
         if date:
325
             trim = True
338
             trim = True
369
             dom_element = dom_element[0]
382
             dom_element = dom_element[0]
370
             if not default_label:
383
             if not default_label:
371
                 label = extract_text(dom_element.xpath(label_xpath))
384
                 label = extract_text(dom_element.xpath(label_xpath))
385
+                label = label[0].upper() + label[1:]
372
 
386
 
373
             if link_type == 'geo':
387
             if link_type == 'geo':
374
                 links.append(get_geolink(dom_element))
388
                 links.append(get_geolink(dom_element))

+ 17
- 1
searx/results.py 查看文件

43
 
43
 
44
 
44
 
45
 def merge_two_infoboxes(infobox1, infobox2):
45
 def merge_two_infoboxes(infobox1, infobox2):
46
+    # get engines weights
47
+    if hasattr(engines[infobox1['engine']], 'weight'):
48
+        weight1 = engines[infobox1['engine']].weight
49
+    else:
50
+        weight1 = 1
51
+    if hasattr(engines[infobox2['engine']], 'weight'):
52
+        weight2 = engines[infobox2['engine']].weight
53
+    else:
54
+        weight2 = 1
55
+
56
+    if weight2 > weight1:
57
+        infobox1['engine'] = infobox2['engine']
58
+
46
     if 'urls' in infobox2:
59
     if 'urls' in infobox2:
47
         urls1 = infobox1.get('urls', None)
60
         urls1 = infobox1.get('urls', None)
48
         if urls1 is None:
61
         if urls1 is None:
64
         img2 = infobox2.get('img_src')
77
         img2 = infobox2.get('img_src')
65
         if img1 is None:
78
         if img1 is None:
66
             infobox1['img_src'] = img2
79
             infobox1['img_src'] = img2
80
+        elif weight2 > weight1:
81
+            infobox1['img_src'] = img2
67
 
82
 
68
     if 'attributes' in infobox2:
83
     if 'attributes' in infobox2:
69
         attributes1 = infobox1.get('attributes', None)
84
         attributes1 = infobox1.get('attributes', None)
77
                 attributeSet.add(attribute.get('label', None))
92
                 attributeSet.add(attribute.get('label', None))
78
 
93
 
79
         for attribute in infobox2.get('attributes', []):
94
         for attribute in infobox2.get('attributes', []):
80
-            attributes1.append(attribute)
95
+            if attribute.get('label', None) not in attributeSet:
96
+                attributes1.append(attribute)
81
 
97
 
82
     if 'content' in infobox2:
98
     if 'content' in infobox2:
83
         content1 = infobox1.get('content', None)
99
         content1 = infobox1.get('content', None)

+ 2
- 0
searx/settings.yml 查看文件

105
   - name : ddg definitions
105
   - name : ddg definitions
106
     engine : duckduckgo_definitions
106
     engine : duckduckgo_definitions
107
     shortcut : ddd
107
     shortcut : ddd
108
+    weight : 2
108
     disabled : True
109
     disabled : True
109
 
110
 
110
   - name : digg
111
   - name : digg
127
   - name : wikidata
128
   - name : wikidata
128
     engine : wikidata
129
     engine : wikidata
129
     shortcut : wd
130
     shortcut : wd
131
+    weight : 2
130
 
132
 
131
   - name : duckduckgo
133
   - name : duckduckgo
132
     engine : duckduckgo
134
     engine : duckduckgo

+ 12
- 10
tests/unit/engines/test_wikidata.py 查看文件

95
 
95
 
96
         results = wikidata.getDetail(response, "Q123", "yua", "yua_MX")
96
         results = wikidata.getDetail(response, "Q123", "yua", "yua_MX")
97
         self.assertEqual(len(results), 2)
97
         self.assertEqual(len(results), 2)
98
-        self.assertEqual(results[0]['title'], 'official website')
98
+        self.assertEqual(results[0]['title'], 'Official website')
99
         self.assertEqual(results[0]['url'], 'https://officialsite.com')
99
         self.assertEqual(results[0]['url'], 'https://officialsite.com')
100
 
100
 
101
         self.assertEqual(results[1]['infobox'], 'Test')
101
         self.assertEqual(results[1]['infobox'], 'Test')
102
         self.assertEqual(results[1]['id'], None)
102
         self.assertEqual(results[1]['id'], None)
103
         self.assertEqual(results[1]['content'], 'Description')
103
         self.assertEqual(results[1]['content'], 'Description')
104
         self.assertEqual(results[1]['attributes'], [])
104
         self.assertEqual(results[1]['attributes'], [])
105
-        self.assertEqual(results[1]['urls'][0]['title'], 'official website')
105
+        self.assertEqual(results[1]['urls'][0]['title'], 'Official website')
106
         self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com')
106
         self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com')
107
         self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)')
107
         self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)')
108
         self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test')
108
         self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test')
141
         html_etree = fromstring(html)
141
         html_etree = fromstring(html)
142
 
142
 
143
         image_src = wikidata.add_image(html_etree)
143
         image_src = wikidata.add_image(html_etree)
144
-        self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500")
144
+        self.assertEqual(image_src,
145
+                         "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500&height=400")
145
 
146
 
146
         html = u"""
147
         html = u"""
147
         <div>
148
         <div>
196
         html_etree = fromstring(html)
197
         html_etree = fromstring(html)
197
 
198
 
198
         image_src = wikidata.add_image(html_etree)
199
         image_src = wikidata.add_image(html_etree)
199
-        self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500")
200
+        self.assertEqual(image_src,
201
+                         "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500&height=400")
200
 
202
 
201
     def test_add_attribute(self):
203
     def test_add_attribute(self):
202
         html = u"""
204
         html = u"""
234
 
236
 
235
         wikidata.add_attribute(attributes, html_etree, "P27")
237
         wikidata.add_attribute(attributes, html_etree, "P27")
236
         self.assertEqual(len(attributes), 1)
238
         self.assertEqual(len(attributes), 1)
237
-        self.assertEqual(attributes[0]["label"], "country of citizenship")
239
+        self.assertEqual(attributes[0]["label"], "Country of citizenship")
238
         self.assertEqual(attributes[0]["value"], "United Kingdom")
240
         self.assertEqual(attributes[0]["value"], "United Kingdom")
239
 
241
 
240
         html = u"""
242
         html = u"""
269
         html_etree = fromstring(html)
271
         html_etree = fromstring(html)
270
         wikidata.add_attribute(attributes, html_etree, "P569", date=True)
272
         wikidata.add_attribute(attributes, html_etree, "P569", date=True)
271
         self.assertEqual(len(attributes), 1)
273
         self.assertEqual(len(attributes), 1)
272
-        self.assertEqual(attributes[0]["label"], "date of birth")
274
+        self.assertEqual(attributes[0]["label"], "Date of birth")
273
         self.assertEqual(attributes[0]["value"], "27 January 1832")
275
         self.assertEqual(attributes[0]["value"], "27 January 1832")
274
 
276
 
275
         html = u"""
277
         html = u"""
317
         html_etree = fromstring(html)
319
         html_etree = fromstring(html)
318
         wikidata.add_attribute(attributes, html_etree, "P6")
320
         wikidata.add_attribute(attributes, html_etree, "P6")
319
         self.assertEqual(len(attributes), 1)
321
         self.assertEqual(len(attributes), 1)
320
-        self.assertEqual(attributes[0]["label"], "head of government")
322
+        self.assertEqual(attributes[0]["label"], "Head of government")
321
         self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
323
         self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
322
 
324
 
323
         attributes = []
325
         attributes = []
355
         html_etree = fromstring(html)
357
         html_etree = fromstring(html)
356
         wikidata.add_url(urls, html_etree, 'P856')
358
         wikidata.add_url(urls, html_etree, 'P856')
357
         self.assertEquals(len(urls), 1)
359
         self.assertEquals(len(urls), 1)
358
-        self.assertIn({'title': 'official website', 'url': 'https://searx.me/'}, urls)
360
+        self.assertIn({'title': 'Official website', 'url': 'https://searx.me/'}, urls)
359
         urls = []
361
         urls = []
360
         results = []
362
         results = []
361
         wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results)
363
         wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results)
403
         html_etree = fromstring(html)
405
         html_etree = fromstring(html)
404
         wikidata.add_url(urls, html_etree, 'P856')
406
         wikidata.add_url(urls, html_etree, 'P856')
405
         self.assertEquals(len(urls), 2)
407
         self.assertEquals(len(urls), 2)
406
-        self.assertIn({'title': 'official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
407
-        self.assertIn({'title': 'official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
408
+        self.assertIn({'title': 'Official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
409
+        self.assertIn({'title': 'Official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
408
 
410
 
409
     def test_get_imdblink(self):
411
     def test_get_imdblink(self):
410
         html = u"""
412
         html = u"""