Browse Source

[enh] multilingual wikidata

disambiguation and tags are in local language

TOFIX:
    needs to query the api every time to know each label's name
a01200356 8 years ago
parent
commit
93ef11adc0
1 changed files with 36 additions and 16 deletions
  1. 36
    16
      searx/engines/wikidata.py

+ 36
- 16
searx/engines/wikidata.py View File

@@ -7,15 +7,16 @@ from searx.utils import format_date_by_locale
7 7
 from datetime import datetime
8 8
 from dateutil.parser import parse as dateutil_parse
9 9
 from urllib import urlencode
10
+from lxml.html import fromstring
10 11
 
11 12
 
12 13
 logger = logger.getChild('wikidata')
13 14
 result_count = 1
14 15
 wikidata_host = 'https://www.wikidata.org'
16
+url_search = wikidata_host \
17
+    + '/wiki/Special:ItemDisambiguation?{query}'
18
+
15 19
 wikidata_api = wikidata_host + '/w/api.php'
16
-url_search = wikidata_api \
17
-    + '?action=query&list=search&format=json'\
18
-    + '&srnamespace=0&srprop=sectiontitle&{query}'
19 20
 url_detail = wikidata_api\
20 21
     + '?action=wbgetentities&format=json'\
21 22
     + '&props=labels%7Cinfo%7Csitelinks'\
@@ -23,22 +24,27 @@ url_detail = wikidata_api\
23 24
     + '&{query}'
24 25
 url_map = 'https://www.openstreetmap.org/'\
25 26
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
27
+url_entity_label = wikidata_api\
28
+    + '?action=wbgetentities&format=json&props=labels&{query}'
29
+
30
+wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
26 31
 
27 32
 
28 33
 def request(query, params):
34
+    language = params['language'].split('_')[0]
35
+    if language == 'all':
36
+        language = 'en'
37
+
29 38
     params['url'] = url_search.format(
30
-        query=urlencode({'srsearch': query,
31
-                        'srlimit': result_count}))
39
+        query=urlencode({'label': query,
40
+                        'language': language}))
32 41
     return params
33 42
 
34 43
 
35 44
 def response(resp):
36 45
     results = []
37
-    search_res = json.loads(resp.text)
38
-
39
-    wikidata_ids = set()
40
-    for r in search_res.get('query', {}).get('search', {}):
41
-        wikidata_ids.add(r.get('title', ''))
46
+    html = fromstring(resp.content)
47
+    wikidata_ids = html.xpath(wikidata_ids_xpath)
42 48
 
43 49
     language = resp.search_params['language'].split('_')[0]
44 50
     if language == 'all':
@@ -49,7 +55,7 @@ def response(resp):
49 55
 
50 56
     htmlresponse = get(url)
51 57
     jsonresponse = json.loads(htmlresponse.content)
52
-    for wikidata_id in wikidata_ids:
58
+    for wikidata_id in wikidata_ids[:result_count]:
53 59
         results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
54 60
 
55 61
     return results
@@ -82,7 +88,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
82 88
     claims = result.get('claims', {})
83 89
     official_website = get_string(claims, 'P856', None)
84 90
     if official_website is not None:
85
-        urls.append({'title': 'Official site', 'url': official_website})
91
+        urls.append({'title': get_label('P856', language), 'url': official_website})
86 92
         results.append({'title': title, 'url': official_website})
87 93
 
88 94
     wikipedia_link_count = 0
@@ -124,8 +130,9 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
124 130
             'Commons wiki',
125 131
             get_wikilink(result, 'commonswiki'))
126 132
 
133
+    # Location
127 134
     add_url(urls,
128
-            'Location',
135
+            get_label('P625', language),
129 136
             get_geolink(claims, 'P625', None))
130 137
 
131 138
     add_url(urls,
@@ -169,15 +176,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
169 176
 
170 177
     postal_code = get_string(claims, 'P281', None)
171 178
     if postal_code is not None:
172
-        attributes.append({'label': 'Postal code(s)', 'value': postal_code})
179
+        attributes.append({'label': get_label('P281', language), 'value': postal_code})
173 180
 
174 181
     date_of_birth = get_time(claims, 'P569', locale, None)
175 182
     if date_of_birth is not None:
176
-        attributes.append({'label': 'Date of birth', 'value': date_of_birth})
183
+        attributes.append({'label': get_label('P569', language), 'value': date_of_birth})
177 184
 
178 185
     date_of_death = get_time(claims, 'P570', locale, None)
179 186
     if date_of_death is not None:
180
-        attributes.append({'label': 'Date of death', 'value': date_of_death})
187
+        attributes.append({'label': get_label('P570', language), 'value': date_of_death})
181 188
 
182 189
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
183 190
         results.append({
@@ -321,3 +328,16 @@ def get_wiki_firstlanguage(result, wikipatternid):
321 328
         if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
322 329
             return k[0:2]
323 330
     return None
331
+
332
+
333
+def get_label(entity_id, language):
334
+    url = url_entity_label.format(query=urlencode({'ids': entity_id,
335
+                                                   'languages': language + '|en'}))
336
+
337
+    response = get(url)
338
+    jsonresponse = json.loads(response.text)
339
+    label = jsonresponse.get('entities', {}).get(entity_id, {}).get('labels', {}).get(language, {}).get('value', None)
340
+    if label is None:
341
+        label = jsonresponse['entities'][entity_id]['labels']['en']['value']
342
+
343
+    return label