Browse Source

[enh] multilingual wikidata

disambiguation and tags are in local language

TOFIX:
    needs to query the api every time to know each label's name
a01200356 9 years ago
parent
commit
93ef11adc0
1 changed files with 36 additions and 16 deletions
  1. 36
    16
      searx/engines/wikidata.py

+ 36
- 16
searx/engines/wikidata.py View File

7
 from datetime import datetime
7
 from datetime import datetime
8
 from dateutil.parser import parse as dateutil_parse
8
 from dateutil.parser import parse as dateutil_parse
9
 from urllib import urlencode
9
 from urllib import urlencode
10
+from lxml.html import fromstring
10
 
11
 
11
 
12
 
12
 logger = logger.getChild('wikidata')
13
 logger = logger.getChild('wikidata')
13
 result_count = 1
14
 result_count = 1
14
 wikidata_host = 'https://www.wikidata.org'
15
 wikidata_host = 'https://www.wikidata.org'
16
+url_search = wikidata_host \
17
+    + '/wiki/Special:ItemDisambiguation?{query}'
18
+
15
 wikidata_api = wikidata_host + '/w/api.php'
19
 wikidata_api = wikidata_host + '/w/api.php'
16
-url_search = wikidata_api \
17
-    + '?action=query&list=search&format=json'\
18
-    + '&srnamespace=0&srprop=sectiontitle&{query}'
19
 url_detail = wikidata_api\
20
 url_detail = wikidata_api\
20
     + '?action=wbgetentities&format=json'\
21
     + '?action=wbgetentities&format=json'\
21
     + '&props=labels%7Cinfo%7Csitelinks'\
22
     + '&props=labels%7Cinfo%7Csitelinks'\
23
     + '&{query}'
24
     + '&{query}'
24
 url_map = 'https://www.openstreetmap.org/'\
25
 url_map = 'https://www.openstreetmap.org/'\
25
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
26
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
27
+url_entity_label = wikidata_api\
28
+    + '?action=wbgetentities&format=json&props=labels&{query}'
29
+
30
+wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
26
 
31
 
27
 
32
 
28
 def request(query, params):
33
 def request(query, params):
34
+    language = params['language'].split('_')[0]
35
+    if language == 'all':
36
+        language = 'en'
37
+
29
     params['url'] = url_search.format(
38
     params['url'] = url_search.format(
30
-        query=urlencode({'srsearch': query,
31
-                        'srlimit': result_count}))
39
+        query=urlencode({'label': query,
40
+                        'language': language}))
32
     return params
41
     return params
33
 
42
 
34
 
43
 
35
 def response(resp):
44
 def response(resp):
36
     results = []
45
     results = []
37
-    search_res = json.loads(resp.text)
38
-
39
-    wikidata_ids = set()
40
-    for r in search_res.get('query', {}).get('search', {}):
41
-        wikidata_ids.add(r.get('title', ''))
46
+    html = fromstring(resp.content)
47
+    wikidata_ids = html.xpath(wikidata_ids_xpath)
42
 
48
 
43
     language = resp.search_params['language'].split('_')[0]
49
     language = resp.search_params['language'].split('_')[0]
44
     if language == 'all':
50
     if language == 'all':
49
 
55
 
50
     htmlresponse = get(url)
56
     htmlresponse = get(url)
51
     jsonresponse = json.loads(htmlresponse.content)
57
     jsonresponse = json.loads(htmlresponse.content)
52
-    for wikidata_id in wikidata_ids:
58
+    for wikidata_id in wikidata_ids[:result_count]:
53
         results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
59
         results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
54
 
60
 
55
     return results
61
     return results
82
     claims = result.get('claims', {})
88
     claims = result.get('claims', {})
83
     official_website = get_string(claims, 'P856', None)
89
     official_website = get_string(claims, 'P856', None)
84
     if official_website is not None:
90
     if official_website is not None:
85
-        urls.append({'title': 'Official site', 'url': official_website})
91
+        urls.append({'title': get_label('P856', language), 'url': official_website})
86
         results.append({'title': title, 'url': official_website})
92
         results.append({'title': title, 'url': official_website})
87
 
93
 
88
     wikipedia_link_count = 0
94
     wikipedia_link_count = 0
124
             'Commons wiki',
130
             'Commons wiki',
125
             get_wikilink(result, 'commonswiki'))
131
             get_wikilink(result, 'commonswiki'))
126
 
132
 
133
+    # Location
127
     add_url(urls,
134
     add_url(urls,
128
-            'Location',
135
+            get_label('P625', language),
129
             get_geolink(claims, 'P625', None))
136
             get_geolink(claims, 'P625', None))
130
 
137
 
131
     add_url(urls,
138
     add_url(urls,
169
 
176
 
170
     postal_code = get_string(claims, 'P281', None)
177
     postal_code = get_string(claims, 'P281', None)
171
     if postal_code is not None:
178
     if postal_code is not None:
172
-        attributes.append({'label': 'Postal code(s)', 'value': postal_code})
179
+        attributes.append({'label': get_label('P281', language), 'value': postal_code})
173
 
180
 
174
     date_of_birth = get_time(claims, 'P569', locale, None)
181
     date_of_birth = get_time(claims, 'P569', locale, None)
175
     if date_of_birth is not None:
182
     if date_of_birth is not None:
176
-        attributes.append({'label': 'Date of birth', 'value': date_of_birth})
183
+        attributes.append({'label': get_label('P569', language), 'value': date_of_birth})
177
 
184
 
178
     date_of_death = get_time(claims, 'P570', locale, None)
185
     date_of_death = get_time(claims, 'P570', locale, None)
179
     if date_of_death is not None:
186
     if date_of_death is not None:
180
-        attributes.append({'label': 'Date of death', 'value': date_of_death})
187
+        attributes.append({'label': get_label('P570', language), 'value': date_of_death})
181
 
188
 
182
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
189
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
183
         results.append({
190
         results.append({
321
         if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
328
         if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
322
             return k[0:2]
329
             return k[0:2]
323
     return None
330
     return None
331
+
332
+
333
+def get_label(entity_id, language):
334
+    url = url_entity_label.format(query=urlencode({'ids': entity_id,
335
+                                                   'languages': language + '|en'}))
336
+
337
+    response = get(url)
338
+    jsonresponse = json.loads(response.text)
339
+    label = jsonresponse.get('entities', {}).get(entity_id, {}).get('labels', {}).get(language, {}).get('value', None)
340
+    if label is None:
341
+        label = jsonresponse['entities'][entity_id]['labels']['en']['value']
342
+
343
+    return label