Browse Source

wikidata refactor and more attributes (see issue #560)

marc 8 years ago
parent
commit
a0a1284998

+ 1
- 0
.gitignore View File

@@ -1,4 +1,5 @@
1 1
 .coverage
2
+coverage/
2 3
 .installed.cfg
3 4
 engines.cfg
4 5
 env

+ 367
- 235
searx/engines/wikidata.py View File

@@ -1,33 +1,57 @@
1
-import json
1
+# -*- coding: utf-8 -*-
2
+"""
3
+ Wikidata
4
+
5
+ @website     https://wikidata.org
6
+ @provide-api yes (https://wikidata.org/w/api.php)
7
+
8
+ @using-api   partially (most things require scraping)
9
+ @results     JSON, HTML
10
+ @stable      no (html can change)
11
+ @parse       url, infobox
12
+"""
2 13
 
3 14
 from searx import logger
4 15
 from searx.poolrequests import get
5
-from searx.utils import format_date_by_locale
16
+from searx.engines.xpath import extract_text
6 17
 
7
-from datetime import datetime
8
-from dateutil.parser import parse as dateutil_parse
9
-from urllib import urlencode
18
+from json import loads
10 19
 from lxml.html import fromstring
11
-
20
+from urllib import urlencode
12 21
 
13 22
 logger = logger.getChild('wikidata')
14 23
 result_count = 1
24
+
25
+# urls
15 26
 wikidata_host = 'https://www.wikidata.org'
16 27
 url_search = wikidata_host \
17 28
     + '/wiki/Special:ItemDisambiguation?{query}'
18 29
 
19 30
 wikidata_api = wikidata_host + '/w/api.php'
20 31
 url_detail = wikidata_api\
21
-    + '?action=wbgetentities&format=json'\
22
-    + '&props=labels%7Cinfo%7Csitelinks'\
23
-    + '%7Csitelinks%2Furls%7Cdescriptions%7Cclaims'\
24
-    + '&{query}'
32
+    + '?action=parse&format=json&{query}'\
33
+    + '&redirects=1&prop=text%7Cdisplaytitle%7Clanglinks%7Crevid'\
34
+    + '&disableeditsection=1&disabletidy=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'
35
+
25 36
 url_map = 'https://www.openstreetmap.org/'\
26 37
     + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
27
-url_entity_label = wikidata_api\
28
-    + '?action=wbgetentities&format=json&props=labels&{query}'
38
+url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500'
29 39
 
40
+# xpaths
30 41
 wikidata_ids_xpath = '//div/ul[@class="wikibase-disambiguation"]/li/a/@title'
42
+title_xpath = '//*[contains(@class,"wikibase-title-label")]'
43
+description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
44
+property_xpath = '//div[@id="{propertyid}"]'
45
+label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
46
+url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
47
+wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
48
+    + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
49
+property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
50
+preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
51
+value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
52
+    + '/*/div[contains(@class,"wikibase-snakview-value")]'
53
+language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
54
+calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
31 55
 
32 56
 
33 57
 def request(query, params):
@@ -50,13 +74,13 @@ def response(resp):
50 74
     if language == 'all':
51 75
         language = 'en'
52 76
 
53
-    url = url_detail.format(query=urlencode({'ids': '|'.join(wikidata_ids),
54
-                                            'languages': language + '|en'}))
55
-
56
-    htmlresponse = get(url)
57
-    jsonresponse = json.loads(htmlresponse.content)
77
+    # TODO: make requests asynchronous to avoid timeout when result_count > 1
58 78
     for wikidata_id in wikidata_ids[:result_count]:
59
-        results = results + getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
79
+        url = url_detail.format(query=urlencode({'page': wikidata_id,
80
+                                                'uselang': language}))
81
+        htmlresponse = get(url)
82
+        jsonresponse = loads(htmlresponse.content)
83
+        results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
60 84
 
61 85
     return results
62 86
 
@@ -66,125 +90,194 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
66 90
     urls = []
67 91
     attributes = []
68 92
 
69
-    result = jsonresponse.get('entities', {}).get(wikidata_id, {})
93
+    title = jsonresponse.get('parse', {}).get('displaytitle', {})
94
+    result = jsonresponse.get('parse', {}).get('text', {})
70 95
 
71
-    title = result.get('labels', {}).get(language, {}).get('value', None)
72
-    if title is None:
73
-        title = result.get('labels', {}).get('en', {}).get('value', None)
74
-    if title is None:
96
+    if not title or not result:
75 97
         return results
76 98
 
77
-    description = result\
78
-        .get('descriptions', {})\
79
-        .get(language, {})\
80
-        .get('value', None)
99
+    title = fromstring(title)
100
+    for elem in title.xpath(language_fallback_xpath):
101
+        elem.getparent().remove(elem)
102
+    title = extract_text(title.xpath(title_xpath))
81 103
 
82
-    if description is None:
83
-        description = result\
84
-            .get('descriptions', {})\
85
-            .get('en', {})\
86
-            .get('value', '')
104
+    result = fromstring(result)
105
+    for elem in result.xpath(language_fallback_xpath):
106
+        elem.getparent().remove(elem)
87 107
 
88
-    claims = result.get('claims', {})
89
-    official_website = get_string(claims, 'P856', None)
90
-    if official_website is not None:
91
-        urls.append({'title': get_label('P856', language), 'url': official_website})
92
-        results.append({'title': title, 'url': official_website})
108
+    description = extract_text(result.xpath(description_xpath))
93 109
 
94
-    wikipedia_link_count = 0
95
-    wikipedia_link = get_wikilink(result, language + 'wiki')
96
-    wikipedia_link_count += add_url(urls,
97
-                                    'Wikipedia (' + language + ')',
98
-                                    wikipedia_link)
99
-    if language != 'en':
100
-        wikipedia_en_link = get_wikilink(result, 'enwiki')
101
-        wikipedia_link_count += add_url(urls,
102
-                                        'Wikipedia (en)',
103
-                                        wikipedia_en_link)
104
-    if wikipedia_link_count == 0:
105
-        misc_language = get_wiki_firstlanguage(result, 'wiki')
106
-        if misc_language is not None:
107
-            add_url(urls,
108
-                    'Wikipedia (' + misc_language + ')',
109
-                    get_wikilink(result, misc_language + 'wiki'))
110
+    # URLS
110 111
 
111
-    if language != 'en':
112
-        add_url(urls,
113
-                'Wiki voyage (' + language + ')',
114
-                get_wikilink(result, language + 'wikivoyage'))
112
+    # official website
113
+    add_url(urls, result, 'P856', results=results)
115 114
 
116
-    add_url(urls,
117
-            'Wiki voyage (en)',
118
-            get_wikilink(result, 'enwikivoyage'))
115
+    # wikipedia
116
+    wikipedia_link_count = 0
117
+    wikipedia_link = get_wikilink(result, language + 'wiki')
118
+    if wikipedia_link:
119
+        wikipedia_link_count += 1
120
+        urls.append({'title': 'Wikipedia (' + language + ')',
121
+                     'url': wikipedia_link})
119 122
 
120 123
     if language != 'en':
121
-        add_url(urls,
122
-                'Wikiquote (' + language + ')',
123
-                get_wikilink(result, language + 'wikiquote'))
124
-
125
-    add_url(urls,
126
-            'Wikiquote (en)',
127
-            get_wikilink(result, 'enwikiquote'))
128
-
129
-    add_url(urls,
130
-            'Commons wiki',
131
-            get_wikilink(result, 'commonswiki'))
132
-
133
-    # Location
134
-    add_url(urls,
135
-            get_label('P625', language),
136
-            get_geolink(claims, 'P625', None))
137
-
138
-    add_url(urls,
139
-            'Wikidata',
140
-            'https://www.wikidata.org/wiki/'
141
-            + wikidata_id + '?uselang=' + language)
142
-
143
-    musicbrainz_work_id = get_string(claims, 'P435')
144
-    if musicbrainz_work_id is not None:
145
-        add_url(urls,
146
-                'MusicBrainz',
147
-                'http://musicbrainz.org/work/'
148
-                + musicbrainz_work_id)
149
-
150
-    musicbrainz_artist_id = get_string(claims, 'P434')
151
-    if musicbrainz_artist_id is not None:
152
-        add_url(urls,
153
-                'MusicBrainz',
154
-                'http://musicbrainz.org/artist/'
155
-                + musicbrainz_artist_id)
156
-
157
-    musicbrainz_release_group_id = get_string(claims, 'P436')
158
-    if musicbrainz_release_group_id is not None:
159
-        add_url(urls,
160
-                'MusicBrainz',
161
-                'http://musicbrainz.org/release-group/'
162
-                + musicbrainz_release_group_id)
163
-
164
-    musicbrainz_label_id = get_string(claims, 'P966')
165
-    if musicbrainz_label_id is not None:
166
-        add_url(urls,
167
-                'MusicBrainz',
168
-                'http://musicbrainz.org/label/'
169
-                + musicbrainz_label_id)
170
-
171
-    # musicbrainz_area_id = get_string(claims, 'P982')
172
-    # P1407 MusicBrainz series ID
173
-    # P1004 MusicBrainz place ID
174
-    # P1330 MusicBrainz instrument ID
175
-    # P1407 MusicBrainz series ID
176
-
177
-    postal_code = get_string(claims, 'P281', None)
178
-    if postal_code is not None:
179
-        attributes.append({'label': get_label('P281', language), 'value': postal_code})
180
-
181
-    date_of_birth = get_time(claims, 'P569', locale, None)
182
-    if date_of_birth is not None:
183
-        attributes.append({'label': get_label('P569', language), 'value': date_of_birth})
184
-
185
-    date_of_death = get_time(claims, 'P570', locale, None)
186
-    if date_of_death is not None:
187
-        attributes.append({'label': get_label('P570', language), 'value': date_of_death})
124
+        wikipedia_en_link = get_wikilink(result, 'enwiki')
125
+        if wikipedia_en_link:
126
+            wikipedia_link_count += 1
127
+            urls.append({'title': 'Wikipedia (en)',
128
+                         'url': wikipedia_en_link})
129
+
130
+    # TODO: get_wiki_firstlanguage
131
+    # if wikipedia_link_count == 0:
132
+
133
+    # more wikis
134
+    add_url(urls, result, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
135
+    add_url(urls, result, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
136
+    add_url(urls, result, default_label='Wikimedia Commons', link_type='commonswiki')
137
+
138
+    add_url(urls, result, 'P625', 'OpenStreetMap', link_type='geo')
139
+
140
+    # musicbrainz
141
+    add_url(urls, result, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
142
+    add_url(urls, result, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
143
+    add_url(urls, result, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
144
+    add_url(urls, result, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')
145
+
146
+    # IMDb
147
+    add_url(urls, result, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
148
+    # source code repository
149
+    add_url(urls, result, 'P1324')
150
+    # blog
151
+    add_url(urls, result, 'P1581')
152
+    # social media links
153
+    add_url(urls, result, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
154
+    add_url(urls, result, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
155
+    add_url(urls, result, 'P2002', 'Twitter', 'https://twitter.com/')
156
+    add_url(urls, result, 'P2013', 'Facebook', 'https://facebook.com/')
157
+    add_url(urls, result, 'P2003', 'Instagram', 'https://instagram.com/')
158
+
159
+    urls.append({'title': 'Wikidata',
160
+                 'url': 'https://www.wikidata.org/wiki/'
161
+                 + wikidata_id + '?uselang=' + language})
162
+
163
+    # INFOBOX ATTRIBUTES (ROWS)
164
+
165
+    # inception date
166
+    add_attribute(attributes, result, 'P571', date=True)
167
+    # dissolution date
168
+    add_attribute(attributes, result, 'P576', date=True)
169
+    # start date
170
+    add_attribute(attributes, result, 'P580', date=True)
171
+    # end date
172
+    add_attribute(attributes, result, 'P582', date=True)
173
+
174
+    # date of birth
175
+    add_attribute(attributes, result, 'P569', date=True)
176
+    # date of death
177
+    add_attribute(attributes, result, 'P570', date=True)
178
+
179
+    # nationality
180
+    add_attribute(attributes, result, 'P27')
181
+    # country of origin
182
+    add_attribute(attributes, result, 'P495')
183
+    # country
184
+    add_attribute(attributes, result, 'P17')
185
+    # headquarters
186
+    add_attribute(attributes, result, 'Q180')
187
+
188
+    # PLACES
189
+    # capital
190
+    add_attribute(attributes, result, 'P36', trim=True)
191
+    # head of state
192
+    add_attribute(attributes, result, 'P35', trim=True)
193
+    # head of government
194
+    add_attribute(attributes, result, 'P6', trim=True)
195
+    # type of government
196
+    add_attribute(attributes, result, 'P122')
197
+    # official language
198
+    add_attribute(attributes, result, 'P37')
199
+    # population
200
+    add_attribute(attributes, result, 'P1082', trim=True)
201
+    # area
202
+    add_attribute(attributes, result, 'P2046')
203
+    # currency
204
+    add_attribute(attributes, result, 'P38')
205
+    # heigth (building)
206
+    add_attribute(attributes, result, 'P2048')
207
+
208
+    # MEDIA
209
+    # platform (videogames)
210
+    add_attribute(attributes, result, 'P400')
211
+    # author
212
+    add_attribute(attributes, result, 'P50')
213
+    # creator
214
+    add_attribute(attributes, result, 'P170')
215
+    # director
216
+    add_attribute(attributes, result, 'P57')
217
+    # performer
218
+    add_attribute(attributes, result, 'P175')
219
+    # developer
220
+    add_attribute(attributes, result, 'P178')
221
+    # producer
222
+    add_attribute(attributes, result, 'P162')
223
+    # manufacturer
224
+    add_attribute(attributes, result, 'P176')
225
+    # screenwriter
226
+    add_attribute(attributes, result, 'P58')
227
+    # production company
228
+    add_attribute(attributes, result, 'P272')
229
+    # record label
230
+    add_attribute(attributes, result, 'P264')
231
+    # publisher
232
+    add_attribute(attributes, result, 'P123')
233
+    # composer
234
+    add_attribute(attributes, result, 'P86')
235
+    # publication date
236
+    add_attribute(attributes, result, 'P577', date=True)
237
+    # genre
238
+    add_attribute(attributes, result, 'P136')
239
+    # original language
240
+    add_attribute(attributes, result, 'P364')
241
+    # isbn
242
+    add_attribute(attributes, result, 'Q33057')
243
+    # software license
244
+    add_attribute(attributes, result, 'P275')
245
+    # programming language
246
+    add_attribute(attributes, result, 'P277')
247
+    # version
248
+    add_attribute(attributes, result, 'P348', trim=True)
249
+    # narrative location
250
+    add_attribute(attributes, result, 'P840')
251
+
252
+    # LANGUAGES
253
+    # number of speakers
254
+    add_attribute(attributes, result, 'P1098')
255
+    # writing system
256
+    add_attribute(attributes, result, 'P282')
257
+    # regulatory body
258
+    add_attribute(attributes, result, 'P1018')
259
+    # language code
260
+    add_attribute(attributes, result, 'P218')
261
+
262
+    # OTHER
263
+    # ceo
264
+    add_attribute(attributes, result, 'P169', trim=True)
265
+    # founder
266
+    add_attribute(attributes, result, 'P112')
267
+    # legal form (company/organization)
268
+    add_attribute(attributes, result, 'P1454')
269
+    # taxon
270
+    add_attribute(attributes, result, 'P225')
271
+    # chemical formula
272
+    add_attribute(attributes, result, 'P274')
273
+    # winner (sports/contests)
274
+    add_attribute(attributes, result, 'P1346')
275
+    # number of deaths
276
+    add_attribute(attributes, result, 'P1120')
277
+    # currency code
278
+    add_attribute(attributes, result, 'P498')
279
+
280
+    image = add_image(result)
188 281
 
189 282
     if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
190 283
         results.append({
@@ -197,6 +290,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
197 290
                        'infobox': title,
198 291
                        'id': wikipedia_link,
199 292
                        'content': description,
293
+                       'img_src': image,
200 294
                        'attributes': attributes,
201 295
                        'urls': urls
202 296
                        })
@@ -204,92 +298,149 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
204 298
     return results
205 299
 
206 300
 
207
-def add_url(urls, title, url):
208
-    if url is not None:
209
-        urls.append({'title': title, 'url': url})
210
-        return 1
301
+# only returns first match
302
+def add_image(result):
303
+    # P18: image, P154: logo, P242: map, P41: flag, P2716: collage, P2910: icon
304
+    property_ids = ['P18', 'P154', 'P242', 'P41', 'P2716', 'P2910']
305
+
306
+    for property_id in property_ids:
307
+        image = result.xpath(property_xpath.replace('{propertyid}', property_id))
308
+        if image:
309
+            image_name = image[0].xpath(value_xpath)
310
+            image_src = url_image.replace('{filename}', extract_text(image_name[0]))
311
+            return image_src
312
+
313
+
314
+# setting trim will only returned high ranked rows OR the first row
315
+def add_attribute(attributes, result, property_id, default_label=None, date=False, trim=False):
316
+    attribute = result.xpath(property_xpath.replace('{propertyid}', property_id))
317
+    if attribute:
318
+
319
+        if default_label:
320
+            label = default_label
321
+        else:
322
+            label = extract_text(attribute[0].xpath(label_xpath))
323
+
324
+        if date:
325
+            trim = True
326
+            # remove calendar name
327
+            calendar_name = attribute[0].xpath(calendar_name_xpath)
328
+            for calendar in calendar_name:
329
+                calendar.getparent().remove(calendar)
330
+
331
+        concat_values = ""
332
+        values = []
333
+        first_value = None
334
+        for row in attribute[0].xpath(property_row_xpath):
335
+            if not first_value or not trim or row.xpath(preferred_rank_xpath):
336
+
337
+                value = row.xpath(value_xpath)
338
+                if not value:
339
+                    continue
340
+                value = extract_text(value)
341
+
342
+                # save first value in case no ranked row is found
343
+                if trim and not first_value:
344
+                    first_value = value
345
+                else:
346
+                    # to avoid duplicate values
347
+                    if value not in values:
348
+                        concat_values += value + ", "
349
+                        values.append(value)
350
+
351
+        if trim and not values:
352
+            attributes.append({'label': label,
353
+                               'value': first_value})
354
+        else:
355
+            attributes.append({'label': label,
356
+                               'value': concat_values[:-2]})
357
+
358
+
359
+# requires property_id unless it's a wiki link (defined in link_type)
360
+def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, results=None, link_type=None):
361
+    links = []
362
+
363
+    # wiki links don't have property in wikidata page
364
+    if link_type and 'wiki' in link_type:
365
+            links.append(get_wikilink(result, link_type))
211 366
     else:
212
-        return 0
367
+        dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id))
368
+        if dom_element:
369
+            dom_element = dom_element[0]
370
+            if not default_label:
371
+                label = extract_text(dom_element.xpath(label_xpath))
372
+
373
+            if link_type == 'geo':
374
+                links.append(get_geolink(dom_element))
375
+
376
+            elif link_type == 'imdb':
377
+                links.append(get_imdblink(dom_element, url_prefix))
378
+
379
+            else:
380
+                url_results = dom_element.xpath(url_xpath)
381
+                for link in url_results:
382
+                    if link is not None:
383
+                        if url_prefix:
384
+                            link = url_prefix + extract_text(link)
385
+                        else:
386
+                            link = extract_text(link)
387
+                        links.append(link)
388
+
389
+    # append urls
390
+    for url in links:
391
+        if url is not None:
392
+            urls.append({'title': default_label or label,
393
+                         'url': url})
394
+            if results is not None:
395
+                results.append({'title': default_label or label,
396
+                                'url': url})
397
+
398
+
399
+def get_imdblink(result, url_prefix):
400
+    imdb_id = result.xpath(value_xpath)
401
+    if imdb_id:
402
+        imdb_id = extract_text(imdb_id)
403
+        id_prefix = imdb_id[:2]
404
+        if id_prefix == 'tt':
405
+            url = url_prefix + 'title/' + imdb_id
406
+        elif id_prefix == 'nm':
407
+            url = url_prefix + 'name/' + imdb_id
408
+        elif id_prefix == 'ch':
409
+            url = url_prefix + 'character/' + imdb_id
410
+        elif id_prefix == 'co':
411
+            url = url_prefix + 'company/' + imdb_id
412
+        elif id_prefix == 'ev':
413
+            url = url_prefix + 'event/' + imdb_id
414
+        else:
415
+            url = None
416
+        return url
213 417
 
214 418
 
215
-def get_mainsnak(claims, propertyName):
216
-    propValue = claims.get(propertyName, {})
217
-    if len(propValue) == 0:
419
+def get_geolink(result):
420
+    coordinates = result.xpath(value_xpath)
421
+    if not coordinates:
218 422
         return None
219
-
220
-    propValue = propValue[0].get('mainsnak', None)
221
-    return propValue
222
-
223
-
224
-def get_string(claims, propertyName, defaultValue=None):
225
-    propValue = claims.get(propertyName, {})
226
-    if len(propValue) == 0:
227
-        return defaultValue
228
-
229
-    result = []
230
-    for e in propValue:
231
-        mainsnak = e.get('mainsnak', {})
232
-
233
-        datavalue = mainsnak.get('datavalue', {})
234
-        if datavalue is not None:
235
-            result.append(datavalue.get('value', ''))
236
-
237
-    if len(result) == 0:
238
-        return defaultValue
239
-    else:
240
-        # TODO handle multiple urls
241
-        return result[0]
242
-
243
-
244
-def get_time(claims, propertyName, locale, defaultValue=None):
245
-    propValue = claims.get(propertyName, {})
246
-    if len(propValue) == 0:
247
-        return defaultValue
248
-
249
-    result = []
250
-    for e in propValue:
251
-        mainsnak = e.get('mainsnak', {})
252
-
253
-        datavalue = mainsnak.get('datavalue', {})
254
-        if datavalue is not None:
255
-            value = datavalue.get('value', '')
256
-            result.append(value.get('time', ''))
257
-
258
-    if len(result) == 0:
259
-        date_string = defaultValue
260
-    else:
261
-        date_string = ', '.join(result)
262
-
263
-    try:
264
-        parsed_date = datetime.strptime(date_string, "+%Y-%m-%dT%H:%M:%SZ")
265
-    except:
266
-        if date_string.startswith('-'):
267
-            return date_string.split('T')[0]
268
-        try:
269
-            parsed_date = dateutil_parse(date_string, fuzzy=False, default=False)
270
-        except:
271
-            logger.debug('could not parse date %s', date_string)
272
-            return date_string.split('T')[0]
273
-
274
-    return format_date_by_locale(parsed_date, locale)
275
-
276
-
277
-def get_geolink(claims, propertyName, defaultValue=''):
278
-    mainsnak = get_mainsnak(claims, propertyName)
279
-
280
-    if mainsnak is None:
281
-        return defaultValue
282
-
283
-    datatype = mainsnak.get('datatype', '')
284
-    datavalue = mainsnak.get('datavalue', {})
285
-
286
-    if datatype != 'globe-coordinate':
287
-        return defaultValue
288
-
289
-    value = datavalue.get('value', {})
290
-
291
-    precision = value.get('precision', 0.0002)
292
-
423
+    coordinates = extract_text(coordinates[0])
424
+    latitude, longitude = coordinates.split(',')
425
+
426
+    # convert to decimal
427
+    lat = int(latitude[:latitude.find(u'°')])
428
+    if latitude.find('\'') >= 0:
429
+        lat += int(latitude[latitude.find(u'°') + 1:latitude.find('\'')] or 0) / 60.0
430
+    if latitude.find('"') >= 0:
431
+        lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
432
+    if latitude.find('S') >= 0:
433
+        lat *= -1
434
+    lon = int(longitude[:longitude.find(u'°')])
435
+    if longitude.find('\'') >= 0:
436
+        lon += int(longitude[longitude.find(u'°') + 1:longitude.find('\'')] or 0) / 60.0
437
+    if longitude.find('"') >= 0:
438
+        lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
439
+    if longitude.find('W') >= 0:
440
+        lon *= -1
441
+
442
+    # TODO: get precision
443
+    precision = 0.0002
293 444
     # there is no zoom information, deduce from precision (error prone)
294 445
     # samples :
295 446
     # 13 --> 5
@@ -305,39 +456,20 @@ def get_geolink(claims, propertyName, defaultValue=''):
305 456
         zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
306 457
 
307 458
     url = url_map\
308
-        .replace('{latitude}', str(value.get('latitude', 0)))\
309
-        .replace('{longitude}', str(value.get('longitude', 0)))\
459
+        .replace('{latitude}', str(lat))\
460
+        .replace('{longitude}', str(lon))\
310 461
         .replace('{zoom}', str(zoom))
311 462
 
312 463
     return url
313 464
 
314 465
 
315 466
 def get_wikilink(result, wikiid):
316
-    url = result.get('sitelinks', {}).get(wikiid, {}).get('url', None)
317
-    if url is None:
318
-        return url
319
-    elif url.startswith('http://'):
467
+    url = result.xpath(wikilink_xpath.replace('{wikiid}', wikiid))
468
+    if not url:
469
+        return None
470
+    url = url[0]
471
+    if url.startswith('http://'):
320 472
         url = url.replace('http://', 'https://')
321 473
     elif url.startswith('//'):
322 474
         url = 'https:' + url
323 475
     return url
324
-
325
-
326
-def get_wiki_firstlanguage(result, wikipatternid):
327
-    for k in result.get('sitelinks', {}).keys():
328
-        if k.endswith(wikipatternid) and len(k) == (2 + len(wikipatternid)):
329
-            return k[0:2]
330
-    return None
331
-
332
-
333
-def get_label(entity_id, language):
334
-    url = url_entity_label.format(query=urlencode({'ids': entity_id,
335
-                                                   'languages': language + '|en'}))
336
-
337
-    response = get(url)
338
-    jsonresponse = json.loads(response.text)
339
-    label = jsonresponse.get('entities', {}).get(entity_id, {}).get('labels', {}).get(language, {}).get('value', None)
340
-    if label is None:
341
-        label = jsonresponse['entities'][entity_id]['labels']['en']['value']
342
-
343
-    return label

+ 7
- 7
searx/templates/default/infobox.html View File

@@ -1,18 +1,18 @@
1 1
 <div class="infobox">
2
-    <h2>{{ infobox.infobox }}</h2>
2
+<h2><bdi>{{ infobox.infobox }}</bdi></h2>
3 3
     {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %}
4
-    <p>{{ infobox.entity }}</p>
5
-    <p>{{ infobox.content | safe }}</p>
4
+    <p><bdi>{{ infobox.entity }}</bdi></p>
5
+    <p><bdi>{{ infobox.content | safe }}</bdi></p>
6 6
     {% if infobox.attributes %}
7 7
     <div class="attributes">
8 8
         <table>
9 9
             {% for attribute in infobox.attributes %}
10 10
             <tr>
11
-                <td>{{ attribute.label }}</td>
11
+                <td><bdi>{{ attribute.label }}</bdi></td>
12 12
                 {% if attribute.image %}
13 13
                 <td><img src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
14 14
                 {% else %}
15
-                <td>{{ attribute.value }}</td>
15
+                <td><bdi>{{ attribute.value }}</bdi></td>
16 16
                 {% endif %}
17 17
             </tr>
18 18
             {% endfor %}
@@ -24,7 +24,7 @@
24 24
     <div class="urls">
25 25
         <ul>
26 26
             {% for url in infobox.urls %}
27
-            <li class="url"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></li>
27
+            <li class="url"><bdi><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></bdi></li>
28 28
             {% endfor %}
29 29
         </ul>
30 30
     </div>
@@ -34,7 +34,7 @@
34 34
     <div class="relatedTopics">
35 35
         {% for topic in infobox.relatedTopics %}
36 36
         <div>
37
-            <h3>{{ topic.name }}</h3>
37
+            <h3><bdi>{{ topic.name }}</bdi></h3>
38 38
             {% for suggestion in topic.suggestions %}
39 39
             <form method="{{ method or 'POST' }}" action="{{ url_for('index') }}">
40 40
                 <input type="hidden" name="q" value="{{ suggestion }}">

+ 6
- 6
searx/templates/oscar/infobox.html View File

@@ -1,21 +1,20 @@
1 1
 <div class="panel panel-default infobox">
2 2
     <div class="panel-heading">
3
-        <bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
3
+        <h4 class="panel-title infobox_part"><bdi>{{ infobox.infobox }}</bdi></h4>
4 4
     </div>
5 5
     <div class="panel-body">
6
-        <bdi>
7 6
         {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
8
-        {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
7
+        {% if infobox.content %}<bdi><p class="infobox_part">{{ infobox.content }}</bdi></p>{% endif %}
9 8
 
10 9
         {% if infobox.attributes %}
11 10
         <table class="table table-striped infobox_part">
12 11
             {% for attribute in infobox.attributes %}
13 12
             <tr>
14
-                <td>{{ attribute.label }}</td>
13
+                <td><bdi>{{ attribute.label }}</bdi></td>
15 14
                 {% if attribute.image %}
16 15
                 <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
17 16
                 {% else %}
18
-                <td>{{ attribute.value }}</td>
17
+                <td><bdi>{{ attribute.value }}</bdi></td>
19 18
                 {% endif %}
20 19
             </tr>
21 20
             {% endfor %}
@@ -24,11 +23,12 @@
24 23
 
25 24
         {% if infobox.urls %}
26 25
         <div class="infobox_part">
26
+            <bdi>
27 27
             {% for url in infobox.urls %}
28 28
             <p class="btn btn-default btn-xs"><a href="{{ url.url }}" rel="noreferrer">{{ url.title }}</a></p>
29 29
             {% endfor %}
30
+            </bdi>
30 31
         </div>
31 32
         {% endif %}
32
-        </bdi>
33 33
     </div>
34 34
 </div>

+ 502
- 0
tests/unit/engines/test_wikidata.py View File

@@ -0,0 +1,502 @@
1
+# -*- coding: utf-8 -*-
2
+from json import loads
3
+from lxml.html import fromstring
4
+from collections import defaultdict
5
+import mock
6
+from searx.engines import wikidata
7
+from searx.testing import SearxTestCase
8
+
9
+
10
+class TestWikidataEngine(SearxTestCase):
11
+
12
+    def test_request(self):
13
+        query = 'test_query'
14
+        dicto = defaultdict(dict)
15
+        dicto['language'] = 'all'
16
+        params = wikidata.request(query, dicto)
17
+        self.assertIn('url', params)
18
+        self.assertIn(query, params['url'])
19
+        self.assertIn('wikidata.org', params['url'])
20
+        self.assertIn('en', params['url'])
21
+
22
+        dicto['language'] = 'es_ES'
23
+        params = wikidata.request(query, dicto)
24
+        self.assertIn(query, params['url'])
25
+        self.assertIn('es', params['url'])
26
+
27
+    # successful cases are not tested here to avoid sending additional requests
28
+    def test_response(self):
29
+        self.assertRaises(AttributeError, wikidata.response, None)
30
+        self.assertRaises(AttributeError, wikidata.response, [])
31
+        self.assertRaises(AttributeError, wikidata.response, '')
32
+        self.assertRaises(AttributeError, wikidata.response, '[]')
33
+
34
+        response = mock.Mock(content='<html></html>', search_params={"language": "all"})
35
+        self.assertEqual(wikidata.response(response), [])
36
+
37
+    def test_getDetail(self):
38
+        response = {}
39
+        results = wikidata.getDetail(response, "Q123", "en", "en-US")
40
+        self.assertEqual(results, [])
41
+
42
+        title_html = '<div><div class="wikibase-title-label">Test</div></div>'
43
+        html = """
44
+        <div>
45
+            <div class="wikibase-entitytermsview-heading-description">
46
+            </div>
47
+            <div>
48
+                <ul class="wikibase-sitelinklistview-listview">
49
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
50
+                </ul>
51
+            </div>
52
+        </div>
53
+        """
54
+        response = {"parse": {"displaytitle": title_html, "text": html}}
55
+
56
+        results = wikidata.getDetail(response, "Q123", "en", "en-US")
57
+        self.assertEqual(len(results), 1)
58
+        self.assertEqual(results[0]['url'], 'https://en.wikipedia.org/wiki/Test')
59
+
60
+        title_html = """
61
+        <div>
62
+            <div class="wikibase-title-label">
63
+                <span lang="en">Test</span>
64
+                <sup class="wb-language-fallback-indicator">English</sup>
65
+            </div>
66
+        </div>
67
+        """
68
+        html = """
69
+        <div>
70
+            <div class="wikibase-entitytermsview-heading-description">
71
+                <span lang="en">Description</span>
72
+                <sup class="wb-language-fallback-indicator">English</sup>
73
+            </div>
74
+            <div id="P856">
75
+                <div class="wikibase-statementgroupview-property-label">
76
+                    <a href="/wiki/Property:P856">
77
+                        <span lang="en">official website</span>
78
+                        <sup class="wb-language-fallback-indicator">English</sup>
79
+                    </a>
80
+                </div>
81
+                <div class="wikibase-statementview-mainsnak">
82
+                    <a class="external free" href="https://officialsite.com">
83
+                        https://officialsite.com
84
+                    </a>
85
+                </div>
86
+            </div>
87
+            <div>
88
+                <ul class="wikibase-sitelinklistview-listview">
89
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
90
+                </ul>
91
+            </div>
92
+        </div>
93
+        """
94
+        response = {"parse": {"displaytitle": title_html, "text": html}}
95
+
96
+        results = wikidata.getDetail(response, "Q123", "yua", "yua_MX")
97
+        self.assertEqual(len(results), 2)
98
+        self.assertEqual(results[0]['title'], 'official website')
99
+        self.assertEqual(results[0]['url'], 'https://officialsite.com')
100
+
101
+        self.assertEqual(results[1]['infobox'], 'Test')
102
+        self.assertEqual(results[1]['id'], None)
103
+        self.assertEqual(results[1]['content'], 'Description')
104
+        self.assertEqual(results[1]['attributes'], [])
105
+        self.assertEqual(results[1]['urls'][0]['title'], 'official website')
106
+        self.assertEqual(results[1]['urls'][0]['url'], 'https://officialsite.com')
107
+        self.assertEqual(results[1]['urls'][1]['title'], 'Wikipedia (en)')
108
+        self.assertEqual(results[1]['urls'][1]['url'], 'https://en.wikipedia.org/wiki/Test')
109
+
110
+    def test_add_image(self):
111
+        image_src = wikidata.add_image(fromstring("<div></div>"))
112
+        self.assertEqual(image_src, None)
113
+
114
+        html = u"""
115
+        <div>
116
+            <div id="P18">
117
+                <div class="wikibase-statementgroupview-property-label">
118
+                    <a href="/wiki/Property:P18">
119
+                        image
120
+                    </a>
121
+                </div>
122
+                <div class="wikibase-statementlistview">
123
+                    <div class="wikibase-statementview listview-item">
124
+                        <div class="wikibase-statementview-rankselector">
125
+                            <span class="wikibase-rankselector-normal"></span>
126
+                        </div>
127
+                        <div class="wikibase-statementview-mainsnak">
128
+                            <div>
129
+                                <div class="wikibase-snakview-value">
130
+                                    <a href="https://commons.wikimedia.org/wiki/File:image.png">
131
+                                        image.png
132
+                                    </a>
133
+                                </div>
134
+                            </div>
135
+                        </div>
136
+                    </div>
137
+                </div>
138
+            </div>
139
+        </div>
140
+        """
141
+        html_etree = fromstring(html)
142
+
143
+        image_src = wikidata.add_image(html_etree)
144
+        self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/image.png?width=500")
145
+
146
+        html = u"""
147
+        <div>
148
+            <div id="P2910">
149
+                <div class="wikibase-statementgroupview-property-label">
150
+                    <a href="/wiki/Property:P2910">
151
+                        icon
152
+                    </a>
153
+                </div>
154
+                <div class="wikibase-statementlistview">
155
+                    <div class="wikibase-statementview listview-item">
156
+                        <div class="wikibase-statementview-rankselector">
157
+                            <span class="wikibase-rankselector-normal"></span>
158
+                        </div>
159
+                        <div class="wikibase-statementview-mainsnak">
160
+                            <div>
161
+                                <div class="wikibase-snakview-value">
162
+                                    <a href="https://commons.wikimedia.org/wiki/File:icon.png">
163
+                                        icon.png
164
+                                    </a>
165
+                                </div>
166
+                            </div>
167
+                        </div>
168
+                    </div>
169
+                </div>
170
+            </div>
171
+            <div id="P154">
172
+                <div class="wikibase-statementgroupview-property-label">
173
+                    <a href="/wiki/Property:P154">
174
+                        logo
175
+                    </a>
176
+                </div>
177
+                <div class="wikibase-statementlistview">
178
+                    <div class="wikibase-statementview listview-item">
179
+                        <div class="wikibase-statementview-rankselector">
180
+                            <span class="wikibase-rankselector-normal"></span>
181
+                        </div>
182
+                        <div class="wikibase-statementview-mainsnak">
183
+                            <div>
184
+                                <div class="wikibase-snakview-value">
185
+                                    <a href="https://commons.wikimedia.org/wiki/File:logo.png">
186
+                                        logo.png
187
+                                    </a>
188
+                                </div>
189
+                            </div>
190
+                        </div>
191
+                    </div>
192
+                </div>
193
+            </div>
194
+        </div>
195
+        """
196
+        html_etree = fromstring(html)
197
+
198
+        image_src = wikidata.add_image(html_etree)
199
+        self.assertEqual(image_src, "https://commons.wikimedia.org/wiki/Special:FilePath/logo.png?width=500")
200
+
201
+    def test_add_attribute(self):
202
+        html = u"""
203
+        <div>
204
+            <div id="P27">
205
+                <div class="wikibase-statementgroupview-property-label">
206
+                    <a href="/wiki/Property:P27">
207
+                        country of citizenship
208
+                    </a>
209
+                </div>
210
+                <div class="wikibase-statementlistview">
211
+                    <div class="wikibase-statementview listview-item">
212
+                        <div class="wikibase-statementview-rankselector">
213
+                            <span class="wikibase-rankselector-normal"></span>
214
+                        </div>
215
+                        <div class="wikibase-statementview-mainsnak">
216
+                            <div>
217
+                                <div class="wikibase-snakview-value">
218
+                                    <a href="/wiki/Q145">
219
+                                        United Kingdom
220
+                                    </a>
221
+                                </div>
222
+                            </div>
223
+                        </div>
224
+                    </div>
225
+                </div>
226
+            </div>
227
+        </div>
228
+        """
229
+        attributes = []
230
+        html_etree = fromstring(html)
231
+
232
+        wikidata.add_attribute(attributes, html_etree, "Fail")
233
+        self.assertEqual(attributes, [])
234
+
235
+        wikidata.add_attribute(attributes, html_etree, "P27")
236
+        self.assertEqual(len(attributes), 1)
237
+        self.assertEqual(attributes[0]["label"], "country of citizenship")
238
+        self.assertEqual(attributes[0]["value"], "United Kingdom")
239
+
240
+        html = u"""
241
+        <div>
242
+            <div id="P569">
243
+                <div class="wikibase-statementgroupview-property-label">
244
+                    <a href="/wiki/Property:P569">
245
+                        date of birth
246
+                    </a>
247
+                </div>
248
+                <div class="wikibase-statementlistview">
249
+                    <div class="wikibase-statementview listview-item">
250
+                        <div class="wikibase-statementview-rankselector">
251
+                            <span class="wikibase-rankselector-normal"></span>
252
+                        </div>
253
+                        <div class="wikibase-statementview-mainsnak">
254
+                            <div>
255
+                                <div class="wikibase-snakview-value">
256
+                                    27 January 1832
257
+                                    <sup class="wb-calendar-name">
258
+                                        Gregorian
259
+                                    </sup>
260
+                                </div>
261
+                            </div>
262
+                        </div>
263
+                    </div>
264
+                </div>
265
+            </div>
266
+        </div>
267
+        """
268
+        attributes = []
269
+        html_etree = fromstring(html)
270
+        wikidata.add_attribute(attributes, html_etree, "P569", date=True)
271
+        self.assertEqual(len(attributes), 1)
272
+        self.assertEqual(attributes[0]["label"], "date of birth")
273
+        self.assertEqual(attributes[0]["value"], "27 January 1832")
274
+
275
+        html = u"""
276
+        <div>
277
+            <div id="P6">
278
+                <div class="wikibase-statementgroupview-property-label">
279
+                    <a href="/wiki/Property:P27">
280
+                        head of government
281
+                    </a>
282
+                </div>
283
+                <div class="wikibase-statementlistview">
284
+                    <div class="wikibase-statementview listview-item">
285
+                        <div class="wikibase-statementview-rankselector">
286
+                            <span class="wikibase-rankselector-normal"></span>
287
+                        </div>
288
+                        <div class="wikibase-statementview-mainsnak">
289
+                            <div>
290
+                                <div class="wikibase-snakview-value">
291
+                                    <a href="/wiki/Q206">
292
+                                        Old Prime Minister
293
+                                    </a>
294
+                                </div>
295
+                            </div>
296
+                        </div>
297
+                    </div>
298
+                    <div class="wikibase-statementview listview-item">
299
+                        <div class="wikibase-statementview-rankselector">
300
+                            <span class="wikibase-rankselector-preferred"></span>
301
+                        </div>
302
+                        <div class="wikibase-statementview-mainsnak">
303
+                            <div>
304
+                                <div class="wikibase-snakview-value">
305
+                                    <a href="/wiki/Q3099714">
306
+                                        Actual Prime Minister
307
+                                    </a>
308
+                                </div>
309
+                            </div>
310
+                        </div>
311
+                    </div>
312
+                </div>
313
+            </div>
314
+        </div>
315
+        """
316
+        attributes = []
317
+        html_etree = fromstring(html)
318
+        wikidata.add_attribute(attributes, html_etree, "P6")
319
+        self.assertEqual(len(attributes), 1)
320
+        self.assertEqual(attributes[0]["label"], "head of government")
321
+        self.assertEqual(attributes[0]["value"], "Old Prime Minister, Actual Prime Minister")
322
+
323
+        attributes = []
324
+        html_etree = fromstring(html)
325
+        wikidata.add_attribute(attributes, html_etree, "P6", trim=True)
326
+        self.assertEqual(len(attributes), 1)
327
+        self.assertEqual(attributes[0]["value"], "Actual Prime Minister")
328
+
329
+    def test_add_url(self):
330
+        html = u"""
331
+        <div>
332
+            <div id="P856">
333
+                <div class="wikibase-statementgroupview-property-label">
334
+                    <a href="/wiki/Property:P856">
335
+                        official website
336
+                    </a>
337
+                </div>
338
+                <div class="wikibase-statementlistview">
339
+                    <div class="wikibase-statementview listview-item">
340
+                        <div class="wikibase-statementview-mainsnak">
341
+                            <div>
342
+                                <div class="wikibase-snakview-value">
343
+                                    <a class="external free" href="https://searx.me">
344
+                                        https://searx.me/
345
+                                    </a>
346
+                                </div>
347
+                            </div>
348
+                        </div>
349
+                    </div>
350
+                </div>
351
+            </div>
352
+        </div>
353
+        """
354
+        urls = []
355
+        html_etree = fromstring(html)
356
+        wikidata.add_url(urls, html_etree, 'P856')
357
+        self.assertEquals(len(urls), 1)
358
+        self.assertIn({'title': 'official website', 'url': 'https://searx.me/'}, urls)
359
+        urls = []
360
+        results = []
361
+        wikidata.add_url(urls, html_etree, 'P856', 'custom label', results=results)
362
+        self.assertEquals(len(urls), 1)
363
+        self.assertEquals(len(results), 1)
364
+        self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, urls)
365
+        self.assertIn({'title': 'custom label', 'url': 'https://searx.me/'}, results)
366
+
367
+        html = u"""
368
+        <div>
369
+            <div id="P856">
370
+                <div class="wikibase-statementgroupview-property-label">
371
+                    <a href="/wiki/Property:P856">
372
+                        official website
373
+                    </a>
374
+                </div>
375
+                <div class="wikibase-statementlistview">
376
+                    <div class="wikibase-statementview listview-item">
377
+                        <div class="wikibase-statementview-mainsnak">
378
+                            <div>
379
+                                <div class="wikibase-snakview-value">
380
+                                    <a class="external free" href="http://www.worldofwarcraft.com">
381
+                                        http://www.worldofwarcraft.com
382
+                                    </a>
383
+                                </div>
384
+                            </div>
385
+                        </div>
386
+                    </div>
387
+                    <div class="wikibase-statementview listview-item">
388
+                        <div class="wikibase-statementview-mainsnak">
389
+                            <div>
390
+                                <div class="wikibase-snakview-value">
391
+                                    <a class="external free" href="http://eu.battle.net/wow/en/">
392
+                                        http://eu.battle.net/wow/en/
393
+                                    </a>
394
+                                </div>
395
+                            </div>
396
+                        </div>
397
+                    </div>
398
+                </div>
399
+            </div>
400
+        </div>
401
+        """
402
+        urls = []
403
+        html_etree = fromstring(html)
404
+        wikidata.add_url(urls, html_etree, 'P856')
405
+        self.assertEquals(len(urls), 2)
406
+        self.assertIn({'title': 'official website', 'url': 'http://www.worldofwarcraft.com'}, urls)
407
+        self.assertIn({'title': 'official website', 'url': 'http://eu.battle.net/wow/en/'}, urls)
408
+
409
+    def test_get_imdblink(self):
410
+        html = u"""
411
+        <div>
412
+            <div class="wikibase-statementview-mainsnak">
413
+                <div>
414
+                    <div class="wikibase-snakview-value">
415
+                        <a class="wb-external-id" href="http://www.imdb.com/tt0433664">
416
+                            tt0433664
417
+                        </a>
418
+                    </div>
419
+                </div>
420
+            </div>
421
+        </div>
422
+        """
423
+        html_etree = fromstring(html)
424
+        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
425
+
426
+        html = u"""
427
+        <div>
428
+            <div class="wikibase-statementview-mainsnak">
429
+                <div>
430
+                    <div class="wikibase-snakview-value">
431
+                        <a class="wb-external-id"
432
+                           href="href="http://tools.wmflabs.org/...http://www.imdb.com/&id=nm4915994"">
433
+                            nm4915994
434
+                        </a>
435
+                    </div>
436
+                </div>
437
+            </div>
438
+        </div>
439
+        """
440
+        html_etree = fromstring(html)
441
+        imdblink = wikidata.get_imdblink(html_etree, 'https://www.imdb.com/')
442
+        self.assertIn('https://www.imdb.com/name/nm4915994', imdblink)
443
+
444
+    def test_get_geolink(self):
445
+        html = u"""
446
+        <div>
447
+            <div class="wikibase-statementview-mainsnak">
448
+                <div>
449
+                    <div class="wikibase-snakview-value">
450
+                        60°N, 40°E
451
+                    </div>
452
+                </div>
453
+            </div>
454
+        </div>
455
+        """
456
+        html_etree = fromstring(html)
457
+        geolink = wikidata.get_geolink(html_etree)
458
+        self.assertIn('https://www.openstreetmap.org/', geolink)
459
+        self.assertIn('lat=60&lon=40', geolink)
460
+
461
+        html = u"""
462
+        <div>
463
+            <div class="wikibase-statementview-mainsnak">
464
+                <div>
465
+                    <div class="wikibase-snakview-value">
466
+                        34°35'59"S, 58°22'55"W
467
+                    </div>
468
+                </div>
469
+            </div>
470
+        </div>
471
+        """
472
+        html_etree = fromstring(html)
473
+        geolink = wikidata.get_geolink(html_etree)
474
+        self.assertIn('https://www.openstreetmap.org/', geolink)
475
+        self.assertIn('lat=-34.59', geolink)
476
+        self.assertIn('lon=-58.38', geolink)
477
+
478
+    def test_get_wikilink(self):
479
+        html = """
480
+        <div>
481
+            <div>
482
+                <ul class="wikibase-sitelinklistview-listview">
483
+                    <li data-wb-siteid="arwiki"><a href="http://ar.wikipedia.org/wiki/Test">Test</a></li>
484
+                    <li data-wb-siteid="enwiki"><a href="http://en.wikipedia.org/wiki/Test">Test</a></li>
485
+                </ul>
486
+            </div>
487
+            <div>
488
+                <ul class="wikibase-sitelinklistview-listview">
489
+                    <li data-wb-siteid="enwikiquote"><a href="https://en.wikiquote.org/wiki/Test">Test</a></li>
490
+                </ul>
491
+            </div>
492
+        </div>
493
+        """
494
+        html_etree = fromstring(html)
495
+        wikilink = wikidata.get_wikilink(html_etree, 'nowiki')
496
+        self.assertEqual(wikilink, None)
497
+        wikilink = wikidata.get_wikilink(html_etree, 'enwiki')
498
+        self.assertEqual(wikilink, 'https://en.wikipedia.org/wiki/Test')
499
+        wikilink = wikidata.get_wikilink(html_etree, 'arwiki')
500
+        self.assertEqual(wikilink, 'https://ar.wikipedia.org/wiki/Test')
501
+        wikilink = wikidata.get_wikilink(html_etree, 'enwikiquote')
502
+        self.assertEqual(wikilink, 'https://en.wikiquote.org/wiki/Test')