|
@@ -0,0 +1,161 @@
|
|
1
|
+# -*- coding: utf-8 -*-
|
|
2
|
+import json
|
|
3
|
+import re
|
|
4
|
+import unicodedata
|
|
5
|
+import string
|
|
6
|
+from urllib import urlencode
|
|
7
|
+from requests import get
|
|
8
|
+
|
|
9
|
+languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'}
|
|
10
|
+
|
|
11
|
+url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages)
|
|
12
|
+url_wmflabs_template = 'http://wdq.wmflabs.org/api?q='
|
|
13
|
+url_wikidata_search_template='http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}'
|
|
14
|
+
|
|
15
|
+wmflabs_queries = [
|
|
16
|
+ 'CLAIM[31:8142]', # all devise
|
|
17
|
+]
|
|
18
|
+
|
|
19
|
+db = {
|
|
20
|
+ 'iso4217' : {
|
|
21
|
+ },
|
|
22
|
+ 'names' : {
|
|
23
|
+ }
|
|
24
|
+}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+def remove_accents(data):
|
|
28
|
+ return unicodedata.normalize('NFKD', data).lower()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+def normalize_name(name):
|
|
32
|
+ return re.sub(' +',' ', remove_accents(name.lower()).replace('-', ' '))
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+def add_currency_name(name, iso4217):
|
|
36
|
+ global db
|
|
37
|
+
|
|
38
|
+ db_names = db['names']
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+ if not isinstance(iso4217, basestring):
|
|
42
|
+ print "problem", name, iso4217
|
|
43
|
+ return
|
|
44
|
+
|
|
45
|
+ name = normalize_name(name)
|
|
46
|
+
|
|
47
|
+ if name == '':
|
|
48
|
+ print "name empty", iso4217
|
|
49
|
+ return
|
|
50
|
+
|
|
51
|
+ iso4217_set = db_names.get(name, None)
|
|
52
|
+ if iso4217_set is not None and iso4217 not in iso4217_set:
|
|
53
|
+ db_names[name].append(iso4217)
|
|
54
|
+ else:
|
|
55
|
+ db_names[name] = [ iso4217 ]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+def add_currency_label(label, iso4217, language):
|
|
59
|
+ global db
|
|
60
|
+
|
|
61
|
+ db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {})
|
|
62
|
+ db['iso4217'][iso4217][language] = label
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+def get_property_value(data, name):
|
|
66
|
+ prop = data.get('claims', {}).get(name, {})
|
|
67
|
+ if len(prop) == 0:
|
|
68
|
+ return None
|
|
69
|
+
|
|
70
|
+ value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '')
|
|
71
|
+ if value == '':
|
|
72
|
+ return None
|
|
73
|
+
|
|
74
|
+ return value
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+def parse_currency(data):
|
|
78
|
+ iso4217 = get_property_value(data, 'P498')
|
|
79
|
+
|
|
80
|
+ if iso4217 is not None:
|
|
81
|
+ unit = get_property_value(data, 'P558')
|
|
82
|
+ if unit is not None:
|
|
83
|
+ add_currency_name(unit, iso4217)
|
|
84
|
+
|
|
85
|
+ labels = data.get('labels', {})
|
|
86
|
+ for language in languages:
|
|
87
|
+ name = labels.get(language, {}).get('value', None)
|
|
88
|
+ if name != None:
|
|
89
|
+ add_currency_name(name, iso4217)
|
|
90
|
+ add_currency_label(name, iso4217, language)
|
|
91
|
+
|
|
92
|
+ aliases = data.get('aliases', {})
|
|
93
|
+ for language in aliases:
|
|
94
|
+ for i in range(0, len(aliases[language])):
|
|
95
|
+ alias = aliases[language][i].get('value', None)
|
|
96
|
+ add_currency_name(alias, iso4217)
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+def fetch_data(wikidata_ids):
|
|
100
|
+ url = url_template.format(query=urlencode({'ids' : '|'.join(wikidata_ids)}))
|
|
101
|
+ htmlresponse = get(url)
|
|
102
|
+ jsonresponse = json.loads(htmlresponse.content)
|
|
103
|
+ entities = jsonresponse.get('entities', {})
|
|
104
|
+
|
|
105
|
+ for pname in entities:
|
|
106
|
+ pvalue = entities.get(pname)
|
|
107
|
+ parse_currency(pvalue)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+def add_q(i):
|
|
111
|
+ return "Q" + str(i)
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+def fetch_data_batch(wikidata_ids):
|
|
115
|
+ while len(wikidata_ids) > 0:
|
|
116
|
+ if len(wikidata_ids) > 50:
|
|
117
|
+ fetch_data(wikidata_ids[0:49])
|
|
118
|
+ wikidata_ids = wikidata_ids[50:]
|
|
119
|
+ else:
|
|
120
|
+ fetch_data(wikidata_ids)
|
|
121
|
+ wikidata_ids = []
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+def wdq_query(query):
|
|
125
|
+ url = url_wmflabs_template + query
|
|
126
|
+ htmlresponse = get(url)
|
|
127
|
+ jsonresponse = json.loads(htmlresponse.content)
|
|
128
|
+ qlist = map(add_q, jsonresponse.get('items', {}))
|
|
129
|
+ error = jsonresponse.get('status', {}).get('error', None)
|
|
130
|
+ if error != None and error != 'OK':
|
|
131
|
+ print "error for query '" + query + "' :" + error
|
|
132
|
+
|
|
133
|
+ fetch_data_batch(qlist)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+def wd_query(query, offset=0):
|
|
137
|
+ qlist = []
|
|
138
|
+
|
|
139
|
+ url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset}))
|
|
140
|
+ htmlresponse = get(url)
|
|
141
|
+ jsonresponse = json.loads(htmlresponse.content)
|
|
142
|
+ for r in jsonresponse.get('query', {}).get('search', {}):
|
|
143
|
+ qlist.append(r.get('title', ''))
|
|
144
|
+ fetch_data_batch(qlist)
|
|
145
|
+
|
|
146
|
+## fetch ##
|
|
147
|
+for q in wmflabs_queries:
|
|
148
|
+ wdq_query(q)
|
|
149
|
+
|
|
150
|
+# static
|
|
151
|
+add_currency_name(u"euro", 'EUR')
|
|
152
|
+add_currency_name(u"euros", 'EUR')
|
|
153
|
+add_currency_name(u"dollar", 'USD')
|
|
154
|
+add_currency_name(u"dollars", 'USD')
|
|
155
|
+add_currency_name(u"peso", 'MXN')
|
|
156
|
+add_currency_name(u"pesos", 'MXN')
|
|
157
|
+
|
|
158
|
+# write
|
|
159
|
+f = open("currencies.json", "wb")
|
|
160
|
+json.dump(db, f, indent=4, encoding="utf-8")
|
|
161
|
+f.close()
|