|
@@ -32,23 +32,26 @@ languages = {}
|
32
|
32
|
def valid_code(lang_code):
|
33
|
33
|
# filter invalid codes
|
34
|
34
|
# sl-SL is technically not invalid, but still a mistake
|
|
35
|
+ invalid_codes = ['sl-SL', 'wt-WT', 'jw']
|
|
36
|
+ invalid_countries = ['UK', 'XA', 'XL']
|
35
|
37
|
if lang_code[:2] == 'xx'\
|
36
|
|
- or lang_code == 'sl-SL'\
|
37
|
|
- or lang_code == 'wt-WT'\
|
38
|
|
- or lang_code == 'jw'\
|
39
|
|
- or lang_code[-2:] == 'UK'\
|
40
|
|
- or lang_code[-2:] == 'XA'\
|
41
|
|
- or lang_code[-2:] == 'XL':
|
|
38
|
+ or lang_code in invalid_codes\
|
|
39
|
+ or lang_code[-2:] in invalid_countries\
|
|
40
|
+ or is_dialect(lang_code):
|
42
|
41
|
return False
|
43
|
42
|
|
44
|
|
- # filter dialects
|
|
43
|
+ return True
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+# Language codes with any additional tags other than language and country.
|
|
47
|
+def is_dialect(lang_code):
|
45
|
48
|
lang_code = lang_code.split('-')
|
46
|
49
|
if len(lang_code) > 2 or len(lang_code[0]) > 3:
|
47
|
|
- return False
|
|
50
|
+ return True
|
48
|
51
|
if len(lang_code) == 2 and len(lang_code[1]) > 2:
|
49
|
|
- return False
|
|
52
|
+ return True
|
50
|
53
|
|
51
|
|
- return True
|
|
54
|
+ return False
|
52
|
55
|
|
53
|
56
|
|
54
|
57
|
# Get country name in specified language.
|
|
@@ -83,19 +86,17 @@ def fetch_supported_languages():
|
83
|
86
|
print e
|
84
|
87
|
|
85
|
88
|
# write json file
|
86
|
|
- f = io.open(engines_languages_file, "w", encoding="utf-8")
|
87
|
|
- f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
|
88
|
|
- f.close()
|
|
89
|
+ with io.open(engines_languages_file, "w", encoding="utf-8") as f:
|
|
90
|
+ f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
|
89
|
91
|
|
90
|
92
|
|
91
|
93
|
# Join all language lists.
|
92
|
94
|
# Iterate all languages supported by each engine.
|
93
|
95
|
def join_language_lists():
|
94
|
96
|
# include wikipedia first for more accurate language names
|
95
|
|
- # exclude languages with too few articles
|
96
|
97
|
languages.update({code: lang for code, lang
|
97
|
98
|
in engines_languages['wikipedia'].iteritems()
|
98
|
|
- if valid_code(code) and lang['articles'] >= 100000})
|
|
99
|
+ if valid_code(code)})
|
99
|
100
|
|
100
|
101
|
for engine_name in engines_languages:
|
101
|
102
|
for locale in engines_languages[engine_name]:
|
|
@@ -104,25 +105,27 @@ def join_language_lists():
|
104
|
105
|
|
105
|
106
|
# if language is not on list or if it has no name yet
|
106
|
107
|
if locale not in languages or not languages[locale].get('name'):
|
107
|
|
- if isinstance(engines_languages[engine_name], dict) \
|
108
|
|
- and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
|
|
108
|
+ if isinstance(engines_languages[engine_name], dict):
|
109
|
109
|
languages[locale] = engines_languages[engine_name][locale]
|
110
|
110
|
else:
|
111
|
111
|
languages[locale] = {}
|
112
|
112
|
|
113
|
113
|
# get locales that have no name or country yet
|
114
|
114
|
for locale in languages.keys():
|
|
115
|
+ # try to get language names
|
115
|
116
|
if not languages[locale].get('name'):
|
116
|
|
- # try to get language names
|
117
|
117
|
name = languages.get(locale.split('-')[0], {}).get('name', None)
|
118
|
118
|
if name:
|
119
|
119
|
languages[locale]['name'] = name
|
120
|
|
- languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
|
121
|
120
|
else:
|
122
|
121
|
# filter out locales with no name
|
123
|
122
|
del languages[locale]
|
124
|
123
|
continue
|
125
|
124
|
|
|
125
|
+ # try to get language name in english
|
|
126
|
+ if not languages[locale].get('english_name'):
|
|
127
|
+ languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
|
|
128
|
+
|
126
|
129
|
# try to get country name
|
127
|
130
|
if locale.find('-') > 0 and not languages[locale].get('country'):
|
128
|
131
|
languages[locale]['country'] = get_country_name(locale) or ''
|
|
@@ -145,10 +148,10 @@ def filter_single_country_languages():
|
145
|
148
|
# Write languages.py.
|
146
|
149
|
def write_languages_file():
|
147
|
150
|
new_file = open(languages_file, 'w')
|
148
|
|
- file_content = '# -*- coding: utf-8 -*-\n'
|
149
|
|
- file_content += '# list of language codes\n'
|
150
|
|
- file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
|
151
|
|
- file_content += '\nlanguage_codes = ('
|
|
151
|
+ file_content = '# -*- coding: utf-8 -*-\n'\
|
|
152
|
+ + '# list of language codes\n'\
|
|
153
|
+ + '# this file is generated automatically by utils/update_search_languages.py\n'\
|
|
154
|
+ + '\nlanguage_codes = ('
|
152
|
155
|
for code in sorted(languages):
|
153
|
156
|
file_content += '\n (u"' + code + '"'\
|
154
|
157
|
+ ', u"' + languages[code]['name'].split(' (')[0] + '"'\
|