Browse Source

minor fixes in utils/fetch_languages.py

marc 8 years ago
parent
commit
4a1ff56389

+ 1
- 1
searx/data/engines_languages.json
File diff suppressed because it is too large
View File


+ 2
- 1
searx/engines/wikipedia.py View File

131
             name = td[2].xpath('./a')[0].text
131
             name = td[2].xpath('./a')[0].text
132
             english_name = td[1].xpath('./a')[0].text
132
             english_name = td[1].xpath('./a')[0].text
133
             articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
133
             articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
134
-            if articles >= 10000:
134
+            # exclude languages with too few articles
135
+            if articles >= 100000:
135
                 supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
136
                 supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
136
 
137
 
137
     return supported_languages
138
     return supported_languages

+ 2
- 2
searx/languages.py View File

124
     (u"war", u"Winaray", u"", u"Waray-Waray"),
124
     (u"war", u"Winaray", u"", u"Waray-Waray"),
125
     (u"xh", u"Xhosa", u"", u"Xhosa"),
125
     (u"xh", u"Xhosa", u"", u"Xhosa"),
126
     (u"zh", u"中文", u"", u"Chinese"),
126
     (u"zh", u"中文", u"", u"Chinese"),
127
-    (u"zh-CN", u"中文", u"中国", u""),
127
+    (u"zh-CN", u"中文", u"中国", u"Chinese"),
128
     (u"zh-HK", u"中文", u"香港", u"Chinese"),
128
     (u"zh-HK", u"中文", u"香港", u"Chinese"),
129
-    (u"zh-TW", u"中文", u"台湾", u""),
129
+    (u"zh-TW", u"中文", u"台湾", u"Chinese"),
130
     (u"zu", u"Isi-Zulu", u"", u"Zulu")
130
     (u"zu", u"Isi-Zulu", u"", u"Zulu")
131
 )
131
 )

+ 2
- 2
searx/templates/oscar/preferences.html View File

172
                                     </td>
172
                                     </td>
173
                                     <th>{{ search_engine.name }}</th>
173
                                     <th>{{ search_engine.name }}</th>
174
 				    <td>{{ shortcuts[search_engine.name] }}</td>
174
 				    <td>{{ shortcuts[search_engine.name] }}</td>
175
-				    <td><input type="checkbox" {{ "checked" if current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
175
+				    <td><input type="checkbox" {{ "checked" if current_language == 'all' and current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
176
 				    <td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
176
 				    <td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
177
 				    <td><input type="checkbox" {{ "checked" if search_engine.time_range_support==True else ""}} readonly="readonly" disabled="disabled"></td>
177
 				    <td><input type="checkbox" {{ "checked" if search_engine.time_range_support==True else ""}} readonly="readonly" disabled="disabled"></td>
178
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
178
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
181
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_timeout'] else '' }}">{{ search_engine.timeout }}</td>
181
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_timeout'] else '' }}">{{ search_engine.timeout }}</td>
182
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
182
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>
183
 				    <td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
183
 				    <td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
184
-				    <td><input type="checkbox" {{ "checked" if current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
184
+				    <td><input type="checkbox" {{ "checked" if current_language == 'all' and current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
185
 				    <td>{{ shortcuts[search_engine.name] }}</td>
185
 				    <td>{{ shortcuts[search_engine.name] }}</td>
186
                                     <th>{{ search_engine.name }}</th>
186
                                     <th>{{ search_engine.name }}</th>
187
                                     <td class="onoff-checkbox">
187
                                     <td class="onoff-checkbox">

+ 26
- 23
utils/fetch_languages.py View File

32
 def valid_code(lang_code):
32
 def valid_code(lang_code):
33
     # filter invalid codes
33
     # filter invalid codes
34
     # sl-SL is technically not invalid, but still a mistake
34
     # sl-SL is technically not invalid, but still a mistake
35
+    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
36
+    invalid_countries = ['UK', 'XA', 'XL']
35
     if lang_code[:2] == 'xx'\
37
     if lang_code[:2] == 'xx'\
36
-       or lang_code == 'sl-SL'\
37
-       or lang_code == 'wt-WT'\
38
-       or lang_code == 'jw'\
39
-       or lang_code[-2:] == 'UK'\
40
-       or lang_code[-2:] == 'XA'\
41
-       or lang_code[-2:] == 'XL':
38
+       or lang_code in invalid_codes\
39
+       or lang_code[-2:] in invalid_countries\
40
+       or is_dialect(lang_code):
42
         return False
41
         return False
43
 
42
 
44
-    # filter dialects
43
+    return True
44
+
45
+
46
+# Language codes with any additional tags other than language and country.
47
+def is_dialect(lang_code):
45
     lang_code = lang_code.split('-')
48
     lang_code = lang_code.split('-')
46
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
49
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
47
-        return False
50
+        return True
48
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
51
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
49
-        return False
52
+        return True
50
 
53
 
51
-    return True
54
+    return False
52
 
55
 
53
 
56
 
54
 # Get country name in specified language.
57
 # Get country name in specified language.
83
                 print e
86
                 print e
84
 
87
 
85
     # write json file
88
     # write json file
86
-    f = io.open(engines_languages_file, "w", encoding="utf-8")
87
-    f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
88
-    f.close()
89
+    with io.open(engines_languages_file, "w", encoding="utf-8") as f:
90
+        f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
89
 
91
 
90
 
92
 
91
 # Join all language lists.
93
 # Join all language lists.
92
 # Iterate all languages supported by each engine.
94
 # Iterate all languages supported by each engine.
93
 def join_language_lists():
95
 def join_language_lists():
94
     # include wikipedia first for more accurate language names
96
     # include wikipedia first for more accurate language names
95
-    # exclude languages with too few articles
96
     languages.update({code: lang for code, lang
97
     languages.update({code: lang for code, lang
97
                       in engines_languages['wikipedia'].iteritems()
98
                       in engines_languages['wikipedia'].iteritems()
98
-                      if valid_code(code) and lang['articles'] >= 100000})
99
+                      if valid_code(code)})
99
 
100
 
100
     for engine_name in engines_languages:
101
     for engine_name in engines_languages:
101
         for locale in engines_languages[engine_name]:
102
         for locale in engines_languages[engine_name]:
104
 
105
 
105
             # if language is not on list or if it has no name yet
106
             # if language is not on list or if it has no name yet
106
             if locale not in languages or not languages[locale].get('name'):
107
             if locale not in languages or not languages[locale].get('name'):
107
-                if isinstance(engines_languages[engine_name], dict) \
108
-                  and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
108
+                if isinstance(engines_languages[engine_name], dict):
109
                     languages[locale] = engines_languages[engine_name][locale]
109
                     languages[locale] = engines_languages[engine_name][locale]
110
                 else:
110
                 else:
111
                     languages[locale] = {}
111
                     languages[locale] = {}
112
 
112
 
113
     # get locales that have no name or country yet
113
     # get locales that have no name or country yet
114
     for locale in languages.keys():
114
     for locale in languages.keys():
115
+        # try to get language names
115
         if not languages[locale].get('name'):
116
         if not languages[locale].get('name'):
116
-            # try to get language names
117
             name = languages.get(locale.split('-')[0], {}).get('name', None)
117
             name = languages.get(locale.split('-')[0], {}).get('name', None)
118
             if name:
118
             if name:
119
                 languages[locale]['name'] = name
119
                 languages[locale]['name'] = name
120
-                languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
121
             else:
120
             else:
122
                 # filter out locales with no name
121
                 # filter out locales with no name
123
                 del languages[locale]
122
                 del languages[locale]
124
                 continue
123
                 continue
125
 
124
 
125
+        # try to get language name in english
126
+        if not languages[locale].get('english_name'):
127
+            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
128
+
126
         # try to get country name
129
         # try to get country name
127
         if locale.find('-') > 0 and not languages[locale].get('country'):
130
         if locale.find('-') > 0 and not languages[locale].get('country'):
128
             languages[locale]['country'] = get_country_name(locale) or ''
131
             languages[locale]['country'] = get_country_name(locale) or ''
145
 # Write languages.py.
148
 # Write languages.py.
146
 def write_languages_file():
149
 def write_languages_file():
147
     new_file = open(languages_file, 'w')
150
     new_file = open(languages_file, 'w')
148
-    file_content = '# -*- coding: utf-8 -*-\n'
149
-    file_content += '# list of language codes\n'
150
-    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
151
-    file_content += '\nlanguage_codes = ('
151
+    file_content = '# -*- coding: utf-8 -*-\n'\
152
+                   + '# list of language codes\n'\
153
+                   + '# this file is generated automatically by utils/update_search_languages.py\n'\
154
+                   + '\nlanguage_codes = ('
152
     for code in sorted(languages):
155
     for code in sorted(languages):
153
         file_content += '\n    (u"' + code + '"'\
156
         file_content += '\n    (u"' + code + '"'\
154
                         + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
157
                         + ', u"' + languages[code]['name'].split(' (')[0] + '"'\