瀏覽代碼

Merge pull request #794 from a01200356/languages

Changes in search language list (continuation of #748)
Adam Tauber 8 年之前
父節點
當前提交
d08108be62

+ 1
- 1
searx/data/engines_languages.json
文件差異過大導致無法顯示
查看文件


+ 7
- 2
searx/engines/gigablast.py 查看文件

@@ -95,8 +95,13 @@ def _fetch_supported_languages(resp):
95 95
     dom = fromstring(resp.text)
96 96
     links = dom.xpath('//span[@id="menu2"]/a')
97 97
     for link in links:
98
-        code = link.xpath('./@href')[0][-2:]
99
-        if code != 'xx' and code not in supported_languages:
98
+        href = link.xpath('./@href')[0].split('lang%3A')
99
+        if len(href) == 2:
100
+            code = href[1].split('_')
101
+            if len(code) == 2:
102
+                code = code[0] + '-' + code[1].upper()
103
+            else:
104
+                code = code[0]
100 105
             supported_languages.append(code)
101 106
 
102 107
     return supported_languages

+ 1
- 1
searx/engines/wikipedia.py 查看文件

@@ -132,7 +132,7 @@ def _fetch_supported_languages(resp):
132 132
             english_name = td[1].xpath('./a')[0].text
133 133
             articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
134 134
             # exclude languages with too few articles
135
-            if articles >= 100000:
135
+            if articles >= 100:
136 136
                 supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
137 137
 
138 138
     return supported_languages

+ 26
- 79
searx/languages.py 查看文件

@@ -3,28 +3,18 @@
3 3
 # this file is generated automatically by utils/update_search_languages.py
4 4
 
5 5
 language_codes = (
6
-    (u"af", u"Afrikaans", u"", u""),
7
-    (u"am", u"አማርኛ", u"", u"Amharic"),
8
-    (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
9
-    (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
10
-    (u"be", u"Беларуская", u"", u"Belarusian"),
11
-    (u"bg-BG", u"Български", u"България", u"Bulgarian"),
12
-    (u"bn", u"বাংলা", u"", u"Bengali"),
13
-    (u"br", u"Brezhoneg", u"", u"Breton"),
14
-    (u"bs", u"Bosnian", u"", u"Bosnian"),
6
+    (u"ar-SA", u"العربية", u"", u"Arabic"),
7
+    (u"bg-BG", u"Български", u"", u"Bulgarian"),
15 8
     (u"ca", u"Català", u"", u"Catalan"),
16 9
     (u"ca-CT", u"Català", u"", u"Catalan"),
17 10
     (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
18
-    (u"ce", u"Нохчийн", u"", u"Chechen"),
19
-    (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
20
-    (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
21
-    (u"cy", u"Cymraeg", u"", u"Welsh"),
22
-    (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
11
+    (u"cs-CZ", u"Čeština", u"", u"Czech"),
12
+    (u"da-DK", u"Dansk", u"", u"Danish"),
23 13
     (u"de", u"Deutsch", u"", u"German"),
24 14
     (u"de-AT", u"Deutsch", u"Österreich", u"German"),
25 15
     (u"de-CH", u"Deutsch", u"Schweiz", u"German"),
26 16
     (u"de-DE", u"Deutsch", u"Deutschland", u"German"),
27
-    (u"el-GR", u"Ελληνικά", u"Ελλάδα", u"Greek"),
17
+    (u"el-GR", u"Ελληνικά", u"", u"Greek"),
28 18
     (u"en", u"English", u"", u"English"),
29 19
     (u"en-AU", u"English", u"Australia", u"English"),
30 20
     (u"en-CA", u"English", u"Canada", u"English"),
@@ -38,7 +28,6 @@ language_codes = (
38 28
     (u"en-SG", u"English", u"Singapore", u"English"),
39 29
     (u"en-US", u"English", u"United States", u"English"),
40 30
     (u"en-ZA", u"English", u"South Africa", u"English"),
41
-    (u"eo", u"Esperanto", u"", u"Esperanto"),
42 31
     (u"es", u"Español", u"", u"Spanish"),
43 32
     (u"es-AR", u"Español", u"Argentina", u"Spanish"),
44 33
     (u"es-CL", u"Español", u"Chile", u"Spanish"),
@@ -47,85 +36,43 @@ language_codes = (
47 36
     (u"es-MX", u"Español", u"México", u"Spanish"),
48 37
     (u"es-PE", u"Español", u"Perú", u"Spanish"),
49 38
     (u"es-US", u"Español", u"Estados Unidos", u"Spanish"),
50
-    (u"et-EE", u"Eesti", u"Eesti", u"Estonian"),
51
-    (u"eu", u"Euskara", u"", u"Basque"),
52
-    (u"fa", u"فارسی", u"", u"Persian"),
53
-    (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
39
+    (u"et-EE", u"Eesti", u"", u"Estonian"),
40
+    (u"fi-FI", u"Suomi", u"", u"Finnish"),
54 41
     (u"fr", u"Français", u"", u"French"),
55 42
     (u"fr-BE", u"Français", u"Belgique", u"French"),
56 43
     (u"fr-CA", u"Français", u"Canada", u"French"),
57 44
     (u"fr-CH", u"Français", u"Suisse", u"French"),
58 45
     (u"fr-FR", u"Français", u"France", u"French"),
59
-    (u"ga", u"Gaeilge", u"", u"Irish"),
60
-    (u"gl", u"Galego", u"", u"Galician"),
61
-    (u"gu", u"ગુજરાતી", u"", u"Gujarati"),
62
-    (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
63
-    (u"hi", u"हिन्दी", u"", u"Hindi"),
64
-    (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
65
-    (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
66
-    (u"hy", u"Հայերեն", u"", u"Armenian"),
67
-    (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
68
-    (u"is", u"Íslenska", u"", u""),
46
+    (u"he-IL", u"עברית", u"", u"Hebrew"),
47
+    (u"hr-HR", u"Hrvatski", u"", u"Croatian"),
48
+    (u"hu-HU", u"Magyar", u"", u"Hungarian"),
49
+    (u"id-ID", u"Bahasa Indonesia", u"", u"Indonesian"),
69 50
     (u"it", u"Italiano", u"", u"Italian"),
70 51
     (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
71 52
     (u"it-IT", u"Italiano", u"Italia", u"Italian"),
72
-    (u"iw", u"עברית", u"", u""),
73
-    (u"ja-JP", u"日本語", u"日本", u"Japanese"),
74
-    (u"ka", u"ქართული", u"", u"Georgian"),
75
-    (u"kk", u"Қазақша", u"", u"Kazakh"),
76
-    (u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
77
-    (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
78
-    (u"la", u"Latina", u"", u"Latin"),
79
-    (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
80
-    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
81
-    (u"mi", u"Reo Māori", u"", u"Maori"),
82
-    (u"min", u"Minangkabau", u"", u"Minangkabau"),
83
-    (u"mk", u"Македонски", u"", u"Macedonian"),
84
-    (u"mn", u"Монгол", u"", u"Mongolian"),
85
-    (u"mr", u"मराठी", u"", u"Marathi"),
86
-    (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
87
-    (u"mt", u"Malti", u"", u"Maltese"),
88
-    (u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
53
+    (u"ja-JP", u"日本語", u"", u"Japanese"),
54
+    (u"ko-KR", u"한국어", u"", u"Korean"),
55
+    (u"lt-LT", u"Lietuvių", u"", u"Lithuanian"),
56
+    (u"lv-LV", u"Latviešu", u"", u"Latvian"),
89 57
     (u"nl", u"Nederlands", u"", u"Dutch"),
90 58
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
91 59
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
92
-    (u"nn", u"Nynorsk", u"", u"Norwegian"),
93
-    (u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
94
-    (u"oc", u"Occitan", u"", u"Occitan"),
95
-    (u"or", u"Oriya", u"", u"Oriya"),
96
-    (u"pa", u"ਪੰਜਾਬੀ", u"", u"Panjabi"),
97
-    (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
98
-    (u"ps", u"Pushto", u"", u"Pushto"),
60
+    (u"no-NO", u"Norsk", u"", u"Norwegian"),
61
+    (u"pl-PL", u"Polski", u"", u"Polish"),
99 62
     (u"pt", u"Português", u"", u"Portuguese"),
100 63
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
101 64
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
102
-    (u"ro-RO", u"Română", u"România", u"Romanian"),
103
-    (u"ru-RU", u"Русский", u"Россия", u"Russian"),
104
-    (u"rw", u"Ikinyarwanda", u"", u"Kinyarwanda"),
105
-    (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
106
-    (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
65
+    (u"ro-RO", u"Română", u"", u"Romanian"),
66
+    (u"ru-RU", u"Русский", u"", u"Russian"),
67
+    (u"sk-SK", u"Slovenčina", u"", u"Slovak"),
107 68
     (u"sl", u"Slovenščina", u"", u"Slovenian"),
108
-    (u"sr", u"Српски / Srpski", u"", u"Serbian"),
109
-    (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
110
-    (u"sw", u"Kiswahili", u"", u""),
111
-    (u"ta", u"தமிழ்", u"", u"Tamil"),
112
-    (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
113
-    (u"ti", u"ትግርኛ", u"", u"Tigrinya"),
114
-    (u"tl-PH", u"Filipino", u"Pilipinas", u""),
115
-    (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
116
-    (u"tt", u"Татарча", u"", u"Tatar"),
117
-    (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
118
-    (u"ur", u"اردو", u"", u"Urdu"),
119
-    (u"uz", u"O‘zbek", u"", u"Uzbek"),
120
-    (u"ve", u"Venda", u"", u"Venda"),
121
-    (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
122
-    (u"vo", u"Volapük", u"", u"Volapük"),
123
-    (u"wa", u"Walon", u"", u"Walloon"),
124
-    (u"war", u"Winaray", u"", u"Waray-Waray"),
125
-    (u"xh", u"Xhosa", u"", u"Xhosa"),
69
+    (u"sv-SE", u"Svenska", u"", u"Swedish"),
70
+    (u"th-TH", u"ไทย", u"", u"Thai"),
71
+    (u"tr-TR", u"Türkçe", u"", u"Turkish"),
72
+    (u"uk-UA", u"Українська", u"", u"Ukrainian"),
73
+    (u"vi-VN", u"Tiếng Việt", u"", u"Vietnamese"),
126 74
     (u"zh", u"中文", u"", u"Chinese"),
127 75
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
128 76
     (u"zh-HK", u"中文", u"香港", u"Chinese"),
129
-    (u"zh-TW", u"中文", u"台湾", u"Chinese"),
130
-    (u"zu", u"Isi-Zulu", u"", u"Zulu")
77
+    (u"zh-TW", u"中文", u"台湾", u"Chinese")
131 78
 )

+ 7
- 0
searx/query.py 查看文件

@@ -24,6 +24,8 @@ from searx.engines import (
24 24
 import string
25 25
 import re
26 26
 
27
+VALID_LANGUAGE_CODE = re.compile(r'^[a-z]{2,3}(\-[A-Z]{2})?$')
28
+
27 29
 
28 30
 class RawTextQuery(object):
29 31
     """parse raw text query (the value from the html input)"""
@@ -68,6 +70,11 @@ class RawTextQuery(object):
68 70
             if query_part[0] == ':':
69 71
                 lang = query_part[1:].lower()
70 72
 
73
+                # user may set a valid, yet not selectable language
74
+                if VALID_LANGUAGE_CODE.match(lang):
75
+                    self.languages.append(lang)
76
+                    parse_next = True
77
+
71 78
                 # check if any language-code is equal with
72 79
                 # declared language-codes
73 80
                 for lc in language_codes:

+ 3
- 3
searx/templates/oscar/preferences.html 查看文件

@@ -148,7 +148,7 @@
148 148
 				    <th>{{ _("Allow") }}</th>
149 149
 				    <th>{{ _("Engine name") }}</th>
150 150
 				    <th>{{ _("Shortcut") }}</th>
151
-				    <th>{{ _("Language support") }}</th>
151
+				    <th>{{ _("Supports selected language") }}</th>
152 152
 				    <th>{{ _("SafeSearch") }}</th>
153 153
 				    <th>{{ _("Time range") }}</th>
154 154
 				    <th>{{ _("Avg. time") }}</th>
@@ -157,7 +157,7 @@
157 157
 				    <th>{{ _("Max time") }}</th>
158 158
 				    <th>{{ _("Avg. time") }}</th>
159 159
 				    <th>{{ _("SafeSearch") }}</th>
160
-				    <th>{{ _("Language support") }}</th>
160
+				    <th>{{ _("Supports selected language") }}</th>
161 161
 				    <th>{{ _("Shortcut") }}</th>
162 162
 				    <th>{{ _("Engine name") }}</th>
163 163
 				    <th>{{ _("Allow") }}</th>
@@ -172,7 +172,7 @@
172 172
                                     </td>
173 173
                                     <th>{{ search_engine.name }}</th>
174 174
 				    <td>{{ shortcuts[search_engine.name] }}</td>
175
-				    <td><input type="checkbox" {{ "checked" if search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
175
+				    <td><input type="checkbox" {{ "checked" if current_language == 'all' or current_language in search_engine.supported_languages or current_language.split('-')[0] in search_engine.supported_languages else ""}} readonly="readonly" disabled="disabled"></td>
176 176
 				    <td><input type="checkbox" {{ "checked" if search_engine.safesearch==True else ""}} readonly="readonly" disabled="disabled"></td>
177 177
 				    <td><input type="checkbox" {{ "checked" if search_engine.time_range_support==True else ""}} readonly="readonly" disabled="disabled"></td>
178 178
 				    <td class="{{ 'danger' if stats[search_engine.name]['warn_time'] else '' }}">{{ 'N/A' if stats[search_engine.name].time==None else stats[search_engine.name].time }}</td>

+ 2
- 2
tests/robot/test_basic.robot 查看文件

@@ -102,10 +102,10 @@ Change search language
102 102
     Page Should Contain  preferences
103 103
     Go To  http://localhost:11111/preferences
104 104
     List Selection Should Be  language  Default language
105
-    Select From List  language  Türkçe (Türkiye) - tr-TR
105
+    Select From List  language  Türkçe - tr-TR
106 106
     Submit Preferences
107 107
     Go To  http://localhost:11111/preferences
108
-    List Selection Should Be  language  Türkçe (Türkiye) - tr-TR
108
+    List Selection Should Be  language  Türkçe - tr-TR
109 109
 
110 110
 Change autocomplete
111 111
     Page Should Contain  about

+ 4
- 2
tests/unit/engines/test_gigablast.py 查看文件

@@ -103,7 +103,9 @@ class TestGigablastEngine(SearxTestCase):
103 103
                 <span id="menu2">
104 104
                     <a href="/search?&rxikd=1&qlang=xx"></a>
105 105
                     <a href="/search?&rxikd=1&qlang=en"></a>
106
-                    <a href="/search?&rxikd=1&qlang=fr"></a>
106
+                    <a href="/search?&rxikd=1&prepend=gblang%3Aen"></a>
107
+                    <a href="/search?&rxikd=1&qlang=zh_"></a>
108
+                    <a href="/search?&rxikd=1&prepend=gblang%3Azh_tw"></a>
107 109
                 </span>
108 110
             </body>
109 111
         </html>
@@ -113,4 +115,4 @@ class TestGigablastEngine(SearxTestCase):
113 115
         self.assertEqual(type(languages), list)
114 116
         self.assertEqual(len(languages), 2)
115 117
         self.assertIn('en', languages)
116
-        self.assertIn('fr', languages)
118
+        self.assertIn('zh-TW', languages)

+ 29
- 13
utils/fetch_languages.py 查看文件

@@ -25,7 +25,6 @@ engines_languages_file = 'engines_languages.json'
25 25
 languages_file = 'languages.py'
26 26
 
27 27
 engines_languages = {}
28
-languages = {}
29 28
 
30 29
 
31 30
 # To filter out invalid codes and dialects.
@@ -93,22 +92,36 @@ def fetch_supported_languages():
93 92
 # Join all language lists.
94 93
 # Iterate all languages supported by each engine.
95 94
 def join_language_lists():
95
+    global languages
96 96
     # include wikipedia first for more accurate language names
97
-    languages.update({code: lang for code, lang
98
-                      in engines_languages['wikipedia'].iteritems()
99
-                      if valid_code(code)})
97
+    languages = {code: lang for code, lang
98
+                 in engines_languages['wikipedia'].iteritems()
99
+                 if valid_code(code)}
100 100
 
101 101
     for engine_name in engines_languages:
102 102
         for locale in engines_languages[engine_name]:
103
-            if not valid_code(locale):
104
-                continue
105
-
106
-            # if language is not on list or if it has no name yet
107
-            if locale not in languages or not languages[locale].get('name'):
108
-                if isinstance(engines_languages[engine_name], dict):
109
-                    languages[locale] = engines_languages[engine_name][locale]
110
-                else:
111
-                    languages[locale] = {}
103
+            if valid_code(locale):
104
+                # if language is not on list or if it has no name yet
105
+                if locale not in languages or not languages[locale].get('name'):
106
+                    if isinstance(engines_languages[engine_name], dict):
107
+                        languages[locale] = engines_languages[engine_name][locale]
108
+                    else:
109
+                        languages[locale] = {}
110
+
111
+            # add to counter of engines that support given language
112
+            lang = locale.split('-')[0]
113
+            if lang in languages:
114
+                if 'counter' not in languages[lang]:
115
+                    languages[lang]['counter'] = [engine_name]
116
+                elif engine_name not in languages[lang]['counter']:
117
+                    languages[lang]['counter'].append(engine_name)
118
+
119
+    # filter list to include only languages supported by most engines
120
+    min_supported_engines = int(0.75 * len(engines_languages))
121
+    languages = {code: lang for code, lang
122
+                 in languages.iteritems()
123
+                 if len(lang.get('counter', [])) >= min_supported_engines or
124
+                 len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
112 125
 
113 126
     # get locales that have no name or country yet
114 127
     for locale in languages.keys():
@@ -134,6 +147,7 @@ def join_language_lists():
134 147
 # Remove countryless language if language is featured in only one country.
135 148
 def filter_single_country_languages():
136 149
     prev_lang = None
150
+    prev_code = None
137 151
     for code in sorted(languages):
138 152
         lang = code.split('-')[0]
139 153
         if lang == prev_lang:
@@ -141,8 +155,10 @@ def filter_single_country_languages():
141 155
         else:
142 156
             if prev_lang is not None and countries == 1:
143 157
                 del languages[prev_lang]
158
+                languages[prev_code]['country'] = ''
144 159
             countries = 0
145 160
             prev_lang = lang
161
+        prev_code = code
146 162
 
147 163
 
148 164
 # Write languages.py.