Bladeren bron

Add language support for more engines.

marc 8 jaren geleden
bovenliggende
commit
a11948c71b

+ 18
- 0
searx/engines/dailymotion.py Bestand weergeven

20
 categories = ['videos']
20
 categories = ['videos']
21
 paging = True
21
 paging = True
22
 language_support = True
22
 language_support = True
23
+supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
24
+                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
25
+                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
26
+                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
27
+                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
28
+                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
29
+                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
30
+                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
31
+                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
32
+                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
33
+                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
34
+                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
35
+                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
36
+                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
37
+                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
38
+                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
39
+                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
40
+                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
23
 
41
 
24
 # search-url
42
 # search-url
25
 # see http://www.dailymotion.com/doc/api/obj-video.html
43
 # see http://www.dailymotion.com/doc/api/obj-video.html

+ 1
- 21
searx/engines/duckduckgo.py Bestand weergeven

16
 from urllib import urlencode
16
 from urllib import urlencode
17
 from lxml.html import fromstring
17
 from lxml.html import fromstring
18
 from searx.engines.xpath import extract_text
18
 from searx.engines.xpath import extract_text
19
-from searx.languages import language_codes
20
 
19
 
21
 # engine dependent config
20
 # engine dependent config
22
 categories = ['general']
21
 categories = ['general']
76
         else:
75
         else:
77
             # tries to get a country code from language
76
             # tries to get a country code from language
78
             locale = locale[0].lower()
77
             locale = locale[0].lower()
79
-            lang_codes = [x[0] for x in language_codes]
80
-            for lc in lang_codes:
81
-                lc = lc.split('-')
82
-                if locale == lc[0] and len(lc) == 2:
83
-                    locale = lc[1].lower() + '-' + lc[0].lower()
84
-                    break
85
-
86
-    if locale:
87
-        params['url'] = url.format(
88
-            query=urlencode({'q': query, 'kl': locale}), offset=offset)
89
-    else:
90
-        locale = params['language'].split('-')
91
-        if len(locale) == 2:
92
-            # country code goes first
93
-            locale = locale[1].lower() + '-' + locale[0].lower()
94
-        else:
95
-            # tries to get a country code from language
96
-            locale = locale[0].lower()
97
-            lang_codes = [x[0] for x in language_codes]
98
-            for lc in lang_codes:
78
+            for lc in supported_languages:
99
                 lc = lc.split('-')
79
                 lc = lc.split('-')
100
                 if locale == lc[0]:
80
                 if locale == lc[0]:
101
                     locale = lc[1].lower() + '-' + lc[0].lower()
81
                     locale = lc[1].lower() + '-' + lc[0].lower()

+ 1
- 1
searx/engines/gigablast.py Bestand weergeven

44
                        "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
44
                        "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
45
                        "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
45
                        "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
46
                        "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
46
                        "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
47
-                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] 
47
+                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
48
 
48
 
49
 
49
 
50
 # do search-request
50
 # do search-request

+ 14
- 1
searx/engines/qwant.py Bestand weergeven

20
 categories = None
20
 categories = None
21
 paging = True
21
 paging = True
22
 language_support = True
22
 language_support = True
23
+supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
24
+                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
25
+                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
26
+                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
27
+                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
23
 
28
 
24
 category_to_keyword = {'general': 'web',
29
 category_to_keyword = {'general': 'web',
25
                        'images': 'images',
30
                        'images': 'images',
46
 
51
 
47
     # add language tag if specified
52
     # add language tag if specified
48
     if params['language'] != 'all':
53
     if params['language'] != 'all':
49
-        params['url'] += '&locale=' + params['language'].lower()
54
+        locale = params['language'].split('-')
55
+        if len(locale) == 2 and params['language'] in supported_languages:
56
+            params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
57
+        else:
58
+            # try to get a country code for language
59
+            for lang in supported_languages:
60
+                if locale[0] == lang.split('-')[0]:
61
+                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
62
+                    break
50
 
63
 
51
     return params
64
     return params
52
 
65
 

+ 5
- 0
searx/engines/startpage.py Bestand weergeven

24
 
24
 
25
 # paging = False
25
 # paging = False
26
 language_support = True
26
 language_support = True
27
+supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
28
+                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
29
+                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
30
+                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
31
+                       "sv", "tl", "th", "tr", "uk", "vi"]
27
 
32
 
28
 # search-url
33
 # search-url
29
 base_url = 'https://startpage.com/'
34
 base_url = 'https://startpage.com/'

+ 8
- 0
searx/engines/swisscows.py Bestand weergeven

18
 categories = ['general', 'images']
18
 categories = ['general', 'images']
19
 paging = True
19
 paging = True
20
 language_support = True
20
 language_support = True
21
+supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
22
+                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
23
+                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
24
+                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
25
+                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
26
+                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
21
 
27
 
22
 # search-url
28
 # search-url
23
 base_url = 'https://swisscows.ch/'
29
 base_url = 'https://swisscows.ch/'
35
     if params['language'] == 'all':
41
     if params['language'] == 'all':
36
         ui_language = 'browser'
42
         ui_language = 'browser'
37
         region = 'browser'
43
         region = 'browser'
44
+    elif params['language'].split('-')[0] == 'no':
45
+        region = 'nb-NO'
38
     else:
46
     else:
39
         region = params['language']
47
         region = params['language']
40
         ui_language = params['language'].split('-')[0]
48
         ui_language = params['language'].split('-')[0]

+ 3
- 1
searx/engines/yandex.py Bestand weergeven

22
 
22
 
23
 default_tld = 'com'
23
 default_tld = 'com'
24
 language_map = {'ru': 'ru',
24
 language_map = {'ru': 'ru',
25
-                'ua': 'uk',
25
+                'ua': 'ua',
26
+                'be': 'by',
27
+                'kk': 'kz',
26
                 'tr': 'com.tr'}
28
                 'tr': 'com.tr'}
27
 
29
 
28
 # search-url
30
 # search-url

+ 3
- 1
searx/languages.py Bestand weergeven

100
     (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
100
     (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
101
     (u"he-IL", u"עברית", u"", u"Hebrew"),
101
     (u"he-IL", u"עברית", u"", u"Hebrew"),
102
     (u"se", u"Sámegiella", u"", u"Northern Sami"),
102
     (u"se", u"Sámegiella", u"", u"Northern Sami"),
103
-    (u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"),
103
+    (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"),
104
     (u"fr-CH", u"Français", u"", u"French"),
104
     (u"fr-CH", u"Français", u"", u"French"),
105
     (u"zea", u"Zeêuws", u"", u"Zeelandic"),
105
     (u"zea", u"Zeêuws", u"", u"Zeelandic"),
106
     (u"it-CH", u"Italiano", u"", u"Italian"),
106
     (u"it-CH", u"Italiano", u"", u"Italian"),
191
     (u"jam", u"Jamaican Creole English", u"", u"Patois"),
191
     (u"jam", u"Jamaican Creole English", u"", u"Patois"),
192
     (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
192
     (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
193
     (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
193
     (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
194
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
194
     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
195
     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
195
     (u"de", u"Deutsch", u"", u"German"),
196
     (u"de", u"Deutsch", u"", u"German"),
196
     (u"da", u"Dansk", u"", u"Danish"),
197
     (u"da", u"Dansk", u"", u"Danish"),
284
     (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
285
     (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
285
     (u"ca-CT", u"Català", u"", u"Catalan"),
286
     (u"ca-CT", u"Català", u"", u"Catalan"),
286
     (u"en-MY", u"English", u"", u"English"),
287
     (u"en-MY", u"English", u"", u"English"),
288
+    (u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"),
287
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
289
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
288
     (u"de-AT", u"Deutsch", u"", u"German"),
290
     (u"de-AT", u"Deutsch", u"", u"German"),
289
     (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
291
     (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),

+ 1
- 1
tests/unit/engines/test_duckduckgo.py Bestand weergeven

11
         query = 'test_query'
11
         query = 'test_query'
12
         dicto = defaultdict(dict)
12
         dicto = defaultdict(dict)
13
         dicto['pageno'] = 1
13
         dicto['pageno'] = 1
14
-        dicto['language'] = 'de_CH'
14
+        dicto['language'] = 'de-CH'
15
         dicto['time_range'] = ''
15
         dicto['time_range'] = ''
16
         params = duckduckgo.request(query, dicto)
16
         params = duckduckgo.request(query, dicto)
17
         self.assertIn('url', params)
17
         self.assertIn('url', params)

+ 4
- 0
tests/unit/engines/test_duckduckgo_definitions.py Bestand weergeven

21
         query = 'test_query'
21
         query = 'test_query'
22
         dicto = defaultdict(dict)
22
         dicto = defaultdict(dict)
23
         dicto['pageno'] = 1
23
         dicto['pageno'] = 1
24
+        dicto['language'] = 'es'
24
         params = duckduckgo_definitions.request(query, dicto)
25
         params = duckduckgo_definitions.request(query, dicto)
25
         self.assertIn('url', params)
26
         self.assertIn('url', params)
26
         self.assertIn(query, params['url'])
27
         self.assertIn(query, params['url'])
27
         self.assertIn('duckduckgo.com', params['url'])
28
         self.assertIn('duckduckgo.com', params['url'])
29
+        self.assertIn('headers', params)
30
+        self.assertIn('Accept-Language', params['headers'])
31
+        self.assertIn('es', params['headers']['Accept-Language'])
28
 
32
 
29
     def test_response(self):
33
     def test_response(self):
30
         self.assertRaises(AttributeError, duckduckgo_definitions.response, None)
34
         self.assertRaises(AttributeError, duckduckgo_definitions.response, None)

+ 1
- 1
tests/unit/engines/test_google.py Bestand weergeven

18
         query = 'test_query'
18
         query = 'test_query'
19
         dicto = defaultdict(dict)
19
         dicto = defaultdict(dict)
20
         dicto['pageno'] = 1
20
         dicto['pageno'] = 1
21
-        dicto['language'] = 'fr_FR'
21
+        dicto['language'] = 'fr-FR'
22
         dicto['time_range'] = ''
22
         dicto['time_range'] = ''
23
         params = google.request(query, dicto)
23
         params = google.request(query, dicto)
24
         self.assertIn('url', params)
24
         self.assertIn('url', params)

+ 1
- 1
tests/unit/engines/test_qwant.py Bestand weergeven

10
         query = 'test_query'
10
         query = 'test_query'
11
         dicto = defaultdict(dict)
11
         dicto = defaultdict(dict)
12
         dicto['pageno'] = 0
12
         dicto['pageno'] = 0
13
-        dicto['language'] = 'fr_FR'
13
+        dicto['language'] = 'fr-FR'
14
         qwant.categories = ['']
14
         qwant.categories = ['']
15
         params = qwant.request(query, dicto)
15
         params = qwant.request(query, dicto)
16
         self.assertIn('url', params)
16
         self.assertIn('url', params)

+ 1
- 1
tests/unit/engines/test_swisscows.py Bestand weergeven

10
         query = 'test_query'
10
         query = 'test_query'
11
         dicto = defaultdict(dict)
11
         dicto = defaultdict(dict)
12
         dicto['pageno'] = 1
12
         dicto['pageno'] = 1
13
-        dicto['language'] = 'de_DE'
13
+        dicto['language'] = 'de-DE'
14
         params = swisscows.request(query, dicto)
14
         params = swisscows.request(query, dicto)
15
         self.assertTrue('url' in params)
15
         self.assertTrue('url' in params)
16
         self.assertTrue(query in params['url'])
16
         self.assertTrue(query in params['url'])

+ 1
- 1
tests/unit/engines/test_wikipedia.py Bestand weergeven

10
     def test_request(self):
10
     def test_request(self):
11
         query = 'test_query'
11
         query = 'test_query'
12
         dicto = defaultdict(dict)
12
         dicto = defaultdict(dict)
13
-        dicto['language'] = 'fr_FR'
13
+        dicto['language'] = 'fr-FR'
14
         params = wikipedia.request(query, dicto)
14
         params = wikipedia.request(query, dicto)
15
         self.assertIn('url', params)
15
         self.assertIn('url', params)
16
         self.assertIn(query, params['url'])
16
         self.assertIn(query, params['url'])

+ 4
- 9
utils/update_languages.py Bestand weergeven

41
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
41
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
42
         return False
42
         return False
43
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
43
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
44
-        print lang_code
45
         return False
44
         return False
46
         
45
         
47
     return True
46
     return True
62
             english_name = td[1].xpath('./a')[0].text
61
             english_name = td[1].xpath('./a')[0].text
63
             articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
62
             articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
64
             
63
             
65
-            # exclude languages with few articles and language variants
66
-            if code not in languages and articles >= 100 and valid_code(code):
64
+            # exclude language variants and languages with few articles
65
+            if code not in languages and articles >= 1000 and valid_code(code):
67
                 languages[code] = (name, '', english_name)
66
                 languages[code] = (name, '', english_name)
68
 
67
 
69
 
68
 
90
                 # try to get language name
89
                 # try to get language name
91
                 language = languages.get(locale.split('-')[0], None)
90
                 language = languages.get(locale.split('-')[0], None)
92
                 if language == None:
91
                 if language == None:
93
-                    # print engine_name + ": " + locale
92
+                    print engine_name + ": " + locale
94
                     continue
93
                     continue
95
 
94
 
96
                 (name, country, english) = language
95
                 (name, country, english) = language
117
     new_file.close()
116
     new_file.close()
118
 
117
 
119
 
118
 
120
-def main():
119
+if __name__ == "__main__":
121
     get_wikipedia_languages()
120
     get_wikipedia_languages()
122
     get_google_languages()
121
     get_google_languages()
123
     join_language_lists()
122
     join_language_lists()
124
     write_languages_file()
123
     write_languages_file()
125
-
126
-
127
-if __name__ == "__main__":
128
-    main()