Kaynağa Gözat

Add language support for more engines.

marc 8 yıl önce
ebeveyn
işleme
a11948c71b

+ 18
- 0
searx/engines/dailymotion.py Dosyayı Görüntüle

@@ -20,6 +20,24 @@ from datetime import datetime
20 20
 categories = ['videos']
21 21
 paging = True
22 22
 language_support = True
23
+supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
24
+                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
25
+                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
26
+                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
27
+                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
28
+                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
29
+                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
30
+                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
31
+                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
32
+                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
33
+                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
34
+                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
35
+                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
36
+                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
37
+                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
38
+                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
39
+                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
40
+                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
23 41
 
24 42
 # search-url
25 43
 # see http://www.dailymotion.com/doc/api/obj-video.html

+ 1
- 21
searx/engines/duckduckgo.py Dosyayı Görüntüle

@@ -16,7 +16,6 @@
16 16
 from urllib import urlencode
17 17
 from lxml.html import fromstring
18 18
 from searx.engines.xpath import extract_text
19
-from searx.languages import language_codes
20 19
 
21 20
 # engine dependent config
22 21
 categories = ['general']
@@ -76,26 +75,7 @@ def request(query, params):
76 75
         else:
77 76
             # tries to get a country code from language
78 77
             locale = locale[0].lower()
79
-            lang_codes = [x[0] for x in language_codes]
80
-            for lc in lang_codes:
81
-                lc = lc.split('-')
82
-                if locale == lc[0] and len(lc) == 2:
83
-                    locale = lc[1].lower() + '-' + lc[0].lower()
84
-                    break
85
-
86
-    if locale:
87
-        params['url'] = url.format(
88
-            query=urlencode({'q': query, 'kl': locale}), offset=offset)
89
-    else:
90
-        locale = params['language'].split('-')
91
-        if len(locale) == 2:
92
-            # country code goes first
93
-            locale = locale[1].lower() + '-' + locale[0].lower()
94
-        else:
95
-            # tries to get a country code from language
96
-            locale = locale[0].lower()
97
-            lang_codes = [x[0] for x in language_codes]
98
-            for lc in lang_codes:
78
+            for lc in supported_languages:
99 79
                 lc = lc.split('-')
100 80
                 if locale == lc[0]:
101 81
                     locale = lc[1].lower() + '-' + lc[0].lower()

+ 1
- 1
searx/engines/gigablast.py Dosyayı Görüntüle

@@ -44,7 +44,7 @@ supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko
44 44
                        "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
45 45
                        "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
46 46
                        "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
47
-                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"] 
47
+                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
48 48
 
49 49
 
50 50
 # do search-request

+ 14
- 1
searx/engines/qwant.py Dosyayı Görüntüle

@@ -20,6 +20,11 @@ from searx.utils import html_to_text
20 20
 categories = None
21 21
 paging = True
22 22
 language_support = True
23
+supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
24
+                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
25
+                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
26
+                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
27
+                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
23 28
 
24 29
 category_to_keyword = {'general': 'web',
25 30
                        'images': 'images',
@@ -46,7 +51,15 @@ def request(query, params):
46 51
 
47 52
     # add language tag if specified
48 53
     if params['language'] != 'all':
49
-        params['url'] += '&locale=' + params['language'].lower()
54
+        locale = params['language'].split('-')
55
+        if len(locale) == 2 and params['language'] in supported_languages:
56
+            params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
57
+        else:
58
+            # try to get a country code for language
59
+            for lang in supported_languages:
60
+                if locale[0] == lang.split('-')[0]:
61
+                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
62
+                    break
50 63
 
51 64
     return params
52 65
 

+ 5
- 0
searx/engines/startpage.py Dosyayı Görüntüle

@@ -24,6 +24,11 @@ categories = ['general']
24 24
 
25 25
 # paging = False
26 26
 language_support = True
27
+supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
28
+                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
29
+                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
30
+                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
31
+                       "sv", "tl", "th", "tr", "uk", "vi"]
27 32
 
28 33
 # search-url
29 34
 base_url = 'https://startpage.com/'

+ 8
- 0
searx/engines/swisscows.py Dosyayı Görüntüle

@@ -18,6 +18,12 @@ import re
18 18
 categories = ['general', 'images']
19 19
 paging = True
20 20
 language_support = True
21
+supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
22
+                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
23
+                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
24
+                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
25
+                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
26
+                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
21 27
 
22 28
 # search-url
23 29
 base_url = 'https://swisscows.ch/'
@@ -35,6 +41,8 @@ def request(query, params):
35 41
     if params['language'] == 'all':
36 42
         ui_language = 'browser'
37 43
         region = 'browser'
44
+    elif params['language'].split('-')[0] == 'no':
45
+        region = 'nb-NO'
38 46
     else:
39 47
         region = params['language']
40 48
         ui_language = params['language'].split('-')[0]

+ 3
- 1
searx/engines/yandex.py Dosyayı Görüntüle

@@ -22,7 +22,9 @@ language_support = True  # TODO
22 22
 
23 23
 default_tld = 'com'
24 24
 language_map = {'ru': 'ru',
25
-                'ua': 'uk',
25
+                'ua': 'ua',
26
+                'be': 'by',
27
+                'kk': 'kz',
26 28
                 'tr': 'com.tr'}
27 29
 
28 30
 # search-url

+ 3
- 1
searx/languages.py Dosyayı Görüntüle

@@ -100,7 +100,7 @@ language_codes = (
100 100
     (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
101 101
     (u"he-IL", u"עברית", u"", u"Hebrew"),
102 102
     (u"se", u"Sámegiella", u"", u"Northern Sami"),
103
-    (u"sd", u"سنڌي، سندھی ، सिन्ध", u"", u"Sindhi"),
103
+    (u"sd", u"سنڌي ،सिन्ध", u"", u"Sindhi"),
104 104
     (u"fr-CH", u"Français", u"", u"French"),
105 105
     (u"zea", u"Zeêuws", u"", u"Zeelandic"),
106 106
     (u"it-CH", u"Italiano", u"", u"Italian"),
@@ -191,6 +191,7 @@ language_codes = (
191 191
     (u"jam", u"Jamaican Creole English", u"", u"Patois"),
192 192
     (u"udm", u"Удмурт кыл", u"", u"Udmurt"),
193 193
     (u"ksh", u"Ripoarisch", u"", u"Ripuarian"),
194
+    (u"sl-SI", u"Slovenščina", u"", u"Slovenian"),
194 195
     (u"ms-MY", u"Bahasa Melayu", u"", u"Malay"),
195 196
     (u"de", u"Deutsch", u"", u"German"),
196 197
     (u"da", u"Dansk", u"", u"Danish"),
@@ -284,6 +285,7 @@ language_codes = (
284 285
     (u"mhr", u"Олык Марий (Olyk Marij)", u"", u"Meadow Mari"),
285 286
     (u"ca-CT", u"Català", u"", u"Catalan"),
286 287
     (u"en-MY", u"English", u"", u"English"),
288
+    (u"olo", u"Livvi-Karelian", u"", u"Livvinkarjala"),
287 289
     (u"sv-SE", u"Svenska", u"", u"Swedish"),
288 290
     (u"de-AT", u"Deutsch", u"", u"German"),
289 291
     (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),

+ 1
- 1
tests/unit/engines/test_duckduckgo.py Dosyayı Görüntüle

@@ -11,7 +11,7 @@ class TestDuckduckgoEngine(SearxTestCase):
11 11
         query = 'test_query'
12 12
         dicto = defaultdict(dict)
13 13
         dicto['pageno'] = 1
14
-        dicto['language'] = 'de_CH'
14
+        dicto['language'] = 'de-CH'
15 15
         dicto['time_range'] = ''
16 16
         params = duckduckgo.request(query, dicto)
17 17
         self.assertIn('url', params)

+ 4
- 0
tests/unit/engines/test_duckduckgo_definitions.py Dosyayı Görüntüle

@@ -21,10 +21,14 @@ class TestDDGDefinitionsEngine(SearxTestCase):
21 21
         query = 'test_query'
22 22
         dicto = defaultdict(dict)
23 23
         dicto['pageno'] = 1
24
+        dicto['language'] = 'es'
24 25
         params = duckduckgo_definitions.request(query, dicto)
25 26
         self.assertIn('url', params)
26 27
         self.assertIn(query, params['url'])
27 28
         self.assertIn('duckduckgo.com', params['url'])
29
+        self.assertIn('headers', params)
30
+        self.assertIn('Accept-Language', params['headers'])
31
+        self.assertIn('es', params['headers']['Accept-Language'])
28 32
 
29 33
     def test_response(self):
30 34
         self.assertRaises(AttributeError, duckduckgo_definitions.response, None)

+ 1
- 1
tests/unit/engines/test_google.py Dosyayı Görüntüle

@@ -18,7 +18,7 @@ class TestGoogleEngine(SearxTestCase):
18 18
         query = 'test_query'
19 19
         dicto = defaultdict(dict)
20 20
         dicto['pageno'] = 1
21
-        dicto['language'] = 'fr_FR'
21
+        dicto['language'] = 'fr-FR'
22 22
         dicto['time_range'] = ''
23 23
         params = google.request(query, dicto)
24 24
         self.assertIn('url', params)

+ 1
- 1
tests/unit/engines/test_qwant.py Dosyayı Görüntüle

@@ -10,7 +10,7 @@ class TestQwantEngine(SearxTestCase):
10 10
         query = 'test_query'
11 11
         dicto = defaultdict(dict)
12 12
         dicto['pageno'] = 0
13
-        dicto['language'] = 'fr_FR'
13
+        dicto['language'] = 'fr-FR'
14 14
         qwant.categories = ['']
15 15
         params = qwant.request(query, dicto)
16 16
         self.assertIn('url', params)

+ 1
- 1
tests/unit/engines/test_swisscows.py Dosyayı Görüntüle

@@ -10,7 +10,7 @@ class TestSwisscowsEngine(SearxTestCase):
10 10
         query = 'test_query'
11 11
         dicto = defaultdict(dict)
12 12
         dicto['pageno'] = 1
13
-        dicto['language'] = 'de_DE'
13
+        dicto['language'] = 'de-DE'
14 14
         params = swisscows.request(query, dicto)
15 15
         self.assertTrue('url' in params)
16 16
         self.assertTrue(query in params['url'])

+ 1
- 1
tests/unit/engines/test_wikipedia.py Dosyayı Görüntüle

@@ -10,7 +10,7 @@ class TestWikipediaEngine(SearxTestCase):
10 10
     def test_request(self):
11 11
         query = 'test_query'
12 12
         dicto = defaultdict(dict)
13
-        dicto['language'] = 'fr_FR'
13
+        dicto['language'] = 'fr-FR'
14 14
         params = wikipedia.request(query, dicto)
15 15
         self.assertIn('url', params)
16 16
         self.assertIn(query, params['url'])

+ 4
- 9
utils/update_languages.py Dosyayı Görüntüle

@@ -41,7 +41,6 @@ def valid_code(lang_code):
41 41
     if len(lang_code) > 2 or len(lang_code[0]) > 3:
42 42
         return False
43 43
     if len(lang_code) == 2 and len(lang_code[1]) > 2:
44
-        print lang_code
45 44
         return False
46 45
         
47 46
     return True
@@ -62,8 +61,8 @@ def get_wikipedia_languages():
62 61
             english_name = td[1].xpath('./a')[0].text
63 62
             articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
64 63
             
65
-            # exclude languages with few articles and language variants
66
-            if code not in languages and articles >= 100 and valid_code(code):
64
+            # exclude language variants and languages with few articles
65
+            if code not in languages and articles >= 1000 and valid_code(code):
67 66
                 languages[code] = (name, '', english_name)
68 67
 
69 68
 
@@ -90,7 +89,7 @@ def join_language_lists():
90 89
                 # try to get language name
91 90
                 language = languages.get(locale.split('-')[0], None)
92 91
                 if language == None:
93
-                    # print engine_name + ": " + locale
92
+                    print engine_name + ": " + locale
94 93
                     continue
95 94
 
96 95
                 (name, country, english) = language
@@ -117,12 +116,8 @@ def write_languages_file():
117 116
     new_file.close()
118 117
 
119 118
 
120
-def main():
119
+if __name__ == "__main__":
121 120
     get_wikipedia_languages()
122 121
     get_google_languages()
123 122
     join_language_lists()
124 123
     write_languages_file()
125
-
126
-
127
-if __name__ == "__main__":
128
-    main()