Ver código fonte

[mod] fetch supported languages for several engines

utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.
marc 8 anos atrás
pai
commit
f62ce21f50

+ 3256
- 0
searx/data/engines_languages.json
Diferenças do arquivo suprimidas por serem muito extensas
Ver arquivo


+ 6
- 0
searx/engines/__init__.py Ver arquivo

20
 import sys
20
 import sys
21
 from flask_babel import gettext
21
 from flask_babel import gettext
22
 from operator import itemgetter
22
 from operator import itemgetter
23
+from json import loads
23
 from searx import settings
24
 from searx import settings
24
 from searx import logger
25
 from searx import logger
25
 from searx.utils import load_module
26
 from searx.utils import load_module
78
         if not hasattr(engine, arg_name):
79
         if not hasattr(engine, arg_name):
79
             setattr(engine, arg_name, arg_value)
80
             setattr(engine, arg_name, arg_value)
80
 
81
 
82
+    if engine_data['name'] in languages:
83
+        setattr(engine, 'supported_languages', languages[engine_data['name']])
84
+
81
     # checking required variables
85
     # checking required variables
82
     for engine_attr in dir(engine):
86
     for engine_attr in dir(engine):
83
         if engine_attr.startswith('_'):
87
         if engine_attr.startswith('_'):
207
     logger.error('No engines found. Edit your settings.yml')
211
     logger.error('No engines found. Edit your settings.yml')
208
     exit(2)
212
     exit(2)
209
 
213
 
214
+languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
215
+
210
 for engine_data in settings['engines']:
216
 for engine_data in settings['engines']:
211
     engine = load_engine(engine_data)
217
     engine = load_engine(engine_data)
212
     if engine is not None:
218
     if engine is not None:

+ 15
- 0
searx/engines/bing.py Ver arquivo

15
 
15
 
16
 from urllib import urlencode
16
 from urllib import urlencode
17
 from lxml import html
17
 from lxml import html
18
+from requests import get
18
 from searx.engines.xpath import extract_text
19
 from searx.engines.xpath import extract_text
19
 
20
 
20
 # engine dependent config
21
 # engine dependent config
21
 categories = ['general']
22
 categories = ['general']
22
 paging = True
23
 paging = True
23
 language_support = True
24
 language_support = True
25
+supported_languages_url = 'https://www.bing.com/account/general'
24
 
26
 
25
 # search-url
27
 # search-url
26
 base_url = 'https://www.bing.com/'
28
 base_url = 'https://www.bing.com/'
81
 
83
 
82
     # return results
84
     # return results
83
     return results
85
     return results
86
+
87
+
88
+# get supported languages from their site
89
+def fetch_supported_languages():
90
+    supported_languages = []
91
+    response = get(supported_languages_url)
92
+    dom = html.fromstring(response.text)
93
+    options = dom.xpath('//div[@id="limit-languages"]//input')
94
+    for option in options:
95
+        code = option.xpath('./@id')[0].replace('_', '-')
96
+        supported_languages.append(code)
97
+
98
+    return supported_languages

+ 1
- 1
searx/engines/bing_images.py Ver arquivo

19
 from lxml import html
19
 from lxml import html
20
 from json import loads
20
 from json import loads
21
 import re
21
 import re
22
-from searx.engines.bing import supported_languages
22
+from searx.engines.bing import fetch_supported_languages
23
 
23
 
24
 # engine dependent config
24
 # engine dependent config
25
 categories = ['images']
25
 categories = ['images']

+ 1
- 1
searx/engines/bing_news.py Ver arquivo

17
 from dateutil import parser
17
 from dateutil import parser
18
 from lxml import etree
18
 from lxml import etree
19
 from searx.utils import list_get
19
 from searx.utils import list_get
20
-from searx.engines.bing import supported_languages
20
+from searx.engines.bing import fetch_supported_languages
21
 
21
 
22
 # engine dependent config
22
 # engine dependent config
23
 categories = ['news']
23
 categories = ['news']

+ 23
- 18
searx/engines/dailymotion.py Ver arquivo

15
 from urllib import urlencode
15
 from urllib import urlencode
16
 from json import loads
16
 from json import loads
17
 from datetime import datetime
17
 from datetime import datetime
18
+from requests import get
18
 
19
 
19
 # engine dependent config
20
 # engine dependent config
20
 categories = ['videos']
21
 categories = ['videos']
21
 paging = True
22
 paging = True
22
 language_support = True
23
 language_support = True
23
-supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
24
-                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
25
-                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
26
-                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
27
-                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
28
-                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
29
-                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
30
-                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
31
-                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
32
-                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
33
-                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
34
-                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
35
-                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
36
-                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
37
-                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
38
-                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
39
-                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
40
-                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
41
 
24
 
42
 # search-url
25
 # search-url
43
 # see http://www.dailymotion.com/doc/api/obj-video.html
26
 # see http://www.dailymotion.com/doc/api/obj-video.html
45
 embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
28
 embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
46
     'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
29
     'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
47
 
30
 
31
+supported_languages_url = 'https://api.dailymotion.com/languages'
32
+
48
 
33
 
49
 # do search-request
34
 # do search-request
50
 def request(query, params):
35
 def request(query, params):
92
 
77
 
93
     # return results
78
     # return results
94
     return results
79
     return results
80
+
81
+
82
+# get supported languages from their site
83
+def fetch_supported_languages():
84
+    supported_languages = {}
85
+
86
+    response = get(supported_languages_url)
87
+    response_json = loads(response.text)
88
+
89
+    for language in response_json['list']:
90
+        supported_languages[language['code']] = {}
91
+
92
+        name = language['native_name']
93
+        if name:
94
+            supported_languages[language['code']]['name'] = name
95
+        english_name = language['name']
96
+        if english_name:
97
+            supported_languages[language['code']]['english_name'] = english_name
98
+
99
+    return supported_languages

+ 18
- 9
searx/engines/duckduckgo.py Ver arquivo

15
 
15
 
16
 from urllib import urlencode
16
 from urllib import urlencode
17
 from lxml.html import fromstring
17
 from lxml.html import fromstring
18
+from requests import get
19
+from json import loads
18
 from searx.engines.xpath import extract_text
20
 from searx.engines.xpath import extract_text
19
 
21
 
20
 # engine dependent config
22
 # engine dependent config
21
 categories = ['general']
23
 categories = ['general']
22
 paging = True
24
 paging = True
23
 language_support = True
25
 language_support = True
24
-supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
25
-                       "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
26
-                       "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
27
-                       "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
28
-                       "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
29
-                       "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
30
-                       "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
26
+supported_languages_url = 'https://duckduckgo.com/d2030.js'
31
 time_range_support = True
27
 time_range_support = True
32
 
28
 
33
 # search-url
29
 # search-url
65
         locale = 'xa' + params['language'].split('-')[0]
61
         locale = 'xa' + params['language'].split('-')[0]
66
     elif params['language'][-2:] == 'GB':
62
     elif params['language'][-2:] == 'GB':
67
         locale = 'uk' + params['language'].split('-')[0]
63
         locale = 'uk' + params['language'].split('-')[0]
68
-    elif params['language'] == 'es-419':
69
-        locale = 'xl-es'
70
     else:
64
     else:
71
         locale = params['language'].split('-')
65
         locale = params['language'].split('-')
72
         if len(locale) == 2:
66
         if len(locale) == 2:
120
 
114
 
121
     # return results
115
     # return results
122
     return results
116
     return results
117
+
118
+
119
+# get supported languages from their site
120
+def fetch_supported_languages():
121
+    response = get(supported_languages_url)
122
+
123
+    # response is a js file with regions as an embedded object
124
+    response_page = response.text
125
+    response_page = response_page[response_page.find('regions:{') + 8:]
126
+    response_page = response_page[:response_page.find('}') + 1]
127
+
128
+    regions_json = loads(response_page)
129
+    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
130
+
131
+    return supported_languages

+ 1
- 1
searx/engines/duckduckgo_definitions.py Ver arquivo

4
 from lxml import html
4
 from lxml import html
5
 from searx.utils import html_to_text
5
 from searx.utils import html_to_text
6
 from searx.engines.xpath import extract_text
6
 from searx.engines.xpath import extract_text
7
-from searx.engines.duckduckgo import supported_languages
7
+from searx.engines.duckduckgo import fetch_supported_languages
8
 
8
 
9
 url = 'https://api.duckduckgo.com/'\
9
 url = 'https://api.duckduckgo.com/'\
10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'

+ 17
- 5
searx/engines/gigablast.py Ver arquivo

14
 from random import randint
14
 from random import randint
15
 from time import time
15
 from time import time
16
 from urllib import urlencode
16
 from urllib import urlencode
17
+from requests import get
18
+from lxml.html import fromstring
17
 
19
 
18
 # engine dependent config
20
 # engine dependent config
19
 categories = ['general']
21
 categories = ['general']
40
 title_xpath = './/title'
42
 title_xpath = './/title'
41
 content_xpath = './/sum'
43
 content_xpath = './/sum'
42
 
44
 
43
-supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
44
-                       "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
45
-                       "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
46
-                       "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
47
-                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
45
+supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
48
 
46
 
49
 
47
 
50
 # do search-request
48
 # do search-request
90
 
88
 
91
     # return results
89
     # return results
92
     return results
90
     return results
91
+
92
+
93
+# get supported languages from their site
94
+def fetch_supported_languages():
95
+    supported_languages = []
96
+    response = get(supported_languages_url)
97
+    dom = fromstring(response.text)
98
+    links = dom.xpath('//span[@id="menu2"]/a')
99
+    for link in links:
100
+        code = link.xpath('./@href')[0][-2:]
101
+        if code != 'xx' and code not in supported_languages:
102
+            supported_languages.append(code)
103
+
104
+    return supported_languages

+ 16
- 14
searx/engines/google.py Ver arquivo

12
 from urllib import urlencode
12
 from urllib import urlencode
13
 from urlparse import urlparse, parse_qsl
13
 from urlparse import urlparse, parse_qsl
14
 from lxml import html, etree
14
 from lxml import html, etree
15
+from requests import get
15
 from searx.engines.xpath import extract_text, extract_url
16
 from searx.engines.xpath import extract_text, extract_url
16
 from searx.search import logger
17
 from searx.search import logger
17
 
18
 
23
 paging = True
24
 paging = True
24
 language_support = True
25
 language_support = True
25
 use_locale_domain = True
26
 use_locale_domain = True
26
-supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
27
-                       "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
28
-                       "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
29
-                       "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
30
-                       "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
31
-                       "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
32
-                       "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
33
-                       "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
34
-                       "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
35
-                       "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
36
-                       "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
37
-                       "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
38
-                       "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
39
-                       "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
40
 time_range_support = True
27
 time_range_support = True
41
 
28
 
42
 # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
29
 # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
117
 maps_path = '/maps'
104
 maps_path = '/maps'
118
 redirect_path = '/url'
105
 redirect_path = '/url'
119
 images_path = '/images'
106
 images_path = '/images'
107
+supported_languages_url = 'https://www.google.com/preferences?#languages'
120
 
108
 
121
 # specific xpath variables
109
 # specific xpath variables
122
 results_xpath = '//div[@class="g"]'
110
 results_xpath = '//div[@class="g"]'
373
         retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
361
         retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
374
     retval = retval + '</table>'
362
     retval = retval + '</table>'
375
     return retval
363
     return retval
364
+
365
+
366
+# get supported languages from their site
367
+def fetch_supported_languages():
368
+    supported_languages = {}
369
+    response = get(supported_languages_url)
370
+    dom = html.fromstring(response.text)
371
+    options = dom.xpath('//select[@name="hl"]/option')
372
+    for option in options:
373
+        code = option.xpath('./@value')[0].split('-')[0]
374
+        name = option.text[:-1].title()
375
+        supported_languages[code] = {"name": name}
376
+
377
+    return supported_languages

+ 1
- 1
searx/engines/google_news.py Ver arquivo

13
 from lxml import html
13
 from lxml import html
14
 from urllib import urlencode
14
 from urllib import urlencode
15
 from json import loads
15
 from json import loads
16
-from searx.engines.google import supported_languages
16
+from searx.engines.google import fetch_supported_languages
17
 
17
 
18
 # search-url
18
 # search-url
19
 categories = ['news']
19
 categories = ['news']

+ 0
- 1
searx/engines/mediawiki.py Ver arquivo

15
 from json import loads
15
 from json import loads
16
 from string import Formatter
16
 from string import Formatter
17
 from urllib import urlencode, quote
17
 from urllib import urlencode, quote
18
-from searx.engines.wikipedia import supported_languages
19
 
18
 
20
 # engine dependent config
19
 # engine dependent config
21
 categories = ['general']
20
 categories = ['general']

+ 1
- 14
searx/engines/qwant.py Ver arquivo

20
 categories = None
20
 categories = None
21
 paging = True
21
 paging = True
22
 language_support = True
22
 language_support = True
23
-supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
24
-                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
25
-                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
26
-                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
27
-                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
28
 
23
 
29
 category_to_keyword = {'general': 'web',
24
 category_to_keyword = {'general': 'web',
30
                        'images': 'images',
25
                        'images': 'images',
51
 
46
 
52
     # add language tag if specified
47
     # add language tag if specified
53
     if params['language'] != 'all':
48
     if params['language'] != 'all':
54
-        locale = params['language'].split('-')
55
-        if len(locale) == 2 and params['language'] in supported_languages:
56
-            params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
57
-        else:
58
-            # try to get a country code for language
59
-            for lang in supported_languages:
60
-                if locale[0] == lang.split('-')[0]:
61
-                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
62
-                    break
49
+        params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
63
 
50
 
64
     return params
51
     return params
65
 
52
 

+ 0
- 5
searx/engines/startpage.py Ver arquivo

24
 
24
 
25
 # paging = False
25
 # paging = False
26
 language_support = True
26
 language_support = True
27
-supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
28
-                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
29
-                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
30
-                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
31
-                       "sv", "tl", "th", "tr", "uk", "vi"]
32
 
27
 
33
 # search-url
28
 # search-url
34
 base_url = 'https://startpage.com/'
29
 base_url = 'https://startpage.com/'

+ 3
- 2
searx/engines/subtitleseeker.py Ver arquivo

22
 
22
 
23
 # search-url
23
 # search-url
24
 url = 'http://www.subtitleseeker.com/'
24
 url = 'http://www.subtitleseeker.com/'
25
-search_url = url + 'search/TITLES/{query}&p={pageno}'
25
+search_url = url + 'search/TITLES/{query}?p={pageno}'
26
 
26
 
27
 # specific xpath variables
27
 # specific xpath variables
28
 results_xpath = '//div[@class="boxRows"]'
28
 results_xpath = '//div[@class="boxRows"]'
51
     elif resp.search_params['language'] != 'all':
51
     elif resp.search_params['language'] != 'all':
52
         search_lang = [lc[3]
52
         search_lang = [lc[3]
53
                        for lc in language_codes
53
                        for lc in language_codes
54
-                       if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
54
+                       if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
55
+        search_lang = search_lang[0].split(' (')[0]
55
 
56
 
56
     # parse results
57
     # parse results
57
     for result in dom.xpath(results_xpath):
58
     for result in dom.xpath(results_xpath):

+ 15
- 6
searx/engines/swisscows.py Ver arquivo

13
 from json import loads
13
 from json import loads
14
 from urllib import urlencode, unquote
14
 from urllib import urlencode, unquote
15
 import re
15
 import re
16
+from requests import get
17
+from lxml.html import fromstring
16
 
18
 
17
 # engine dependent config
19
 # engine dependent config
18
 categories = ['general', 'images']
20
 categories = ['general', 'images']
19
 paging = True
21
 paging = True
20
 language_support = True
22
 language_support = True
21
-supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
22
-                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
23
-                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
24
-                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
25
-                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
26
-                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
27
 
23
 
28
 # search-url
24
 # search-url
29
 base_url = 'https://swisscows.ch/'
25
 base_url = 'https://swisscows.ch/'
114
 
110
 
115
     # return results
111
     # return results
116
     return results
112
     return results
113
+
114
+
115
+# get supported languages from their site
116
+def fetch_supported_languages():
117
+    supported_languages = []
118
+    response = get(base_url)
119
+    dom = fromstring(response.text)
120
+    options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
121
+    for option in options:
122
+        code = option.xpath('./@data-val')[0]
123
+        supported_languages.append(code)
124
+
125
+    return supported_languages

+ 3
- 3
searx/engines/wikidata.py Ver arquivo

15
 from searx.poolrequests import get
15
 from searx.poolrequests import get
16
 from searx.engines.xpath import extract_text
16
 from searx.engines.xpath import extract_text
17
 from searx.utils import format_date_by_locale
17
 from searx.utils import format_date_by_locale
18
-from searx.engines.wikipedia import supported_languages
18
+from searx.engines.wikipedia import fetch_supported_languages
19
 
19
 
20
 from json import loads
20
 from json import loads
21
 from lxml.html import fromstring
21
 from lxml.html import fromstring
57
 
57
 
58
 
58
 
59
 def request(query, params):
59
 def request(query, params):
60
-    language = params['language'].split('_')[0]
60
+    language = params['language'].split('-')[0]
61
     if language == 'all':
61
     if language == 'all':
62
         language = 'en'
62
         language = 'en'
63
 
63
 
72
     html = fromstring(resp.content)
72
     html = fromstring(resp.content)
73
     wikidata_ids = html.xpath(wikidata_ids_xpath)
73
     wikidata_ids = html.xpath(wikidata_ids_xpath)
74
 
74
 
75
-    language = resp.search_params['language'].split('_')[0]
75
+    language = resp.search_params['language'].split('-')[0]
76
     if language == 'all':
76
     if language == 'all':
77
         language = 'en'
77
         language = 'en'
78
 
78
 

+ 24
- 29
searx/engines/wikipedia.py Ver arquivo

12
 
12
 
13
 from json import loads
13
 from json import loads
14
 from urllib import urlencode, quote
14
 from urllib import urlencode, quote
15
+from requests import get
16
+from lxml.html import fromstring
15
 
17
 
16
-supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
17
-                       "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
18
-                       "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
19
-                       "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
20
-                       "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
21
-                       "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
22
-                       "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
23
-                       "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
24
-                       "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
25
-                       "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
26
-                       "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
27
-                       "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
28
-                       "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
29
-                       "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
30
-                       "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
31
-                       "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
32
-                       "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
33
-                       "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
34
-                       "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
35
-                       "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
36
-                       "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
37
-                       "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
38
-                       "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
39
-                       "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
40
-                       "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
41
-                       "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
42
-                       "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
43
-                       "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
44
-                       "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
45
 
18
 
46
 # search-url
19
 # search-url
47
 base_url = 'https://{language}.wikipedia.org/'
20
 base_url = 'https://{language}.wikipedia.org/'
54
     '&explaintext'\
27
     '&explaintext'\
55
     '&pithumbsize=300'\
28
     '&pithumbsize=300'\
56
     '&redirects'
29
     '&redirects'
30
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
57
 
31
 
58
 
32
 
59
 # set language in base_url
33
 # set language in base_url
142
                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
116
                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
143
 
117
 
144
     return results
118
     return results
119
+
120
+
121
+# get supported languages from their site
122
+def fetch_supported_languages():
123
+    supported_languages = {}
124
+    response = get(supported_languages_url)
125
+    dom = fromstring(response.text)
126
+    tables = dom.xpath('//table[contains(@class,"sortable")]')
127
+    for table in tables:
128
+        # exclude header row
129
+        trs = table.xpath('.//tr')[1:]
130
+        for tr in trs:
131
+            td = tr.xpath('./td')
132
+            code = td[3].xpath('./a')[0].text
133
+            name = td[2].xpath('./a')[0].text
134
+            english_name = td[1].xpath('./a')[0].text
135
+            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
136
+            if articles >= 10000:
137
+                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
138
+
139
+    return supported_languages

+ 16
- 4
searx/engines/yahoo.py Ver arquivo

14
 from urllib import urlencode
14
 from urllib import urlencode
15
 from urlparse import unquote
15
 from urlparse import unquote
16
 from lxml import html
16
 from lxml import html
17
+from requests import get
17
 from searx.engines.xpath import extract_text, extract_url
18
 from searx.engines.xpath import extract_text, extract_url
18
 
19
 
19
 # engine dependent config
20
 # engine dependent config
20
 categories = ['general']
21
 categories = ['general']
21
 paging = True
22
 paging = True
22
 language_support = True
23
 language_support = True
23
-supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
24
-                       "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
25
-                       "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
26
-                       "sl", "es", "sv", "th", "tr"]
27
 time_range_support = True
24
 time_range_support = True
28
 
25
 
29
 # search-url
26
 # search-url
31
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
28
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
32
 search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
29
 search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
33
 
30
 
31
+supported_languages_url = 'https://search.yahoo.com/web/advanced'
32
+
34
 # specific xpath variables
33
 # specific xpath variables
35
 results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
34
 results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
36
 url_xpath = './/h3/a/@href'
35
 url_xpath = './/h3/a/@href'
142
 
141
 
143
     # return results
142
     # return results
144
     return results
143
     return results
144
+
145
+
146
+# get supported languages from their site
147
+def fetch_supported_languages():
148
+    supported_languages = []
149
+    response = get(supported_languages_url)
150
+    dom = html.fromstring(response.text)
151
+    options = dom.xpath('//div[@id="yschlang"]/span/label/input')
152
+    for option in options:
153
+        code = option.xpath('./@value')[0][5:]
154
+        supported_languages.append(code)
155
+
156
+    return supported_languages

+ 1
- 1
searx/engines/yahoo_news.py Ver arquivo

12
 from urllib import urlencode
12
 from urllib import urlencode
13
 from lxml import html
13
 from lxml import html
14
 from searx.engines.xpath import extract_text, extract_url
14
 from searx.engines.xpath import extract_text, extract_url
15
-from searx.engines.yahoo import parse_url, supported_languages
15
+from searx.engines.yahoo import parse_url, fetch_supported_languages
16
 from datetime import datetime, timedelta
16
 from datetime import datetime, timedelta
17
 import re
17
 import re
18
 from dateutil import parser
18
 from dateutil import parser

+ 47
- 76
searx/languages.py Ver arquivo

4
 
4
 
5
 language_codes = (
5
 language_codes = (
6
     (u"ach", u"Acoli", u"", u""),
6
     (u"ach", u"Acoli", u"", u""),
7
-    (u"af", u"Afrikaans", u"", u"Afrikaans"),
7
+    (u"af", u"Afrikaans", u"", u""),
8
     (u"ak", u"Akan", u"", u""),
8
     (u"ak", u"Akan", u"", u""),
9
-    (u"als", u"Alemannisch", u"", u"Alemannic"),
10
-    (u"am", u"አማርኛ", u"", u"Amharic"),
11
-    (u"an", u"Aragonés", u"", u"Aragonese"),
9
+    (u"am", u"አማርኛ", u"", u""),
12
     (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
10
     (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
13
-    (u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
14
-    (u"ast", u"Asturianu", u"", u"Asturian"),
15
     (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
11
     (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
16
-    (u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
17
-    (u"ba", u"Башҡорт", u"", u"Bashkir"),
18
     (u"ban", u"Balinese", u"", u""),
12
     (u"ban", u"Balinese", u"", u""),
19
-    (u"bar", u"Boarisch", u"", u"Bavarian"),
20
     (u"be", u"Беларуская", u"", u"Belarusian"),
13
     (u"be", u"Беларуская", u"", u"Belarusian"),
21
     (u"bem", u"Ichibemba", u"", u""),
14
     (u"bem", u"Ichibemba", u"", u""),
22
     (u"bg-BG", u"Български", u"България", u"Bulgarian"),
15
     (u"bg-BG", u"Български", u"България", u"Bulgarian"),
23
-    (u"bn", u"বাংলা", u"", u"Bengali"),
24
-    (u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"),
25
-    (u"br", u"Brezhoneg", u"", u"Breton"),
26
-    (u"bs", u"Bosanski", u"", u"Bosnian"),
27
-    (u"bug", u"Basa Ugi", u"", u"Buginese"),
16
+    (u"bn", u"বাংলা", u"", u""),
17
+    (u"br", u"Brezhoneg", u"", u""),
18
+    (u"bs", u"Bosanski", u"", u""),
28
     (u"ca", u"Català", u"", u"Catalan"),
19
     (u"ca", u"Català", u"", u"Catalan"),
29
     (u"ca-CT", u"Català", u"", u"Catalan"),
20
     (u"ca-CT", u"Català", u"", u"Catalan"),
30
     (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
21
     (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
31
     (u"ce", u"Нохчийн", u"", u"Chechen"),
22
     (u"ce", u"Нохчийн", u"", u"Chechen"),
32
     (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
23
     (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
33
     (u"chr", u"ᏣᎳᎩ", u"", u""),
24
     (u"chr", u"ᏣᎳᎩ", u"", u""),
34
-    (u"ckb", u"Soranî / کوردی", u"", u"Sorani"),
25
+    (u"ckb", u"Central Kurdish", u"", u""),
35
     (u"co", u"Corsican", u"", u""),
26
     (u"co", u"Corsican", u"", u""),
36
     (u"crs", u"Seychellois Creole", u"", u""),
27
     (u"crs", u"Seychellois Creole", u"", u""),
37
     (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
28
     (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
38
-    (u"cv", u"Чăваш", u"", u"Chuvash"),
39
-    (u"cy", u"Cymraeg", u"", u"Welsh"),
29
+    (u"cy", u"Cymraeg", u"", u""),
40
     (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
30
     (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
41
     (u"de", u"Deutsch", u"", u"German"),
31
     (u"de", u"Deutsch", u"", u"German"),
42
     (u"de-AT", u"Deutsch", u"Österreich", u"German"),
32
     (u"de-AT", u"Deutsch", u"Österreich", u"German"),
70
     (u"eu", u"Euskara", u"", u"Basque"),
60
     (u"eu", u"Euskara", u"", u"Basque"),
71
     (u"fa", u"فارسی", u"", u"Persian"),
61
     (u"fa", u"فارسی", u"", u"Persian"),
72
     (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
62
     (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
73
-    (u"fo", u"Føroyskt", u"", u"Faroese"),
63
+    (u"fo", u"Føroyskt", u"", u""),
74
     (u"fr", u"Français", u"", u"French"),
64
     (u"fr", u"Français", u"", u"French"),
75
     (u"fr-BE", u"Français", u"Belgique", u"French"),
65
     (u"fr-BE", u"Français", u"Belgique", u"French"),
76
     (u"fr-CA", u"Français", u"Canada", u"French"),
66
     (u"fr-CA", u"Français", u"Canada", u"French"),
77
     (u"fr-CH", u"Français", u"Suisse", u"French"),
67
     (u"fr-CH", u"Français", u"Suisse", u"French"),
78
     (u"fr-FR", u"Français", u"France", u"French"),
68
     (u"fr-FR", u"Français", u"France", u"French"),
79
-    (u"fy", u"Frysk", u"", u"West Frisian"),
80
-    (u"ga", u"Gaeilge", u"", u"Irish"),
69
+    (u"fy", u"West-Frysk", u"", u""),
70
+    (u"ga", u"Gaeilge", u"", u""),
81
     (u"gaa", u"Ga", u"", u""),
71
     (u"gaa", u"Ga", u"", u""),
82
-    (u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"),
72
+    (u"gd", u"Gàidhlig", u"", u""),
83
     (u"gl", u"Galego", u"", u"Galician"),
73
     (u"gl", u"Galego", u"", u"Galician"),
84
     (u"gn", u"Guarani", u"", u""),
74
     (u"gn", u"Guarani", u"", u""),
85
-    (u"gu", u"ગુજરાતી", u"", u"Gujarati"),
75
+    (u"gu", u"ગુજરાતી", u"", u""),
86
     (u"ha", u"Hausa", u"", u""),
76
     (u"ha", u"Hausa", u"", u""),
87
     (u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
77
     (u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
88
     (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
78
     (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
89
     (u"hi", u"हिन्दी", u"", u"Hindi"),
79
     (u"hi", u"हिन्दी", u"", u"Hindi"),
90
     (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
80
     (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
91
-    (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
92
-    (u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
81
+    (u"ht", u"Haitian Creole", u"", u""),
93
     (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
82
     (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
94
     (u"hy", u"Հայերեն", u"", u"Armenian"),
83
     (u"hy", u"Հայերեն", u"", u"Armenian"),
95
-    (u"ia", u"Interlingua", u"", u"Interlingua"),
84
+    (u"ia", u"Interlingua", u"", u""),
96
     (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
85
     (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
97
     (u"ig", u"Igbo", u"", u""),
86
     (u"ig", u"Igbo", u"", u""),
98
-    (u"io", u"Ido", u"", u"Ido"),
99
-    (u"is", u"Íslenska", u"", u"Icelandic"),
87
+    (u"is", u"Íslenska", u"", u""),
100
     (u"it", u"Italiano", u"", u"Italian"),
88
     (u"it", u"Italiano", u"", u"Italian"),
101
     (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
89
     (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
102
     (u"it-IT", u"Italiano", u"Italia", u"Italian"),
90
     (u"it-IT", u"Italiano", u"Italia", u"Italian"),
103
     (u"iw", u"עברית", u"", u""),
91
     (u"iw", u"עברית", u"", u""),
104
     (u"ja-JP", u"日本語", u"日本", u"Japanese"),
92
     (u"ja-JP", u"日本語", u"日本", u"Japanese"),
105
-    (u"jv", u"Basa Jawa", u"", u"Javanese"),
106
     (u"ka", u"ქართული", u"", u"Georgian"),
93
     (u"ka", u"ქართული", u"", u"Georgian"),
107
     (u"kg", u"Kongo", u"", u""),
94
     (u"kg", u"Kongo", u"", u""),
108
     (u"kk", u"Қазақша", u"", u"Kazakh"),
95
     (u"kk", u"Қазақша", u"", u"Kazakh"),
109
     (u"km", u"ខ្មែរ", u"", u""),
96
     (u"km", u"ខ្មែរ", u"", u""),
110
-    (u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
97
+    (u"kn", u"ಕನ್ನಡ", u"", u""),
111
     (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
98
     (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
112
-    (u"kri", u"Krio (Sierra Leone)", u"", u""),
113
-    (u"ku", u"Kurdî / كوردی", u"", u"Kurdish"),
114
-    (u"ky", u"Кыргызча", u"", u"Kirghiz"),
99
+    (u"kri", u"Krio", u"", u""),
100
+    (u"ky", u"Кыргызча", u"", u""),
115
     (u"la", u"Latina", u"", u"Latin"),
101
     (u"la", u"Latina", u"", u"Latin"),
116
-    (u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
117
     (u"lg", u"Luganda", u"", u""),
102
     (u"lg", u"Luganda", u"", u""),
118
-    (u"li", u"Limburgs", u"", u"Limburgish"),
119
-    (u"lmo", u"Lumbaart", u"", u"Lombard"),
120
     (u"ln", u"Lingála", u"", u""),
103
     (u"ln", u"Lingála", u"", u""),
121
     (u"lo", u"ລາວ", u"", u""),
104
     (u"lo", u"ລາວ", u"", u""),
122
     (u"loz", u"Lozi", u"", u""),
105
     (u"loz", u"Lozi", u"", u""),
123
     (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
106
     (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
124
     (u"lua", u"Luba-Lulua", u"", u""),
107
     (u"lua", u"Luba-Lulua", u"", u""),
125
-    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"),
108
+    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
126
     (u"mfe", u"Kreol Morisien", u"", u""),
109
     (u"mfe", u"Kreol Morisien", u"", u""),
127
-    (u"mg", u"Malagasy", u"", u"Malagasy"),
110
+    (u"mg", u"Malagasy", u"", u""),
128
     (u"mi", u"Maori", u"", u""),
111
     (u"mi", u"Maori", u"", u""),
129
     (u"min", u"Minangkabau", u"", u"Minangkabau"),
112
     (u"min", u"Minangkabau", u"", u"Minangkabau"),
130
-    (u"mk", u"Македонски", u"", u"Macedonian"),
131
-    (u"ml", u"മലയാളം", u"", u"Malayalam"),
132
-    (u"mn", u"Монгол", u"", u"Mongolian"),
133
-    (u"mr", u"मराठी", u"", u"Marathi"),
134
-    (u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
113
+    (u"mk", u"Македонски", u"", u""),
114
+    (u"ml", u"മലയാളം", u"", u""),
115
+    (u"mn", u"Монгол", u"", u""),
116
+    (u"mr", u"मराठी", u"", u""),
135
     (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
117
     (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
136
     (u"mt", u"Malti", u"", u""),
118
     (u"mt", u"Malti", u"", u""),
137
-    (u"my", u"မြန်မာဘာသာ", u"", u"Burmese"),
138
-    (u"mzn", u"مَزِروني", u"", u"Mazandarani"),
139
-    (u"nah", u"Nāhuatl", u"", u"Nahuatl"),
140
-    (u"nap", u"Nnapulitano", u"", u"Neapolitan"),
141
-    (u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
142
-    (u"ne", u"नेपाली", u"", u"Nepali"),
143
-    (u"new", u"नेपाल भाषा", u"", u"Newar"),
119
+    (u"my", u"ဗမာ", u"", u""),
120
+    (u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
121
+    (u"ne", u"नेपाली", u"", u""),
144
     (u"nl", u"Nederlands", u"", u"Dutch"),
122
     (u"nl", u"Nederlands", u"", u"Dutch"),
145
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
123
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
146
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
124
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
147
-    (u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"),
148
-    (u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"),
125
+    (u"nn", u"Nynorsk", u"", u"Norwegian"),
126
+    (u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
149
     (u"nso", u"Northern Sotho", u"", u""),
127
     (u"nso", u"Northern Sotho", u"", u""),
150
     (u"ny", u"Nyanja", u"", u""),
128
     (u"ny", u"Nyanja", u"", u""),
151
     (u"nyn", u"Runyankore", u"", u""),
129
     (u"nyn", u"Runyankore", u"", u""),
152
-    (u"oc", u"Occitan", u"", u"Occitan"),
130
+    (u"oc", u"Occitan", u"", u""),
153
     (u"om", u"Oromoo", u"", u""),
131
     (u"om", u"Oromoo", u"", u""),
154
-    (u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"),
155
-    (u"os", u"Иронау", u"", u"Ossetian"),
156
-    (u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
132
+    (u"or", u"ଓଡ଼ିଆ", u"", u""),
133
+    (u"pa", u"ਪੰਜਾਬੀ", u"", u""),
157
     (u"pcm", u"Nigerian Pidgin", u"", u""),
134
     (u"pcm", u"Nigerian Pidgin", u"", u""),
158
     (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
135
     (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
159
-    (u"pms", u"Piemontèis", u"", u"Piedmontese"),
160
-    (u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
161
     (u"ps", u"پښتو", u"", u""),
136
     (u"ps", u"پښتو", u"", u""),
162
     (u"pt", u"Português", u"", u"Portuguese"),
137
     (u"pt", u"Português", u"", u"Portuguese"),
163
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
138
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
164
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
139
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
165
-    (u"qu", u"Runa Simi", u"", u"Quechua"),
140
+    (u"qu", u"Runasimi", u"", u""),
166
     (u"rm", u"Rumantsch", u"", u""),
141
     (u"rm", u"Rumantsch", u"", u""),
167
     (u"rn", u"Ikirundi", u"", u""),
142
     (u"rn", u"Ikirundi", u"", u""),
168
     (u"ro-RO", u"Română", u"România", u"Romanian"),
143
     (u"ro-RO", u"Română", u"România", u"Romanian"),
169
     (u"ru-RU", u"Русский", u"Россия", u"Russian"),
144
     (u"ru-RU", u"Русский", u"Россия", u"Russian"),
170
     (u"rw", u"Kinyarwanda", u"", u""),
145
     (u"rw", u"Kinyarwanda", u"", u""),
171
-    (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
172
-    (u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
173
-    (u"scn", u"Sicilianu", u"", u"Sicilian"),
174
-    (u"sco", u"Scots", u"", u"Scots"),
175
     (u"sd", u"Sindhi", u"", u""),
146
     (u"sd", u"Sindhi", u"", u""),
176
     (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
147
     (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
177
-    (u"si", u"සිංහල", u"", u"Sinhalese"),
148
+    (u"si", u"සිංහල", u"", u""),
178
     (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
149
     (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
179
-    (u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"),
150
+    (u"sl", u"Slovenščina", u"", u"Slovenian"),
180
     (u"sn", u"Chishona", u"", u""),
151
     (u"sn", u"Chishona", u"", u""),
181
     (u"so", u"Soomaali", u"", u""),
152
     (u"so", u"Soomaali", u"", u""),
182
-    (u"sq", u"Shqip", u"", u"Albanian"),
183
-    (u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"),
153
+    (u"sq", u"Shqip", u"", u""),
154
+    (u"sr", u"Српски / Srpski", u"", u"Serbian"),
184
     (u"st", u"Southern Sotho", u"", u""),
155
     (u"st", u"Southern Sotho", u"", u""),
185
-    (u"su", u"Basa Sunda", u"", u"Sundanese"),
156
+    (u"su", u"Sundanese", u"", u""),
186
     (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
157
     (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
187
-    (u"sw", u"Kiswahili", u"", u"Swahili"),
188
-    (u"ta", u"தமிழ்", u"", u"Tamil"),
189
-    (u"te", u"తెలుగు", u"", u"Telugu"),
190
-    (u"tg", u"Тоҷикӣ", u"", u"Tajik"),
158
+    (u"sw", u"Kiswahili", u"", u""),
159
+    (u"ta", u"தமிழ்", u"", u""),
160
+    (u"te", u"తెలుగు", u"", u""),
161
+    (u"tg", u"Tajik", u"", u""),
191
     (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
162
     (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
192
     (u"ti", u"ትግርኛ", u"", u""),
163
     (u"ti", u"ትግርኛ", u"", u""),
193
     (u"tk", u"Turkmen", u"", u""),
164
     (u"tk", u"Turkmen", u"", u""),
194
-    (u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"),
165
+    (u"tl-PH", u"Filipino", u"Pilipinas", u""),
195
     (u"tlh", u"Klingon", u"", u""),
166
     (u"tlh", u"Klingon", u"", u""),
196
     (u"tn", u"Tswana", u"", u""),
167
     (u"tn", u"Tswana", u"", u""),
197
     (u"to", u"Lea Fakatonga", u"", u""),
168
     (u"to", u"Lea Fakatonga", u"", u""),
198
     (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
169
     (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
199
-    (u"tt", u"Tatarça / Татарча", u"", u"Tatar"),
170
+    (u"tt", u"Tatar", u"", u""),
200
     (u"tum", u"Tumbuka", u"", u""),
171
     (u"tum", u"Tumbuka", u"", u""),
201
     (u"tw", u"Twi", u"", u""),
172
     (u"tw", u"Twi", u"", u""),
202
     (u"ug", u"ئۇيغۇرچە", u"", u""),
173
     (u"ug", u"ئۇيغۇرچە", u"", u""),
203
     (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
174
     (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
204
     (u"ur", u"اردو", u"", u"Urdu"),
175
     (u"ur", u"اردو", u"", u"Urdu"),
205
     (u"uz", u"O‘zbek", u"", u"Uzbek"),
176
     (u"uz", u"O‘zbek", u"", u"Uzbek"),
206
-    (u"vec", u"Vèneto", u"", u"Venetian"),
177
+    (u"ve", u"Venda", u"", u"Venda"),
207
     (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
178
     (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
208
     (u"vo", u"Volapük", u"", u"Volapük"),
179
     (u"vo", u"Volapük", u"", u"Volapük"),
209
     (u"wa", u"Walon", u"", u"Walloon"),
180
     (u"wa", u"Walon", u"", u"Walloon"),
210
     (u"war", u"Winaray", u"", u"Waray-Waray"),
181
     (u"war", u"Winaray", u"", u"Waray-Waray"),
211
     (u"wo", u"Wolof", u"", u""),
182
     (u"wo", u"Wolof", u"", u""),
212
     (u"xh", u"Xhosa", u"", u""),
183
     (u"xh", u"Xhosa", u"", u""),
213
-    (u"yi", u"ייִדיש", u"", u"Yiddish"),
214
-    (u"yo", u"Yorùbá", u"", u"Yoruba"),
184
+    (u"yi", u"ייִדיש", u"", u""),
185
+    (u"yo", u"Èdè Yorùbá", u"", u""),
215
     (u"zh", u"中文", u"", u"Chinese"),
186
     (u"zh", u"中文", u"", u"Chinese"),
216
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
187
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
217
     (u"zh-HK", u"中文", u"香港", u"Chinese"),
188
     (u"zh-HK", u"中文", u"香港", u"Chinese"),

+ 1
- 1
searx/webapp.py Ver arquivo

514
         answers=result_container.answers,
514
         answers=result_container.answers,
515
         infoboxes=result_container.infoboxes,
515
         infoboxes=result_container.infoboxes,
516
         paging=result_container.paging,
516
         paging=result_container.paging,
517
-        current_language=search.lang,
517
+        current_language=search_query.lang,
518
         base_url=get_base_url(),
518
         base_url=get_base_url(),
519
         theme=get_current_theme_name(),
519
         theme=get_current_theme_name(),
520
         favicons=global_favicons[themes.index(get_current_theme_name())]
520
         favicons=global_favicons[themes.index(get_current_theme_name())]

+ 1
- 1
tests/unit/engines/test_subtitleseeker.py Ver arquivo

17
 
17
 
18
     def test_response(self):
18
     def test_response(self):
19
         dicto = defaultdict(dict)
19
         dicto = defaultdict(dict)
20
-        dicto['language'] = 'fr_FR'
20
+        dicto['language'] = 'fr-FR'
21
         response = mock.Mock(search_params=dicto)
21
         response = mock.Mock(search_params=dicto)
22
 
22
 
23
         self.assertRaises(AttributeError, subtitleseeker.response, None)
23
         self.assertRaises(AttributeError, subtitleseeker.response, None)

+ 2
- 0
tests/unit/engines/test_wikipedia.py Ver arquivo

8
 class TestWikipediaEngine(SearxTestCase):
8
 class TestWikipediaEngine(SearxTestCase):
9
 
9
 
10
     def test_request(self):
10
     def test_request(self):
11
+        wikipedia.supported_languages = ['fr', 'en']
12
+
11
         query = 'test_query'
13
         query = 'test_query'
12
         dicto = defaultdict(dict)
14
         dicto = defaultdict(dict)
13
         dicto['language'] = 'fr-FR'
15
         dicto['language'] = 'fr-FR'

+ 164
- 0
utils/fetch_languages.py Ver arquivo

1
+# -*- coding: utf-8 -*-
2
+
3
+# This script generates languages.py from intersecting each engine's supported languages.
4
+#
5
+# The country names are obtained from http://api.geonames.org which requires registering as a user.
6
+#
7
+# Output files (engines_languages.json and languages.py)
8
+# are written in current directory to avoid overwriting in case something goes wrong.
9
+
10
+from requests import get
11
+from urllib import urlencode
12
+from lxml.html import fromstring
13
+from json import loads, dumps
14
+import io
15
+from sys import path
16
+path.append('../searx')  # noqa
17
+from searx.engines import engines
18
+
19
+# Geonames API for country names.
20
+geonames_user = ''  # ADD USER NAME HERE
21
+country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
22
+
23
+# Output files.
24
+engines_languages_file = 'engines_languages.json'
25
+languages_file = 'languages.py'
26
+
27
+engines_languages = {}
28
+languages = {}
29
+
30
+
31
+# To filter out invalid codes and dialects.
32
+def valid_code(lang_code):
33
+    # filter invalid codes
34
+    # sl-SL is technically not invalid, but still a mistake
35
+    if lang_code[:2] == 'xx'\
36
+       or lang_code == 'sl-SL'\
37
+       or lang_code == 'wt-WT'\
38
+       or lang_code == 'jw'\
39
+       or lang_code[-2:] == 'UK'\
40
+       or lang_code[-2:] == 'XA'\
41
+       or lang_code[-2:] == 'XL':
42
+        return False
43
+
44
+    # filter dialects
45
+    lang_code = lang_code.split('-')
46
+    if len(lang_code) > 2 or len(lang_code[0]) > 3:
47
+        return False
48
+    if len(lang_code) == 2 and len(lang_code[1]) > 2:
49
+        return False
50
+
51
+    return True
52
+
53
+
54
+# Get country name in specified language.
55
+def get_country_name(locale):
56
+    if geonames_user is '':
57
+        return ''
58
+
59
+    locale = locale.split('-')
60
+    if len(locale) != 2:
61
+        return ''
62
+
63
+    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
64
+                                                         'country': locale[1],
65
+                                                         'username': geonames_user}))
66
+    response = get(url)
67
+    json = loads(response.text)
68
+    content = json.get('geonames', None)
69
+    if content is None or len(content) != 1:
70
+        print "No country name found for " + locale[0] + "-" + locale[1]
71
+        return ''
72
+
73
+    return content[0].get('countryName', '')
74
+
75
+
76
+# Fetchs supported languages for each engine and writes json file with those.
77
+def fetch_supported_languages():
78
+    for engine_name in engines:
79
+        if hasattr(engines[engine_name], 'fetch_supported_languages'):
80
+            try:
81
+                engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
82
+            except Exception as e:
83
+                print e
84
+
85
+    # write json file
86
+    f = io.open(engines_languages_file, "w", encoding="utf-8")
87
+    f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
88
+    f.close()
89
+
90
+
91
+# Join all language lists.
92
+# Iterate all languages supported by each engine.
93
+def join_language_lists():
94
+    # include wikipedia first for more accurate language names
95
+    # exclude languages with too few articles
96
+    languages.update({code: lang for code, lang
97
+                      in engines_languages['wikipedia'].iteritems()
98
+                      if valid_code(code) and lang['articles'] >= 100000})
99
+
100
+    for engine_name in engines_languages:
101
+        for locale in engines_languages[engine_name]:
102
+            if not valid_code(locale):
103
+                continue
104
+
105
+            # if language is not on list or if it has no name yet
106
+            if locale not in languages or not languages[locale].get('name'):
107
+                if isinstance(engines_languages[engine_name], dict) \
108
+                  and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
109
+                    languages[locale] = engines_languages[engine_name][locale]
110
+                else:
111
+                    languages[locale] = {}
112
+
113
+    # get locales that have no name yet
114
+    for locale in languages.keys():
115
+        if not languages[locale].get('name'):
116
+            # try to get language and country names
117
+            name = languages.get(locale.split('-')[0], {}).get('name', None)
118
+            if name:
119
+                languages[locale]['name'] = name
120
+                languages[locale]['country'] = get_country_name(locale) or ''
121
+                languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
122
+            else:
123
+                # filter out locales with no name
124
+                del languages[locale]
125
+
126
+
127
+# Remove countryless language if language is featured in only one country.
128
+def filter_single_country_languages():
129
+    prev_lang = None
130
+    for code in sorted(languages):
131
+        lang = code.split('-')[0]
132
+        if lang == prev_lang:
133
+            countries += 1
134
+        else:
135
+            if prev_lang is not None and countries == 1:
136
+                del languages[prev_lang]
137
+            countries = 0
138
+            prev_lang = lang
139
+
140
+
141
+# Write languages.py.
142
+def write_languages_file():
143
+    new_file = open(languages_file, 'w')
144
+    file_content = '# -*- coding: utf-8 -*-\n'
145
+    file_content += '# list of language codes\n'
146
+    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
147
+    file_content += '\nlanguage_codes = ('
148
+    for code in sorted(languages):
149
+        file_content += '\n    (u"' + code + '"'\
150
+                        + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
151
+                        + ', u"' + languages[code].get('country', '') + '"'\
152
+                        + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
153
+    # remove last comma
154
+    file_content = file_content[:-1]
155
+    file_content += '\n)\n'
156
+    new_file.write(file_content.encode('utf8'))
157
+    new_file.close()
158
+
159
+
160
+if __name__ == "__main__":
161
+    fetch_supported_languages()
162
+    join_language_lists()
163
+    filter_single_country_languages()
164
+    write_languages_file()

+ 0
- 169
utils/update_languages.py Ver arquivo

1
-# -*- coding: utf-8 -*-
2
-
3
-# This script generates languages.py from
4
-# intersecting each engine's supported languages.
5
-#
6
-# The language's native names are obtained from
7
-# Wikipedia and Google's supported languages.
8
-#
9
-# The country names are obtained from http://api.geonames.org
10
-# which requires registering as a user.
11
-#
12
-# Output file (languages.py) is written in current directory
13
-# to avoid overwriting in case something goes wrong.
14
-
15
-from requests import get
16
-from urllib import urlencode
17
-from lxml.html import fromstring
18
-from json import loads
19
-from sys import path
20
-path.append('../searx')
21
-from searx.engines import engines
22
-
23
-# list of names
24
-wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
25
-google_languages_url = 'https://www.google.com/preferences?#languages'
26
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
27
-
28
-geonames_user = ''  # add user name here
29
-
30
-google_json_name = 'google.preferences.langMap'
31
-
32
-languages = {}
33
-
34
-
35
-# To filter out invalid codes and dialects.
36
-def valid_code(lang_code):
37
-    # filter invalid codes
38
-    # sl-SL is technically not invalid, but still a mistake
39
-    if lang_code[:2] == 'xx'\
40
-       or lang_code == 'sl-SL'\
41
-       or lang_code == 'jw'\
42
-       or lang_code[-2:] == 'UK'\
43
-       or lang_code[-2:] == 'XA'\
44
-       or lang_code[-2:] == 'XL':
45
-        return False
46
-
47
-    # filter dialects
48
-    lang_code = lang_code.split('-')
49
-    if len(lang_code) > 2 or len(lang_code[0]) > 3:
50
-        return False
51
-    if len(lang_code) == 2 and len(lang_code[1]) > 2:
52
-        return False
53
-        
54
-    return True
55
-
56
-
57
-# Get country name in specified language.
58
-def get_country_name(locale):
59
-    if geonames_user is '':
60
-        return ''
61
-
62
-    locale = locale.split('-')
63
-    if len(locale) != 2:
64
-        return ''
65
-
66
-    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
67
-                                                         'country': locale[1],
68
-                                                         'username': geonames_user}))
69
-    response = get(url)
70
-    json = loads(response.text)
71
-    content = json.get('geonames', None)
72
-    if content is None or len(content) != 1:
73
-        print "No country name found for " + locale[0] + "-" + locale[1]
74
-        print json
75
-        return ''
76
-
77
-    return content[0].get('countryName', '')
78
-
79
-
80
-# Get language names from Wikipedia.
81
-def get_wikipedia_languages():
82
-    response = get(wiki_languages_url)
83
-    dom = fromstring(response.text)
84
-    tables = dom.xpath('//table[contains(@class,"sortable")]')
85
-    for table in tables:
86
-        # exclude header row
87
-        trs = table.xpath('.//tr')[1:]
88
-        for tr in trs:
89
-            td = tr.xpath('./td')
90
-            code = td[3].xpath('./a')[0].text
91
-            name = td[2].xpath('./a')[0].text
92
-            english_name = td[1].xpath('./a')[0].text
93
-            articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
94
-            
95
-            # exclude language variants and languages with few articles
96
-            if code not in languages and articles >= 10000 and valid_code(code):
97
-                languages[code] = (name, '', english_name)
98
-
99
-
100
-# Get language names from Google.
101
-def get_google_languages():
102
-    response = get(google_languages_url)
103
-    dom = fromstring(response.text)
104
-    options = dom.xpath('//select[@name="hl"]/option')
105
-    for option in options:
106
-        code = option.xpath('./@value')[0].split('-')[0]
107
-        name = option.text[:-1].title()
108
-
109
-        if code not in languages and valid_code(code):
110
-            languages[code] = (name, '', '')
111
-
112
-
113
-# Join all language lists.
114
-# iterate all languages supported by each engine
115
-def join_language_lists():
116
-    for engine_name in engines:
117
-        for locale in engines[engine_name].supported_languages:
118
-            locale = locale.replace('_', '-')
119
-            if locale not in languages and valid_code(locale):
120
-                # try to get language name
121
-                language = languages.get(locale.split('-')[0], None)
122
-                if language == None:
123
-                    print engine_name + ": " + locale
124
-                    continue
125
-
126
-                country = get_country_name(locale)
127
-                languages[locale] = (language[0], country, language[2])
128
-
129
-
130
-# Remove countryless language if language is featured in only one country.
131
-def filter_single_country_languages():
132
-    prev_lang = None
133
-    for code in sorted(languages):
134
-        lang = code.split('-')[0]
135
-        if lang == prev_lang:
136
-            countries += 1
137
-        else:
138
-            if prev_lang is not None and countries == 1:
139
-                del languages[prev_lang]
140
-            countries = 0
141
-            prev_lang = lang
142
-
143
-
144
-# Write languages.py.
145
-def write_languages_file():
146
-    new_file = open('languages.py', 'w')
147
-    file_content = '# -*- coding: utf-8 -*-\n'
148
-    file_content += '# list of language codes\n'
149
-    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
150
-    file_content += '\nlanguage_codes = ('
151
-    for code in sorted(languages):
152
-        (name, country, english) = languages[code]
153
-        file_content += '\n    (u"' + code + '"'\
154
-                        + ', u"' + name + '"'\
155
-                        + ', u"' + country + '"'\
156
-                        + ', u"' + english + '"),'
157
-    # remove last comma
158
-    file_content = file_content[:-1]
159
-    file_content += '\n)\n'
160
-    new_file.write(file_content.encode('utf8'))
161
-    new_file.close()
162
-
163
-
164
-if __name__ == "__main__":
165
-    get_wikipedia_languages()
166
-    get_google_languages()
167
-    join_language_lists()
168
-    filter_single_country_languages()
169
-    write_languages_file()