소스 검색

[mod] fetch supported languages for several engines

utils/fetch_languages.py gets languages supported by each engine and
generates engines_languages.json with each engine's supported language.
marc 8 년 전
부모
커밋
f62ce21f50

+ 3256
- 0
searx/data/engines_languages.json
파일 크기가 너무 크기때문에 변경 상태를 표시하지 않습니다.
파일 보기


+ 6
- 0
searx/engines/__init__.py 파일 보기

@@ -20,6 +20,7 @@ from os.path import realpath, dirname
20 20
 import sys
21 21
 from flask_babel import gettext
22 22
 from operator import itemgetter
23
+from json import loads
23 24
 from searx import settings
24 25
 from searx import logger
25 26
 from searx.utils import load_module
@@ -78,6 +79,9 @@ def load_engine(engine_data):
78 79
         if not hasattr(engine, arg_name):
79 80
             setattr(engine, arg_name, arg_value)
80 81
 
82
+    if engine_data['name'] in languages:
83
+        setattr(engine, 'supported_languages', languages[engine_data['name']])
84
+
81 85
     # checking required variables
82 86
     for engine_attr in dir(engine):
83 87
         if engine_attr.startswith('_'):
@@ -207,6 +211,8 @@ if 'engines' not in settings or not settings['engines']:
207 211
     logger.error('No engines found. Edit your settings.yml')
208 212
     exit(2)
209 213
 
214
+languages = loads(open(engine_dir + '/../data/engines_languages.json').read())
215
+
210 216
 for engine_data in settings['engines']:
211 217
     engine = load_engine(engine_data)
212 218
     if engine is not None:

+ 15
- 0
searx/engines/bing.py 파일 보기

@@ -15,12 +15,14 @@
15 15
 
16 16
 from urllib import urlencode
17 17
 from lxml import html
18
+from requests import get
18 19
 from searx.engines.xpath import extract_text
19 20
 
20 21
 # engine dependent config
21 22
 categories = ['general']
22 23
 paging = True
23 24
 language_support = True
25
+supported_languages_url = 'https://www.bing.com/account/general'
24 26
 
25 27
 # search-url
26 28
 base_url = 'https://www.bing.com/'
@@ -81,3 +83,16 @@ def response(resp):
81 83
 
82 84
     # return results
83 85
     return results
86
+
87
+
88
+# get supported languages from their site
89
+def fetch_supported_languages():
90
+    supported_languages = []
91
+    response = get(supported_languages_url)
92
+    dom = html.fromstring(response.text)
93
+    options = dom.xpath('//div[@id="limit-languages"]//input')
94
+    for option in options:
95
+        code = option.xpath('./@id')[0].replace('_', '-')
96
+        supported_languages.append(code)
97
+
98
+    return supported_languages

+ 1
- 1
searx/engines/bing_images.py 파일 보기

@@ -19,7 +19,7 @@ from urllib import urlencode
19 19
 from lxml import html
20 20
 from json import loads
21 21
 import re
22
-from searx.engines.bing import supported_languages
22
+from searx.engines.bing import fetch_supported_languages
23 23
 
24 24
 # engine dependent config
25 25
 categories = ['images']

+ 1
- 1
searx/engines/bing_news.py 파일 보기

@@ -17,7 +17,7 @@ from datetime import datetime
17 17
 from dateutil import parser
18 18
 from lxml import etree
19 19
 from searx.utils import list_get
20
-from searx.engines.bing import supported_languages
20
+from searx.engines.bing import fetch_supported_languages
21 21
 
22 22
 # engine dependent config
23 23
 categories = ['news']

+ 23
- 18
searx/engines/dailymotion.py 파일 보기

@@ -15,29 +15,12 @@
15 15
 from urllib import urlencode
16 16
 from json import loads
17 17
 from datetime import datetime
18
+from requests import get
18 19
 
19 20
 # engine dependent config
20 21
 categories = ['videos']
21 22
 paging = True
22 23
 language_support = True
23
-supported_languages = ["af", "ak", "am", "ar", "an", "as", "av", "ae", "ay", "az",
24
-                       "ba", "bm", "be", "bn", "bi", "bo", "bs", "br", "bg", "ca",
25
-                       "cs", "ch", "ce", "cu", "cv", "kw", "co", "cr", "cy", "da",
26
-                       "de", "dv", "dz", "el", "en", "eo", "et", "eu", "ee", "fo",
27
-                       "fa", "fj", "fi", "fr", "fy", "ff", "gd", "ga", "gl", "gv",
28
-                       "gn", "gu", "ht", "ha", "sh", "he", "hz", "hi", "ho", "hr",
29
-                       "hu", "hy", "ig", "io", "ii", "iu", "ie", "ia", "id", "ik",
30
-                       "is", "it", "jv", "ja", "kl", "kn", "ks", "ka", "kr", "kk",
31
-                       "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo",
32
-                       "la", "lv", "li", "ln", "lt", "lb", "lu", "lg", "mh", "ml",
33
-                       "mr", "mk", "mg", "mt", "mn", "mi", "ms", "my", "na", "nv",
34
-                       "nr", "nd", "ng", "ne", "nl", "nn", "nb", "no", "ny", "oc",
35
-                       "oj", "or", "om", "os", "pa", "pi", "pl", "pt", "ps", "qu",
36
-                       "rm", "ro", "rn", "ru", "sg", "sa", "si", "sk", "sl", "se",
37
-                       "sm", "sn", "sd", "so", "st", "es", "sq", "sc", "sr", "ss",
38
-                       "su", "sw", "sv", "ty", "ta", "tt", "te", "tg", "tl", "th",
39
-                       "ti", "to", "tn", "ts", "tk", "tr", "tw", "ug", "uk", "ur",
40
-                       "uz", "ve", "vi", "vo", "wa", "wo", "xh", "yi", "yo", "za", "zh", "zu"]
41 24
 
42 25
 # search-url
43 26
 # see http://www.dailymotion.com/doc/api/obj-video.html
@@ -45,6 +28,8 @@ search_url = 'https://api.dailymotion.com/videos?fields=created_time,title,descr
45 28
 embedded_url = '<iframe frameborder="0" width="540" height="304" ' +\
46 29
     'data-src="//www.dailymotion.com/embed/video/{videoid}" allowfullscreen></iframe>'
47 30
 
31
+supported_languages_url = 'https://api.dailymotion.com/languages'
32
+
48 33
 
49 34
 # do search-request
50 35
 def request(query, params):
@@ -92,3 +77,23 @@ def response(resp):
92 77
 
93 78
     # return results
94 79
     return results
80
+
81
+
82
+# get supported languages from their site
83
+def fetch_supported_languages():
84
+    supported_languages = {}
85
+
86
+    response = get(supported_languages_url)
87
+    response_json = loads(response.text)
88
+
89
+    for language in response_json['list']:
90
+        supported_languages[language['code']] = {}
91
+
92
+        name = language['native_name']
93
+        if name:
94
+            supported_languages[language['code']]['name'] = name
95
+        english_name = language['name']
96
+        if english_name:
97
+            supported_languages[language['code']]['english_name'] = english_name
98
+
99
+    return supported_languages

+ 18
- 9
searx/engines/duckduckgo.py 파일 보기

@@ -15,19 +15,15 @@
15 15
 
16 16
 from urllib import urlencode
17 17
 from lxml.html import fromstring
18
+from requests import get
19
+from json import loads
18 20
 from searx.engines.xpath import extract_text
19 21
 
20 22
 # engine dependent config
21 23
 categories = ['general']
22 24
 paging = True
23 25
 language_support = True
24
-supported_languages = ["es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA", "ca-CT",
25
-                       "es-CL", "zh-CN", "es-CO", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE",
26
-                       "el-GR", "tzh-HK", "hu-HU", "en-IN", "id-ID", "en-ID", "en-IE", "he-IL", "it-IT", "jp-JP",
27
-                       "kr-KR", "es-XL", "lv-LV", "lt-LT", "ms-MY", "en-MY", "es-MX", "nl-NL", "en-NZ", "no-NO",
28
-                       "es-PE", "en-PH", "tl-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU", "ar-XA", "en-XA", "en-SG",
29
-                       "sk-SK", "sl-SL", "en-ZA", "es-ES", "ca-ES", "sv-SE", "de-CH", "fr-CH", "it-CH", "tzh-TW",
30
-                       "th-TH", "tr-TR", "uk-UA", "en-UK", "en-US", "es-US", "vi-VN"]
26
+supported_languages_url = 'https://duckduckgo.com/d2030.js'
31 27
 time_range_support = True
32 28
 
33 29
 # search-url
@@ -65,8 +61,6 @@ def request(query, params):
65 61
         locale = 'xa' + params['language'].split('-')[0]
66 62
     elif params['language'][-2:] == 'GB':
67 63
         locale = 'uk' + params['language'].split('-')[0]
68
-    elif params['language'] == 'es-419':
69
-        locale = 'xl-es'
70 64
     else:
71 65
         locale = params['language'].split('-')
72 66
         if len(locale) == 2:
@@ -120,3 +114,18 @@ def response(resp):
120 114
 
121 115
     # return results
122 116
     return results
117
+
118
+
119
+# get supported languages from their site
120
+def fetch_supported_languages():
121
+    response = get(supported_languages_url)
122
+
123
+    # response is a js file with regions as an embedded object
124
+    response_page = response.text
125
+    response_page = response_page[response_page.find('regions:{') + 8:]
126
+    response_page = response_page[:response_page.find('}') + 1]
127
+
128
+    regions_json = loads(response_page)
129
+    supported_languages = map((lambda x: x[3:] + '-' + x[:2].upper()), regions_json.keys())
130
+
131
+    return supported_languages

+ 1
- 1
searx/engines/duckduckgo_definitions.py 파일 보기

@@ -4,7 +4,7 @@ from re import compile, sub
4 4
 from lxml import html
5 5
 from searx.utils import html_to_text
6 6
 from searx.engines.xpath import extract_text
7
-from searx.engines.duckduckgo import supported_languages
7
+from searx.engines.duckduckgo import fetch_supported_languages
8 8
 
9 9
 url = 'https://api.duckduckgo.com/'\
10 10
     + '?{query}&format=json&pretty=0&no_redirect=1&d=1'

+ 17
- 5
searx/engines/gigablast.py 파일 보기

@@ -14,6 +14,8 @@ from json import loads
14 14
 from random import randint
15 15
 from time import time
16 16
 from urllib import urlencode
17
+from requests import get
18
+from lxml.html import fromstring
17 19
 
18 20
 # engine dependent config
19 21
 categories = ['general']
@@ -40,11 +42,7 @@ url_xpath = './/url'
40 42
 title_xpath = './/title'
41 43
 content_xpath = './/sum'
42 44
 
43
-supported_languages = ["en", "fr", "es", "ru", "tr", "ja", "zh-CN", "zh-TW", "ko", "de",
44
-                       "nl", "it", "fi", "sv", "no", "pt", "vi", "ar", "he", "id", "el",
45
-                       "th", "hi", "bn", "pl", "tl", "la", "eo", "ca", "bg", "tx", "sr",
46
-                       "hu", "da", "lt", "cs", "gl", "ka", "gd", "go", "ro", "ga", "lv",
47
-                       "hy", "is", "ag", "gv", "io", "fa", "te", "vv", "mg", "ku", "lb", "et"]
45
+supported_languages_url = 'https://gigablast.com/search?&rxikd=1'
48 46
 
49 47
 
50 48
 # do search-request
@@ -90,3 +88,17 @@ def response(resp):
90 88
 
91 89
     # return results
92 90
     return results
91
+
92
+
93
+# get supported languages from their site
94
+def fetch_supported_languages():
95
+    supported_languages = []
96
+    response = get(supported_languages_url)
97
+    dom = fromstring(response.text)
98
+    links = dom.xpath('//span[@id="menu2"]/a')
99
+    for link in links:
100
+        code = link.xpath('./@href')[0][-2:]
101
+        if code != 'xx' and code not in supported_languages:
102
+            supported_languages.append(code)
103
+
104
+    return supported_languages

+ 16
- 14
searx/engines/google.py 파일 보기

@@ -12,6 +12,7 @@ import re
12 12
 from urllib import urlencode
13 13
 from urlparse import urlparse, parse_qsl
14 14
 from lxml import html, etree
15
+from requests import get
15 16
 from searx.engines.xpath import extract_text, extract_url
16 17
 from searx.search import logger
17 18
 
@@ -23,20 +24,6 @@ categories = ['general']
23 24
 paging = True
24 25
 language_support = True
25 26
 use_locale_domain = True
26
-supported_languages = ["ach", "af", "ak", "az", "ms", "ban", "xx-bork", "bs", "br", "ca",
27
-                       "ceb", "ckb", "cs", "sn", "co", "cy", "da", "de", "yo", "et",
28
-                       "xx-elmer", "en", "es", "es-419", "eo", "eu", "ee", "tl", "fo", "fr",
29
-                       "gaa", "ga", "gd", "gl", "gn", "xx-hacker", "ht", "ha", "hr", "haw",
30
-                       "bem", "ig", "rn", "id", "ia", "zu", "is", "it", "jw", "rw", "sw",
31
-                       "tlh", "kg", "mfe", "kri", "la", "lv", "to", "lt", "ln", "loz",
32
-                       "lua", "lg", "hu", "mg", "mt", "mi", "nl", "pcm", "no", "nso",
33
-                       "ny", "nn", "uz", "oc", "om", "xx-pirate", "pl", "pt-BR", "pt-PT",
34
-                       "ro", "rm", "qu", "nyn", "crs", "sq", "sd", "sk", "sl", "so", "st",
35
-                       "sr-ME", "sr-Latn", "su", "fi", "sv", "tg", "tt", "vi", "tn", "tum",
36
-                       "tr", "tk", "tw", "fy", "wo", "xh", "el", "be", "bg", "ky", "kk", "mk",
37
-                       "mn", "ru", "sr", "uk", "ka", "hy", "yi", "iw", "ug", "ur", "ar", "ps",
38
-                       "fa", "ti", "am", "ne", "mr", "hi", "bn", "pa", "gu", "or", "ta", "te",
39
-                       "kn", "ml", "si", "th", "lo", "my", "km", "chr", "ko", "zh-CN", "zh-TW", "ja"]
40 27
 time_range_support = True
41 28
 
42 29
 # based on https://en.wikipedia.org/wiki/List_of_Google_domains and tests
@@ -117,6 +104,7 @@ map_hostname_start = 'maps.google.'
117 104
 maps_path = '/maps'
118 105
 redirect_path = '/url'
119 106
 images_path = '/images'
107
+supported_languages_url = 'https://www.google.com/preferences?#languages'
120 108
 
121 109
 # specific xpath variables
122 110
 results_xpath = '//div[@class="g"]'
@@ -373,3 +361,17 @@ def attributes_to_html(attributes):
373 361
         retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
374 362
     retval = retval + '</table>'
375 363
     return retval
364
+
365
+
366
+# get supported languages from their site
367
+def fetch_supported_languages():
368
+    supported_languages = {}
369
+    response = get(supported_languages_url)
370
+    dom = html.fromstring(response.text)
371
+    options = dom.xpath('//select[@name="hl"]/option')
372
+    for option in options:
373
+        code = option.xpath('./@value')[0].split('-')[0]
374
+        name = option.text[:-1].title()
375
+        supported_languages[code] = {"name": name}
376
+
377
+    return supported_languages

+ 1
- 1
searx/engines/google_news.py 파일 보기

@@ -13,7 +13,7 @@
13 13
 from lxml import html
14 14
 from urllib import urlencode
15 15
 from json import loads
16
-from searx.engines.google import supported_languages
16
+from searx.engines.google import fetch_supported_languages
17 17
 
18 18
 # search-url
19 19
 categories = ['news']

+ 0
- 1
searx/engines/mediawiki.py 파일 보기

@@ -15,7 +15,6 @@
15 15
 from json import loads
16 16
 from string import Formatter
17 17
 from urllib import urlencode, quote
18
-from searx.engines.wikipedia import supported_languages
19 18
 
20 19
 # engine dependent config
21 20
 categories = ['general']

+ 1
- 14
searx/engines/qwant.py 파일 보기

@@ -20,11 +20,6 @@ from searx.utils import html_to_text
20 20
 categories = None
21 21
 paging = True
22 22
 language_support = True
23
-supported_languages = ["fr-FR", "de-DE", "en-GB", "it-IT", "es-ES", "pt-PT", "de-CH", "fr-CH", "it-CH", "de-AT",
24
-                       "fr-BE", "nl-BE", "nl-NL", "da-DK", "fi-FI", "sv-SE", "en-IE", "no-NO", "pl-PL", "ru-RU",
25
-                       "el-GR", "bg-BG", "cs-CZ", "et-EE", "hu-HU", "ro-RO", "en-US", "en-CA", "fr-CA", "pt-BR",
26
-                       "es-AR", "es-CL", "es-MX", "ja-JP", "en-SG", "en-IN", "en-MY", "ms-MY", "ko-KR", "tl-PH",
27
-                       "th-TH", "he-IL", "tr-TR", "en-AU", "en-NZ"]
28 23
 
29 24
 category_to_keyword = {'general': 'web',
30 25
                        'images': 'images',
@@ -51,15 +46,7 @@ def request(query, params):
51 46
 
52 47
     # add language tag if specified
53 48
     if params['language'] != 'all':
54
-        locale = params['language'].split('-')
55
-        if len(locale) == 2 and params['language'] in supported_languages:
56
-            params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
57
-        else:
58
-            # try to get a country code for language
59
-            for lang in supported_languages:
60
-                if locale[0] == lang.split('-')[0]:
61
-                    params['url'] += '&locale=' + lang.replace('-', '_').lower()
62
-                    break
49
+        params['url'] += '&locale=' + params['language'].replace('-', '_').lower()
63 50
 
64 51
     return params
65 52
 

+ 0
- 5
searx/engines/startpage.py 파일 보기

@@ -24,11 +24,6 @@ categories = ['general']
24 24
 
25 25
 # paging = False
26 26
 language_support = True
27
-supported_languages = ["af", "de", "ar", "hy", "be", "bg", "ca", "cs", "zh-CN", "zh-TW",
28
-                       "ko", "hr", "da", "sk", "sl", "es", "eo", "et", "fi", "fr",
29
-                       "el", "iw", "hi", "nl", "hu", "id", "en", "is", "it", "ja",
30
-                       "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sr", "sw",
31
-                       "sv", "tl", "th", "tr", "uk", "vi"]
32 27
 
33 28
 # search-url
34 29
 base_url = 'https://startpage.com/'

+ 3
- 2
searx/engines/subtitleseeker.py 파일 보기

@@ -22,7 +22,7 @@ language = ""
22 22
 
23 23
 # search-url
24 24
 url = 'http://www.subtitleseeker.com/'
25
-search_url = url + 'search/TITLES/{query}&p={pageno}'
25
+search_url = url + 'search/TITLES/{query}?p={pageno}'
26 26
 
27 27
 # specific xpath variables
28 28
 results_xpath = '//div[@class="boxRows"]'
@@ -51,7 +51,8 @@ def response(resp):
51 51
     elif resp.search_params['language'] != 'all':
52 52
         search_lang = [lc[3]
53 53
                        for lc in language_codes
54
-                       if lc[0][:2] == resp.search_params['language'].split('_')[0]][0]
54
+                       if lc[0].split('-')[0] == resp.search_params['language'].split('-')[0]]
55
+        search_lang = search_lang[0].split(' (')[0]
55 56
 
56 57
     # parse results
57 58
     for result in dom.xpath(results_xpath):

+ 15
- 6
searx/engines/swisscows.py 파일 보기

@@ -13,17 +13,13 @@
13 13
 from json import loads
14 14
 from urllib import urlencode, unquote
15 15
 import re
16
+from requests import get
17
+from lxml.html import fromstring
16 18
 
17 19
 # engine dependent config
18 20
 categories = ['general', 'images']
19 21
 paging = True
20 22
 language_support = True
21
-supported_languages = ["ar-SA", "es-AR", "en-AU", "de-AT", "fr-BE", "nl-BE", "pt-BR", "bg-BG", "en-CA", "fr-CA",
22
-                       "es-CL", "zh-CN", "hr-HR", "cs-CZ", "da-DK", "et-EE", "fi-FI", "fr-FR", "de-DE", "el-GR",
23
-                       "zh-HK", "hu-HU", "en-IN", "en-IE", "he-IL", "it-IT", "ja-JP", "ko-KR", "lv-LV", "lt-LT",
24
-                       "en-MY", "es-MX", "nl-NL", "en-NZ", "nb-NO", "en-PH", "pl-PL", "pt-PT", "ro-RO", "ru-RU",
25
-                       "en-SG", "sk-SK", "sl-SI", "en-ZA", "es-ES", "sv-SE", "de-CH", "fr-CH", "zh-TW", "th-TH",
26
-                       "tr-TR", "uk-UA", "en-GB", "en-US", "es-US"]
27 23
 
28 24
 # search-url
29 25
 base_url = 'https://swisscows.ch/'
@@ -114,3 +110,16 @@ def response(resp):
114 110
 
115 111
     # return results
116 112
     return results
113
+
114
+
115
+# get supported languages from their site
116
+def fetch_supported_languages():
117
+    supported_languages = []
118
+    response = get(base_url)
119
+    dom = fromstring(response.text)
120
+    options = dom.xpath('//div[@id="regions-popup"]//ul/li/a')
121
+    for option in options:
122
+        code = option.xpath('./@data-val')[0]
123
+        supported_languages.append(code)
124
+
125
+    return supported_languages

+ 3
- 3
searx/engines/wikidata.py 파일 보기

@@ -15,7 +15,7 @@ from searx import logger
15 15
 from searx.poolrequests import get
16 16
 from searx.engines.xpath import extract_text
17 17
 from searx.utils import format_date_by_locale
18
-from searx.engines.wikipedia import supported_languages
18
+from searx.engines.wikipedia import fetch_supported_languages
19 19
 
20 20
 from json import loads
21 21
 from lxml.html import fromstring
@@ -57,7 +57,7 @@ calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
57 57
 
58 58
 
59 59
 def request(query, params):
60
-    language = params['language'].split('_')[0]
60
+    language = params['language'].split('-')[0]
61 61
     if language == 'all':
62 62
         language = 'en'
63 63
 
@@ -72,7 +72,7 @@ def response(resp):
72 72
     html = fromstring(resp.content)
73 73
     wikidata_ids = html.xpath(wikidata_ids_xpath)
74 74
 
75
-    language = resp.search_params['language'].split('_')[0]
75
+    language = resp.search_params['language'].split('-')[0]
76 76
     if language == 'all':
77 77
         language = 'en'
78 78
 

+ 24
- 29
searx/engines/wikipedia.py 파일 보기

@@ -12,36 +12,9 @@
12 12
 
13 13
 from json import loads
14 14
 from urllib import urlencode, quote
15
+from requests import get
16
+from lxml.html import fromstring
15 17
 
16
-supported_languages = ["en", "sv", "ceb", "de", "nl", "fr", "ru", "it", "es", "war",
17
-                       "pl", "vi", "ja", "pt", "zh", "uk", "ca", "fa", "no", "sh",
18
-                       "ar", "fi", "hu", "id", "ro", "cs", "ko", "sr", "ms", "tr",
19
-                       "eu", "eo", "min", "bg", "da", "kk", "sk", "hy", "he", "zh-min-nan",
20
-                       "lt", "hr", "sl", "et", "ce", "gl", "nn", "uz", "la", "vo",
21
-                       "el", "simple", "be", "az", "th", "ur", "ka", "hi", "oc", "ta",
22
-                       "mk", "mg", "new", "lv", "cy", "bs", "tt", "tl", "te", "pms",
23
-                       "be-tarask", "br", "sq", "ky", "ht", "jv", "tg", "ast", "zh-yue", "lb",
24
-                       "mr", "ml", "bn", "pnb", "is", "af", "sco", "ga", "ba", "fy",
25
-                       "cv", "lmo", "sw", "my", "an", "yo", "ne", "io", "gu", "nds",
26
-                       "scn", "bpy", "pa", "ku", "als", "kn", "bar", "ia", "qu", "su",
27
-                       "ckb", "bat-smg", "mn", "arz", "nap", "wa", "bug", "gd", "yi", "map-bms",
28
-                       "am", "mzn", "fo", "si", "nah", "li", "sah", "vec", "hsb", "or",
29
-                       "os", "mrj", "sa", "hif", "mhr", "roa-tara", "azb", "pam", "ilo",
30
-                       "sd", "ps", "se", "mi", "bh", "eml", "bcl", "xmf", "diq", "hak",
31
-                       "gan", "glk", "vls", "nds-nl", "rue", "bo", "fiu-vro", "co", "sc",
32
-                       "tk", "csb", "lrc", "vep", "wuu", "km", "szl", "gv", "crh", "kv",
33
-                       "zh-classical", "frr", "zea", "as", "so", "kw", "nso", "ay", "stq",
34
-                       "udm", "cdo", "nrm", "ie", "koi", "rm", "pcd", "myv", "mt", "fur",
35
-                       "ace", "lad", "gn", "lij", "dsb", "dv", "cbk-zam", "ext", "gom",
36
-                       "kab", "ksh", "ang", "mai", "mwl", "lez", "gag", "ln", "ug", "pi",
37
-                       "pag", "frp", "sn", "nv", "av", "pfl", "haw", "xal", "krc", "kaa",
38
-                       "rw", "bxr", "pdc", "to", "kl", "nov", "arc", "kbd", "lo", "bjn",
39
-                       "pap", "ha", "tet", "ki", "tyv", "tpi", "na", "lbe", "ig", "jbo",
40
-                       "roa-rup", "ty", "jam", "za", "kg", "mdf", "lg", "wo", "srn", "ab",
41
-                       "ltg", "zu", "sm", "chr", "om", "tn", "chy", "rmy", "cu", "tw", "tum",
42
-                       "xh", "bi", "rn", "pih", "got", "ss", "pnt", "bm", "ch", "mo", "ts",
43
-                       "ady", "iu", "st", "ee", "ny", "fj", "ks", "ak", "ik", "sg", "ve",
44
-                       "dz", "ff", "ti", "cr", "ng", "cho", "kj", "mh", "ho", "ii", "aa", "mus", "hz", "kr"]
45 18
 
46 19
 # search-url
47 20
 base_url = 'https://{language}.wikipedia.org/'
@@ -54,6 +27,7 @@ search_postfix = 'w/api.php?'\
54 27
     '&explaintext'\
55 28
     '&pithumbsize=300'\
56 29
     '&redirects'
30
+supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
57 31
 
58 32
 
59 33
 # set language in base_url
@@ -142,3 +116,24 @@ def response(resp):
142 116
                     'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
143 117
 
144 118
     return results
119
+
120
+
121
+# get supported languages from their site
122
+def fetch_supported_languages():
123
+    supported_languages = {}
124
+    response = get(supported_languages_url)
125
+    dom = fromstring(response.text)
126
+    tables = dom.xpath('//table[contains(@class,"sortable")]')
127
+    for table in tables:
128
+        # exclude header row
129
+        trs = table.xpath('.//tr')[1:]
130
+        for tr in trs:
131
+            td = tr.xpath('./td')
132
+            code = td[3].xpath('./a')[0].text
133
+            name = td[2].xpath('./a')[0].text
134
+            english_name = td[1].xpath('./a')[0].text
135
+            articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
136
+            if articles >= 10000:
137
+                supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
138
+
139
+    return supported_languages

+ 16
- 4
searx/engines/yahoo.py 파일 보기

@@ -14,16 +14,13 @@
14 14
 from urllib import urlencode
15 15
 from urlparse import unquote
16 16
 from lxml import html
17
+from requests import get
17 18
 from searx.engines.xpath import extract_text, extract_url
18 19
 
19 20
 # engine dependent config
20 21
 categories = ['general']
21 22
 paging = True
22 23
 language_support = True
23
-supported_languages = ["ar", "bg", "ca", "szh", "tzh", "hr", "cs", "da", "nl", "en",
24
-                       "et", "fi", "fr", "de", "el", "he", "hu", "is", "id", "it", "ja",
25
-                       "ko", "lv", "lt", "no", "fa", "pl", "pt", "ro", "ru", "sk", "sr",
26
-                       "sl", "es", "sv", "th", "tr"]
27 24
 time_range_support = True
28 25
 
29 26
 # search-url
@@ -31,6 +28,8 @@ base_url = 'https://search.yahoo.com/'
31 28
 search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
32 29
 search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
33 30
 
31
+supported_languages_url = 'https://search.yahoo.com/web/advanced'
32
+
34 33
 # specific xpath variables
35 34
 results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
36 35
 url_xpath = './/h3/a/@href'
@@ -142,3 +141,16 @@ def response(resp):
142 141
 
143 142
     # return results
144 143
     return results
144
+
145
+
146
+# get supported languages from their site
147
+def fetch_supported_languages():
148
+    supported_languages = []
149
+    response = get(supported_languages_url)
150
+    dom = html.fromstring(response.text)
151
+    options = dom.xpath('//div[@id="yschlang"]/span/label/input')
152
+    for option in options:
153
+        code = option.xpath('./@value')[0][5:]
154
+        supported_languages.append(code)
155
+
156
+    return supported_languages

+ 1
- 1
searx/engines/yahoo_news.py 파일 보기

@@ -12,7 +12,7 @@
12 12
 from urllib import urlencode
13 13
 from lxml import html
14 14
 from searx.engines.xpath import extract_text, extract_url
15
-from searx.engines.yahoo import parse_url, supported_languages
15
+from searx.engines.yahoo import parse_url, fetch_supported_languages
16 16
 from datetime import datetime, timedelta
17 17
 import re
18 18
 from dateutil import parser

+ 47
- 76
searx/languages.py 파일 보기

@@ -4,39 +4,29 @@
4 4
 
5 5
 language_codes = (
6 6
     (u"ach", u"Acoli", u"", u""),
7
-    (u"af", u"Afrikaans", u"", u"Afrikaans"),
7
+    (u"af", u"Afrikaans", u"", u""),
8 8
     (u"ak", u"Akan", u"", u""),
9
-    (u"als", u"Alemannisch", u"", u"Alemannic"),
10
-    (u"am", u"አማርኛ", u"", u"Amharic"),
11
-    (u"an", u"Aragonés", u"", u"Aragonese"),
9
+    (u"am", u"አማርኛ", u"", u""),
12 10
     (u"ar-SA", u"العربية", u"المملكة العربية السعودية", u"Arabic"),
13
-    (u"arz", u"مصرى (Maṣri)", u"", u"Egyptian Arabic"),
14
-    (u"ast", u"Asturianu", u"", u"Asturian"),
15 11
     (u"az", u"Azərbaycanca", u"", u"Azerbaijani"),
16
-    (u"azb", u"تۆرکجه", u"", u"South Azerbaijani"),
17
-    (u"ba", u"Башҡорт", u"", u"Bashkir"),
18 12
     (u"ban", u"Balinese", u"", u""),
19
-    (u"bar", u"Boarisch", u"", u"Bavarian"),
20 13
     (u"be", u"Беларуская", u"", u"Belarusian"),
21 14
     (u"bem", u"Ichibemba", u"", u""),
22 15
     (u"bg-BG", u"Български", u"България", u"Bulgarian"),
23
-    (u"bn", u"বাংলা", u"", u"Bengali"),
24
-    (u"bpy", u"ইমার ঠার/বিষ্ণুপ্রিয়া মণিপুরী", u"", u"Bishnupriya Manipuri"),
25
-    (u"br", u"Brezhoneg", u"", u"Breton"),
26
-    (u"bs", u"Bosanski", u"", u"Bosnian"),
27
-    (u"bug", u"Basa Ugi", u"", u"Buginese"),
16
+    (u"bn", u"বাংলা", u"", u""),
17
+    (u"br", u"Brezhoneg", u"", u""),
18
+    (u"bs", u"Bosanski", u"", u""),
28 19
     (u"ca", u"Català", u"", u"Catalan"),
29 20
     (u"ca-CT", u"Català", u"", u"Catalan"),
30 21
     (u"ca-ES", u"Català", u"Espanya", u"Catalan"),
31 22
     (u"ce", u"Нохчийн", u"", u"Chechen"),
32 23
     (u"ceb", u"Sinugboanong Binisaya", u"", u"Cebuano"),
33 24
     (u"chr", u"ᏣᎳᎩ", u"", u""),
34
-    (u"ckb", u"Soranî / کوردی", u"", u"Sorani"),
25
+    (u"ckb", u"Central Kurdish", u"", u""),
35 26
     (u"co", u"Corsican", u"", u""),
36 27
     (u"crs", u"Seychellois Creole", u"", u""),
37 28
     (u"cs-CZ", u"Čeština", u"Česko", u"Czech"),
38
-    (u"cv", u"Чăваш", u"", u"Chuvash"),
39
-    (u"cy", u"Cymraeg", u"", u"Welsh"),
29
+    (u"cy", u"Cymraeg", u"", u""),
40 30
     (u"da-DK", u"Dansk", u"Danmark", u"Danish"),
41 31
     (u"de", u"Deutsch", u"", u"German"),
42 32
     (u"de-AT", u"Deutsch", u"Österreich", u"German"),
@@ -70,148 +60,129 @@ language_codes = (
70 60
     (u"eu", u"Euskara", u"", u"Basque"),
71 61
     (u"fa", u"فارسی", u"", u"Persian"),
72 62
     (u"fi-FI", u"Suomi", u"Suomi", u"Finnish"),
73
-    (u"fo", u"Føroyskt", u"", u"Faroese"),
63
+    (u"fo", u"Føroyskt", u"", u""),
74 64
     (u"fr", u"Français", u"", u"French"),
75 65
     (u"fr-BE", u"Français", u"Belgique", u"French"),
76 66
     (u"fr-CA", u"Français", u"Canada", u"French"),
77 67
     (u"fr-CH", u"Français", u"Suisse", u"French"),
78 68
     (u"fr-FR", u"Français", u"France", u"French"),
79
-    (u"fy", u"Frysk", u"", u"West Frisian"),
80
-    (u"ga", u"Gaeilge", u"", u"Irish"),
69
+    (u"fy", u"West-Frysk", u"", u""),
70
+    (u"ga", u"Gaeilge", u"", u""),
81 71
     (u"gaa", u"Ga", u"", u""),
82
-    (u"gd", u"Gàidhlig", u"", u"Scottish Gaelic"),
72
+    (u"gd", u"Gàidhlig", u"", u""),
83 73
     (u"gl", u"Galego", u"", u"Galician"),
84 74
     (u"gn", u"Guarani", u"", u""),
85
-    (u"gu", u"ગુજરાતી", u"", u"Gujarati"),
75
+    (u"gu", u"ગુજરાતી", u"", u""),
86 76
     (u"ha", u"Hausa", u"", u""),
87 77
     (u"haw", u"ʻŌlelo HawaiʻI", u"", u""),
88 78
     (u"he-IL", u"עברית", u"ישראל", u"Hebrew"),
89 79
     (u"hi", u"हिन्दी", u"", u"Hindi"),
90 80
     (u"hr-HR", u"Hrvatski", u"Hrvatska", u"Croatian"),
91
-    (u"hsb", u"Hornjoserbsce", u"", u"Upper Sorbian"),
92
-    (u"ht", u"Krèyol ayisyen", u"", u"Haitian"),
81
+    (u"ht", u"Haitian Creole", u"", u""),
93 82
     (u"hu-HU", u"Magyar", u"Magyarország", u"Hungarian"),
94 83
     (u"hy", u"Հայերեն", u"", u"Armenian"),
95
-    (u"ia", u"Interlingua", u"", u"Interlingua"),
84
+    (u"ia", u"Interlingua", u"", u""),
96 85
     (u"id-ID", u"Bahasa Indonesia", u"Indonesia", u"Indonesian"),
97 86
     (u"ig", u"Igbo", u"", u""),
98
-    (u"io", u"Ido", u"", u"Ido"),
99
-    (u"is", u"Íslenska", u"", u"Icelandic"),
87
+    (u"is", u"Íslenska", u"", u""),
100 88
     (u"it", u"Italiano", u"", u"Italian"),
101 89
     (u"it-CH", u"Italiano", u"Svizzera", u"Italian"),
102 90
     (u"it-IT", u"Italiano", u"Italia", u"Italian"),
103 91
     (u"iw", u"עברית", u"", u""),
104 92
     (u"ja-JP", u"日本語", u"日本", u"Japanese"),
105
-    (u"jv", u"Basa Jawa", u"", u"Javanese"),
106 93
     (u"ka", u"ქართული", u"", u"Georgian"),
107 94
     (u"kg", u"Kongo", u"", u""),
108 95
     (u"kk", u"Қазақша", u"", u"Kazakh"),
109 96
     (u"km", u"ខ្មែរ", u"", u""),
110
-    (u"kn", u"ಕನ್ನಡ", u"", u"Kannada"),
97
+    (u"kn", u"ಕನ್ನಡ", u"", u""),
111 98
     (u"ko-KR", u"한국어", u"대한민국", u"Korean"),
112
-    (u"kri", u"Krio (Sierra Leone)", u"", u""),
113
-    (u"ku", u"Kurdî / كوردی", u"", u"Kurdish"),
114
-    (u"ky", u"Кыргызча", u"", u"Kirghiz"),
99
+    (u"kri", u"Krio", u"", u""),
100
+    (u"ky", u"Кыргызча", u"", u""),
115 101
     (u"la", u"Latina", u"", u"Latin"),
116
-    (u"lb", u"Lëtzebuergesch", u"", u"Luxembourgish"),
117 102
     (u"lg", u"Luganda", u"", u""),
118
-    (u"li", u"Limburgs", u"", u"Limburgish"),
119
-    (u"lmo", u"Lumbaart", u"", u"Lombard"),
120 103
     (u"ln", u"Lingála", u"", u""),
121 104
     (u"lo", u"ລາວ", u"", u""),
122 105
     (u"loz", u"Lozi", u"", u""),
123 106
     (u"lt-LT", u"Lietuvių", u"Lietuva", u"Lithuanian"),
124 107
     (u"lua", u"Luba-Lulua", u"", u""),
125
-    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u"Latvian"),
108
+    (u"lv-LV", u"Latviešu", u"Latvijas Republika", u""),
126 109
     (u"mfe", u"Kreol Morisien", u"", u""),
127
-    (u"mg", u"Malagasy", u"", u"Malagasy"),
110
+    (u"mg", u"Malagasy", u"", u""),
128 111
     (u"mi", u"Maori", u"", u""),
129 112
     (u"min", u"Minangkabau", u"", u"Minangkabau"),
130
-    (u"mk", u"Македонски", u"", u"Macedonian"),
131
-    (u"ml", u"മലയാളം", u"", u"Malayalam"),
132
-    (u"mn", u"Монгол", u"", u"Mongolian"),
133
-    (u"mr", u"मराठी", u"", u"Marathi"),
134
-    (u"mrj", u"Кырык Мары (Kyryk Mary)", u"", u"Hill Mari"),
113
+    (u"mk", u"Македонски", u"", u""),
114
+    (u"ml", u"മലയാളം", u"", u""),
115
+    (u"mn", u"Монгол", u"", u""),
116
+    (u"mr", u"मराठी", u"", u""),
135 117
     (u"ms-MY", u"Bahasa Melayu", u"Malaysia", u"Malay"),
136 118
     (u"mt", u"Malti", u"", u""),
137
-    (u"my", u"မြန်မာဘာသာ", u"", u"Burmese"),
138
-    (u"mzn", u"مَزِروني", u"", u"Mazandarani"),
139
-    (u"nah", u"Nāhuatl", u"", u"Nahuatl"),
140
-    (u"nap", u"Nnapulitano", u"", u"Neapolitan"),
141
-    (u"nds-nl", u"Plattdüütsch", u"Nedderlannen", u"Low Saxon"),
142
-    (u"ne", u"नेपाली", u"", u"Nepali"),
143
-    (u"new", u"नेपाल भाषा", u"", u"Newar"),
119
+    (u"my", u"ဗမာ", u"", u""),
120
+    (u"nb-NO", u"Norwegian Bokmål", u"Norge", u"Norwegian Bokmål"),
121
+    (u"ne", u"नेपाली", u"", u""),
144 122
     (u"nl", u"Nederlands", u"", u"Dutch"),
145 123
     (u"nl-BE", u"Nederlands", u"België", u"Dutch"),
146 124
     (u"nl-NL", u"Nederlands", u"Nederland", u"Dutch"),
147
-    (u"nn", u"Nynorsk", u"", u"Norwegian (Nynorsk)"),
148
-    (u"no-NO", u"Norsk (Bokmål)", u"Norge", u"Norwegian (Bokmål)"),
125
+    (u"nn", u"Nynorsk", u"", u"Norwegian"),
126
+    (u"no-NO", u"Norsk", u"Norge", u"Norwegian"),
149 127
     (u"nso", u"Northern Sotho", u"", u""),
150 128
     (u"ny", u"Nyanja", u"", u""),
151 129
     (u"nyn", u"Runyankore", u"", u""),
152
-    (u"oc", u"Occitan", u"", u"Occitan"),
130
+    (u"oc", u"Occitan", u"", u""),
153 131
     (u"om", u"Oromoo", u"", u""),
154
-    (u"or", u"ଓଡ଼ିଆ", u"", u"Oriya"),
155
-    (u"os", u"Иронау", u"", u"Ossetian"),
156
-    (u"pa", u"ਪੰਜਾਬੀ", u"", u"Punjabi"),
132
+    (u"or", u"ଓଡ଼ିଆ", u"", u""),
133
+    (u"pa", u"ਪੰਜਾਬੀ", u"", u""),
157 134
     (u"pcm", u"Nigerian Pidgin", u"", u""),
158 135
     (u"pl-PL", u"Polski", u"Rzeczpospolita Polska", u"Polish"),
159
-    (u"pms", u"Piemontèis", u"", u"Piedmontese"),
160
-    (u"pnb", u"شاہ مکھی پنجابی (Shāhmukhī Pañjābī)", u"", u"Western Punjabi"),
161 136
     (u"ps", u"پښتو", u"", u""),
162 137
     (u"pt", u"Português", u"", u"Portuguese"),
163 138
     (u"pt-BR", u"Português", u"Brasil", u"Portuguese"),
164 139
     (u"pt-PT", u"Português", u"Portugal", u"Portuguese"),
165
-    (u"qu", u"Runa Simi", u"", u"Quechua"),
140
+    (u"qu", u"Runasimi", u"", u""),
166 141
     (u"rm", u"Rumantsch", u"", u""),
167 142
     (u"rn", u"Ikirundi", u"", u""),
168 143
     (u"ro-RO", u"Română", u"România", u"Romanian"),
169 144
     (u"ru-RU", u"Русский", u"Россия", u"Russian"),
170 145
     (u"rw", u"Kinyarwanda", u"", u""),
171
-    (u"sa", u"संस्कृतम्", u"", u"Sanskrit"),
172
-    (u"sah", u"Саха тыла (Saxa Tyla)", u"", u"Sakha"),
173
-    (u"scn", u"Sicilianu", u"", u"Sicilian"),
174
-    (u"sco", u"Scots", u"", u"Scots"),
175 146
     (u"sd", u"Sindhi", u"", u""),
176 147
     (u"sh", u"Srpskohrvatski / Српскохрватски", u"", u"Serbo-Croatian"),
177
-    (u"si", u"සිංහල", u"", u"Sinhalese"),
148
+    (u"si", u"සිංහල", u"", u""),
178 149
     (u"sk-SK", u"Slovenčina", u"Slovenská republika", u"Slovak"),
179
-    (u"sl-SI", u"Slovenščina", u"Slovenija", u"Slovenian"),
150
+    (u"sl", u"Slovenščina", u"", u"Slovenian"),
180 151
     (u"sn", u"Chishona", u"", u""),
181 152
     (u"so", u"Soomaali", u"", u""),
182
-    (u"sq", u"Shqip", u"", u"Albanian"),
183
-    (u"sr-ME", u"Српски / Srpski", u"Црна Гора", u"Serbian"),
153
+    (u"sq", u"Shqip", u"", u""),
154
+    (u"sr", u"Српски / Srpski", u"", u"Serbian"),
184 155
     (u"st", u"Southern Sotho", u"", u""),
185
-    (u"su", u"Basa Sunda", u"", u"Sundanese"),
156
+    (u"su", u"Sundanese", u"", u""),
186 157
     (u"sv-SE", u"Svenska", u"Sverige", u"Swedish"),
187
-    (u"sw", u"Kiswahili", u"", u"Swahili"),
188
-    (u"ta", u"தமிழ்", u"", u"Tamil"),
189
-    (u"te", u"తెలుగు", u"", u"Telugu"),
190
-    (u"tg", u"Тоҷикӣ", u"", u"Tajik"),
158
+    (u"sw", u"Kiswahili", u"", u""),
159
+    (u"ta", u"தமிழ்", u"", u""),
160
+    (u"te", u"తెలుగు", u"", u""),
161
+    (u"tg", u"Tajik", u"", u""),
191 162
     (u"th-TH", u"ไทย", u"ไทย", u"Thai"),
192 163
     (u"ti", u"ትግርኛ", u"", u""),
193 164
     (u"tk", u"Turkmen", u"", u""),
194
-    (u"tl-PH", u"Tagalog", u"Pilipinas", u"Tagalog"),
165
+    (u"tl-PH", u"Filipino", u"Pilipinas", u""),
195 166
     (u"tlh", u"Klingon", u"", u""),
196 167
     (u"tn", u"Tswana", u"", u""),
197 168
     (u"to", u"Lea Fakatonga", u"", u""),
198 169
     (u"tr-TR", u"Türkçe", u"Türkiye", u"Turkish"),
199
-    (u"tt", u"Tatarça / Татарча", u"", u"Tatar"),
170
+    (u"tt", u"Tatar", u"", u""),
200 171
     (u"tum", u"Tumbuka", u"", u""),
201 172
     (u"tw", u"Twi", u"", u""),
202 173
     (u"ug", u"ئۇيغۇرچە", u"", u""),
203 174
     (u"uk-UA", u"Українська", u"Україна", u"Ukrainian"),
204 175
     (u"ur", u"اردو", u"", u"Urdu"),
205 176
     (u"uz", u"O‘zbek", u"", u"Uzbek"),
206
-    (u"vec", u"Vèneto", u"", u"Venetian"),
177
+    (u"ve", u"Venda", u"", u"Venda"),
207 178
     (u"vi-VN", u"Tiếng Việt", u"Công Hòa Xã Hội Chủ Nghĩa Việt Nam", u"Vietnamese"),
208 179
     (u"vo", u"Volapük", u"", u"Volapük"),
209 180
     (u"wa", u"Walon", u"", u"Walloon"),
210 181
     (u"war", u"Winaray", u"", u"Waray-Waray"),
211 182
     (u"wo", u"Wolof", u"", u""),
212 183
     (u"xh", u"Xhosa", u"", u""),
213
-    (u"yi", u"ייִדיש", u"", u"Yiddish"),
214
-    (u"yo", u"Yorùbá", u"", u"Yoruba"),
184
+    (u"yi", u"ייִדיש", u"", u""),
185
+    (u"yo", u"Èdè Yorùbá", u"", u""),
215 186
     (u"zh", u"中文", u"", u"Chinese"),
216 187
     (u"zh-CN", u"中文", u"中国", u"Chinese"),
217 188
     (u"zh-HK", u"中文", u"香港", u"Chinese"),

+ 1
- 1
searx/webapp.py 파일 보기

@@ -514,7 +514,7 @@ def index():
514 514
         answers=result_container.answers,
515 515
         infoboxes=result_container.infoboxes,
516 516
         paging=result_container.paging,
517
-        current_language=search.lang,
517
+        current_language=search_query.lang,
518 518
         base_url=get_base_url(),
519 519
         theme=get_current_theme_name(),
520 520
         favicons=global_favicons[themes.index(get_current_theme_name())]

+ 1
- 1
tests/unit/engines/test_subtitleseeker.py 파일 보기

@@ -17,7 +17,7 @@ class TestSubtitleseekerEngine(SearxTestCase):
17 17
 
18 18
     def test_response(self):
19 19
         dicto = defaultdict(dict)
20
-        dicto['language'] = 'fr_FR'
20
+        dicto['language'] = 'fr-FR'
21 21
         response = mock.Mock(search_params=dicto)
22 22
 
23 23
         self.assertRaises(AttributeError, subtitleseeker.response, None)

+ 2
- 0
tests/unit/engines/test_wikipedia.py 파일 보기

@@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
8 8
 class TestWikipediaEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        wikipedia.supported_languages = ['fr', 'en']
12
+
11 13
         query = 'test_query'
12 14
         dicto = defaultdict(dict)
13 15
         dicto['language'] = 'fr-FR'

+ 164
- 0
utils/fetch_languages.py 파일 보기

@@ -0,0 +1,164 @@
1
+# -*- coding: utf-8 -*-
2
+
3
+# This script generates languages.py from intersecting each engine's supported languages.
4
+#
5
+# The country names are obtained from http://api.geonames.org which requires registering as a user.
6
+#
7
+# Output files (engines_languages.json and languages.py)
8
+# are written in current directory to avoid overwriting in case something goes wrong.
9
+
10
+from requests import get
11
+from urllib import urlencode
12
+from lxml.html import fromstring
13
+from json import loads, dumps
14
+import io
15
+from sys import path
16
+path.append('../searx')  # noqa
17
+from searx.engines import engines
18
+
19
+# Geonames API for country names.
20
+geonames_user = ''  # ADD USER NAME HERE
21
+country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
22
+
23
+# Output files.
24
+engines_languages_file = 'engines_languages.json'
25
+languages_file = 'languages.py'
26
+
27
+engines_languages = {}
28
+languages = {}
29
+
30
+
31
+# To filter out invalid codes and dialects.
32
+def valid_code(lang_code):
33
+    # filter invalid codes
34
+    # sl-SL is technically not invalid, but still a mistake
35
+    if lang_code[:2] == 'xx'\
36
+       or lang_code == 'sl-SL'\
37
+       or lang_code == 'wt-WT'\
38
+       or lang_code == 'jw'\
39
+       or lang_code[-2:] == 'UK'\
40
+       or lang_code[-2:] == 'XA'\
41
+       or lang_code[-2:] == 'XL':
42
+        return False
43
+
44
+    # filter dialects
45
+    lang_code = lang_code.split('-')
46
+    if len(lang_code) > 2 or len(lang_code[0]) > 3:
47
+        return False
48
+    if len(lang_code) == 2 and len(lang_code[1]) > 2:
49
+        return False
50
+
51
+    return True
52
+
53
+
54
+# Get country name in specified language.
55
+def get_country_name(locale):
56
+    if geonames_user is '':
57
+        return ''
58
+
59
+    locale = locale.split('-')
60
+    if len(locale) != 2:
61
+        return ''
62
+
63
+    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
64
+                                                         'country': locale[1],
65
+                                                         'username': geonames_user}))
66
+    response = get(url)
67
+    json = loads(response.text)
68
+    content = json.get('geonames', None)
69
+    if content is None or len(content) != 1:
70
+        print "No country name found for " + locale[0] + "-" + locale[1]
71
+        return ''
72
+
73
+    return content[0].get('countryName', '')
74
+
75
+
76
+# Fetchs supported languages for each engine and writes json file with those.
77
+def fetch_supported_languages():
78
+    for engine_name in engines:
79
+        if hasattr(engines[engine_name], 'fetch_supported_languages'):
80
+            try:
81
+                engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
82
+            except Exception as e:
83
+                print e
84
+
85
+    # write json file
86
+    f = io.open(engines_languages_file, "w", encoding="utf-8")
87
+    f.write(unicode(dumps(engines_languages, indent=4, ensure_ascii=False, encoding="utf-8")))
88
+    f.close()
89
+
90
+
91
+# Join all language lists.
92
+# Iterate all languages supported by each engine.
93
+def join_language_lists():
94
+    # include wikipedia first for more accurate language names
95
+    # exclude languages with too few articles
96
+    languages.update({code: lang for code, lang
97
+                      in engines_languages['wikipedia'].iteritems()
98
+                      if valid_code(code) and lang['articles'] >= 100000})
99
+
100
+    for engine_name in engines_languages:
101
+        for locale in engines_languages[engine_name]:
102
+            if not valid_code(locale):
103
+                continue
104
+
105
+            # if language is not on list or if it has no name yet
106
+            if locale not in languages or not languages[locale].get('name'):
107
+                if isinstance(engines_languages[engine_name], dict) \
108
+                  and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
109
+                    languages[locale] = engines_languages[engine_name][locale]
110
+                else:
111
+                    languages[locale] = {}
112
+
113
+    # get locales that have no name yet
114
+    for locale in languages.keys():
115
+        if not languages[locale].get('name'):
116
+            # try to get language and country names
117
+            name = languages.get(locale.split('-')[0], {}).get('name', None)
118
+            if name:
119
+                languages[locale]['name'] = name
120
+                languages[locale]['country'] = get_country_name(locale) or ''
121
+                languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
122
+            else:
123
+                # filter out locales with no name
124
+                del languages[locale]
125
+
126
+
127
+# Remove countryless language if language is featured in only one country.
128
+def filter_single_country_languages():
129
+    prev_lang = None
130
+    for code in sorted(languages):
131
+        lang = code.split('-')[0]
132
+        if lang == prev_lang:
133
+            countries += 1
134
+        else:
135
+            if prev_lang is not None and countries == 1:
136
+                del languages[prev_lang]
137
+            countries = 0
138
+            prev_lang = lang
139
+
140
+
141
+# Write languages.py.
142
+def write_languages_file():
143
+    new_file = open(languages_file, 'w')
144
+    file_content = '# -*- coding: utf-8 -*-\n'
145
+    file_content += '# list of language codes\n'
146
+    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
147
+    file_content += '\nlanguage_codes = ('
148
+    for code in sorted(languages):
149
+        file_content += '\n    (u"' + code + '"'\
150
+                        + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
151
+                        + ', u"' + languages[code].get('country', '') + '"'\
152
+                        + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
153
+    # remove last comma
154
+    file_content = file_content[:-1]
155
+    file_content += '\n)\n'
156
+    new_file.write(file_content.encode('utf8'))
157
+    new_file.close()
158
+
159
+
160
+if __name__ == "__main__":
161
+    fetch_supported_languages()
162
+    join_language_lists()
163
+    filter_single_country_languages()
164
+    write_languages_file()

+ 0
- 169
utils/update_languages.py 파일 보기

@@ -1,169 +0,0 @@
1
-# -*- coding: utf-8 -*-
2
-
3
-# This script generates languages.py from
4
-# intersecting each engine's supported languages.
5
-#
6
-# The language's native names are obtained from
7
-# Wikipedia and Google's supported languages.
8
-#
9
-# The country names are obtained from http://api.geonames.org
10
-# which requires registering as a user.
11
-#
12
-# Output file (languages.py) is written in current directory
13
-# to avoid overwriting in case something goes wrong.
14
-
15
-from requests import get
16
-from urllib import urlencode
17
-from lxml.html import fromstring
18
-from json import loads
19
-from sys import path
20
-path.append('../searx')
21
-from searx.engines import engines
22
-
23
-# list of names
24
-wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
25
-google_languages_url = 'https://www.google.com/preferences?#languages'
26
-country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
27
-
28
-geonames_user = ''  # add user name here
29
-
30
-google_json_name = 'google.preferences.langMap'
31
-
32
-languages = {}
33
-
34
-
35
-# To filter out invalid codes and dialects.
36
-def valid_code(lang_code):
37
-    # filter invalid codes
38
-    # sl-SL is technically not invalid, but still a mistake
39
-    if lang_code[:2] == 'xx'\
40
-       or lang_code == 'sl-SL'\
41
-       or lang_code == 'jw'\
42
-       or lang_code[-2:] == 'UK'\
43
-       or lang_code[-2:] == 'XA'\
44
-       or lang_code[-2:] == 'XL':
45
-        return False
46
-
47
-    # filter dialects
48
-    lang_code = lang_code.split('-')
49
-    if len(lang_code) > 2 or len(lang_code[0]) > 3:
50
-        return False
51
-    if len(lang_code) == 2 and len(lang_code[1]) > 2:
52
-        return False
53
-        
54
-    return True
55
-
56
-
57
-# Get country name in specified language.
58
-def get_country_name(locale):
59
-    if geonames_user is '':
60
-        return ''
61
-
62
-    locale = locale.split('-')
63
-    if len(locale) != 2:
64
-        return ''
65
-
66
-    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
67
-                                                         'country': locale[1],
68
-                                                         'username': geonames_user}))
69
-    response = get(url)
70
-    json = loads(response.text)
71
-    content = json.get('geonames', None)
72
-    if content is None or len(content) != 1:
73
-        print "No country name found for " + locale[0] + "-" + locale[1]
74
-        print json
75
-        return ''
76
-
77
-    return content[0].get('countryName', '')
78
-
79
-
80
-# Get language names from Wikipedia.
81
-def get_wikipedia_languages():
82
-    response = get(wiki_languages_url)
83
-    dom = fromstring(response.text)
84
-    tables = dom.xpath('//table[contains(@class,"sortable")]')
85
-    for table in tables:
86
-        # exclude header row
87
-        trs = table.xpath('.//tr')[1:]
88
-        for tr in trs:
89
-            td = tr.xpath('./td')
90
-            code = td[3].xpath('./a')[0].text
91
-            name = td[2].xpath('./a')[0].text
92
-            english_name = td[1].xpath('./a')[0].text
93
-            articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
94
-            
95
-            # exclude language variants and languages with few articles
96
-            if code not in languages and articles >= 10000 and valid_code(code):
97
-                languages[code] = (name, '', english_name)
98
-
99
-
100
-# Get language names from Google.
101
-def get_google_languages():
102
-    response = get(google_languages_url)
103
-    dom = fromstring(response.text)
104
-    options = dom.xpath('//select[@name="hl"]/option')
105
-    for option in options:
106
-        code = option.xpath('./@value')[0].split('-')[0]
107
-        name = option.text[:-1].title()
108
-
109
-        if code not in languages and valid_code(code):
110
-            languages[code] = (name, '', '')
111
-
112
-
113
-# Join all language lists.
114
-# iterate all languages supported by each engine
115
-def join_language_lists():
116
-    for engine_name in engines:
117
-        for locale in engines[engine_name].supported_languages:
118
-            locale = locale.replace('_', '-')
119
-            if locale not in languages and valid_code(locale):
120
-                # try to get language name
121
-                language = languages.get(locale.split('-')[0], None)
122
-                if language == None:
123
-                    print engine_name + ": " + locale
124
-                    continue
125
-
126
-                country = get_country_name(locale)
127
-                languages[locale] = (language[0], country, language[2])
128
-
129
-
130
-# Remove countryless language if language is featured in only one country.
131
-def filter_single_country_languages():
132
-    prev_lang = None
133
-    for code in sorted(languages):
134
-        lang = code.split('-')[0]
135
-        if lang == prev_lang:
136
-            countries += 1
137
-        else:
138
-            if prev_lang is not None and countries == 1:
139
-                del languages[prev_lang]
140
-            countries = 0
141
-            prev_lang = lang
142
-
143
-
144
-# Write languages.py.
145
-def write_languages_file():
146
-    new_file = open('languages.py', 'w')
147
-    file_content = '# -*- coding: utf-8 -*-\n'
148
-    file_content += '# list of language codes\n'
149
-    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
150
-    file_content += '\nlanguage_codes = ('
151
-    for code in sorted(languages):
152
-        (name, country, english) = languages[code]
153
-        file_content += '\n    (u"' + code + '"'\
154
-                        + ', u"' + name + '"'\
155
-                        + ', u"' + country + '"'\
156
-                        + ', u"' + english + '"),'
157
-    # remove last comma
158
-    file_content = file_content[:-1]
159
-    file_content += '\n)\n'
160
-    new_file.write(file_content.encode('utf8'))
161
-    new_file.close()
162
-
163
-
164
-if __name__ == "__main__":
165
-    get_wikipedia_languages()
166
-    get_google_languages()
167
-    join_language_lists()
168
-    filter_single_country_languages()
169
-    write_languages_file()