瀏覽代碼

[fix] language support for bing images and videos

marc 7 年之前
父節點
當前提交
a524dbb823

+ 47
- 9
searx/engines/bing_images.py 查看文件

@@ -18,7 +18,6 @@
18 18
 from lxml import html
19 19
 from json import loads
20 20
 import re
21
-from searx.engines.bing import _fetch_supported_languages, supported_languages_url
22 21
 from searx.url_utils import urlencode
23 22
 
24 23
 # engine dependent config
@@ -26,6 +25,8 @@ categories = ['images']
26 25
 paging = True
27 26
 safesearch = True
28 27
 time_range_support = True
28
+language_support = True
29
+supported_languages_url = 'https://www.bing.com/account/general'
29 30
 
30 31
 # search-url
31 32
 base_url = 'https://www.bing.com/'
@@ -45,23 +46,41 @@ safesearch_types = {2: 'STRICT',
45 46
 _quote_keys_regex = re.compile('({|,)([a-z][a-z0-9]*):(")', re.I | re.U)
46 47
 
47 48
 
49
+# get supported region code
50
+def get_region_code(lang, lang_list=None):
51
+    region = None
52
+    if lang in (lang_list or supported_languages):
53
+        region = lang
54
+    elif lang.startswith('no'):
55
+        region = 'nb-NO'
56
+    else:
57
+        # try to get a supported country code with language
58
+        lang = lang.split('-')[0]
59
+        for lc in (lang_list or supported_languages):
60
+            if lang == lc.split('-')[0]:
61
+                region = lc
62
+                break
63
+    if region:
64
+        return region.lower()
65
+    else:
66
+        return 'en-us'
67
+
68
+
48 69
 # do search-request
49 70
 def request(query, params):
50 71
     offset = (params['pageno'] - 1) * 10 + 1
51 72
 
52
-    # required for cookie
53
-    if params['language'] == 'all':
54
-        language = 'en-US'
55
-    else:
56
-        language = params['language']
57
-
58 73
     search_path = search_string.format(
59 74
         query=urlencode({'q': query}),
60 75
         offset=offset)
61 76
 
77
+    language = get_region_code(params['language'])
78
+
62 79
     params['cookies']['SRCHHPGUSR'] = \
63
-        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0] +\
64
-        '&ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
80
+        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
81
+
82
+    params['cookies']['_EDGE_S'] = 'mkt=' + language +\
83
+        '&ui=' + language + '&F=1'
65 84
 
66 85
     params['url'] = base_url + search_path
67 86
     if params['time_range'] in time_range_dict:
@@ -106,3 +125,22 @@ def response(resp):
106 125
 
107 126
     # return results
108 127
     return results
128
+
129
+
130
+# get supported languages from their site
131
+def _fetch_supported_languages(resp):
132
+    supported_languages = []
133
+    dom = html.fromstring(resp.text)
134
+
135
+    regions_xpath = '//div[@id="region-section-content"]' \
136
+                    + '//ul[@class="b_vList"]/li/a/@href'
137
+
138
+    regions = dom.xpath(regions_xpath)
139
+    for region in regions:
140
+        code = re.search('setmkt=[^\&]+', region).group()[7:]
141
+        if code == 'nb-NO':
142
+            code = 'no-NO'
143
+
144
+        supported_languages.append(code)
145
+
146
+    return supported_languages

+ 4
- 1
searx/engines/bing_videos.py 查看文件

@@ -12,6 +12,7 @@
12 12
 
13 13
 from json import loads
14 14
 from lxml import html
15
+from searx.engines.bing_images import _fetch_supported_languages, supported_languages_url, get_region_code
15 16
 from searx.engines.xpath import extract_text
16 17
 from searx.url_utils import urlencode
17 18
 
@@ -21,6 +22,7 @@ paging = True
21 22
 safesearch = True
22 23
 time_range_support = True
23 24
 number_of_results = 10
25
+language_support = True
24 26
 
25 27
 search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
26 28
              'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
@@ -45,7 +47,8 @@ def request(query, params):
45 47
         'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
46 48
 
47 49
     # language cookie
48
-    params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1'
50
+    region = get_region_code(params['language'], lang_list=supported_languages)
51
+    params['cookies']['_EDGE_S'] = 'mkt=' + region + '&F=1'
49 52
 
50 53
     # query and paging
51 54
     params['url'] = search_url.format(query=urlencode({'q': query}),

+ 8
- 4
tests/unit/engines/test_bing_images.py 查看文件

@@ -8,10 +8,12 @@ from searx.testing import SearxTestCase
8 8
 class TestBingImagesEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        bing_images.supported_languages = ['fr-FR', 'en-US']
12
+
11 13
         query = 'test_query'
12 14
         dicto = defaultdict(dict)
13 15
         dicto['pageno'] = 1
14
-        dicto['language'] = 'fr_FR'
16
+        dicto['language'] = 'fr-FR'
15 17
         dicto['safesearch'] = 1
16 18
         dicto['time_range'] = ''
17 19
         params = bing_images.request(query, dicto)
@@ -19,12 +21,14 @@ class TestBingImagesEngine(SearxTestCase):
19 21
         self.assertTrue(query in params['url'])
20 22
         self.assertTrue('bing.com' in params['url'])
21 23
         self.assertTrue('SRCHHPGUSR' in params['cookies'])
22
-        self.assertTrue('fr' in params['cookies']['SRCHHPGUSR'])
24
+        self.assertTrue('DEMOTE' in params['cookies']['SRCHHPGUSR'])
25
+        self.assertTrue('_EDGE_S' in params['cookies'])
26
+        self.assertTrue('fr-fr' in params['cookies']['_EDGE_S'])
23 27
 
24 28
         dicto['language'] = 'all'
25 29
         params = bing_images.request(query, dicto)
26
-        self.assertIn('SRCHHPGUSR', params['cookies'])
27
-        self.assertIn('en', params['cookies']['SRCHHPGUSR'])
30
+        self.assertTrue('_EDGE_S' in params['cookies'])
31
+        self.assertTrue('en' in params['cookies']['_EDGE_S'])
28 32
 
29 33
     def test_response(self):
30 34
         self.assertRaises(AttributeError, bing_images.response, None)

+ 2
- 0
tests/unit/engines/test_bing_videos.py 查看文件

@@ -8,6 +8,8 @@ from searx.testing import SearxTestCase
8 8
 class TestBingVideosEngine(SearxTestCase):
9 9
 
10 10
     def test_request(self):
11
+        bing_videos.supported_languages = ['fr-FR', 'en-US']
12
+
11 13
         query = 'test_query'
12 14
         dicto = defaultdict(dict)
13 15
         dicto['pageno'] = 1