浏览代码

add bing videos engine

marc 7 年前
父节点
当前提交
3ca9cad927
共有 3 个文件被更改,包括 231 次插入0 次删除
  1. 96
    0
      searx/engines/bing_videos.py
  2. 4
    0
      searx/settings.yml
  3. 131
    0
      tests/unit/engines/test_bing_videos.py

+ 96
- 0
searx/engines/bing_videos.py 查看文件

@@ -0,0 +1,96 @@
1
+"""
2
+ Bing (Videos)
3
+
4
+ @website     https://www.bing.com/videos
5
+ @provide-api yes (http://datamarket.azure.com/dataset/bing/search)
6
+
7
+ @using-api   no
8
+ @results     HTML
9
+ @stable      no
10
+ @parse       url, title, content, thumbnail
11
+"""
12
+
13
+from json import loads
14
+from lxml import html
15
+from searx.engines.xpath import extract_text
16
+from searx.url_utils import urlencode
17
+
18
+
19
+categories = ['videos']
20
+paging = True
21
+safesearch = True
22
+time_range_support = True
23
+number_of_results = 10
24
+
25
+search_url = 'https://www.bing.com/videos/asyncv2?{query}&async=content&'\
26
+             'first={offset}&count={number_of_results}&CW=1366&CH=25&FORM=R5VR5'
27
+time_range_string = '&qft=+filterui:videoage-lt{interval}'
28
+time_range_dict = {'day': '1440',
29
+                   'week': '10080',
30
+                   'month': '43200',
31
+                   'year': '525600'}
32
+
33
+# safesearch definitions
34
+safesearch_types = {2: 'STRICT',
35
+                    1: 'DEMOTE',
36
+                    0: 'OFF'}
37
+
38
+
39
+# do search-request
40
+def request(query, params):
41
+    offset = (params['pageno'] - 1) * 10 + 1
42
+
43
+    # safesearch cookie
44
+    params['cookies']['SRCHHPGUSR'] = \
45
+        'ADLT=' + safesearch_types.get(params['safesearch'], 'DEMOTE')
46
+
47
+    # language cookie
48
+    params['cookies']['_EDGE_S'] = 'mkt=' + params['language'].lower() + '&F=1'
49
+
50
+    # query and paging
51
+    params['url'] = search_url.format(query=urlencode({'q': query}),
52
+                                      offset=offset,
53
+                                      number_of_results=number_of_results)
54
+
55
+    # time range
56
+    if params['time_range'] in time_range_dict:
57
+        params['url'] += time_range_string.format(interval=time_range_dict[params['time_range']])
58
+
59
+    return params
60
+
61
+
62
+# get response from search-request
63
+def response(resp):
64
+    results = []
65
+
66
+    dom = html.fromstring(resp.text)
67
+
68
+    for result in dom.xpath('//div[@class="dg_u"]'):
69
+
70
+        # try to extract the url
71
+        url_container = result.xpath('.//div[@class="sa_wrapper"]/@data-eventpayload')
72
+        if len(url_container) > 0:
73
+            url = loads(url_container[0])['purl']
74
+        else:
75
+            url = result.xpath('./a/@href')[0]
76
+
77
+            # discard results that do not return an external url
78
+            # very recent results sometimes don't return the video's url
79
+            if url.startswith('/videos/search?'):
80
+                continue
81
+
82
+        title = extract_text(result.xpath('./a//div[@class="tl"]'))
83
+        content = extract_text(result.xpath('.//div[@class="pubInfo"]'))
84
+        thumbnail = result.xpath('.//div[@class="vthumb"]/img/@src')[0]
85
+
86
+        results.append({'url': url,
87
+                        'title': title,
88
+                        'content': content,
89
+                        'thumbnail': thumbnail,
90
+                        'template': 'videos.html'})
91
+
92
+        # first page ignores requested number of results
93
+        if len(results) >= number_of_results:
94
+            break
95
+
96
+    return results

+ 4
- 0
searx/settings.yml 查看文件

@@ -81,6 +81,10 @@ engines:
81 81
     engine : bing_news
82 82
     shortcut : bin
83 83
 
84
+  - name : bing videos
85
+    engine : bing_videos
86
+    shortcut : biv
87
+
84 88
   - name : bitbucket
85 89
     engine : xpath
86 90
     paging : True

+ 131
- 0
tests/unit/engines/test_bing_videos.py 查看文件

@@ -0,0 +1,131 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import bing_videos
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestBingVideosEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 1
14
+        dicto['language'] = 'fr-FR'
15
+        dicto['safesearch'] = 0
16
+        dicto['time_range'] = ''
17
+        params = bing_videos.request(query, dicto)
18
+        self.assertTrue('url' in params)
19
+        self.assertTrue(query in params['url'])
20
+        self.assertTrue('bing.com' in params['url'])
21
+        self.assertTrue('SRCHHPGUSR' in params['cookies'])
22
+        self.assertTrue('OFF' in params['cookies']['SRCHHPGUSR'])
23
+        self.assertTrue('_EDGE_S' in params['cookies'])
24
+        self.assertTrue('fr-fr' in params['cookies']['_EDGE_S'])
25
+
26
+        dicto['pageno'] = 2
27
+        dicto['time_range'] = 'day'
28
+        dicto['safesearch'] = 2
29
+        params = bing_videos.request(query, dicto)
30
+        self.assertTrue('first=11' in params['url'])
31
+        self.assertTrue('1440' in params['url'])
32
+        self.assertIn('SRCHHPGUSR', params['cookies'])
33
+        self.assertTrue('STRICT' in params['cookies']['SRCHHPGUSR'])
34
+
35
+    def test_response(self):
36
+        self.assertRaises(AttributeError, bing_videos.response, None)
37
+        self.assertRaises(AttributeError, bing_videos.response, [])
38
+        self.assertRaises(AttributeError, bing_videos.response, '')
39
+        self.assertRaises(AttributeError, bing_videos.response, '[]')
40
+
41
+        response = mock.Mock(text='<html></html>')
42
+        self.assertEqual(bing_videos.response(response), [])
43
+
44
+        response = mock.Mock(text='<html></html>')
45
+        self.assertEqual(bing_videos.response(response), [])
46
+
47
+        html = """
48
+        <div>
49
+            <div class="dg_u">
50
+                <a class="dv_i" href="/videos/search?abcde">
51
+                    <div class="vthblock">
52
+                        <div class="vthumb">
53
+                            <img src="thumb_1.jpg" />
54
+                        </div>
55
+                        <div>
56
+                            <div class="tl">
57
+                                Title 1
58
+                            </div>
59
+                        </div>
60
+                    </div>
61
+                    <div class="videoInfoPanel">
62
+                        <div class="pubInfo">
63
+                            <div>Content 1</div>
64
+                        </div>
65
+                    </div>
66
+                </a>
67
+                <div class="sa_wrapper"
68
+                    data-eventpayload="{&quot;purl&quot;: &quot;https://url.com/1&quot;}">
69
+                </div>
70
+            </div>
71
+        </div>
72
+        """
73
+        response = mock.Mock(text=html)
74
+        results = bing_videos.response(response)
75
+        self.assertEqual(type(results), list)
76
+        self.assertEqual(len(results), 1)
77
+        self.assertEqual(results[0]['title'], 'Title 1')
78
+        self.assertEqual(results[0]['url'], 'https://url.com/1')
79
+        self.assertEqual(results[0]['content'], 'Content 1')
80
+        self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg')
81
+
82
+        html = """
83
+        <div>
84
+            <div class="dg_u">
85
+                <a class="dv_i" href="https://url.com/1">
86
+                    <div class="vthblock">
87
+                        <div class="vthumb">
88
+                            <img src="thumb_1.jpg" />
89
+                        </div>
90
+                        <div>
91
+                            <div class="tl">
92
+                                Title 1
93
+                            </div>
94
+                        </div>
95
+                    </div>
96
+                    <div class="videoInfoPanel">
97
+                        <div class="pubInfo">
98
+                            <div>Content 1</div>
99
+                        </div>
100
+                    </div>
101
+                </a>
102
+            </div>
103
+            <div class="dg_u">
104
+                <a class="dv_i" href="/videos/search?abcde">
105
+                    <div class="vthblock">
106
+                        <div class="vthumb">
107
+                            <img src="thumb_2.jpg" />
108
+                        </div>
109
+                        <div>
110
+                            <div class="tl">
111
+                                Title 2
112
+                            </div>
113
+                        </div>
114
+                    </div>
115
+                    <div class="videoInfoPanel">
116
+                        <div class="pubInfo">
117
+                            <div>Content 2</div>
118
+                        </div>
119
+                    </div>
120
+                </a>
121
+            </div>
122
+        </div>
123
+        """
124
+        response = mock.Mock(text=html)
125
+        results = bing_videos.response(response)
126
+        self.assertEqual(type(results), list)
127
+        self.assertEqual(len(results), 1)
128
+        self.assertEqual(results[0]['title'], 'Title 1')
129
+        self.assertEqual(results[0]['url'], 'https://url.com/1')
130
+        self.assertEqual(results[0]['content'], 'Content 1')
131
+        self.assertEqual(results[0]['thumbnail'], 'thumb_1.jpg')