瀏覽代碼

Merge pull request #88 from pointhi/engines

update and fix search engines
Adam Tauber 10 年之前
父節點
當前提交
f36d1e28fa

+ 39
- 6
searx/engines/bing.py 查看文件

@@ -1,48 +1,81 @@
1
+## Bing (Web)
2
+# 
3
+# @website     https://www.bing.com
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content
10
+#
11
+# @todo        publishedDate
12
+
1 13
 from urllib import urlencode
2 14
 from cgi import escape
3 15
 from lxml import html
4 16
 
5
-base_url = 'http://www.bing.com/'
6
-search_string = 'search?{query}&first={offset}'
17
+# engine dependent config
18
+categories = ['general']
7 19
 paging = True
8 20
 language_support = True
9 21
 
22
+# search-url
23
+base_url = 'https://www.bing.com/'
24
+search_string = 'search?{query}&first={offset}'
10 25
 
26
+# do search-request
11 27
 def request(query, params):
12 28
     offset = (params['pageno'] - 1) * 10 + 1
29
+
13 30
     if params['language'] == 'all':
14 31
         language = 'en-US'
15 32
     else:
16 33
         language = params['language'].replace('_', '-')
34
+
17 35
     search_path = search_string.format(
18 36
         query=urlencode({'q': query, 'setmkt': language}),
19 37
         offset=offset)
20 38
 
21 39
     params['cookies']['SRCHHPGUSR'] = \
22 40
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
23
-    #if params['category'] == 'images':
24
-    #    params['url'] = base_url + 'images/' + search_path
41
+
25 42
     params['url'] = base_url + search_path
26 43
     return params
27 44
 
28 45
 
46
+# get response from search-request
29 47
 def response(resp):
30 48
     results = []
49
+
31 50
     dom = html.fromstring(resp.content)
51
+
52
+    # parse results
32 53
     for result in dom.xpath('//div[@class="sa_cc"]'):
33 54
         link = result.xpath('.//h3/a')[0]
34 55
         url = link.attrib.get('href')
35 56
         title = ' '.join(link.xpath('.//text()'))
36 57
         content = escape(' '.join(result.xpath('.//p//text()')))
37
-        results.append({'url': url, 'title': title, 'content': content})
38 58
 
59
+        # append result
60
+        results.append({'url': url, 
61
+                        'title': title, 
62
+                        'content': content})
63
+
64
+    # return results if something is found
39 65
     if results:
40 66
         return results
41 67
 
68
+    # parse results again if nothing is found yet
42 69
     for result in dom.xpath('//li[@class="b_algo"]'):
43 70
         link = result.xpath('.//h2/a')[0]
44 71
         url = link.attrib.get('href')
45 72
         title = ' '.join(link.xpath('.//text()'))
46 73
         content = escape(' '.join(result.xpath('.//p//text()')))
47
-        results.append({'url': url, 'title': title, 'content': content})
74
+
75
+        # append result
76
+        results.append({'url': url, 
77
+                        'title': title, 
78
+                        'content': content})
79
+
80
+    # return results
48 81
     return results

+ 81
- 0
searx/engines/bing_images.py 查看文件

@@ -0,0 +1,81 @@
1
+## Bing (Images)
2
+# 
3
+# @website     https://www.bing.com/images
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, img_src
10
+#
11
+# @todo        currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images
12
+
13
+from urllib import urlencode
14
+from cgi import escape
15
+from lxml import html
16
+from yaml import load
17
+import re
18
+
19
+# engine dependent config
20
+categories = ['images']
21
+paging = True
22
+
23
+# search-url
24
+base_url = 'https://www.bing.com/'
25
+search_string = 'images/search?{query}&count=10&first={offset}'
26
+
27
+# do search-request
28
+def request(query, params):
29
+    offset = (params['pageno'] - 1) * 10 + 1
30
+
31
+    # required for cookie
32
+    language = 'en-US'
33
+
34
+    search_path = search_string.format(
35
+        query=urlencode({'q': query}),
36
+        offset=offset)
37
+
38
+    params['cookies']['SRCHHPGUSR'] = \
39
+        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
40
+
41
+    params['url'] = base_url + search_path
42
+
43
+    print(params['url'])
44
+
45
+    return params
46
+
47
+
48
+# get response from search-request
49
+def response(resp):
50
+    results = []
51
+
52
+    dom = html.fromstring(resp.content)
53
+
54
+    # init regex for yaml-parsing
55
+    p = re.compile( '({|,)([a-z]+):(")')
56
+
57
+    # parse results
58
+    for result in dom.xpath('//div[@class="dg_u"]'):
59
+        link = result.xpath('./a')[0]
60
+
61
+        # parse yaml-data (it is required to add a space, to make it parsable)
62
+        yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m')))
63
+ 
64
+        title = link.attrib.get('t1')
65
+        #url = 'http://' + link.attrib.get('t3')
66
+        url = yaml_data.get('surl')
67
+        img_src = yaml_data.get('imgurl')
68
+
69
+        # append result
70
+        results.append({'template': 'images.html',
71
+                        'url': url,
72
+                        'title': title,
73
+                        'content': '',  
74
+                        'img_src': img_src})
75
+
76
+        # TODO stop parsing if 10 images are found
77
+        if len(results) >= 10:
78
+            break
79
+
80
+    # return results
81
+    return results

+ 53
- 17
searx/engines/bing_news.py 查看文件

@@ -1,50 +1,86 @@
1
+## Bing (News)
2
+# 
3
+# @website     https://www.bing.com/news
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
10
+
1 11
 from urllib import urlencode
2 12
 from cgi import escape
3 13
 from lxml import html
14
+from datetime import datetime, timedelta
15
+from dateutil import parser
16
+import re
4 17
 
18
+# engine dependent config
5 19
 categories = ['news']
6
-
7
-base_url = 'http://www.bing.com/'
8
-search_string = 'news/search?{query}&first={offset}'
9 20
 paging = True
10 21
 language_support = True
11 22
 
23
+# search-url
24
+base_url = 'https://www.bing.com/'
25
+search_string = 'news/search?{query}&first={offset}'
12 26
 
27
+# do search-request
13 28
 def request(query, params):
14 29
     offset = (params['pageno'] - 1) * 10 + 1
30
+
15 31
     if params['language'] == 'all':
16 32
         language = 'en-US'
17 33
     else:
18 34
         language = params['language'].replace('_', '-')
35
+
19 36
     search_path = search_string.format(
20 37
         query=urlencode({'q': query, 'setmkt': language}),
21 38
         offset=offset)
22 39
 
23 40
     params['cookies']['SRCHHPGUSR'] = \
24 41
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
25
-    #if params['category'] == 'images':
26
-    # params['url'] = base_url + 'images/' + search_path
42
+
27 43
     params['url'] = base_url + search_path
28 44
     return params
29 45
 
30 46
 
47
+# get response from search-request
31 48
 def response(resp):
32 49
     results = []
50
+
33 51
     dom = html.fromstring(resp.content)
34
-    for result in dom.xpath('//div[@class="sa_cc"]'):
35
-        link = result.xpath('.//h3/a')[0]
52
+
53
+    # parse results
54
+    for result in dom.xpath('//div[@class="sn_r"]'):
55
+        link = result.xpath('.//div[@class="newstitle"]/a')[0]
36 56
         url = link.attrib.get('href')
37 57
         title = ' '.join(link.xpath('.//text()'))
38
-        content = escape(' '.join(result.xpath('.//p//text()')))
39
-        results.append({'url': url, 'title': title, 'content': content})
58
+        content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
59
+        
60
+        # parse publishedDate
61
+        publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
40 62
 
41
-    if results:
42
-        return results
63
+        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
64
+            timeNumbers = re.findall(r'\d+', publishedDate)
65
+            publishedDate = datetime.now()\
66
+                - timedelta(minutes=int(timeNumbers[0]))
67
+        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
68
+            timeNumbers = re.findall(r'\d+', publishedDate)
69
+            publishedDate = datetime.now()\
70
+                - timedelta(hours=int(timeNumbers[0]))
71
+        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
72
+            timeNumbers = re.findall(r'\d+', publishedDate)
73
+            publishedDate = datetime.now()\
74
+                - timedelta(hours=int(timeNumbers[0]))\
75
+                - timedelta(minutes=int(timeNumbers[1]))
76
+        else:
77
+            publishedDate = parser.parse(publishedDate)  
43 78
 
44
-    for result in dom.xpath('//li[@class="b_algo"]'):
45
-        link = result.xpath('.//h2/a')[0]
46
-        url = link.attrib.get('href')
47
-        title = ' '.join(link.xpath('.//text()'))
48
-        content = escape(' '.join(result.xpath('.//p//text()')))
49
-        results.append({'url': url, 'title': title, 'content': content})
79
+        # append result
80
+        results.append({'url': url, 
81
+                        'title': title, 
82
+                        'publishedDate': publishedDate,
83
+                        'content': content})
84
+
85
+    # return results
50 86
     return results

+ 35
- 19
searx/engines/dailymotion.py 查看文件

@@ -1,45 +1,61 @@
1
+## Dailymotion (Videos)
2
+# 
3
+# @website     https://www.dailymotion.com
4
+# @provide-api yes (http://www.dailymotion.com/developer)
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes
9
+# @parse       url, title, thumbnail
10
+#
11
+# @todo        set content-parameter with correct data
12
+
1 13
 from urllib import urlencode
2 14
 from json import loads
3 15
 from lxml import html
4 16
 
17
+# engine dependent config
5 18
 categories = ['videos']
6 19
 locale = 'en_US'
20
+paging = True
7 21
 
22
+# search-url
8 23
 # see http://www.dailymotion.com/doc/api/obj-video.html
9
-search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}'  # noqa
10
-
11
-# TODO use video result template
12
-content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />'
13
-
14
-paging = True
24
+search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}'  # noqa
15 25
 
16 26
 
27
+# do search-request
17 28
 def request(query, params):
18 29
     params['url'] = search_url.format(
19 30
         query=urlencode({'search': query, 'localization': locale}),
20 31
         pageno=params['pageno'])
32
+
21 33
     return params
22 34
 
23 35
 
36
+# get response from search-request
24 37
 def response(resp):
25 38
     results = []
39
+
26 40
     search_res = loads(resp.text)
41
+
42
+    # return empty array if there are no results
27 43
     if not 'list' in search_res:
28
-        return results
44
+        return []
45
+
46
+    # parse results
29 47
     for res in search_res['list']:
30 48
         title = res['title']
31 49
         url = res['url']
32
-        if res['thumbnail_360_url']:
33
-            content = content_tpl.format(url, res['thumbnail_360_url'])
34
-        else:
35
-            content = ''
36
-        if res['description']:
37
-            description = text_content_from_html(res['description'])
38
-            content += description[:500]
39
-        results.append({'url': url, 'title': title, 'content': content})
40
-    return results
50
+        #content = res['description']
51
+        content = ''
52
+        thumbnail = res['thumbnail_360_url']
41 53
 
54
+        results.append({'template': 'videos.html',
55
+                        'url': url,
56
+                        'title': title,
57
+                        'content': content,
58
+                        'thumbnail': thumbnail})
42 59
 
43
-def text_content_from_html(html_string):
44
-    desc_html = html.fragment_fromstring(html_string, create_parent=True)
45
-    return desc_html.text_content()
60
+    # return results
61
+    return results

+ 24
- 4
searx/engines/google.py 查看文件

@@ -1,37 +1,57 @@
1
-#!/usr/bin/env python
1
+## Google (Web)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, content
2 10
 
3 11
 from urllib import urlencode
4 12
 from json import loads
5 13
 
14
+# engine dependent config
6 15
 categories = ['general']
16
+paging = True
17
+language_support = True
7 18
 
19
+# search-url
8 20
 url = 'https://ajax.googleapis.com/'
9 21
 search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
10 22
 
11
-paging = True
12
-language_support = True
13
-
14 23
 
24
+# do search-request
15 25
 def request(query, params):
16 26
     offset = (params['pageno'] - 1) * 8
27
+
17 28
     language = 'en-US'
18 29
     if params['language'] != 'all':
19 30
         language = params['language'].replace('_', '-')
31
+
20 32
     params['url'] = search_url.format(offset=offset,
21 33
                                       query=urlencode({'q': query}),
22 34
                                       language=language)
35
+
23 36
     return params
24 37
 
25 38
 
39
+# get response from search-request
26 40
 def response(resp):
27 41
     results = []
42
+
28 43
     search_res = loads(resp.text)
29 44
 
45
+    # return empty array if there are no results
30 46
     if not search_res.get('responseData', {}).get('results'):
31 47
         return []
32 48
 
49
+    # parse results
33 50
     for result in search_res['responseData']['results']:
51
+        # append result
34 52
         results.append({'url': result['unescapedUrl'],
35 53
                         'title': result['titleNoFormatting'],
36 54
                         'content': result['content']})
55
+
56
+    # return results
37 57
     return results

+ 26
- 5
searx/engines/google_images.py 查看文件

@@ -1,37 +1,58 @@
1
-#!/usr/bin/env python
1
+## Google (Images)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, img_src
2 10
 
3 11
 from urllib import urlencode
4 12
 from json import loads
5 13
 
14
+# engine dependent config
6 15
 categories = ['images']
16
+paging = True
7 17
 
18
+# search-url
8 19
 url = 'https://ajax.googleapis.com/'
9 20
 search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'  # noqa
10 21
 
11
-paging = True
12 22
 
23
+# do search-request
13 24
 def request(query, params):
14 25
     offset = (params['pageno'] - 1) * 8
26
+
15 27
     params['url'] = search_url.format(query=urlencode({'q': query}),
16 28
                                       offset=offset)
29
+
17 30
     return params
18 31
 
19 32
 
33
+# get response from search-request
20 34
 def response(resp):
21 35
     results = []
36
+
22 37
     search_res = loads(resp.text)
23
-    if not search_res.get('responseData'):
24
-        return []
25
-    if not search_res['responseData'].get('results'):
38
+
39
+    # return empty array if there are no results
40
+    if not search_res.get('responseData', {}).get('results'):
26 41
         return []
42
+
43
+    # parse results
27 44
     for result in search_res['responseData']['results']:
28 45
         href = result['originalContextUrl']
29 46
         title = result['title']
30 47
         if not result['url']:
31 48
             continue
49
+
50
+        # append result
32 51
         results.append({'url': href,
33 52
                         'title': title,
34 53
                         'content': '',
35 54
                         'img_src': result['url'],
36 55
                         'template': 'images.html'})
56
+
57
+    # return results
37 58
     return results

+ 25
- 6
searx/engines/google_news.py 查看文件

@@ -1,43 +1,62 @@
1
-#!/usr/bin/env python
1
+## Google (News)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, content, publishedDate
2 10
 
3 11
 from urllib import urlencode
4 12
 from json import loads
5 13
 from dateutil import parser
6 14
 
15
+# search-url
7 16
 categories = ['news']
17
+paging = True
18
+language_support = True
8 19
 
20
+# engine dependent config
9 21
 url = 'https://ajax.googleapis.com/'
10 22
 search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
11 23
 
12
-paging = True
13
-language_support = True
14
-
15 24
 
25
+# do search-request
16 26
 def request(query, params):
17 27
     offset = (params['pageno'] - 1) * 8
28
+
18 29
     language = 'en-US'
19 30
     if params['language'] != 'all':
20 31
         language = params['language'].replace('_', '-')
32
+
21 33
     params['url'] = search_url.format(offset=offset,
22 34
                                       query=urlencode({'q': query}),
23 35
                                       language=language)
36
+
24 37
     return params
25 38
 
26 39
 
40
+# get response from search-request
27 41
 def response(resp):
28 42
     results = []
43
+
29 44
     search_res = loads(resp.text)
30 45
 
46
+    # return empty array if there are no results
31 47
     if not search_res.get('responseData', {}).get('results'):
32 48
         return []
33 49
 
50
+    # parse results
34 51
     for result in search_res['responseData']['results']:
35
-
36
-# Mon, 10 Mar 2014 16:26:15 -0700
52
+        # parse publishedDate
37 53
         publishedDate = parser.parse(result['publishedDate'])
38 54
 
55
+        # append result
39 56
         results.append({'url': result['unescapedUrl'],
40 57
                         'title': result['titleNoFormatting'],
41 58
                         'publishedDate': publishedDate,
42 59
                         'content': result['content']})
60
+
61
+    # return results
43 62
     return results

+ 40
- 22
searx/engines/vimeo.py 查看文件

@@ -1,43 +1,58 @@
1
+## Vimeo (Videos)
2
+# 
3
+# @website     https://vimeo.com/
4
+# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour
5
+# 
6
+# @using-api   no (TODO, rewrite to api)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, publishedDate,  thumbnail
10
+#
11
+# @todo        rewrite to api
12
+# @todo        set content-parameter with correct data
13
+
1 14
 from urllib import urlencode
2 15
 from HTMLParser import HTMLParser
3 16
 from lxml import html
4 17
 from searx.engines.xpath import extract_text
5 18
 from dateutil import parser
6 19
 
7
-base_url = 'http://vimeo.com'
8
-search_url = base_url + '/search?{query}'
9
-url_xpath = None
10
-content_xpath = None
11
-title_xpath = None
12
-results_xpath = ''
13
-content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
14
-publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
20
+# engine dependent config
21
+categories = ['videos']
22
+paging = True
15 23
 
16
-# the cookie set by vimeo contains all the following values,
17
-# but only __utma seems to be requiered
18
-cookie = {
19
-    #'vuid':'918282893.1027205400'
20
-    # 'ab_bs':'%7B%223%22%3A279%7D'
21
-     '__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0'
22
-    # '__utmb':'18302654.1.10.1388942090'
23
-    #, '__utmc':'18302654'
24
-    #, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'  # noqa
25
-    #, '__utml':'search'
26
-}
24
+# search-url
25
+base_url = 'https://vimeo.com'
26
+search_url = base_url + '/search/page:{pageno}?{query}'
27
+
28
+# specific xpath variables
29
+url_xpath = './a/@href'
30
+content_xpath = './a/img/@src'
31
+title_xpath = './a/div[@class="data"]/p[@class="title"]/text()'
32
+results_xpath = '//div[@id="browse_content"]/ol/li'
33
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
27 34
 
28 35
 
36
+# do search-request
29 37
 def request(query, params):
30
-    params['url'] = search_url.format(query=urlencode({'q': query}))
31
-    params['cookies'] = cookie
38
+    params['url'] = search_url.format(pageno=params['pageno'] ,
39
+                                      query=urlencode({'q': query}))
40
+
41
+    # TODO required?
42
+    params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0'
43
+
32 44
     return params
33 45
 
34 46
 
47
+# get response from search-request
35 48
 def response(resp):
36 49
     results = []
50
+
37 51
     dom = html.fromstring(resp.text)
38 52
 
39 53
     p = HTMLParser()
40 54
 
55
+    # parse results
41 56
     for result in dom.xpath(results_xpath):
42 57
         url = base_url + result.xpath(url_xpath)[0]
43 58
         title = p.unescape(extract_text(result.xpath(title_xpath)))
@@ -45,10 +60,13 @@ def response(resp):
45 60
         publishedDate = parser.parse(extract_text(
46 61
             result.xpath(publishedDate_xpath)[0]))
47 62
 
63
+        # append result
48 64
         results.append({'url': url,
49 65
                         'title': title,
50
-                        'content': content_tpl.format(url, title, thumbnail),
66
+                        'content': '',
51 67
                         'template': 'videos.html',
52 68
                         'publishedDate': publishedDate,
53 69
                         'thumbnail': thumbnail})
70
+
71
+    # return results
54 72
     return results

+ 41
- 6
searx/engines/yahoo.py 查看文件

@@ -1,64 +1,99 @@
1
-#!/usr/bin/env python
1
+## Yahoo (Web)
2
+# 
3
+# @website     https://search.yahoo.com/web
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, suggestion
2 10
 
3 11
 from urllib import urlencode
4 12
 from urlparse import unquote
5 13
 from lxml import html
6 14
 from searx.engines.xpath import extract_text, extract_url
7 15
 
16
+# engine dependent config
8 17
 categories = ['general']
9
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
18
+paging = True
19
+language_support = True
20
+
21
+# search-url
22
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
23
+
24
+# specific xpath variables
10 25
 results_xpath = '//div[@class="res"]'
11 26
 url_xpath = './/h3/a/@href'
12 27
 title_xpath = './/h3/a'
13 28
 content_xpath = './/div[@class="abstr"]'
14 29
 suggestion_xpath = '//div[@id="satat"]//a'
15 30
 
16
-paging = True
17
-
18 31
 
32
+# remove yahoo-specific tracking-url
19 33
 def parse_url(url_string):
20 34
     endings = ['/RS', '/RK']
21 35
     endpositions = []
22 36
     start = url_string.find('http', url_string.find('/RU=')+1)
37
+
23 38
     for ending in endings:
24 39
         endpos = url_string.rfind(ending)
25 40
         if endpos > -1:
26 41
             endpositions.append(endpos)
27 42
 
28 43
     end = min(endpositions)
44
+
29 45
     return unquote(url_string[start:end])
30 46
 
31 47
 
48
+# do search-request
32 49
 def request(query, params):
33 50
     offset = (params['pageno'] - 1) * 10 + 1
51
+
34 52
     if params['language'] == 'all':
35 53
         language = 'en'
36 54
     else:
37 55
         language = params['language'].split('_')[0]
56
+
38 57
     params['url'] = search_url.format(offset=offset,
39
-                                      query=urlencode({'p': query}))
58
+                                      query=urlencode({'p': query}),
59
+                                      lang=language)
60
+
61
+    # TODO required?
40 62
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
41 63
         .format(lang=language)
64
+
42 65
     return params
43 66
 
44 67
 
68
+# get response from search-request
45 69
 def response(resp):
46 70
     results = []
71
+
47 72
     dom = html.fromstring(resp.text)
48 73
 
74
+    # parse results
49 75
     for result in dom.xpath(results_xpath):
50 76
         try:
51 77
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
52 78
             title = extract_text(result.xpath(title_xpath)[0])
53 79
         except:
54 80
             continue
81
+
55 82
         content = extract_text(result.xpath(content_xpath)[0])
56
-        results.append({'url': url, 'title': title, 'content': content})
57 83
 
84
+        # append result
85
+        results.append({'url': url, 
86
+                        'title': title, 
87
+                        'content': content})
88
+
89
+    # if no suggestion found, return results
58 90
     if not suggestion_xpath:
59 91
         return results
60 92
 
93
+    # parse suggestion
61 94
     for suggestion in dom.xpath(suggestion_xpath):
95
+        # append suggestion
62 96
         results.append({'suggestion': extract_text(suggestion)})
63 97
 
98
+    # return results
64 99
     return results

+ 31
- 11
searx/engines/yahoo_news.py 查看文件

@@ -1,4 +1,12 @@
1
-#!/usr/bin/env python
1
+## Yahoo (News)
2
+# 
3
+# @website     https://news.yahoo.com
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
2 10
 
3 11
 from urllib import urlencode
4 12
 from lxml import html
@@ -8,8 +16,15 @@ from datetime import datetime, timedelta
8 16
 import re
9 17
 from dateutil import parser
10 18
 
19
+# engine dependent config
11 20
 categories = ['news']
12
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
21
+paging = True
22
+language_support = True
23
+
24
+# search-url
25
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
26
+
27
+# specific xpath variables
13 28
 results_xpath = '//div[@class="res"]'
14 29
 url_xpath = './/h3/a/@href'
15 30
 title_xpath = './/h3/a'
@@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
17 32
 publishedDate_xpath = './/span[@class="timestamp"]'
18 33
 suggestion_xpath = '//div[@id="satat"]//a'
19 34
 
20
-paging = True
21
-
22 35
 
36
+# do search-request
23 37
 def request(query, params):
24 38
     offset = (params['pageno'] - 1) * 10 + 1
39
+
25 40
     if params['language'] == 'all':
26 41
         language = 'en'
27 42
     else:
28 43
         language = params['language'].split('_')[0]
44
+    
29 45
     params['url'] = search_url.format(offset=offset,
30
-                                      query=urlencode({'p': query}))
46
+                                      query=urlencode({'p': query}),
47
+                                      lang=language)
48
+
49
+    # TODO required?
31 50
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
32 51
         .format(lang=language)
33 52
     return params
34 53
 
35 54
 
55
+# get response from search-request
36 56
 def response(resp):
37 57
     results = []
58
+
38 59
     dom = html.fromstring(resp.text)
39 60
 
61
+    # parse results
40 62
     for result in dom.xpath(results_xpath):
41 63
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
42 64
         title = extract_text(result.xpath(title_xpath)[0])
43 65
         content = extract_text(result.xpath(content_xpath)[0])
66
+
67
+        # parse publishedDate
44 68
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
45 69
 
46 70
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@@ -58,15 +82,11 @@ def response(resp):
58 82
         if publishedDate.year == 1900:
59 83
             publishedDate = publishedDate.replace(year=datetime.now().year)
60 84
 
85
+        # append result
61 86
         results.append({'url': url,
62 87
                         'title': title,
63 88
                         'content': content,
64 89
                         'publishedDate': publishedDate})
65 90
 
66
-    if not suggestion_xpath:
67
-        return results
68
-
69
-    for suggestion in dom.xpath(suggestion_xpath):
70
-        results.append({'suggestion': extract_text(suggestion)})
71
-
91
+    # return results
72 92
     return results

+ 6
- 5
searx/settings.yml 查看文件

@@ -20,6 +20,11 @@ engines:
20 20
     locale : en-US
21 21
     shortcut : bi
22 22
 
23
+  - name : bing images
24
+    engine : bing_images
25
+    locale : en-US
26
+    shortcut : bii
27
+
23 28
   - name : bing news
24 29
     engine : bing_news
25 30
     locale : en-US
@@ -148,11 +153,7 @@ engines:
148 153
 
149 154
   - name : vimeo
150 155
     engine : vimeo
151
-    categories : videos
152
-    results_xpath : //div[@id="browse_content"]/ol/li
153
-    url_xpath : ./a/@href
154
-    title_xpath : ./a/div[@class="data"]/p[@class="title"]/text()
155
-    content_xpath : ./a/img/@src
156
+    locale : en-US
156 157
     shortcut : vm
157 158
 
158 159
 locales: