浏览代码

Merge pull request #88 from pointhi/engines

update and fix search engines
Adam Tauber 10 年前
父节点
当前提交
f36d1e28fa

+ 39
- 6
searx/engines/bing.py 查看文件

1
+## Bing (Web)
2
+# 
3
+# @website     https://www.bing.com
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content
10
+#
11
+# @todo        publishedDate
12
+
1
 from urllib import urlencode
13
 from urllib import urlencode
2
 from cgi import escape
14
 from cgi import escape
3
 from lxml import html
15
 from lxml import html
4
 
16
 
5
-base_url = 'http://www.bing.com/'
6
-search_string = 'search?{query}&first={offset}'
17
+# engine dependent config
18
+categories = ['general']
7
 paging = True
19
 paging = True
8
 language_support = True
20
 language_support = True
9
 
21
 
22
+# search-url
23
+base_url = 'https://www.bing.com/'
24
+search_string = 'search?{query}&first={offset}'
10
 
25
 
26
+# do search-request
11
 def request(query, params):
27
 def request(query, params):
12
     offset = (params['pageno'] - 1) * 10 + 1
28
     offset = (params['pageno'] - 1) * 10 + 1
29
+
13
     if params['language'] == 'all':
30
     if params['language'] == 'all':
14
         language = 'en-US'
31
         language = 'en-US'
15
     else:
32
     else:
16
         language = params['language'].replace('_', '-')
33
         language = params['language'].replace('_', '-')
34
+
17
     search_path = search_string.format(
35
     search_path = search_string.format(
18
         query=urlencode({'q': query, 'setmkt': language}),
36
         query=urlencode({'q': query, 'setmkt': language}),
19
         offset=offset)
37
         offset=offset)
20
 
38
 
21
     params['cookies']['SRCHHPGUSR'] = \
39
     params['cookies']['SRCHHPGUSR'] = \
22
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
40
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
23
-    #if params['category'] == 'images':
24
-    #    params['url'] = base_url + 'images/' + search_path
41
+
25
     params['url'] = base_url + search_path
42
     params['url'] = base_url + search_path
26
     return params
43
     return params
27
 
44
 
28
 
45
 
46
+# get response from search-request
29
 def response(resp):
47
 def response(resp):
30
     results = []
48
     results = []
49
+
31
     dom = html.fromstring(resp.content)
50
     dom = html.fromstring(resp.content)
51
+
52
+    # parse results
32
     for result in dom.xpath('//div[@class="sa_cc"]'):
53
     for result in dom.xpath('//div[@class="sa_cc"]'):
33
         link = result.xpath('.//h3/a')[0]
54
         link = result.xpath('.//h3/a')[0]
34
         url = link.attrib.get('href')
55
         url = link.attrib.get('href')
35
         title = ' '.join(link.xpath('.//text()'))
56
         title = ' '.join(link.xpath('.//text()'))
36
         content = escape(' '.join(result.xpath('.//p//text()')))
57
         content = escape(' '.join(result.xpath('.//p//text()')))
37
-        results.append({'url': url, 'title': title, 'content': content})
38
 
58
 
59
+        # append result
60
+        results.append({'url': url, 
61
+                        'title': title, 
62
+                        'content': content})
63
+
64
+    # return results if something is found
39
     if results:
65
     if results:
40
         return results
66
         return results
41
 
67
 
68
+    # parse results again if nothing is found yet
42
     for result in dom.xpath('//li[@class="b_algo"]'):
69
     for result in dom.xpath('//li[@class="b_algo"]'):
43
         link = result.xpath('.//h2/a')[0]
70
         link = result.xpath('.//h2/a')[0]
44
         url = link.attrib.get('href')
71
         url = link.attrib.get('href')
45
         title = ' '.join(link.xpath('.//text()'))
72
         title = ' '.join(link.xpath('.//text()'))
46
         content = escape(' '.join(result.xpath('.//p//text()')))
73
         content = escape(' '.join(result.xpath('.//p//text()')))
47
-        results.append({'url': url, 'title': title, 'content': content})
74
+
75
+        # append result
76
+        results.append({'url': url, 
77
+                        'title': title, 
78
+                        'content': content})
79
+
80
+    # return results
48
     return results
81
     return results

+ 81
- 0
searx/engines/bing_images.py 查看文件

1
+## Bing (Images)
2
+# 
3
+# @website     https://www.bing.com/images
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, img_src
10
+#
11
+# @todo        currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images
12
+
13
+from urllib import urlencode
14
+from cgi import escape
15
+from lxml import html
16
+from yaml import load
17
+import re
18
+
19
+# engine dependent config
20
+categories = ['images']
21
+paging = True
22
+
23
+# search-url
24
+base_url = 'https://www.bing.com/'
25
+search_string = 'images/search?{query}&count=10&first={offset}'
26
+
27
+# do search-request
28
+def request(query, params):
29
+    offset = (params['pageno'] - 1) * 10 + 1
30
+
31
+    # required for cookie
32
+    language = 'en-US'
33
+
34
+    search_path = search_string.format(
35
+        query=urlencode({'q': query}),
36
+        offset=offset)
37
+
38
+    params['cookies']['SRCHHPGUSR'] = \
39
+        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
40
+
41
+    params['url'] = base_url + search_path
42
+
43
+    print(params['url'])
44
+
45
+    return params
46
+
47
+
48
+# get response from search-request
49
+def response(resp):
50
+    results = []
51
+
52
+    dom = html.fromstring(resp.content)
53
+
54
+    # init regex for yaml-parsing
55
+    p = re.compile( '({|,)([a-z]+):(")')
56
+
57
+    # parse results
58
+    for result in dom.xpath('//div[@class="dg_u"]'):
59
+        link = result.xpath('./a')[0]
60
+
61
+        # parse yaml-data (it is required to add a space, to make it parsable)
62
+        yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m')))
63
+ 
64
+        title = link.attrib.get('t1')
65
+        #url = 'http://' + link.attrib.get('t3')
66
+        url = yaml_data.get('surl')
67
+        img_src = yaml_data.get('imgurl')
68
+
69
+        # append result
70
+        results.append({'template': 'images.html',
71
+                        'url': url,
72
+                        'title': title,
73
+                        'content': '',  
74
+                        'img_src': img_src})
75
+
76
+        # TODO stop parsing if 10 images are found
77
+        if len(results) >= 10:
78
+            break
79
+
80
+    # return results
81
+    return results

+ 53
- 17
searx/engines/bing_news.py 查看文件

1
+## Bing (News)
2
+# 
3
+# @website     https://www.bing.com/news
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
10
+
1
 from urllib import urlencode
11
 from urllib import urlencode
2
 from cgi import escape
12
 from cgi import escape
3
 from lxml import html
13
 from lxml import html
14
+from datetime import datetime, timedelta
15
+from dateutil import parser
16
+import re
4
 
17
 
18
+# engine dependent config
5
 categories = ['news']
19
 categories = ['news']
6
-
7
-base_url = 'http://www.bing.com/'
8
-search_string = 'news/search?{query}&first={offset}'
9
 paging = True
20
 paging = True
10
 language_support = True
21
 language_support = True
11
 
22
 
23
+# search-url
24
+base_url = 'https://www.bing.com/'
25
+search_string = 'news/search?{query}&first={offset}'
12
 
26
 
27
+# do search-request
13
 def request(query, params):
28
 def request(query, params):
14
     offset = (params['pageno'] - 1) * 10 + 1
29
     offset = (params['pageno'] - 1) * 10 + 1
30
+
15
     if params['language'] == 'all':
31
     if params['language'] == 'all':
16
         language = 'en-US'
32
         language = 'en-US'
17
     else:
33
     else:
18
         language = params['language'].replace('_', '-')
34
         language = params['language'].replace('_', '-')
35
+
19
     search_path = search_string.format(
36
     search_path = search_string.format(
20
         query=urlencode({'q': query, 'setmkt': language}),
37
         query=urlencode({'q': query, 'setmkt': language}),
21
         offset=offset)
38
         offset=offset)
22
 
39
 
23
     params['cookies']['SRCHHPGUSR'] = \
40
     params['cookies']['SRCHHPGUSR'] = \
24
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
41
         'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
25
-    #if params['category'] == 'images':
26
-    # params['url'] = base_url + 'images/' + search_path
42
+
27
     params['url'] = base_url + search_path
43
     params['url'] = base_url + search_path
28
     return params
44
     return params
29
 
45
 
30
 
46
 
47
+# get response from search-request
31
 def response(resp):
48
 def response(resp):
32
     results = []
49
     results = []
50
+
33
     dom = html.fromstring(resp.content)
51
     dom = html.fromstring(resp.content)
34
-    for result in dom.xpath('//div[@class="sa_cc"]'):
35
-        link = result.xpath('.//h3/a')[0]
52
+
53
+    # parse results
54
+    for result in dom.xpath('//div[@class="sn_r"]'):
55
+        link = result.xpath('.//div[@class="newstitle"]/a')[0]
36
         url = link.attrib.get('href')
56
         url = link.attrib.get('href')
37
         title = ' '.join(link.xpath('.//text()'))
57
         title = ' '.join(link.xpath('.//text()'))
38
-        content = escape(' '.join(result.xpath('.//p//text()')))
39
-        results.append({'url': url, 'title': title, 'content': content})
58
+        content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
59
+        
60
+        # parse publishedDate
61
+        publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
40
 
62
 
41
-    if results:
42
-        return results
63
+        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
64
+            timeNumbers = re.findall(r'\d+', publishedDate)
65
+            publishedDate = datetime.now()\
66
+                - timedelta(minutes=int(timeNumbers[0]))
67
+        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
68
+            timeNumbers = re.findall(r'\d+', publishedDate)
69
+            publishedDate = datetime.now()\
70
+                - timedelta(hours=int(timeNumbers[0]))
71
+        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
72
+            timeNumbers = re.findall(r'\d+', publishedDate)
73
+            publishedDate = datetime.now()\
74
+                - timedelta(hours=int(timeNumbers[0]))\
75
+                - timedelta(minutes=int(timeNumbers[1]))
76
+        else:
77
+            publishedDate = parser.parse(publishedDate)  
43
 
78
 
44
-    for result in dom.xpath('//li[@class="b_algo"]'):
45
-        link = result.xpath('.//h2/a')[0]
46
-        url = link.attrib.get('href')
47
-        title = ' '.join(link.xpath('.//text()'))
48
-        content = escape(' '.join(result.xpath('.//p//text()')))
49
-        results.append({'url': url, 'title': title, 'content': content})
79
+        # append result
80
+        results.append({'url': url, 
81
+                        'title': title, 
82
+                        'publishedDate': publishedDate,
83
+                        'content': content})
84
+
85
+    # return results
50
     return results
86
     return results

+ 35
- 19
searx/engines/dailymotion.py 查看文件

1
+## Dailymotion (Videos)
2
+# 
3
+# @website     https://www.dailymotion.com
4
+# @provide-api yes (http://www.dailymotion.com/developer)
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes
9
+# @parse       url, title, thumbnail
10
+#
11
+# @todo        set content-parameter with correct data
12
+
1
 from urllib import urlencode
13
 from urllib import urlencode
2
 from json import loads
14
 from json import loads
3
 from lxml import html
15
 from lxml import html
4
 
16
 
17
+# engine dependent config
5
 categories = ['videos']
18
 categories = ['videos']
6
 locale = 'en_US'
19
 locale = 'en_US'
20
+paging = True
7
 
21
 
22
+# search-url
8
 # see http://www.dailymotion.com/doc/api/obj-video.html
23
 # see http://www.dailymotion.com/doc/api/obj-video.html
9
-search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=25&page={pageno}&{query}'  # noqa
10
-
11
-# TODO use video result template
12
-content_tpl = '<a href="{0}" title="{0}" ><img src="{1}" /></a><br />'
13
-
14
-paging = True
24
+search_url = 'https://api.dailymotion.com/videos?fields=title,description,duration,url,thumbnail_360_url&sort=relevance&limit=5&page={pageno}&{query}'  # noqa
15
 
25
 
16
 
26
 
27
+# do search-request
17
 def request(query, params):
28
 def request(query, params):
18
     params['url'] = search_url.format(
29
     params['url'] = search_url.format(
19
         query=urlencode({'search': query, 'localization': locale}),
30
         query=urlencode({'search': query, 'localization': locale}),
20
         pageno=params['pageno'])
31
         pageno=params['pageno'])
32
+
21
     return params
33
     return params
22
 
34
 
23
 
35
 
36
+# get response from search-request
24
 def response(resp):
37
 def response(resp):
25
     results = []
38
     results = []
39
+
26
     search_res = loads(resp.text)
40
     search_res = loads(resp.text)
41
+
42
+    # return empty array if there are no results
27
     if not 'list' in search_res:
43
     if not 'list' in search_res:
28
-        return results
44
+        return []
45
+
46
+    # parse results
29
     for res in search_res['list']:
47
     for res in search_res['list']:
30
         title = res['title']
48
         title = res['title']
31
         url = res['url']
49
         url = res['url']
32
-        if res['thumbnail_360_url']:
33
-            content = content_tpl.format(url, res['thumbnail_360_url'])
34
-        else:
35
-            content = ''
36
-        if res['description']:
37
-            description = text_content_from_html(res['description'])
38
-            content += description[:500]
39
-        results.append({'url': url, 'title': title, 'content': content})
40
-    return results
50
+        #content = res['description']
51
+        content = ''
52
+        thumbnail = res['thumbnail_360_url']
41
 
53
 
54
+        results.append({'template': 'videos.html',
55
+                        'url': url,
56
+                        'title': title,
57
+                        'content': content,
58
+                        'thumbnail': thumbnail})
42
 
59
 
43
-def text_content_from_html(html_string):
44
-    desc_html = html.fragment_fromstring(html_string, create_parent=True)
45
-    return desc_html.text_content()
60
+    # return results
61
+    return results

+ 24
- 4
searx/engines/google.py 查看文件

1
-#!/usr/bin/env python
1
+## Google (Web)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, content
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from json import loads
12
 from json import loads
5
 
13
 
14
+# engine dependent config
6
 categories = ['general']
15
 categories = ['general']
16
+paging = True
17
+language_support = True
7
 
18
 
19
+# search-url
8
 url = 'https://ajax.googleapis.com/'
20
 url = 'https://ajax.googleapis.com/'
9
 search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
21
 search_url = url + 'ajax/services/search/web?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
10
 
22
 
11
-paging = True
12
-language_support = True
13
-
14
 
23
 
24
+# do search-request
15
 def request(query, params):
25
 def request(query, params):
16
     offset = (params['pageno'] - 1) * 8
26
     offset = (params['pageno'] - 1) * 8
27
+
17
     language = 'en-US'
28
     language = 'en-US'
18
     if params['language'] != 'all':
29
     if params['language'] != 'all':
19
         language = params['language'].replace('_', '-')
30
         language = params['language'].replace('_', '-')
31
+
20
     params['url'] = search_url.format(offset=offset,
32
     params['url'] = search_url.format(offset=offset,
21
                                       query=urlencode({'q': query}),
33
                                       query=urlencode({'q': query}),
22
                                       language=language)
34
                                       language=language)
35
+
23
     return params
36
     return params
24
 
37
 
25
 
38
 
39
+# get response from search-request
26
 def response(resp):
40
 def response(resp):
27
     results = []
41
     results = []
42
+
28
     search_res = loads(resp.text)
43
     search_res = loads(resp.text)
29
 
44
 
45
+    # return empty array if there are no results
30
     if not search_res.get('responseData', {}).get('results'):
46
     if not search_res.get('responseData', {}).get('results'):
31
         return []
47
         return []
32
 
48
 
49
+    # parse results
33
     for result in search_res['responseData']['results']:
50
     for result in search_res['responseData']['results']:
51
+        # append result
34
         results.append({'url': result['unescapedUrl'],
52
         results.append({'url': result['unescapedUrl'],
35
                         'title': result['titleNoFormatting'],
53
                         'title': result['titleNoFormatting'],
36
                         'content': result['content']})
54
                         'content': result['content']})
55
+
56
+    # return results
37
     return results
57
     return results

+ 26
- 5
searx/engines/google_images.py 查看文件

1
-#!/usr/bin/env python
1
+## Google (Images)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, img_src
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from json import loads
12
 from json import loads
5
 
13
 
14
+# engine dependent config
6
 categories = ['images']
15
 categories = ['images']
16
+paging = True
7
 
17
 
18
+# search-url
8
 url = 'https://ajax.googleapis.com/'
19
 url = 'https://ajax.googleapis.com/'
9
 search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'  # noqa
20
 search_url = url + 'ajax/services/search/images?v=1.0&start={offset}&rsz=large&safe=off&filter=off&{query}'  # noqa
10
 
21
 
11
-paging = True
12
 
22
 
23
+# do search-request
13
 def request(query, params):
24
 def request(query, params):
14
     offset = (params['pageno'] - 1) * 8
25
     offset = (params['pageno'] - 1) * 8
26
+
15
     params['url'] = search_url.format(query=urlencode({'q': query}),
27
     params['url'] = search_url.format(query=urlencode({'q': query}),
16
                                       offset=offset)
28
                                       offset=offset)
29
+
17
     return params
30
     return params
18
 
31
 
19
 
32
 
33
+# get response from search-request
20
 def response(resp):
34
 def response(resp):
21
     results = []
35
     results = []
36
+
22
     search_res = loads(resp.text)
37
     search_res = loads(resp.text)
23
-    if not search_res.get('responseData'):
24
-        return []
25
-    if not search_res['responseData'].get('results'):
38
+
39
+    # return empty array if there are no results
40
+    if not search_res.get('responseData', {}).get('results'):
26
         return []
41
         return []
42
+
43
+    # parse results
27
     for result in search_res['responseData']['results']:
44
     for result in search_res['responseData']['results']:
28
         href = result['originalContextUrl']
45
         href = result['originalContextUrl']
29
         title = result['title']
46
         title = result['title']
30
         if not result['url']:
47
         if not result['url']:
31
             continue
48
             continue
49
+
50
+        # append result
32
         results.append({'url': href,
51
         results.append({'url': href,
33
                         'title': title,
52
                         'title': title,
34
                         'content': '',
53
                         'content': '',
35
                         'img_src': result['url'],
54
                         'img_src': result['url'],
36
                         'template': 'images.html'})
55
                         'template': 'images.html'})
56
+
57
+    # return results
37
     return results
58
     return results

+ 25
- 6
searx/engines/google_news.py 查看文件

1
-#!/usr/bin/env python
1
+## Google (News)
2
+# 
3
+# @website     https://www.google.com
4
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
5
+# 
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes (but deprecated)
9
+# @parse       url, title, content, publishedDate
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from json import loads
12
 from json import loads
5
 from dateutil import parser
13
 from dateutil import parser
6
 
14
 
15
+# search-url
7
 categories = ['news']
16
 categories = ['news']
17
+paging = True
18
+language_support = True
8
 
19
 
20
+# engine dependent config
9
 url = 'https://ajax.googleapis.com/'
21
 url = 'https://ajax.googleapis.com/'
10
 search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
22
 search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}'  # noqa
11
 
23
 
12
-paging = True
13
-language_support = True
14
-
15
 
24
 
25
+# do search-request
16
 def request(query, params):
26
 def request(query, params):
17
     offset = (params['pageno'] - 1) * 8
27
     offset = (params['pageno'] - 1) * 8
28
+
18
     language = 'en-US'
29
     language = 'en-US'
19
     if params['language'] != 'all':
30
     if params['language'] != 'all':
20
         language = params['language'].replace('_', '-')
31
         language = params['language'].replace('_', '-')
32
+
21
     params['url'] = search_url.format(offset=offset,
33
     params['url'] = search_url.format(offset=offset,
22
                                       query=urlencode({'q': query}),
34
                                       query=urlencode({'q': query}),
23
                                       language=language)
35
                                       language=language)
36
+
24
     return params
37
     return params
25
 
38
 
26
 
39
 
40
+# get response from search-request
27
 def response(resp):
41
 def response(resp):
28
     results = []
42
     results = []
43
+
29
     search_res = loads(resp.text)
44
     search_res = loads(resp.text)
30
 
45
 
46
+    # return empty array if there are no results
31
     if not search_res.get('responseData', {}).get('results'):
47
     if not search_res.get('responseData', {}).get('results'):
32
         return []
48
         return []
33
 
49
 
50
+    # parse results
34
     for result in search_res['responseData']['results']:
51
     for result in search_res['responseData']['results']:
35
-
36
-# Mon, 10 Mar 2014 16:26:15 -0700
52
+        # parse publishedDate
37
         publishedDate = parser.parse(result['publishedDate'])
53
         publishedDate = parser.parse(result['publishedDate'])
38
 
54
 
55
+        # append result
39
         results.append({'url': result['unescapedUrl'],
56
         results.append({'url': result['unescapedUrl'],
40
                         'title': result['titleNoFormatting'],
57
                         'title': result['titleNoFormatting'],
41
                         'publishedDate': publishedDate,
58
                         'publishedDate': publishedDate,
42
                         'content': result['content']})
59
                         'content': result['content']})
60
+
61
+    # return results
43
     return results
62
     return results

+ 40
- 22
searx/engines/vimeo.py 查看文件

1
+## Vimeo (Videos)
2
+# 
3
+# @website     https://vimeo.com/
4
+# @provide-api yes (http://developer.vimeo.com/api), they have a maximum count of queries/hour
5
+# 
6
+# @using-api   no (TODO, rewrite to api)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, publishedDate,  thumbnail
10
+#
11
+# @todo        rewrite to api
12
+# @todo        set content-parameter with correct data
13
+
1
 from urllib import urlencode
14
 from urllib import urlencode
2
 from HTMLParser import HTMLParser
15
 from HTMLParser import HTMLParser
3
 from lxml import html
16
 from lxml import html
4
 from searx.engines.xpath import extract_text
17
 from searx.engines.xpath import extract_text
5
 from dateutil import parser
18
 from dateutil import parser
6
 
19
 
7
-base_url = 'http://vimeo.com'
8
-search_url = base_url + '/search?{query}'
9
-url_xpath = None
10
-content_xpath = None
11
-title_xpath = None
12
-results_xpath = ''
13
-content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
14
-publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
20
+# engine dependent config
21
+categories = ['videos']
22
+paging = True
15
 
23
 
16
-# the cookie set by vimeo contains all the following values,
17
-# but only __utma seems to be requiered
18
-cookie = {
19
-    #'vuid':'918282893.1027205400'
20
-    # 'ab_bs':'%7B%223%22%3A279%7D'
21
-     '__utma': '00000000.000#0000000.0000000000.0000000000.0000000000.0'
22
-    # '__utmb':'18302654.1.10.1388942090'
23
-    #, '__utmc':'18302654'
24
-    #, '__utmz':'18#302654.1388942090.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)'  # noqa
25
-    #, '__utml':'search'
26
-}
24
+# search-url
25
+base_url = 'https://vimeo.com'
26
+search_url = base_url + '/search/page:{pageno}?{query}'
27
+
28
+# specific xpath variables
29
+url_xpath = './a/@href'
30
+content_xpath = './a/img/@src'
31
+title_xpath = './a/div[@class="data"]/p[@class="title"]/text()'
32
+results_xpath = '//div[@id="browse_content"]/ol/li'
33
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
27
 
34
 
28
 
35
 
36
+# do search-request
29
 def request(query, params):
37
 def request(query, params):
30
-    params['url'] = search_url.format(query=urlencode({'q': query}))
31
-    params['cookies'] = cookie
38
+    params['url'] = search_url.format(pageno=params['pageno'] ,
39
+                                      query=urlencode({'q': query}))
40
+
41
+    # TODO required?
42
+    params['cookies']['__utma'] = '00000000.000#0000000.0000000000.0000000000.0000000000.0'
43
+
32
     return params
44
     return params
33
 
45
 
34
 
46
 
47
+# get response from search-request
35
 def response(resp):
48
 def response(resp):
36
     results = []
49
     results = []
50
+
37
     dom = html.fromstring(resp.text)
51
     dom = html.fromstring(resp.text)
38
 
52
 
39
     p = HTMLParser()
53
     p = HTMLParser()
40
 
54
 
55
+    # parse results
41
     for result in dom.xpath(results_xpath):
56
     for result in dom.xpath(results_xpath):
42
         url = base_url + result.xpath(url_xpath)[0]
57
         url = base_url + result.xpath(url_xpath)[0]
43
         title = p.unescape(extract_text(result.xpath(title_xpath)))
58
         title = p.unescape(extract_text(result.xpath(title_xpath)))
45
         publishedDate = parser.parse(extract_text(
60
         publishedDate = parser.parse(extract_text(
46
             result.xpath(publishedDate_xpath)[0]))
61
             result.xpath(publishedDate_xpath)[0]))
47
 
62
 
63
+        # append result
48
         results.append({'url': url,
64
         results.append({'url': url,
49
                         'title': title,
65
                         'title': title,
50
-                        'content': content_tpl.format(url, title, thumbnail),
66
+                        'content': '',
51
                         'template': 'videos.html',
67
                         'template': 'videos.html',
52
                         'publishedDate': publishedDate,
68
                         'publishedDate': publishedDate,
53
                         'thumbnail': thumbnail})
69
                         'thumbnail': thumbnail})
70
+
71
+    # return results
54
     return results
72
     return results

+ 41
- 6
searx/engines/yahoo.py 查看文件

1
-#!/usr/bin/env python
1
+## Yahoo (Web)
2
+# 
3
+# @website     https://search.yahoo.com/web
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, suggestion
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from urlparse import unquote
12
 from urlparse import unquote
5
 from lxml import html
13
 from lxml import html
6
 from searx.engines.xpath import extract_text, extract_url
14
 from searx.engines.xpath import extract_text, extract_url
7
 
15
 
16
+# engine dependent config
8
 categories = ['general']
17
 categories = ['general']
9
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
18
+paging = True
19
+language_support = True
20
+
21
+# search-url
22
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
23
+
24
+# specific xpath variables
10
 results_xpath = '//div[@class="res"]'
25
 results_xpath = '//div[@class="res"]'
11
 url_xpath = './/h3/a/@href'
26
 url_xpath = './/h3/a/@href'
12
 title_xpath = './/h3/a'
27
 title_xpath = './/h3/a'
13
 content_xpath = './/div[@class="abstr"]'
28
 content_xpath = './/div[@class="abstr"]'
14
 suggestion_xpath = '//div[@id="satat"]//a'
29
 suggestion_xpath = '//div[@id="satat"]//a'
15
 
30
 
16
-paging = True
17
-
18
 
31
 
32
+# remove yahoo-specific tracking-url
19
 def parse_url(url_string):
33
 def parse_url(url_string):
20
     endings = ['/RS', '/RK']
34
     endings = ['/RS', '/RK']
21
     endpositions = []
35
     endpositions = []
22
     start = url_string.find('http', url_string.find('/RU=')+1)
36
     start = url_string.find('http', url_string.find('/RU=')+1)
37
+
23
     for ending in endings:
38
     for ending in endings:
24
         endpos = url_string.rfind(ending)
39
         endpos = url_string.rfind(ending)
25
         if endpos > -1:
40
         if endpos > -1:
26
             endpositions.append(endpos)
41
             endpositions.append(endpos)
27
 
42
 
28
     end = min(endpositions)
43
     end = min(endpositions)
44
+
29
     return unquote(url_string[start:end])
45
     return unquote(url_string[start:end])
30
 
46
 
31
 
47
 
48
+# do search-request
32
 def request(query, params):
49
 def request(query, params):
33
     offset = (params['pageno'] - 1) * 10 + 1
50
     offset = (params['pageno'] - 1) * 10 + 1
51
+
34
     if params['language'] == 'all':
52
     if params['language'] == 'all':
35
         language = 'en'
53
         language = 'en'
36
     else:
54
     else:
37
         language = params['language'].split('_')[0]
55
         language = params['language'].split('_')[0]
56
+
38
     params['url'] = search_url.format(offset=offset,
57
     params['url'] = search_url.format(offset=offset,
39
-                                      query=urlencode({'p': query}))
58
+                                      query=urlencode({'p': query}),
59
+                                      lang=language)
60
+
61
+    # TODO required?
40
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
62
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
41
         .format(lang=language)
63
         .format(lang=language)
64
+
42
     return params
65
     return params
43
 
66
 
44
 
67
 
68
+# get response from search-request
45
 def response(resp):
69
 def response(resp):
46
     results = []
70
     results = []
71
+
47
     dom = html.fromstring(resp.text)
72
     dom = html.fromstring(resp.text)
48
 
73
 
74
+    # parse results
49
     for result in dom.xpath(results_xpath):
75
     for result in dom.xpath(results_xpath):
50
         try:
76
         try:
51
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
77
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
52
             title = extract_text(result.xpath(title_xpath)[0])
78
             title = extract_text(result.xpath(title_xpath)[0])
53
         except:
79
         except:
54
             continue
80
             continue
81
+
55
         content = extract_text(result.xpath(content_xpath)[0])
82
         content = extract_text(result.xpath(content_xpath)[0])
56
-        results.append({'url': url, 'title': title, 'content': content})
57
 
83
 
84
+        # append result
85
+        results.append({'url': url, 
86
+                        'title': title, 
87
+                        'content': content})
88
+
89
+    # if no suggestion found, return results
58
     if not suggestion_xpath:
90
     if not suggestion_xpath:
59
         return results
91
         return results
60
 
92
 
93
+    # parse suggestion
61
     for suggestion in dom.xpath(suggestion_xpath):
94
     for suggestion in dom.xpath(suggestion_xpath):
95
+        # append suggestion
62
         results.append({'suggestion': extract_text(suggestion)})
96
         results.append({'suggestion': extract_text(suggestion)})
63
 
97
 
98
+    # return results
64
     return results
99
     return results

+ 31
- 11
searx/engines/yahoo_news.py 查看文件

1
-#!/usr/bin/env python
1
+## Yahoo (News)
2
+# 
3
+# @website     https://news.yahoo.com
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from lxml import html
12
 from lxml import html
8
 import re
16
 import re
9
 from dateutil import parser
17
 from dateutil import parser
10
 
18
 
19
+# engine dependent config
11
 categories = ['news']
20
 categories = ['news']
12
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
21
+paging = True
22
+language_support = True
23
+
24
+# search-url
25
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
26
+
27
+# specific xpath variables
13
 results_xpath = '//div[@class="res"]'
28
 results_xpath = '//div[@class="res"]'
14
 url_xpath = './/h3/a/@href'
29
 url_xpath = './/h3/a/@href'
15
 title_xpath = './/h3/a'
30
 title_xpath = './/h3/a'
17
 publishedDate_xpath = './/span[@class="timestamp"]'
32
 publishedDate_xpath = './/span[@class="timestamp"]'
18
 suggestion_xpath = '//div[@id="satat"]//a'
33
 suggestion_xpath = '//div[@id="satat"]//a'
19
 
34
 
20
-paging = True
21
-
22
 
35
 
36
+# do search-request
23
 def request(query, params):
37
 def request(query, params):
24
     offset = (params['pageno'] - 1) * 10 + 1
38
     offset = (params['pageno'] - 1) * 10 + 1
39
+
25
     if params['language'] == 'all':
40
     if params['language'] == 'all':
26
         language = 'en'
41
         language = 'en'
27
     else:
42
     else:
28
         language = params['language'].split('_')[0]
43
         language = params['language'].split('_')[0]
44
+    
29
     params['url'] = search_url.format(offset=offset,
45
     params['url'] = search_url.format(offset=offset,
30
-                                      query=urlencode({'p': query}))
46
+                                      query=urlencode({'p': query}),
47
+                                      lang=language)
48
+
49
+    # TODO required?
31
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
50
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
32
         .format(lang=language)
51
         .format(lang=language)
33
     return params
52
     return params
34
 
53
 
35
 
54
 
55
+# get response from search-request
36
 def response(resp):
56
 def response(resp):
37
     results = []
57
     results = []
58
+
38
     dom = html.fromstring(resp.text)
59
     dom = html.fromstring(resp.text)
39
 
60
 
61
+    # parse results
40
     for result in dom.xpath(results_xpath):
62
     for result in dom.xpath(results_xpath):
41
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
63
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
42
         title = extract_text(result.xpath(title_xpath)[0])
64
         title = extract_text(result.xpath(title_xpath)[0])
43
         content = extract_text(result.xpath(content_xpath)[0])
65
         content = extract_text(result.xpath(content_xpath)[0])
66
+
67
+        # parse publishedDate
44
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
68
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
45
 
69
 
46
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
70
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
58
         if publishedDate.year == 1900:
82
         if publishedDate.year == 1900:
59
             publishedDate = publishedDate.replace(year=datetime.now().year)
83
             publishedDate = publishedDate.replace(year=datetime.now().year)
60
 
84
 
85
+        # append result
61
         results.append({'url': url,
86
         results.append({'url': url,
62
                         'title': title,
87
                         'title': title,
63
                         'content': content,
88
                         'content': content,
64
                         'publishedDate': publishedDate})
89
                         'publishedDate': publishedDate})
65
 
90
 
66
-    if not suggestion_xpath:
67
-        return results
68
-
69
-    for suggestion in dom.xpath(suggestion_xpath):
70
-        results.append({'suggestion': extract_text(suggestion)})
71
-
91
+    # return results
72
     return results
92
     return results

+ 6
- 5
searx/settings.yml 查看文件

20
     locale : en-US
20
     locale : en-US
21
     shortcut : bi
21
     shortcut : bi
22
 
22
 
23
+  - name : bing images
24
+    engine : bing_images
25
+    locale : en-US
26
+    shortcut : bii
27
+
23
   - name : bing news
28
   - name : bing news
24
     engine : bing_news
29
     engine : bing_news
25
     locale : en-US
30
     locale : en-US
148
 
153
 
149
   - name : vimeo
154
   - name : vimeo
150
     engine : vimeo
155
     engine : vimeo
151
-    categories : videos
152
-    results_xpath : //div[@id="browse_content"]/ol/li
153
-    url_xpath : ./a/@href
154
-    title_xpath : ./a/div[@class="data"]/p[@class="title"]/text()
155
-    content_xpath : ./a/img/@src
156
+    locale : en-US
156
     shortcut : vm
157
     shortcut : vm
157
 
158
 
158
 locales:
159
 locales: