Ver código fonte

fix yahoo engines and add comments

Thomas Pointhuber 10 anos atrás
pai
commit
03db970e6a
2 arquivos alterados com 72 adições e 17 exclusões
  1. 41
    6
      searx/engines/yahoo.py
  2. 31
    11
      searx/engines/yahoo_news.py

+ 41
- 6
searx/engines/yahoo.py Ver arquivo

1
-#!/usr/bin/env python
1
+## Yahoo (Web)
2
+# 
3
+# @website     https://search.yahoo.com/web
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, suggestion
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from urlparse import unquote
12
 from urlparse import unquote
5
 from lxml import html
13
 from lxml import html
6
 from searx.engines.xpath import extract_text, extract_url
14
 from searx.engines.xpath import extract_text, extract_url
7
 
15
 
16
+# engine dependent config
8
 categories = ['general']
17
 categories = ['general']
9
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
18
+paging = True
19
+language_support = True
20
+
21
+# search-url
22
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
23
+
24
+# specific xpath variables
10
 results_xpath = '//div[@class="res"]'
25
 results_xpath = '//div[@class="res"]'
11
 url_xpath = './/h3/a/@href'
26
 url_xpath = './/h3/a/@href'
12
 title_xpath = './/h3/a'
27
 title_xpath = './/h3/a'
13
 content_xpath = './/div[@class="abstr"]'
28
 content_xpath = './/div[@class="abstr"]'
14
 suggestion_xpath = '//div[@id="satat"]//a'
29
 suggestion_xpath = '//div[@id="satat"]//a'
15
 
30
 
16
-paging = True
17
-
18
 
31
 
32
+# remove yahoo-specific tracking-url
19
 def parse_url(url_string):
33
 def parse_url(url_string):
20
     endings = ['/RS', '/RK']
34
     endings = ['/RS', '/RK']
21
     endpositions = []
35
     endpositions = []
22
     start = url_string.find('http', url_string.find('/RU=')+1)
36
     start = url_string.find('http', url_string.find('/RU=')+1)
37
+
23
     for ending in endings:
38
     for ending in endings:
24
         endpos = url_string.rfind(ending)
39
         endpos = url_string.rfind(ending)
25
         if endpos > -1:
40
         if endpos > -1:
26
             endpositions.append(endpos)
41
             endpositions.append(endpos)
27
 
42
 
28
     end = min(endpositions)
43
     end = min(endpositions)
44
+
29
     return unquote(url_string[start:end])
45
     return unquote(url_string[start:end])
30
 
46
 
31
 
47
 
48
+# do search-request
32
 def request(query, params):
49
 def request(query, params):
33
     offset = (params['pageno'] - 1) * 10 + 1
50
     offset = (params['pageno'] - 1) * 10 + 1
51
+
34
     if params['language'] == 'all':
52
     if params['language'] == 'all':
35
         language = 'en'
53
         language = 'en'
36
     else:
54
     else:
37
         language = params['language'].split('_')[0]
55
         language = params['language'].split('_')[0]
56
+
38
     params['url'] = search_url.format(offset=offset,
57
     params['url'] = search_url.format(offset=offset,
39
-                                      query=urlencode({'p': query}))
58
+                                      query=urlencode({'p': query}),
59
+                                      lang=language)
60
+
61
+    # TODO required?
40
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
62
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
41
         .format(lang=language)
63
         .format(lang=language)
64
+
42
     return params
65
     return params
43
 
66
 
44
 
67
 
68
+# get response from search-request
45
 def response(resp):
69
 def response(resp):
46
     results = []
70
     results = []
71
+
47
     dom = html.fromstring(resp.text)
72
     dom = html.fromstring(resp.text)
48
 
73
 
74
+    # parse results
49
     for result in dom.xpath(results_xpath):
75
     for result in dom.xpath(results_xpath):
50
         try:
76
         try:
51
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
77
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
52
             title = extract_text(result.xpath(title_xpath)[0])
78
             title = extract_text(result.xpath(title_xpath)[0])
53
         except:
79
         except:
54
             continue
80
             continue
81
+
55
         content = extract_text(result.xpath(content_xpath)[0])
82
         content = extract_text(result.xpath(content_xpath)[0])
56
-        results.append({'url': url, 'title': title, 'content': content})
57
 
83
 
84
+        # append result
85
+        results.append({'url': url, 
86
+                        'title': title, 
87
+                        'content': content})
88
+
89
+    # if no suggestion found, return results
58
     if not suggestion_xpath:
90
     if not suggestion_xpath:
59
         return results
91
         return results
60
 
92
 
93
+    # parse suggestion
61
     for suggestion in dom.xpath(suggestion_xpath):
94
     for suggestion in dom.xpath(suggestion_xpath):
95
+        # append suggestion
62
         results.append({'suggestion': extract_text(suggestion)})
96
         results.append({'suggestion': extract_text(suggestion)})
63
 
97
 
98
+    # return results
64
     return results
99
     return results

+ 31
- 11
searx/engines/yahoo_news.py Ver arquivo

1
-#!/usr/bin/env python
1
+## Yahoo (News)
2
+# 
3
+# @website     https://news.yahoo.com
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
2
 
10
 
3
 from urllib import urlencode
11
 from urllib import urlencode
4
 from lxml import html
12
 from lxml import html
8
 import re
16
 import re
9
 from dateutil import parser
17
 from dateutil import parser
10
 
18
 
19
+# engine dependent config
11
 categories = ['news']
20
 categories = ['news']
12
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
21
+paging = True
22
+language_support = True
23
+
24
+# search-url
25
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
26
+
27
+# specific xpath variables
13
 results_xpath = '//div[@class="res"]'
28
 results_xpath = '//div[@class="res"]'
14
 url_xpath = './/h3/a/@href'
29
 url_xpath = './/h3/a/@href'
15
 title_xpath = './/h3/a'
30
 title_xpath = './/h3/a'
17
 publishedDate_xpath = './/span[@class="timestamp"]'
32
 publishedDate_xpath = './/span[@class="timestamp"]'
18
 suggestion_xpath = '//div[@id="satat"]//a'
33
 suggestion_xpath = '//div[@id="satat"]//a'
19
 
34
 
20
-paging = True
21
-
22
 
35
 
36
+# do search-request
23
 def request(query, params):
37
 def request(query, params):
24
     offset = (params['pageno'] - 1) * 10 + 1
38
     offset = (params['pageno'] - 1) * 10 + 1
39
+
25
     if params['language'] == 'all':
40
     if params['language'] == 'all':
26
         language = 'en'
41
         language = 'en'
27
     else:
42
     else:
28
         language = params['language'].split('_')[0]
43
         language = params['language'].split('_')[0]
44
+    
29
     params['url'] = search_url.format(offset=offset,
45
     params['url'] = search_url.format(offset=offset,
30
-                                      query=urlencode({'p': query}))
46
+                                      query=urlencode({'p': query}),
47
+                                      lang=language)
48
+
49
+    # TODO required?
31
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
50
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
32
         .format(lang=language)
51
         .format(lang=language)
33
     return params
52
     return params
34
 
53
 
35
 
54
 
55
+# get response from search-request
36
 def response(resp):
56
 def response(resp):
37
     results = []
57
     results = []
58
+
38
     dom = html.fromstring(resp.text)
59
     dom = html.fromstring(resp.text)
39
 
60
 
61
+    # parse results
40
     for result in dom.xpath(results_xpath):
62
     for result in dom.xpath(results_xpath):
41
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
63
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
42
         title = extract_text(result.xpath(title_xpath)[0])
64
         title = extract_text(result.xpath(title_xpath)[0])
43
         content = extract_text(result.xpath(content_xpath)[0])
65
         content = extract_text(result.xpath(content_xpath)[0])
66
+
67
+        # parse publishedDate
44
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
68
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
45
 
69
 
46
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
70
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
58
         if publishedDate.year == 1900:
82
         if publishedDate.year == 1900:
59
             publishedDate = publishedDate.replace(year=datetime.now().year)
83
             publishedDate = publishedDate.replace(year=datetime.now().year)
60
 
84
 
85
+        # append result
61
         results.append({'url': url,
86
         results.append({'url': url,
62
                         'title': title,
87
                         'title': title,
63
                         'content': content,
88
                         'content': content,
64
                         'publishedDate': publishedDate})
89
                         'publishedDate': publishedDate})
65
 
90
 
66
-    if not suggestion_xpath:
67
-        return results
68
-
69
-    for suggestion in dom.xpath(suggestion_xpath):
70
-        results.append({'suggestion': extract_text(suggestion)})
71
-
91
+    # return results
72
     return results
92
     return results