Browse Source

fix yahoo engines and add comments

Thomas Pointhuber 10 years ago
parent
commit
03db970e6a
2 changed files with 72 additions and 17 deletions
  1. 41
    6
      searx/engines/yahoo.py
  2. 31
    11
      searx/engines/yahoo_news.py

+ 41
- 6
searx/engines/yahoo.py View File

@@ -1,64 +1,99 @@
1
-#!/usr/bin/env python
1
+## Yahoo (Web)
2
+# 
3
+# @website     https://search.yahoo.com/web
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, suggestion
2 10
 
3 11
 from urllib import urlencode
4 12
 from urlparse import unquote
5 13
 from lxml import html
6 14
 from searx.engines.xpath import extract_text, extract_url
7 15
 
16
+# engine dependent config
8 17
 categories = ['general']
9
-search_url = 'http://search.yahoo.com/search?{query}&b={offset}'
18
+paging = True
19
+language_support = True
20
+
21
+# search-url
22
+search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
23
+
24
+# specific xpath variables
10 25
 results_xpath = '//div[@class="res"]'
11 26
 url_xpath = './/h3/a/@href'
12 27
 title_xpath = './/h3/a'
13 28
 content_xpath = './/div[@class="abstr"]'
14 29
 suggestion_xpath = '//div[@id="satat"]//a'
15 30
 
16
-paging = True
17
-
18 31
 
32
+# remove yahoo-specific tracking-url
19 33
 def parse_url(url_string):
20 34
     endings = ['/RS', '/RK']
21 35
     endpositions = []
22 36
     start = url_string.find('http', url_string.find('/RU=')+1)
37
+
23 38
     for ending in endings:
24 39
         endpos = url_string.rfind(ending)
25 40
         if endpos > -1:
26 41
             endpositions.append(endpos)
27 42
 
28 43
     end = min(endpositions)
44
+
29 45
     return unquote(url_string[start:end])
30 46
 
31 47
 
48
+# do search-request
32 49
 def request(query, params):
33 50
     offset = (params['pageno'] - 1) * 10 + 1
51
+
34 52
     if params['language'] == 'all':
35 53
         language = 'en'
36 54
     else:
37 55
         language = params['language'].split('_')[0]
56
+
38 57
     params['url'] = search_url.format(offset=offset,
39
-                                      query=urlencode({'p': query}))
58
+                                      query=urlencode({'p': query}),
59
+                                      lang=language)
60
+
61
+    # TODO required?
40 62
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
41 63
         .format(lang=language)
64
+
42 65
     return params
43 66
 
44 67
 
68
+# get response from search-request
45 69
 def response(resp):
46 70
     results = []
71
+
47 72
     dom = html.fromstring(resp.text)
48 73
 
74
+    # parse results
49 75
     for result in dom.xpath(results_xpath):
50 76
         try:
51 77
             url = parse_url(extract_url(result.xpath(url_xpath), search_url))
52 78
             title = extract_text(result.xpath(title_xpath)[0])
53 79
         except:
54 80
             continue
81
+
55 82
         content = extract_text(result.xpath(content_xpath)[0])
56
-        results.append({'url': url, 'title': title, 'content': content})
57 83
 
84
+        # append result
85
+        results.append({'url': url, 
86
+                        'title': title, 
87
+                        'content': content})
88
+
89
+    # if no suggestion found, return results
58 90
     if not suggestion_xpath:
59 91
         return results
60 92
 
93
+    # parse suggestion
61 94
     for suggestion in dom.xpath(suggestion_xpath):
95
+        # append suggestion
62 96
         results.append({'suggestion': extract_text(suggestion)})
63 97
 
98
+    # return results
64 99
     return results

+ 31
- 11
searx/engines/yahoo_news.py View File

@@ -1,4 +1,12 @@
1
-#!/usr/bin/env python
1
+## Yahoo (News)
2
+# 
3
+# @website     https://news.yahoo.com
4
+# @provide-api yes (https://developer.yahoo.com/boss/search/), $0.80/1000 queries
5
+# 
6
+# @using-api   no (because pricing)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate
2 10
 
3 11
 from urllib import urlencode
4 12
 from lxml import html
@@ -8,8 +16,15 @@ from datetime import datetime, timedelta
8 16
 import re
9 17
 from dateutil import parser
10 18
 
19
+# engine dependent config
11 20
 categories = ['news']
12
-search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
21
+paging = True
22
+language_support = True
23
+
24
+# search-url
25
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
26
+
27
+# specific xpath variables
13 28
 results_xpath = '//div[@class="res"]'
14 29
 url_xpath = './/h3/a/@href'
15 30
 title_xpath = './/h3/a'
@@ -17,30 +32,39 @@ content_xpath = './/div[@class="abstr"]'
17 32
 publishedDate_xpath = './/span[@class="timestamp"]'
18 33
 suggestion_xpath = '//div[@id="satat"]//a'
19 34
 
20
-paging = True
21
-
22 35
 
36
+# do search-request
23 37
 def request(query, params):
24 38
     offset = (params['pageno'] - 1) * 10 + 1
39
+
25 40
     if params['language'] == 'all':
26 41
         language = 'en'
27 42
     else:
28 43
         language = params['language'].split('_')[0]
44
+    
29 45
     params['url'] = search_url.format(offset=offset,
30
-                                      query=urlencode({'p': query}))
46
+                                      query=urlencode({'p': query}),
47
+                                      lang=language)
48
+
49
+    # TODO required?
31 50
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
32 51
         .format(lang=language)
33 52
     return params
34 53
 
35 54
 
55
+# get response from search-request
36 56
 def response(resp):
37 57
     results = []
58
+
38 59
     dom = html.fromstring(resp.text)
39 60
 
61
+    # parse results
40 62
     for result in dom.xpath(results_xpath):
41 63
         url = parse_url(extract_url(result.xpath(url_xpath), search_url))
42 64
         title = extract_text(result.xpath(title_xpath)[0])
43 65
         content = extract_text(result.xpath(content_xpath)[0])
66
+
67
+        # parse publishedDate
44 68
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
45 69
 
46 70
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
@@ -58,15 +82,11 @@ def response(resp):
58 82
         if publishedDate.year == 1900:
59 83
             publishedDate = publishedDate.replace(year=datetime.now().year)
60 84
 
85
+        # append result
61 86
         results.append({'url': url,
62 87
                         'title': title,
63 88
                         'content': content,
64 89
                         'publishedDate': publishedDate})
65 90
 
66
-    if not suggestion_xpath:
67
-        return results
68
-
69
-    for suggestion in dom.xpath(suggestion_xpath):
70
-        results.append({'suggestion': extract_text(suggestion)})
71
-
91
+    # return results
72 92
     return results