瀏覽代碼

[fix] yahoo_news engine

Alexandre Flament 10 年之前
父節點
當前提交
44ed4424f6
共有 2 個檔案被更改,包括 89 行新增89 行删除
  1. 18
    7
      searx/engines/yahoo_news.py
  2. 71
    82
      searx/tests/engines/test_yahoo_news.py

+ 18
- 7
searx/engines/yahoo_news.py 查看文件

23
 language_support = True
23
 language_support = True
24
 
24
 
25
 # search-url
25
 # search-url
26
-search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'  # noqa
26
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}'  # noqa
27
 
27
 
28
 # specific xpath variables
28
 # specific xpath variables
29
-results_xpath = '//div[@class="res"]'
29
+results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li'
30
 url_xpath = './/h3/a/@href'
30
 url_xpath = './/h3/a/@href'
31
 title_xpath = './/h3/a'
31
 title_xpath = './/h3/a'
32
-content_xpath = './/div[@class="abstr"]'
33
-publishedDate_xpath = './/span[@class="timestamp"]'
34
-suggestion_xpath = '//div[@id="satat"]//a'
32
+content_xpath = './/div[@class="compText"]'
33
+publishedDate_xpath = './/span[contains(@class,"tri")]'
34
+suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
35
 
35
 
36
 
36
 
37
 # do search-request
37
 # do search-request
48
                                       lang=language)
48
                                       lang=language)
49
 
49
 
50
     # TODO required?
50
     # TODO required?
51
-    params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
51
+    params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
52
         .format(lang=language)
52
         .format(lang=language)
53
     return params
53
     return params
54
 
54
 
55
 
55
 
56
+def sanitize_url(url):
57
+    if ".yahoo.com/" in url:
58
+        return re.sub(u"\;\_ylt\=.+$", "", url)
59
+    else:
60
+        return url
61
+
62
+
56
 # get response from search-request
63
 # get response from search-request
57
 def response(resp):
64
 def response(resp):
58
     results = []
65
     results = []
61
 
68
 
62
     # parse results
69
     # parse results
63
     for result in dom.xpath(results_xpath):
70
     for result in dom.xpath(results_xpath):
64
-        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
71
+        urls = result.xpath(url_xpath)
72
+        if len(urls) != 1:
73
+            continue
74
+        url = sanitize_url(parse_url(extract_url(urls, search_url)))
65
         title = extract_text(result.xpath(title_xpath)[0])
75
         title = extract_text(result.xpath(title_xpath)[0])
66
         content = extract_text(result.xpath(content_xpath)[0])
76
         content = extract_text(result.xpath(content_xpath)[0])
67
 
77
 
68
         # parse publishedDate
78
         # parse publishedDate
69
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
79
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
70
 
80
 
81
+        # still useful ?
71
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
82
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
72
             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa
83
             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa
73
         else:
84
         else:

+ 71
- 82
searx/tests/engines/test_yahoo_news.py 查看文件

39
         self.assertEqual(yahoo_news.response(response), [])
39
         self.assertEqual(yahoo_news.response(response), [])
40
 
40
 
41
         html = """
41
         html = """
42
-        <div class="res">
43
-            <div>
44
-                <h3>
45
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
46
-                        This is
47
-                        the <b>title</b>...
48
-                    </a>
49
-                </h3>
50
-            </div>
51
-            <span class="url">Business via Yahoo! Finance</span> &nbsp; <span class="timestamp">Feb 03 09:45am</span>
52
-            <div class="abstr">
53
-                This is the content
54
-            </div>
42
+        <ol class=" reg searchCenterMiddle">
43
+            <li class="first">
44
+                <div class="compTitle">
45
+                    <h3>
46
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
47
+                           This is
48
+                           the <b>title</b>...
49
+                        </a>
50
+                    </h3>
51
+                </div>
52
+                <div>
53
+                    <span class="cite">Business via Yahoo!</span>
54
+                    <span class="tri fc-2nd ml-10">May 01 10:00 AM</span>
55
+                </div>
56
+                <div class="compText">
57
+                   This is the content
58
+               </div>
59
+            </li>
55
         </div>
60
         </div>
56
         """
61
         """
57
         response = mock.Mock(text=html)
62
         response = mock.Mock(text=html)
63
         self.assertEqual(results[0]['content'], 'This is the content')
68
         self.assertEqual(results[0]['content'], 'This is the content')
64
 
69
 
65
         html = """
70
         html = """
66
-        <div class="res">
67
-            <div>
68
-                <h3>
69
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
70
-                        This is
71
-                        the <b>title</b>...
72
-                    </a>
73
-                </h3>
74
-            </div>
75
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">2 hours, 22 minutes ago</span>
76
-            <div class="abstr">
77
-                This is the content
78
-            </div>
79
-        </div>
80
-        <div class="res">
81
-            <div>
82
-                <h3>
83
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
84
-                        This is
85
-                        the <b>title</b>...
86
-                    </a>
87
-                </h3>
88
-            </div>
89
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">22 minutes ago</span>
90
-            <div class="abstr">
91
-                This is the content
92
-            </div>
93
-        </div>
94
-        <div class="res">
95
-            <div>
96
-                <h3>
97
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
98
-                        This is
99
-                        the <b>title</b>...
100
-                    </a>
101
-                </h3>
102
-            </div>
103
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">Feb 03 09:45am 1900</span>
104
-            <div class="abstr">
105
-                This is the content
106
-            </div>
107
-        </div>
71
+        <ol class=" reg searchCenterMiddle">
72
+            <li class="first">
73
+                <div class="compTitle">
74
+                    <h3>
75
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
76
+                            This is
77
+                            the <b>title</b>...
78
+                        </a>
79
+                    </h3>
80
+                </div>
81
+                <div>
82
+                    <span class="cite">Business via Yahoo!</span>
83
+                    <span class="tri fc-2nd ml-10">2 hours, 22 minutes ago</span>
84
+                </div>
85
+                <div class="compText">
86
+                    This is the content
87
+                </div>
88
+            </li>
89
+            <li>
90
+                <div class="compTitle">
91
+                    <h3>
92
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
93
+                            This is
94
+                            the <b>title</b>...
95
+                        </a>
96
+                    </h3>
97
+                </div>
98
+                <div>
99
+                    <span class="cite">Business via Yahoo!</span>
100
+                    <span class="tri fc-2nd ml-10">22 minutes ago</span>
101
+                </div>
102
+                <div class="compText">
103
+                    This is the content
104
+                </div>
105
+            </li>
106
+            <li>
107
+                <div class="compTitle">
108
+                    <h3>
109
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
110
+                            This is
111
+                            the <b>title</b>...
112
+                        </a>
113
+                    </h3>
114
+                </div>
115
+                <div>
116
+                    <span class="cite">Business via Yahoo!</span>
117
+                    <span class="tri fc-2nd ml-10">Feb 03 09:45AM 1900</span>
118
+                </div>
119
+                <div class="compText">
120
+                    This is the content
121
+                </div>
122
+            </li>
123
+        </ol>
108
         """
124
         """
109
         response = mock.Mock(text=html)
125
         response = mock.Mock(text=html)
110
         results = yahoo_news.response(response)
126
         results = yahoo_news.response(response)
114
         self.assertEqual(results[0]['url'], 'http://this.is.the.url/')
130
         self.assertEqual(results[0]['url'], 'http://this.is.the.url/')
115
         self.assertEqual(results[0]['content'], 'This is the content')
131
         self.assertEqual(results[0]['content'], 'This is the content')
116
         self.assertEqual(results[2]['publishedDate'].year, datetime.now().year)
132
         self.assertEqual(results[2]['publishedDate'].year, datetime.now().year)
117
-
118
-        html = """
119
-        <li class="b_algo" u="0|5109|4755453613245655|UAGjXgIrPH5yh-o5oNHRx_3Zta87f_QO">
120
-            <div Class="sa_mc">
121
-                <div class="sb_tlst">
122
-                    <h2>
123
-                        <a href="http://this.should.be.the.link/" h="ID=SERP,5124.1">
124
-                        <strong>This</strong> should be the title</a>
125
-                    </h2>
126
-                </div>
127
-                <div class="sb_meta">
128
-                <cite>
129
-                <strong>this</strong>.meta.com</cite>
130
-                    <span class="c_tlbxTrg">
131
-                        <span class="c_tlbxH" H="BASE:CACHEDPAGEDEFAULT" K="SERP,5125.1">
132
-                        </span>
133
-                    </span>
134
-                </div>
135
-                <p>
136
-                <strong>This</strong> should be the content.</p>
137
-            </div>
138
-        </li>
139
-        """
140
-        response = mock.Mock(text=html)
141
-        results = yahoo_news.response(response)
142
-        self.assertEqual(type(results), list)
143
-        self.assertEqual(len(results), 0)