Browse Source

[fix] yahoo_news engine

Alexandre Flament 10 years ago
parent
commit
44ed4424f6
2 changed files with 89 additions and 89 deletions
  1. 18
    7
      searx/engines/yahoo_news.py
  2. 71
    82
      searx/tests/engines/test_yahoo_news.py

+ 18
- 7
searx/engines/yahoo_news.py View File

@@ -23,15 +23,15 @@ paging = True
23 23
 language_support = True
24 24
 
25 25
 # search-url
26
-search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'  # noqa
26
+search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}'  # noqa
27 27
 
28 28
 # specific xpath variables
29
-results_xpath = '//div[@class="res"]'
29
+results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li'
30 30
 url_xpath = './/h3/a/@href'
31 31
 title_xpath = './/h3/a'
32
-content_xpath = './/div[@class="abstr"]'
33
-publishedDate_xpath = './/span[@class="timestamp"]'
34
-suggestion_xpath = '//div[@id="satat"]//a'
32
+content_xpath = './/div[@class="compText"]'
33
+publishedDate_xpath = './/span[contains(@class,"tri")]'
34
+suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a'
35 35
 
36 36
 
37 37
 # do search-request
@@ -48,11 +48,18 @@ def request(query, params):
48 48
                                       lang=language)
49 49
 
50 50
     # TODO required?
51
-    params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
51
+    params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\
52 52
         .format(lang=language)
53 53
     return params
54 54
 
55 55
 
56
+def sanitize_url(url):
57
+    if ".yahoo.com/" in url:
58
+        return re.sub(u"\;\_ylt\=.+$", "", url)
59
+    else:
60
+        return url
61
+
62
+
56 63
 # get response from search-request
57 64
 def response(resp):
58 65
     results = []
@@ -61,13 +68,17 @@ def response(resp):
61 68
 
62 69
     # parse results
63 70
     for result in dom.xpath(results_xpath):
64
-        url = parse_url(extract_url(result.xpath(url_xpath), search_url))
71
+        urls = result.xpath(url_xpath)
72
+        if len(urls) != 1:
73
+            continue
74
+        url = sanitize_url(parse_url(extract_url(urls, search_url)))
65 75
         title = extract_text(result.xpath(title_xpath)[0])
66 76
         content = extract_text(result.xpath(content_xpath)[0])
67 77
 
68 78
         # parse publishedDate
69 79
         publishedDate = extract_text(result.xpath(publishedDate_xpath)[0])
70 80
 
81
+        # still useful ?
71 82
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
72 83
             publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group()))  # noqa
73 84
         else:

+ 71
- 82
searx/tests/engines/test_yahoo_news.py View File

@@ -39,19 +39,24 @@ class TestYahooNewsEngine(SearxTestCase):
39 39
         self.assertEqual(yahoo_news.response(response), [])
40 40
 
41 41
         html = """
42
-        <div class="res">
43
-            <div>
44
-                <h3>
45
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
46
-                        This is
47
-                        the <b>title</b>...
48
-                    </a>
49
-                </h3>
50
-            </div>
51
-            <span class="url">Business via Yahoo! Finance</span> &nbsp; <span class="timestamp">Feb 03 09:45am</span>
52
-            <div class="abstr">
53
-                This is the content
54
-            </div>
42
+        <ol class=" reg searchCenterMiddle">
43
+            <li class="first">
44
+                <div class="compTitle">
45
+                    <h3>
46
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
47
+                           This is
48
+                           the <b>title</b>...
49
+                        </a>
50
+                    </h3>
51
+                </div>
52
+                <div>
53
+                    <span class="cite">Business via Yahoo!</span>
54
+                    <span class="tri fc-2nd ml-10">May 01 10:00 AM</span>
55
+                </div>
56
+                <div class="compText">
57
+                   This is the content
58
+               </div>
59
+            </li>
55 60
         </div>
56 61
         """
57 62
         response = mock.Mock(text=html)
@@ -63,48 +68,59 @@ class TestYahooNewsEngine(SearxTestCase):
63 68
         self.assertEqual(results[0]['content'], 'This is the content')
64 69
 
65 70
         html = """
66
-        <div class="res">
67
-            <div>
68
-                <h3>
69
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
70
-                        This is
71
-                        the <b>title</b>...
72
-                    </a>
73
-                </h3>
74
-            </div>
75
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">2 hours, 22 minutes ago</span>
76
-            <div class="abstr">
77
-                This is the content
78
-            </div>
79
-        </div>
80
-        <div class="res">
81
-            <div>
82
-                <h3>
83
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
84
-                        This is
85
-                        the <b>title</b>...
86
-                    </a>
87
-                </h3>
88
-            </div>
89
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">22 minutes ago</span>
90
-            <div class="abstr">
91
-                This is the content
92
-            </div>
93
-        </div>
94
-        <div class="res">
95
-            <div>
96
-                <h3>
97
-                    <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
98
-                        This is
99
-                        the <b>title</b>...
100
-                    </a>
101
-                </h3>
102
-            </div>
103
-            <span class="url">Business via Yahoo!</span> &nbsp; <span class="timestamp">Feb 03 09:45am 1900</span>
104
-            <div class="abstr">
105
-                This is the content
106
-            </div>
107
-        </div>
71
+        <ol class=" reg searchCenterMiddle">
72
+            <li class="first">
73
+                <div class="compTitle">
74
+                    <h3>
75
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
76
+                            This is
77
+                            the <b>title</b>...
78
+                        </a>
79
+                    </h3>
80
+                </div>
81
+                <div>
82
+                    <span class="cite">Business via Yahoo!</span>
83
+                    <span class="tri fc-2nd ml-10">2 hours, 22 minutes ago</span>
84
+                </div>
85
+                <div class="compText">
86
+                    This is the content
87
+                </div>
88
+            </li>
89
+            <li>
90
+                <div class="compTitle">
91
+                    <h3>
92
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
93
+                            This is
94
+                            the <b>title</b>...
95
+                        </a>
96
+                    </h3>
97
+                </div>
98
+                <div>
99
+                    <span class="cite">Business via Yahoo!</span>
100
+                    <span class="tri fc-2nd ml-10">22 minutes ago</span>
101
+                </div>
102
+                <div class="compText">
103
+                    This is the content
104
+                </div>
105
+            </li>
106
+            <li>
107
+                <div class="compTitle">
108
+                    <h3>
109
+                        <a class="yschttl spt" href="http://this.is.the.url" target="_blank">
110
+                            This is
111
+                            the <b>title</b>...
112
+                        </a>
113
+                    </h3>
114
+                </div>
115
+                <div>
116
+                    <span class="cite">Business via Yahoo!</span>
117
+                    <span class="tri fc-2nd ml-10">Feb 03 09:45AM 1900</span>
118
+                </div>
119
+                <div class="compText">
120
+                    This is the content
121
+                </div>
122
+            </li>
123
+        </ol>
108 124
         """
109 125
         response = mock.Mock(text=html)
110 126
         results = yahoo_news.response(response)
@@ -114,30 +130,3 @@ class TestYahooNewsEngine(SearxTestCase):
114 130
         self.assertEqual(results[0]['url'], 'http://this.is.the.url/')
115 131
         self.assertEqual(results[0]['content'], 'This is the content')
116 132
         self.assertEqual(results[2]['publishedDate'].year, datetime.now().year)
117
-
118
-        html = """
119
-        <li class="b_algo" u="0|5109|4755453613245655|UAGjXgIrPH5yh-o5oNHRx_3Zta87f_QO">
120
-            <div Class="sa_mc">
121
-                <div class="sb_tlst">
122
-                    <h2>
123
-                        <a href="http://this.should.be.the.link/" h="ID=SERP,5124.1">
124
-                        <strong>This</strong> should be the title</a>
125
-                    </h2>
126
-                </div>
127
-                <div class="sb_meta">
128
-                <cite>
129
-                <strong>this</strong>.meta.com</cite>
130
-                    <span class="c_tlbxTrg">
131
-                        <span class="c_tlbxH" H="BASE:CACHEDPAGEDEFAULT" K="SERP,5125.1">
132
-                        </span>
133
-                    </span>
134
-                </div>
135
-                <p>
136
-                <strong>This</strong> should be the content.</p>
137
-            </div>
138
-        </li>
139
-        """
140
-        response = mock.Mock(text=html)
141
-        results = yahoo_news.response(response)
142
-        self.assertEqual(type(results), list)
143
-        self.assertEqual(len(results), 0)