Browse Source

[fix] bing_news based on RSS output format

Dalf 10 years ago
parent
commit
62cc2a5658
3 changed files with 156 additions and 237 deletions
  1. 60
    51
      searx/engines/bing_news.py
  2. 88
    186
      searx/tests/engines/test_bing_news.py
  3. 8
    0
      searx/utils.py

+ 60
- 51
searx/engines/bing_news.py View File

@@ -6,18 +6,17 @@
6 6
               max. 5000 query/month
7 7
 
8 8
  @using-api   no (because of query limit)
9
- @results     HTML (using search portal)
10
- @stable      no (HTML can change)
11
- @parse       url, title, content, publishedDate
9
+ @results     RSS (using search portal)
10
+ @stable      yes (except perhaps for the images)
11
+ @parse       url, title, content, publishedDate, thumbnail
12 12
 """
13 13
 
14 14
 from urllib import urlencode
15
-from cgi import escape
16
-from lxml import html
17
-from datetime import datetime, timedelta
15
+from urlparse import urlparse, parse_qsl
16
+from datetime import datetime
18 17
 from dateutil import parser
19
-import re
20
-from searx.engines.xpath import extract_text
18
+from lxml import etree
19
+from searx.utils import list_get
21 20
 
22 21
 # engine dependent config
23 22
 categories = ['news']
@@ -26,7 +25,25 @@ language_support = True
26 25
 
27 26
 # search-url
28 27
 base_url = 'https://www.bing.com/'
29
-search_string = 'news/search?{query}&first={offset}'
28
+search_string = 'news/search?{query}&first={offset}&format=RSS'
29
+
30
+
31
+# remove click
32
+def url_cleanup(url_string):
33
+    parsed_url = urlparse(url_string)
34
+    if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
35
+        query = dict(parse_qsl(parsed_url.query))
36
+        return query.get('url', None)
37
+    return url_string
38
+
39
+
40
+# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
41
+def image_url_cleanup(url_string):
42
+    parsed_url = urlparse(url_string)
43
+    if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
44
+        query = dict(parse_qsl(parsed_url.query))
45
+        return "https://www.bing.com/th?id=" + query.get('id')
46
+    return url_string
30 47
 
31 48
 
32 49
 # do search-request
@@ -42,8 +59,6 @@ def request(query, params):
42 59
         query=urlencode({'q': query, 'setmkt': language}),
43 60
         offset=offset)
44 61
 
45
-    params['cookies']['_FP'] = "ui=en-US"
46
-
47 62
     params['url'] = base_url + search_path
48 63
 
49 64
     return params
@@ -53,50 +68,44 @@ def request(query, params):
53 68
 def response(resp):
54 69
     results = []
55 70
 
56
-    dom = html.fromstring(resp.content)
71
+    rss = etree.fromstring(resp.content)
72
+
73
+    ns = rss.nsmap
57 74
 
58 75
     # parse results
59
-    for result in dom.xpath('//div[@class="sn_r"]'):
60
-        link = result.xpath('.//div[@class="newstitle"]/a')[0]
61
-        url = link.attrib.get('href')
62
-        title = extract_text(link)
63
-        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
64
-        content = escape(extract_text(contentXPath))
65
-
66
-        # parse publishedDate
67
-        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
68
-                                          '//div[contains(@class,"sn_ST")]'
69
-                                          '//span[contains(@class,"sn_tm")]')
70
-
71
-        publishedDate = escape(extract_text(publishedDateXPath))
72
-
73
-        if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
74
-            timeNumbers = re.findall(r'\d+', publishedDate)
75
-            publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
76
-        elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
77
-            timeNumbers = re.findall(r'\d+', publishedDate)
78
-            publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
79
-        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
80
-            timeNumbers = re.findall(r'\d+', publishedDate)
81
-            publishedDate = datetime.now()\
82
-                - timedelta(hours=int(timeNumbers[0]))\
83
-                - timedelta(minutes=int(timeNumbers[1]))
84
-        elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
85
-            timeNumbers = re.findall(r'\d+', publishedDate)
86
-            publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
87
-        else:
88
-            try:
89
-                publishedDate = parser.parse(publishedDate, dayfirst=False)
90
-            except TypeError:
91
-                publishedDate = datetime.now()
92
-            except ValueError:
93
-                publishedDate = datetime.now()
76
+    for item in rss.xpath('./channel/item'):
77
+        # url / title / content
78
+        url = url_cleanup(item.xpath('./link/text()')[0])
79
+        title = list_get(item.xpath('./title/text()'), 0, url)
80
+        content = list_get(item.xpath('./description/text()'), 0, '')
81
+
82
+        # publishedDate
83
+        publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
84
+        try:
85
+            publishedDate = parser.parse(publishedDate, dayfirst=False)
86
+        except TypeError:
87
+            publishedDate = datetime.now()
88
+        except ValueError:
89
+            publishedDate = datetime.now()
90
+
91
+        # thumbnail
92
+        thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
93
+        if thumbnail is not None:
94
+            thumbnail = image_url_cleanup(thumbnail)
94 95
 
95 96
         # append result
96
-        results.append({'url': url,
97
-                        'title': title,
98
-                        'publishedDate': publishedDate,
99
-                        'content': content})
97
+        if thumbnail is not None:
98
+            results.append({'template': 'videos.html',
99
+                            'url': url,
100
+                            'title': title,
101
+                            'publishedDate': publishedDate,
102
+                            'content': content,
103
+                            'thumbnail': thumbnail})
104
+        else:
105
+            results.append({'url': url,
106
+                            'title': title,
107
+                            'publishedDate': publishedDate,
108
+                            'content': content})
100 109
 
101 110
     # return results
102 111
     return results

+ 88
- 186
searx/tests/engines/test_bing_news.py View File

@@ -2,6 +2,7 @@ from collections import defaultdict
2 2
 import mock
3 3
 from searx.engines import bing_news
4 4
 from searx.testing import SearxTestCase
5
+import lxml
5 6
 
6 7
 
7 8
 class TestBingNewsEngine(SearxTestCase):
@@ -16,14 +17,10 @@ class TestBingNewsEngine(SearxTestCase):
16 17
         self.assertIn(query, params['url'])
17 18
         self.assertIn('bing.com', params['url'])
18 19
         self.assertIn('fr', params['url'])
19
-        self.assertIn('_FP', params['cookies'])
20
-        self.assertIn('en', params['cookies']['_FP'])
21 20
 
22 21
         dicto['language'] = 'all'
23 22
         params = bing_news.request(query, dicto)
24 23
         self.assertIn('en', params['url'])
25
-        self.assertIn('_FP', params['cookies'])
26
-        self.assertIn('en', params['cookies']['_FP'])
27 24
 
28 25
     def test_response(self):
29 26
         self.assertRaises(AttributeError, bing_news.response, None)
@@ -37,200 +34,105 @@ class TestBingNewsEngine(SearxTestCase):
37 34
         response = mock.Mock(content='<html></html>')
38 35
         self.assertEqual(bing_news.response(response), [])
39 36
 
40
-        html = """
41
-        <div class="sn_r">
42
-            <div class="newstitle">
43
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
44
-                    Title
45
-                </a>
46
-            </div>
47
-            <div class="sn_img">
48
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
49
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
50
-                </a>
51
-            </div>
52
-            <div class="sn_txt">
53
-                <div class="sn_oi">
54
-                    <span class="sn_snip">Article Content</span>
55
-                    <div class="sn_ST">
56
-                        <cite class="sn_src">metronews.fr</cite>
57
-                        &nbsp;&#0183;&#32;
58
-                        <span class="sn_tm">44 minutes ago</span>
59
-                    </div>
60
-                </div>
61
-            </div>
62
-        </div>
63
-        """
37
+        html = """<?xml version="1.0" encoding="utf-8" ?>
38
+<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
39
+    <channel>
40
+        <title>python - Bing News</title>
41
+        <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
42
+        <description>Search results</description>
43
+        <image>
44
+            <url>http://10.53.64.9/rsslogo.gif</url>
45
+            <title>test</title>
46
+            <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
47
+        </image>
48
+        <copyright>Copyright</copyright>
49
+        <item>
50
+            <title>Title</title>
51
+            <link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2furl.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
52
+            <description>Article Content</description>
53
+            <pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
54
+            <News:Source>Infoworld</News:Source>
55
+            <News:Image>http://a1.bing4.com/th?id=ON.13371337133713371337133713371337&amp;pid=News</News:Image>
56
+            <News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
57
+            <News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
58
+            <News:ImageMaxWidth>620</News:ImageMaxWidth>
59
+            <News:ImageMaxHeight>413</News:ImageMaxHeight>
60
+        </item>
61
+        <item>
62
+            <title>Another Title</title>
63
+            <link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&amp;aid=&amp;tid=c237eccc50bd4758b106a5e3c94fce09&amp;url=http%3a%2f%2fanother.url.of.article%2f&amp;c=xxxxxxxxx&amp;mkt=en-us</link>
64
+            <description>Another Article Content</description>
65
+            <pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
66
+        </item>
67
+    </channel>
68
+</rss>"""  # noqa
64 69
         response = mock.Mock(content=html)
65 70
         results = bing_news.response(response)
66 71
         self.assertEqual(type(results), list)
67
-        self.assertEqual(len(results), 1)
72
+        self.assertEqual(len(results), 2)
68 73
         self.assertEqual(results[0]['title'], 'Title')
69 74
         self.assertEqual(results[0]['url'], 'http://url.of.article/')
70 75
         self.assertEqual(results[0]['content'], 'Article Content')
76
+        self.assertEqual(results[0]['thumbnail'], 'https://www.bing.com/th?id=ON.13371337133713371337133713371337')
77
+        self.assertEqual(results[1]['title'], 'Another Title')
78
+        self.assertEqual(results[1]['url'], 'http://another.url.of.article/')
79
+        self.assertEqual(results[1]['content'], 'Another Article Content')
80
+        self.assertNotIn('thumbnail', results[1])
71 81
 
72
-        html = """
73
-        <div class="sn_r">
74
-            <div class="newstitle">
75
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
76
-                    Title
77
-                </a>
78
-            </div>
79
-            <div class="sn_img">
80
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
81
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
82
-                </a>
83
-            </div>
84
-            <div class="sn_txt">
85
-                <div class="sn_oi">
86
-                    <span class="sn_snip">Article Content</span>
87
-                    <div class="sn_ST">
88
-                        <cite class="sn_src">metronews.fr</cite>
89
-                        &nbsp;&#0183;&#32;
90
-                        <span class="sn_tm">44 minutes ago</span>
91
-                    </div>
92
-                </div>
93
-            </div>
94
-        </div>
95
-        <div class="sn_r">
96
-            <div class="newstitle">
97
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
98
-                    Title
99
-                </a>
100
-            </div>
101
-            <div class="sn_img">
102
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
103
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
104
-                </a>
105
-            </div>
106
-            <div class="sn_txt">
107
-                <div class="sn_oi">
108
-                    <span class="sn_snip">Article Content</span>
109
-                    <div class="sn_ST">
110
-                        <cite class="sn_src">metronews.fr</cite>
111
-                        &nbsp;&#0183;&#32;
112
-                        <span class="sn_tm">3 hours, 44 minutes ago</span>
113
-                    </div>
114
-                </div>
115
-            </div>
116
-        </div>
117
-        <div class="sn_r">
118
-            <div class="newstitle">
119
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
120
-                    Title
121
-                </a>
122
-            </div>
123
-            <div class="sn_img">
124
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
125
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
126
-                </a>
127
-            </div>
128
-            <div class="sn_txt">
129
-                <div class="sn_oi">
130
-                    <span class="sn_snip">Article Content</span>
131
-                    <div class="sn_ST">
132
-                        <cite class="sn_src">metronews.fr</cite>
133
-                        &nbsp;&#0183;&#32;
134
-                        <span class="sn_tm">44 hours ago</span>
135
-                    </div>
136
-                </div>
137
-            </div>
138
-        </div>
139
-        <div class="sn_r">
140
-            <div class="newstitle">
141
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
142
-                    Title
143
-                </a>
144
-            </div>
145
-            <div class="sn_img">
146
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
147
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
148
-                </a>
149
-            </div>
150
-            <div class="sn_txt">
151
-                <div class="sn_oi">
152
-                    <span class="sn_snip">Article Content</span>
153
-                    <div class="sn_ST">
154
-                        <cite class="sn_src">metronews.fr</cite>
155
-                        &nbsp;&#0183;&#32;
156
-                        <span class="sn_tm">2 days ago</span>
157
-                    </div>
158
-                </div>
159
-            </div>
160
-        </div>
161
-        <div class="sn_r">
162
-            <div class="newstitle">
163
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
164
-                    Title
165
-                </a>
166
-            </div>
167
-            <div class="sn_img">
168
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
169
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
170
-                </a>
171
-            </div>
172
-            <div class="sn_txt">
173
-                <div class="sn_oi">
174
-                    <span class="sn_snip">Article Content</span>
175
-                    <div class="sn_ST">
176
-                        <cite class="sn_src">metronews.fr</cite>
177
-                        &nbsp;&#0183;&#32;
178
-                        <span class="sn_tm">27/01/2015</span>
179
-                    </div>
180
-                </div>
181
-            </div>
182
-        </div>
183
-        <div class="sn_r">
184
-            <div class="newstitle">
185
-                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
186
-                    Title
187
-                </a>
188
-            </div>
189
-            <div class="sn_img">
190
-                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
191
-                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
192
-                </a>
193
-            </div>
194
-            <div class="sn_txt">
195
-                <div class="sn_oi">
196
-                    <span class="sn_snip">Article Content</span>
197
-                    <div class="sn_ST">
198
-                        <cite class="sn_src">metronews.fr</cite>
199
-                        &nbsp;&#0183;&#32;
200
-                        <span class="sn_tm">Il y a 3 heures</span>
201
-                    </div>
202
-                </div>
203
-            </div>
204
-        </div>
205
-        """
82
+        html = """<?xml version="1.0" encoding="utf-8" ?>
83
+<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
84
+    <channel>
85
+        <title>python - Bing News</title>
86
+        <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
87
+        <description>Search results</description>
88
+        <image>
89
+            <url>http://10.53.64.9/rsslogo.gif</url>
90
+            <title>test</title>
91
+            <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
92
+        </image>
93
+        <copyright>Copyright</copyright>
94
+        <item>
95
+            <title>Title</title>
96
+            <link>http://another.url.of.article/</link>
97
+            <description>Article Content</description>
98
+            <pubDate>garbage</pubDate>
99
+            <News:Source>Infoworld</News:Source>
100
+            <News:Image>http://another.bing.com/image</News:Image>
101
+            <News:ImageSize>w={0}&amp;h={1}&amp;c=7</News:ImageSize>
102
+            <News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
103
+            <News:ImageMaxWidth>620</News:ImageMaxWidth>
104
+            <News:ImageMaxHeight>413</News:ImageMaxHeight>
105
+        </item>
106
+    </channel>
107
+</rss>"""  # noqa
206 108
         response = mock.Mock(content=html)
207 109
         results = bing_news.response(response)
208 110
         self.assertEqual(type(results), list)
209
-        self.assertEqual(len(results), 6)
111
+        self.assertEqual(len(results), 1)
112
+        self.assertEqual(results[0]['title'], 'Title')
113
+        self.assertEqual(results[0]['url'], 'http://another.url.of.article/')
114
+        self.assertEqual(results[0]['content'], 'Article Content')
115
+        self.assertEqual(results[0]['thumbnail'], 'http://another.bing.com/image')
116
+
117
+        html = """<?xml version="1.0" encoding="utf-8" ?>
118
+<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS">
119
+    <channel>
120
+        <title>python - Bing News</title>
121
+        <link>https://www.bing.com:443/news/search?q=python&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
122
+        <description>Search results</description>
123
+        <image>
124
+            <url>http://10.53.64.9/rsslogo.gif</url>
125
+            <title>test</title>
126
+            <link>https://www.bing.com:443/news/search?q=test&amp;setmkt=en-US&amp;first=1&amp;format=RSS</link>
127
+        </image>
128
+    </channel>
129
+</rss>"""  # noqa
210 130
 
211
-        html = """
212
-        <div class="newstitle">
213
-            <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
214
-                Title
215
-            </a>
216
-        </div>
217
-        <div class="sn_img">
218
-            <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
219
-                <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
220
-            </a>
221
-        </div>
222
-        <div class="sn_txt">
223
-            <div class="sn_oi">
224
-                <span class="sn_snip">Article Content</span>
225
-                <div class="sn_ST">
226
-                    <cite class="sn_src">metronews.fr</cite>
227
-                    &nbsp;&#0183;&#32;
228
-                    <span class="sn_tm">44 minutes ago</span>
229
-                </div>
230
-            </div>
231
-        </div>
232
-        """
233 131
         response = mock.Mock(content=html)
234 132
         results = bing_news.response(response)
235 133
         self.assertEqual(type(results), list)
236 134
         self.assertEqual(len(results), 0)
135
+
136
+        html = """<?xml version="1.0" encoding="utf-8" ?>gabarge"""
137
+        response = mock.Mock(content=html)
138
+        self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response)

+ 8
- 0
searx/utils.py View File

@@ -228,6 +228,14 @@ def prettify_url(url):
228 228
         return url
229 229
 
230 230
 
231
+# get element in list or default value
232
+def list_get(a_list, index, default=None):
233
+    if len(a_list) > index:
234
+        return a_list[index]
235
+    else:
236
+        return default
237
+
238
+
231 239
 def get_blocked_engines(engines, cookies):
232 240
     if 'blocked_engines' not in cookies:
233 241
         return [(engine_name, category) for engine_name in engines