Browse Source

Bing news' unit test

Cqoicebordel 10 years ago
parent
commit
efde2c21c8
3 changed files with 249 additions and 17 deletions
  1. 12
    17
      searx/engines/bing_news.py
  2. 236
    0
      searx/tests/engines/test_bing_news.py
  3. 1
    0
      searx/tests/test_engines.py

+ 12
- 17
searx/engines/bing_news.py View File

15
 from datetime import datetime, timedelta
15
 from datetime import datetime, timedelta
16
 from dateutil import parser
16
 from dateutil import parser
17
 import re
17
 import re
18
+from searx.engines.xpath import extract_text
18
 
19
 
19
 # engine dependent config
20
 # engine dependent config
20
 categories = ['news']
21
 categories = ['news']
42
     params['cookies']['_FP'] = "ui=en-US"
43
     params['cookies']['_FP'] = "ui=en-US"
43
 
44
 
44
     params['url'] = base_url + search_path
45
     params['url'] = base_url + search_path
46
+
45
     return params
47
     return params
46
 
48
 
47
 
49
 
55
     for result in dom.xpath('//div[@class="sn_r"]'):
57
     for result in dom.xpath('//div[@class="sn_r"]'):
56
         link = result.xpath('.//div[@class="newstitle"]/a')[0]
58
         link = result.xpath('.//div[@class="newstitle"]/a')[0]
57
         url = link.attrib.get('href')
59
         url = link.attrib.get('href')
58
-        title = ' '.join(link.xpath('.//text()'))
59
-        contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
60
-                                    '//span[@class="sn_snip"]//text()')
60
+        title = extract_text(link)
61
+        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
61
         if contentXPath is not None:
62
         if contentXPath is not None:
62
-            content = escape(' '.join(contentXPath))
63
+            content = escape(extract_text(contentXPath))
63
 
64
 
64
         # parse publishedDate
65
         # parse publishedDate
65
         publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
66
         publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
66
                                           '//span[contains(@class,"sn_ST")]'
67
                                           '//span[contains(@class,"sn_ST")]'
67
-                                          '//span[contains(@class,"sn_tm")]'
68
-                                          '//text()')
68
+                                          '//span[contains(@class,"sn_tm")]')
69
+
69
         if publishedDateXPath is not None:
70
         if publishedDateXPath is not None:
70
-            publishedDate = escape(' '.join(publishedDateXPath))
71
+            publishedDate = escape(extract_text(publishedDateXPath))
71
 
72
 
72
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
73
         if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
73
             timeNumbers = re.findall(r'\d+', publishedDate)
74
             timeNumbers = re.findall(r'\d+', publishedDate)
74
-            publishedDate = datetime.now()\
75
-                - timedelta(minutes=int(timeNumbers[0]))
75
+            publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
76
         elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
76
         elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
77
             timeNumbers = re.findall(r'\d+', publishedDate)
77
             timeNumbers = re.findall(r'\d+', publishedDate)
78
-            publishedDate = datetime.now()\
79
-                - timedelta(hours=int(timeNumbers[0]))
80
-        elif re.match("^[0-9]+ hour(s|),"
81
-                      " [0-9]+ minute(s|) ago$", publishedDate):
78
+            publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
79
+        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
82
             timeNumbers = re.findall(r'\d+', publishedDate)
80
             timeNumbers = re.findall(r'\d+', publishedDate)
83
             publishedDate = datetime.now()\
81
             publishedDate = datetime.now()\
84
                 - timedelta(hours=int(timeNumbers[0]))\
82
                 - timedelta(hours=int(timeNumbers[0]))\
85
                 - timedelta(minutes=int(timeNumbers[1]))
83
                 - timedelta(minutes=int(timeNumbers[1]))
86
         elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
84
         elif re.match("^[0-9]+ day(s|) ago$", publishedDate):
87
             timeNumbers = re.findall(r'\d+', publishedDate)
85
             timeNumbers = re.findall(r'\d+', publishedDate)
88
-            publishedDate = datetime.now()\
89
-                - timedelta(days=int(timeNumbers[0]))
86
+            publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
90
         else:
87
         else:
91
             try:
88
             try:
92
-                # FIXME use params['language'] to parse either mm/dd or dd/mm
93
                 publishedDate = parser.parse(publishedDate, dayfirst=False)
89
                 publishedDate = parser.parse(publishedDate, dayfirst=False)
94
             except TypeError:
90
             except TypeError:
95
-                # FIXME
96
                 publishedDate = datetime.now()
91
                 publishedDate = datetime.now()
97
 
92
 
98
         # append result
93
         # append result

+ 236
- 0
searx/tests/engines/test_bing_news.py View File

1
+from collections import defaultdict
2
+import mock
3
+from searx.engines import bing_news
4
+from searx.testing import SearxTestCase
5
+
6
+
7
+class TestBingNewsEngine(SearxTestCase):
8
+
9
+    def test_request(self):
10
+        query = 'test_query'
11
+        dicto = defaultdict(dict)
12
+        dicto['pageno'] = 1
13
+        dicto['language'] = 'fr_FR'
14
+        params = bing_news.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn(query, params['url'])
17
+        self.assertIn('bing.com', params['url'])
18
+        self.assertIn('fr', params['url'])
19
+        self.assertIn('_FP', params['cookies'])
20
+        self.assertIn('en', params['cookies']['_FP'])
21
+
22
+        dicto['language'] = 'all'
23
+        params = bing_news.request(query, dicto)
24
+        self.assertIn('en', params['url'])
25
+        self.assertIn('_FP', params['cookies'])
26
+        self.assertIn('en', params['cookies']['_FP'])
27
+
28
+    def test_response(self):
29
+        self.assertRaises(AttributeError, bing_news.response, None)
30
+        self.assertRaises(AttributeError, bing_news.response, [])
31
+        self.assertRaises(AttributeError, bing_news.response, '')
32
+        self.assertRaises(AttributeError, bing_news.response, '[]')
33
+
34
+        response = mock.Mock(content='<html></html>')
35
+        self.assertEqual(bing_news.response(response), [])
36
+
37
+        response = mock.Mock(content='<html></html>')
38
+        self.assertEqual(bing_news.response(response), [])
39
+
40
+        html = """
41
+        <div class="sn_r">
42
+            <div class="newstitle">
43
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
44
+                    Title
45
+                </a>
46
+            </div>
47
+            <div class="sn_img">
48
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
49
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
50
+                </a>
51
+            </div>
52
+            <div class="sn_txt">
53
+                <div class="sn_oi">
54
+                    <span class="sn_snip">Article Content</span>
55
+                    <span class="sn_ST">
56
+                        <cite class="sn_src">metronews.fr</cite>
57
+                        &nbsp;&#0183;&#32;
58
+                        <span class="sn_tm">44 minutes ago</span>
59
+                    </span>
60
+                </div>
61
+            </div>
62
+        </div>
63
+        """
64
+        response = mock.Mock(content=html)
65
+        results = bing_news.response(response)
66
+        self.assertEqual(type(results), list)
67
+        self.assertEqual(len(results), 1)
68
+        self.assertEqual(results[0]['title'], 'Title')
69
+        self.assertEqual(results[0]['url'], 'http://url.of.article/')
70
+        self.assertEqual(results[0]['content'], 'Article Content')
71
+
72
+        html = """
73
+        <div class="sn_r">
74
+            <div class="newstitle">
75
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
76
+                    Title
77
+                </a>
78
+            </div>
79
+            <div class="sn_img">
80
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
81
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
82
+                </a>
83
+            </div>
84
+            <div class="sn_txt">
85
+                <div class="sn_oi">
86
+                    <span class="sn_snip">Article Content</span>
87
+                    <span class="sn_ST">
88
+                        <cite class="sn_src">metronews.fr</cite>
89
+                        &nbsp;&#0183;&#32;
90
+                        <span class="sn_tm">44 minutes ago</span>
91
+                    </span>
92
+                </div>
93
+            </div>
94
+        </div>
95
+        <div class="sn_r">
96
+            <div class="newstitle">
97
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
98
+                    Title
99
+                </a>
100
+            </div>
101
+            <div class="sn_img">
102
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
103
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
104
+                </a>
105
+            </div>
106
+            <div class="sn_txt">
107
+                <div class="sn_oi">
108
+                    <span class="sn_snip">Article Content</span>
109
+                    <span class="sn_ST">
110
+                        <cite class="sn_src">metronews.fr</cite>
111
+                        &nbsp;&#0183;&#32;
112
+                        <span class="sn_tm">3 hours, 44 minutes ago</span>
113
+                    </span>
114
+                </div>
115
+            </div>
116
+        </div>
117
+        <div class="sn_r">
118
+            <div class="newstitle">
119
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
120
+                    Title
121
+                </a>
122
+            </div>
123
+            <div class="sn_img">
124
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
125
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
126
+                </a>
127
+            </div>
128
+            <div class="sn_txt">
129
+                <div class="sn_oi">
130
+                    <span class="sn_snip">Article Content</span>
131
+                    <span class="sn_ST">
132
+                        <cite class="sn_src">metronews.fr</cite>
133
+                        &nbsp;&#0183;&#32;
134
+                        <span class="sn_tm">44 hours ago</span>
135
+                    </span>
136
+                </div>
137
+            </div>
138
+        </div>
139
+        <div class="sn_r">
140
+            <div class="newstitle">
141
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
142
+                    Title
143
+                </a>
144
+            </div>
145
+            <div class="sn_img">
146
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
147
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
148
+                </a>
149
+            </div>
150
+            <div class="sn_txt">
151
+                <div class="sn_oi">
152
+                    <span class="sn_snip">Article Content</span>
153
+                    <span class="sn_ST">
154
+                        <cite class="sn_src">metronews.fr</cite>
155
+                        &nbsp;&#0183;&#32;
156
+                        <span class="sn_tm">2 days ago</span>
157
+                    </span>
158
+                </div>
159
+            </div>
160
+        </div>
161
+        <div class="sn_r">
162
+            <div class="newstitle">
163
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
164
+                    Title
165
+                </a>
166
+            </div>
167
+            <div class="sn_img">
168
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
169
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
170
+                </a>
171
+            </div>
172
+            <div class="sn_txt">
173
+                <div class="sn_oi">
174
+                    <span class="sn_snip">Article Content</span>
175
+                    <span class="sn_ST">
176
+                        <cite class="sn_src">metronews.fr</cite>
177
+                        &nbsp;&#0183;&#32;
178
+                        <span class="sn_tm">27/01/2015</span>
179
+                    </span>
180
+                </div>
181
+            </div>
182
+        </div>
183
+        <div class="sn_r">
184
+            <div class="newstitle">
185
+                <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
186
+                    Title
187
+                </a>
188
+            </div>
189
+            <div class="sn_img">
190
+                <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
191
+                    <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
192
+                </a>
193
+            </div>
194
+            <div class="sn_txt">
195
+                <div class="sn_oi">
196
+                    <span class="sn_snip">Article Content</span>
197
+                    <span class="sn_ST">
198
+                        <cite class="sn_src">metronews.fr</cite>
199
+                        &nbsp;&#0183;&#32;
200
+                        <span class="sn_tm">Il y a 3 heures</span>
201
+                    </span>
202
+                </div>
203
+            </div>
204
+        </div>
205
+        """
206
+        response = mock.Mock(content=html)
207
+        results = bing_news.response(response)
208
+        self.assertEqual(type(results), list)
209
+        self.assertEqual(len(results), 6)
210
+
211
+        html = """
212
+        <div class="newstitle">
213
+            <a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
214
+                Title
215
+            </a>
216
+        </div>
217
+        <div class="sn_img">
218
+            <a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
219
+                <img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
220
+            </a>
221
+        </div>
222
+        <div class="sn_txt">
223
+            <div class="sn_oi">
224
+                <span class="sn_snip">Article Content</span>
225
+                <span class="sn_ST">
226
+                    <cite class="sn_src">metronews.fr</cite>
227
+                    &nbsp;&#0183;&#32;
228
+                    <span class="sn_tm">44 minutes ago</span>
229
+                </span>
230
+            </div>
231
+        </div>
232
+        """
233
+        response = mock.Mock(content=html)
234
+        results = bing_news.response(response)
235
+        self.assertEqual(type(results), list)
236
+        self.assertEqual(len(results), 0)

+ 1
- 0
searx/tests/test_engines.py View File

1
 from searx.tests.engines.test_bing import *  # noqa
1
 from searx.tests.engines.test_bing import *  # noqa
2
 from searx.tests.engines.test_bing_images import *  # noqa
2
 from searx.tests.engines.test_bing_images import *  # noqa
3
+from searx.tests.engines.test_bing_news import *  # noqa
3
 from searx.tests.engines.test_dailymotion import *  # noqa
4
 from searx.tests.engines.test_dailymotion import *  # noqa
4
 from searx.tests.engines.test_deezer import *  # noqa
5
 from searx.tests.engines.test_deezer import *  # noqa
5
 from searx.tests.engines.test_deviantart import *  # noqa
6
 from searx.tests.engines.test_deviantart import *  # noqa