|
|
@@ -1,43 +1,62 @@
|
|
1
|
|
-#!/usr/bin/env python
|
|
|
1
|
+## Google (News)
|
|
|
2
|
+#
|
|
|
3
|
+# @website https://www.google.com
|
|
|
4
|
+# @provide-api yes (https://developers.google.com/web-search/docs/), deprecated!
|
|
|
5
|
+#
|
|
|
6
|
+# @using-api yes
|
|
|
7
|
+# @results JSON
|
|
|
8
|
+# @stable yes (but deprecated)
|
|
|
9
|
+# @parse url, title, content, publishedDate
|
|
2
|
10
|
|
|
3
|
11
|
from urllib import urlencode
|
|
4
|
12
|
from json import loads
|
|
5
|
13
|
from dateutil import parser
|
|
6
|
14
|
|
|
|
15
|
+# search-url
|
|
7
|
16
|
categories = ['news']
|
|
|
17
|
+paging = True
|
|
|
18
|
+language_support = True
|
|
8
|
19
|
|
|
|
20
|
+# engine dependent config
|
|
9
|
21
|
url = 'https://ajax.googleapis.com/'
|
|
10
|
22
|
search_url = url + 'ajax/services/search/news?v=2.0&start={offset}&rsz=large&safe=off&filter=off&{query}&hl={language}' # noqa
|
|
11
|
23
|
|
|
12
|
|
-paging = True
|
|
13
|
|
-language_support = True
|
|
14
|
|
-
|
|
15
|
24
|
|
|
|
25
|
+# do search-request
|
|
16
|
26
|
def request(query, params):
|
|
17
|
27
|
offset = (params['pageno'] - 1) * 8
|
|
|
28
|
+
|
|
18
|
29
|
language = 'en-US'
|
|
19
|
30
|
if params['language'] != 'all':
|
|
20
|
31
|
language = params['language'].replace('_', '-')
|
|
|
32
|
+
|
|
21
|
33
|
params['url'] = search_url.format(offset=offset,
|
|
22
|
34
|
query=urlencode({'q': query}),
|
|
23
|
35
|
language=language)
|
|
|
36
|
+
|
|
24
|
37
|
return params
|
|
25
|
38
|
|
|
26
|
39
|
|
|
|
40
|
+# get response from search-request
|
|
27
|
41
|
def response(resp):
|
|
28
|
42
|
results = []
|
|
|
43
|
+
|
|
29
|
44
|
search_res = loads(resp.text)
|
|
30
|
45
|
|
|
|
46
|
+ # return empty array if there are no results
|
|
31
|
47
|
if not search_res.get('responseData', {}).get('results'):
|
|
32
|
48
|
return []
|
|
33
|
49
|
|
|
|
50
|
+ # parse results
|
|
34
|
51
|
for result in search_res['responseData']['results']:
|
|
35
|
|
-
|
|
36
|
|
-# Mon, 10 Mar 2014 16:26:15 -0700
|
|
|
52
|
+ # parse publishedDate
|
|
37
|
53
|
publishedDate = parser.parse(result['publishedDate'])
|
|
38
|
54
|
|
|
|
55
|
+ # append result
|
|
39
|
56
|
results.append({'url': result['unescapedUrl'],
|
|
40
|
57
|
'title': result['titleNoFormatting'],
|
|
41
|
58
|
'publishedDate': publishedDate,
|
|
42
|
59
|
'content': result['content']})
|
|
|
60
|
+
|
|
|
61
|
+ # return results
|
|
43
|
62
|
return results
|