|
@@ -1,15 +1,15 @@
|
1
|
|
-## Google (Web)
|
2
|
|
-#
|
|
1
|
+# Google (Web)
|
|
2
|
+#
|
3
|
3
|
# @website https://www.google.com
|
4
|
4
|
# @provide-api yes (https://developers.google.com/custom-search/)
|
5
|
|
-#
|
|
5
|
+#
|
6
|
6
|
# @using-api no
|
7
|
7
|
# @results HTML
|
8
|
8
|
# @stable no (HTML can change)
|
9
|
9
|
# @parse url, title, content, suggestion
|
10
|
10
|
|
11
|
11
|
from urllib import urlencode
|
12
|
|
-from urlparse import unquote,urlparse,parse_qsl
|
|
12
|
+from urlparse import urlparse, parse_qsl
|
13
|
13
|
from lxml import html
|
14
|
14
|
from searx.engines.xpath import extract_text, extract_url
|
15
|
15
|
|
|
@@ -23,10 +23,13 @@ google_hostname = 'www.google.com'
|
23
|
23
|
search_path = '/search'
|
24
|
24
|
redirect_path = '/url'
|
25
|
25
|
images_path = '/images'
|
26
|
|
-search_url = 'https://' + google_hostname + search_path + '?{query}&start={offset}&gbv=1'
|
|
26
|
+search_url = ('https://' +
|
|
27
|
+ google_hostname +
|
|
28
|
+ search_path +
|
|
29
|
+ '?{query}&start={offset}&gbv=1')
|
27
|
30
|
|
28
|
31
|
# specific xpath variables
|
29
|
|
-results_xpath= '//li[@class="g"]'
|
|
32
|
+results_xpath = '//li[@class="g"]'
|
30
|
33
|
url_xpath = './/h3/a/@href'
|
31
|
34
|
title_xpath = './/h3'
|
32
|
35
|
content_xpath = './/span[@class="st"]'
|
|
@@ -36,15 +39,18 @@ images_xpath = './/div/a'
|
36
|
39
|
image_url_xpath = './@href'
|
37
|
40
|
image_img_src_xpath = './img/@src'
|
38
|
41
|
|
|
42
|
+
|
39
|
43
|
# remove google-specific tracking-url
|
40
|
44
|
def parse_url(url_string):
|
41
|
45
|
parsed_url = urlparse(url_string)
|
42
|
|
- if parsed_url.netloc in [google_hostname, ''] and parsed_url.path==redirect_path:
|
|
46
|
+ if (parsed_url.netloc in [google_hostname, '']
|
|
47
|
+ and parsed_url.path == redirect_path):
|
43
|
48
|
query = dict(parse_qsl(parsed_url.query))
|
44
|
49
|
return query['q']
|
45
|
50
|
else:
|
46
|
51
|
return url_string
|
47
|
52
|
|
|
53
|
+
|
48
|
54
|
# do search-request
|
49
|
55
|
def request(query, params):
|
50
|
56
|
offset = (params['pageno'] - 1) * 10
|
|
@@ -52,7 +58,7 @@ def request(query, params):
|
52
|
58
|
if params['language'] == 'all':
|
53
|
59
|
language = 'en'
|
54
|
60
|
else:
|
55
|
|
- language = params['language'].replace('_','-').lower()
|
|
61
|
+ language = params['language'].replace('_', '-').lower()
|
56
|
62
|
|
57
|
63
|
params['url'] = search_url.format(offset=offset,
|
58
|
64
|
query=urlencode({'q': query}))
|
|
@@ -74,19 +80,21 @@ def response(resp):
|
74
|
80
|
try:
|
75
|
81
|
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
|
76
|
82
|
parsed_url = urlparse(url)
|
77
|
|
- if parsed_url.netloc==google_hostname and parsed_url.path==search_path:
|
|
83
|
+ if (parsed_url.netloc == google_hostname
|
|
84
|
+ and parsed_url.path == search_path):
|
78
|
85
|
# remove the link to google news
|
79
|
86
|
continue
|
80
|
87
|
|
81
|
|
- if parsed_url.netloc==google_hostname and parsed_url.path==images_path:
|
|
88
|
+ if (parsed_url.netloc == google_hostname
|
|
89
|
+ and parsed_url.path == images_path):
|
82
|
90
|
# images result
|
83
|
91
|
results = results + parse_images(result)
|
84
|
92
|
else:
|
85
|
93
|
# normal result
|
86
|
94
|
content = extract_text(result.xpath(content_xpath)[0])
|
87
|
95
|
# append result
|
88
|
|
- results.append({'url': url,
|
89
|
|
- 'title': title,
|
|
96
|
+ results.append({'url': url,
|
|
97
|
+ 'title': title,
|
90
|
98
|
'content': content})
|
91
|
99
|
except:
|
92
|
100
|
continue
|
|
@@ -99,12 +107,13 @@ def response(resp):
|
99
|
107
|
# return results
|
100
|
108
|
return results
|
101
|
109
|
|
|
110
|
+
|
102
|
111
|
def parse_images(result):
|
103
|
112
|
results = []
|
104
|
113
|
for image in result.xpath(images_xpath):
|
105
|
114
|
url = parse_url(extract_text(image.xpath(image_url_xpath)[0]))
|
106
|
115
|
img_src = extract_text(image.xpath(image_img_src_xpath)[0])
|
107
|
|
-
|
|
116
|
+
|
108
|
117
|
# append result
|
109
|
118
|
results.append({'url': url,
|
110
|
119
|
'title': '',
|