|
@@ -1,47 +1,79 @@
|
|
1
|
+## Startpage (Web)
|
|
2
|
+#
|
|
3
|
+# @website https://startpage.com
|
|
4
|
+# @provide-api no (nothing found)
|
|
5
|
+#
|
|
6
|
+# @using-api no
|
|
7
|
+# @results HTML
|
|
8
|
+# @stable no (HTML can change)
|
|
9
|
+# @parse url, title, content
|
|
10
|
+#
|
|
11
|
+# @todo paging
|
|
12
|
+
|
1
|
13
|
from urllib import urlencode
|
2
|
14
|
from lxml import html
|
3
|
15
|
from cgi import escape
|
|
16
|
+import re
|
|
17
|
+
|
|
18
|
+# engine dependent config
|
|
19
|
+categories = ['general']
|
|
20
|
+# there is a mechanism to block "bot" search (probably the parameter qid), require storing of qid's between mulitble search-calls
|
|
21
|
+#paging = False
|
|
22
|
+language_support = True
|
4
|
23
|
|
5
|
|
-base_url = None
|
6
|
|
-search_url = None
|
|
24
|
+# search-url
|
|
25
|
+base_url = 'https://startpage.com/'
|
|
26
|
+search_url = base_url + 'do/search'
|
7
|
27
|
|
8
|
|
-# TODO paging
|
9
|
|
-paging = False
|
10
|
|
-# TODO complete list of country mapping
|
11
|
|
-country_map = {'en_US': 'eng',
|
12
|
|
- 'en_UK': 'uk',
|
13
|
|
- 'nl_NL': 'ned'}
|
|
28
|
+# specific xpath variables
|
|
29
|
+# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
|
30
|
+# not ads: div[@class="result"] are the direct childs of div[@id="results"]
|
|
31
|
+results_xpath = '//div[@class="result"]'
|
|
32
|
+link_xpath = './/h3/a'
|
14
|
33
|
|
15
|
34
|
|
|
35
|
+# do search-request
|
16
|
36
|
def request(query, params):
|
|
37
|
+ offset = (params['pageno'] - 1) * 10
|
17
|
38
|
query = urlencode({'q': query})[2:]
|
|
39
|
+
|
18
|
40
|
params['url'] = search_url
|
19
|
41
|
params['method'] = 'POST'
|
20
|
42
|
params['data'] = {'query': query,
|
21
|
|
- 'startat': (params['pageno'] - 1) * 10} # offset
|
22
|
|
- country = country_map.get(params['language'], 'eng')
|
23
|
|
- params['cookies']['preferences'] = \
|
24
|
|
- 'lang_homepageEEEs/air/{country}/N1NsslEEE1N1Nfont_sizeEEEmediumN1Nrecent_results_filterEEE1N1Nlanguage_uiEEEenglishN1Ndisable_open_in_new_windowEEE0N1Ncolor_schemeEEEnewN1Nnum_of_resultsEEE10N1N'.format(country=country) # noqa
|
|
43
|
+ 'startat': offset}
|
|
44
|
+
|
|
45
|
+ # set language if specified
|
|
46
|
+ if params['language'] != 'all':
|
|
47
|
+ params['data']['with_language'] = 'lang_' + params['language'].split('_')[0]
|
|
48
|
+
|
25
|
49
|
return params
|
26
|
50
|
|
27
|
51
|
|
|
52
|
+# get response from search-request
|
28
|
53
|
def response(resp):
|
29
|
54
|
results = []
|
|
55
|
+
|
30
|
56
|
dom = html.fromstring(resp.content)
|
31
|
|
- # ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
|
32
|
|
- # not ads: div[@class="result"] are the direct childs of div[@id="results"]
|
33
|
|
- for result in dom.xpath('//div[@class="result"]'):
|
34
|
|
- link = result.xpath('.//h3/a')[0]
|
|
57
|
+
|
|
58
|
+ # parse results
|
|
59
|
+ for result in dom.xpath(results_xpath):
|
|
60
|
+ link = result.xpath(link_xpath)[0]
|
35
|
61
|
url = link.attrib.get('href')
|
36
|
|
- if url.startswith('http://www.google.')\
|
37
|
|
- or url.startswith('https://www.google.'):
|
38
|
|
- continue
|
39
|
62
|
title = escape(link.text_content())
|
40
|
63
|
|
41
|
|
- content = ''
|
|
64
|
+ # block google-ad url's
|
|
65
|
+ if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
|
|
66
|
+ continue
|
|
67
|
+
|
42
|
68
|
if result.xpath('./p[@class="desc"]'):
|
43
|
69
|
content = escape(result.xpath('./p[@class="desc"]')[0].text_content())
|
|
70
|
+ else:
|
|
71
|
+ content = ''
|
44
|
72
|
|
45
|
|
- results.append({'url': url, 'title': title, 'content': content})
|
|
73
|
+ # append result
|
|
74
|
+ results.append({'url': url,
|
|
75
|
+ 'title': title,
|
|
76
|
+ 'content': content})
|
46
|
77
|
|
|
78
|
+ # return results
|
47
|
79
|
return results
|