|
@@ -5,10 +5,10 @@ from cgi import escape
|
5
|
5
|
from lxml.etree import _ElementStringResult
|
6
|
6
|
|
7
|
7
|
search_url = None
|
8
|
|
-results_xpath = None
|
9
|
8
|
url_xpath = None
|
10
|
9
|
content_xpath = None
|
11
|
10
|
title_xpath = None
|
|
11
|
+results_xpath = ''
|
12
|
12
|
|
13
|
13
|
def extract_url(xpath_results):
|
14
|
14
|
url = ''
|
|
@@ -26,7 +26,7 @@ def extract_url(xpath_results):
|
26
|
26
|
else:
|
27
|
27
|
url = xpath_results[0].attrib.get('href')
|
28
|
28
|
else:
|
29
|
|
- raise Exception('Cannot handle xpath url resultset')
|
|
29
|
+ url = xpath_results.attrib.get('href')
|
30
|
30
|
if not url.startswith('http://') or not url.startswith('https://'):
|
31
|
31
|
url = 'http://'+url
|
32
|
32
|
parsed_url = urlparse(url)
|
|
@@ -45,10 +45,15 @@ def response(resp):
|
45
|
45
|
results = []
|
46
|
46
|
dom = html.fromstring(resp.text)
|
47
|
47
|
query = resp.search_params['query']
|
48
|
|
- for result in dom.xpath(results_xpath):
|
49
|
|
- url = extract_url(result.xpath(url_xpath))
|
50
|
|
- title = ' '.join(result.xpath(title_xpath))
|
51
|
|
- content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
|
52
|
|
- results.append({'url': url, 'title': title, 'content': content})
|
|
48
|
+ if results_xpath:
|
|
49
|
+ for result in dom.xpath(results_xpath):
|
|
50
|
+ url = extract_url(result.xpath(url_xpath))
|
|
51
|
+ title = ' '.join(result.xpath(title_xpath))
|
|
52
|
+ content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
|
|
53
|
+ results.append({'url': url, 'title': title, 'content': content})
|
|
54
|
+ else:
|
|
55
|
+ for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
|
|
56
|
+ results.append({'url': url, 'title': title, 'content': content})
|
|
57
|
+
|
53
|
58
|
|
54
|
59
|
return results
|