|
@@ -1,5 +1,5 @@
|
1
|
1
|
from lxml import html
|
2
|
|
-from urllib import urlencode
|
|
2
|
+from urllib import urlencode, unquote
|
3
|
3
|
from urlparse import urlparse, urljoin
|
4
|
4
|
from cgi import escape
|
5
|
5
|
from lxml.etree import _ElementStringResult
|
|
@@ -11,32 +11,64 @@ title_xpath = None
|
11
|
11
|
suggestion_xpath = ''
|
12
|
12
|
results_xpath = ''
|
13
|
13
|
|
14
|
|
-def extract_url(xpath_results):
|
15
|
|
- url = ''
|
16
|
|
- parsed_search_url = urlparse(search_url)
|
|
14
|
+'''
|
|
15
|
+if xpath_results is list, extract the text from each result and concat the list
|
|
16
|
+if xpath_results is a xml element, extract all the text node from it ( text_content() method from lxml )
|
|
17
|
+if xpath_results is a string element, then it's already done
|
|
18
|
+'''
|
|
19
|
+def extract_text(xpath_results):
|
17
|
20
|
if type(xpath_results) == list:
|
|
21
|
+ # it's list of result : concat everything using recursive call
|
18
|
22
|
if not len(xpath_results):
|
19
|
23
|
raise Exception('Empty url resultset')
|
20
|
|
- if type(xpath_results[0]) == _ElementStringResult:
|
21
|
|
- url = ''.join(xpath_results)
|
22
|
|
- if url.startswith('//'):
|
23
|
|
- url = parsed_search_url.scheme+url
|
24
|
|
- elif url.startswith('/'):
|
25
|
|
- url = urljoin(search_url, url)
|
26
|
|
- #TODO
|
27
|
|
- else:
|
28
|
|
- url = xpath_results[0].attrib.get('href')
|
|
24
|
+ result = ''
|
|
25
|
+ for e in xpath_results:
|
|
26
|
+ result = result + extract_text(e)
|
|
27
|
+ return result
|
|
28
|
+ elif type(xpath_results) == _ElementStringResult:
|
|
29
|
+ # it's a string
|
|
30
|
+ return ''.join(xpath_results)
|
29
|
31
|
else:
|
30
|
|
- url = xpath_results.attrib.get('href')
|
31
|
|
- if not url.startswith('http://') and not url.startswith('https://'):
|
32
|
|
- url = 'http://'+url
|
|
32
|
+ # it's a element
|
|
33
|
+ return xpath_results.text_content()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+def extract_url(xpath_results):
|
|
37
|
+ url = extract_text(xpath_results)
|
|
38
|
+
|
|
39
|
+ if url.startswith('//'):
|
|
40
|
+ # add http or https to this kind of url //example.com/
|
|
41
|
+ parsed_search_url = urlparse(search_url)
|
|
42
|
+ url = parsed_search_url.scheme+url
|
|
43
|
+ elif url.startswith('/'):
|
|
44
|
+ # fix relative url to the search engine
|
|
45
|
+ url = urljoin(search_url, url)
|
|
46
|
+
|
|
47
|
+ # normalize url
|
|
48
|
+ url = normalize_url(url)
|
|
49
|
+
|
|
50
|
+ return url
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+def normalize_url(url):
|
33
|
54
|
parsed_url = urlparse(url)
|
|
55
|
+
|
|
56
|
+ # add a / at this end of the url if there is no path
|
34
|
57
|
if not parsed_url.netloc:
|
35
|
58
|
raise Exception('Cannot parse url')
|
36
|
59
|
if not parsed_url.path:
|
37
|
60
|
url += '/'
|
|
61
|
+
|
|
62
|
+ # FIXME : hack for yahoo
|
|
63
|
+ if parsed_url.hostname == 'search.yahoo.com' and parsed_url.path.startswith('/r'):
|
|
64
|
+ p = parsed_url.path
|
|
65
|
+ mark = p.find('/**')
|
|
66
|
+ if mark != -1:
|
|
67
|
+ return unquote(p[mark+3:]).decode('utf-8')
|
|
68
|
+
|
38
|
69
|
return url
|
39
|
70
|
|
|
71
|
+
|
40
|
72
|
def request(query, params):
|
41
|
73
|
query = urlencode({'q': query})[2:]
|
42
|
74
|
params['url'] = search_url.format(query=query)
|
|
@@ -50,15 +82,19 @@ def response(resp):
|
50
|
82
|
if results_xpath:
|
51
|
83
|
for result in dom.xpath(results_xpath):
|
52
|
84
|
url = extract_url(result.xpath(url_xpath))
|
53
|
|
- title = ' '.join(result.xpath(title_xpath))
|
54
|
|
- content = escape(' '.join(result.xpath(content_xpath)))
|
|
85
|
+ title = extract_text(result.xpath(title_xpath)[0 ])
|
|
86
|
+ content = extract_text(result.xpath(content_xpath)[0])
|
55
|
87
|
results.append({'url': url, 'title': title, 'content': content})
|
56
|
88
|
else:
|
57
|
|
- for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
|
|
89
|
+ for url, title, content in zip(
|
|
90
|
+ map(extract_url, dom.xpath(url_xpath)), \
|
|
91
|
+ map(extract_text, dom.xpath(title_xpath)), \
|
|
92
|
+ map(extract_text, dom.xpath(content_xpath)), \
|
|
93
|
+ ):
|
58
|
94
|
results.append({'url': url, 'title': title, 'content': content})
|
59
|
95
|
|
60
|
96
|
if not suggestion_xpath:
|
61
|
97
|
return results
|
62
|
98
|
for suggestion in dom.xpath(suggestion_xpath):
|
63
|
|
- results.append({'suggestion': escape(''.join(suggestion.xpath('.//text()')))})
|
|
99
|
+ results.append({'suggestion': extract_text(suggestion)})
|
64
|
100
|
return results
|