Ver código fonte

[enh] xpath engine added

asciimoo 11 anos atrás
pai
commit
badd988545
1 arquivos alterados com 54 adições e 0 exclusões
  1. 54
    0
      searx/engines/xpath.py

+ 54
- 0
searx/engines/xpath.py Ver arquivo

@@ -0,0 +1,54 @@
1
+from lxml import html
2
+from urllib import urlencode
3
+from urlparse import urlparse, urljoin
4
+from cgi import escape
5
+from lxml.etree import _ElementStringResult
6
+
7
+search_url    = None
8
+results_xpath = None
9
+url_xpath     = None
10
+content_xpath = None
11
+title_xpath   = None
12
+
13
+def extract_url(xpath_results):
14
+    url = ''
15
+    parsed_search_url = urlparse(search_url)
16
+    if type(xpath_results) == list:
17
+        if not len(xpath_results):
18
+            raise Exception('Empty url resultset')
19
+        if type(xpath_results[0]) == _ElementStringResult:
20
+            url = ''.join(xpath_results)
21
+            if url.startswith('//'):
22
+                url = parsed_search_url.scheme+url
23
+            elif url.startswith('/'):
24
+                url = urljoin(search_url, url)
25
+        #TODO
26
+        else:
27
+            url = xpath_results[0].attrib.get('href')
28
+    else:
29
+        raise Exception('Cannot handle xpath url resultset')
30
+    if not url.startswith('http://') or not url.startswith('https://'):
31
+        url = 'http://'+url
32
+    parsed_url = urlparse(url)
33
+    if not parsed_url.netloc:
34
+        raise Exception('Cannot parse url')
35
+    return url
36
+
37
+def request(query, params):
38
+    query = urlencode({'q': query})[2:]
39
+    params['url'] = search_url.format(query=query)
40
+    params['query'] = query
41
+    return params
42
+
43
+
44
+def response(resp):
45
+    results = []
46
+    dom = html.fromstring(resp.text)
47
+    query = resp.search_params['query']
48
+    for result in dom.xpath(results_xpath):
49
+        url = extract_url(result.xpath(url_xpath))
50
+        title = ' '.join(result.xpath(title_xpath))
51
+        content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
52
+        results.append({'url': url, 'title': title, 'content': content})
53
+
54
+    return results