Browse Source

[enh] xpath engine absolute xpath support

asciimoo 11 years ago
parent
commit
5d764f95cf
1 changed files with 12 additions and 7 deletions
  1. 12
    7
      searx/engines/xpath.py

+ 12
- 7
searx/engines/xpath.py View File

5
 from lxml.etree import _ElementStringResult
5
 from lxml.etree import _ElementStringResult
6
 
6
 
7
 search_url    = None
7
 search_url    = None
8
-results_xpath = None
9
 url_xpath     = None
8
 url_xpath     = None
10
 content_xpath = None
9
 content_xpath = None
11
 title_xpath   = None
10
 title_xpath   = None
11
+results_xpath = ''
12
 
12
 
13
 def extract_url(xpath_results):
13
 def extract_url(xpath_results):
14
     url = ''
14
     url = ''
26
         else:
26
         else:
27
             url = xpath_results[0].attrib.get('href')
27
             url = xpath_results[0].attrib.get('href')
28
     else:
28
     else:
29
-        raise Exception('Cannot handle xpath url resultset')
29
+        url = xpath_results.attrib.get('href')
30
     if not url.startswith('http://') or not url.startswith('https://'):
30
     if not url.startswith('http://') or not url.startswith('https://'):
31
         url = 'http://'+url
31
         url = 'http://'+url
32
     parsed_url = urlparse(url)
32
     parsed_url = urlparse(url)
45
     results = []
45
     results = []
46
     dom = html.fromstring(resp.text)
46
     dom = html.fromstring(resp.text)
47
     query = resp.search_params['query']
47
     query = resp.search_params['query']
48
-    for result in dom.xpath(results_xpath):
49
-        url = extract_url(result.xpath(url_xpath))
50
-        title = ' '.join(result.xpath(title_xpath))
51
-        content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
52
-        results.append({'url': url, 'title': title, 'content': content})
48
+    if results_xpath:
49
+        for result in dom.xpath(results_xpath):
50
+            url = extract_url(result.xpath(url_xpath))
51
+            title = ' '.join(result.xpath(title_xpath))
52
+            content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
53
+            results.append({'url': url, 'title': title, 'content': content})
54
+    else:
55
+        for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
56
+            results.append({'url': url, 'title': title, 'content': content})
57
+
53
 
58
 
54
     return results
59
     return results