浏览代码

[enh] removing result html tags

asciimoo 11 年前
父节点
当前提交
17bf00ee42
共有 4 个文件被更改,包括 7 次插入6 次删除
  1. 2
    1
      searx/engines/duckduckgo.py
  2. 2
    2
      searx/engines/startpage.py
  3. 2
    1
      searx/engines/twitter.py
  4. 1
    2
      searx/engines/xpath.py

+ 2
- 1
searx/engines/duckduckgo.py 查看文件

1
 from json import loads
1
 from json import loads
2
 from urllib import urlencode
2
 from urllib import urlencode
3
+from searx.utils import html_to_text
3
 
4
 
4
 url = 'https://duckduckgo.com/'
5
 url = 'https://duckduckgo.com/'
5
 search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
6
 search_url = url + 'd.js?{query}&l=us-en&p=1&s=0'
16
         if not r.get('t'):
17
         if not r.get('t'):
17
             continue
18
             continue
18
         results.append({'title': r['t']
19
         results.append({'title': r['t']
19
-                       ,'content': r['a']
20
+                       ,'content': html_to_text(r['a'])
20
                        ,'url': r['u']
21
                        ,'url': r['u']
21
                        })
22
                        })
22
     return results
23
     return results

+ 2
- 2
searx/engines/startpage.py 查看文件

1
-from urllib import quote
1
+from urllib import urlencode
2
 from lxml import html
2
 from lxml import html
3
 from urlparse import urlparse
3
 from urlparse import urlparse
4
 from cgi import escape
4
 from cgi import escape
8
 
8
 
9
 def request(query, params):
9
 def request(query, params):
10
     global search_url
10
     global search_url
11
-    query = quote(query.replace(' ', '+'), safe='+')
11
+    query = urlencode({'q': query})[2:]
12
     params['url'] = search_url
12
     params['url'] = search_url
13
     params['method'] = 'POST'
13
     params['method'] = 'POST'
14
     params['data'] = {'query': query}
14
     params['data'] = {'query': query}

+ 2
- 1
searx/engines/twitter.py 查看文件

1
 from urlparse import urljoin
1
 from urlparse import urljoin
2
 from urllib import urlencode
2
 from urllib import urlencode
3
 from lxml import html
3
 from lxml import html
4
+from cgi import escape
4
 
5
 
5
 categories = ['social media']
6
 categories = ['social media']
6
 
7
 
21
         link = tweet.xpath('.//small[@class="time"]//a')[0]
22
         link = tweet.xpath('.//small[@class="time"]//a')[0]
22
         url = urljoin(base_url, link.attrib.get('href'))
23
         url = urljoin(base_url, link.attrib.get('href'))
23
         title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
24
         title = ''.join(tweet.xpath('.//span[@class="username js-action-profile-name"]//text()'))
24
-        content = ''.join(map(html.tostring, tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//*')))
25
+        content = escape(''.join(tweet.xpath('.//p[@class="js-tweet-text tweet-text"]//text()')))
25
         results.append({'url': url, 'title': title, 'content': content})
26
         results.append({'url': url, 'title': title, 'content': content})
26
     return results
27
     return results

+ 1
- 2
searx/engines/xpath.py 查看文件

46
 def response(resp):
46
 def response(resp):
47
     results = []
47
     results = []
48
     dom = html.fromstring(resp.text)
48
     dom = html.fromstring(resp.text)
49
-    query = resp.search_params['query']
50
     if results_xpath:
49
     if results_xpath:
51
         for result in dom.xpath(results_xpath):
50
         for result in dom.xpath(results_xpath):
52
             url = extract_url(result.xpath(url_xpath))
51
             url = extract_url(result.xpath(url_xpath))
53
             title = ' '.join(result.xpath(title_xpath))
52
             title = ' '.join(result.xpath(title_xpath))
54
-            content = escape(' '.join(result.xpath(content_xpath))).replace(query, '<b>{0}</b>'.format(query))
53
+            content = escape(' '.join(result.xpath(content_xpath)))
55
             results.append({'url': url, 'title': title, 'content': content})
54
             results.append({'url': url, 'title': title, 'content': content})
56
     else:
55
     else:
57
         for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):
56
         for content, url, title in zip(dom.xpath(content_xpath), map(extract_url, dom.xpath(url_xpath)), dom.xpath(title_xpath)):