Преглед изворни кода

rewrite duckduckgo engine and add comments

Thomas Pointhuber пре 10 година
родитељ
комит
e6e4de8ba0
2 измењених фајлова са 37 додато и 36 уклоњено
  1. 37
    34
      searx/engines/duckduckgo.py
  2. 0
    2
      searx/settings.yml

+ 37
- 34
searx/engines/duckduckgo.py Прегледај датотеку

1
+## DuckDuckGo (Web)
2
+# 
3
+# @website     https://duckduckgo.com/
4
+# @provide-api yes (https://duckduckgo.com/api), but not all results from search-site
5
+# 
6
+# @using-api   no
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content
10
+#
11
+# @todo        rewrite to api
12
+# @todo        language support
13
+
1
 from urllib import urlencode
14
 from urllib import urlencode
2
 from lxml.html import fromstring
15
 from lxml.html import fromstring
3
 from searx.utils import html_to_text
16
 from searx.utils import html_to_text
4
 
17
 
5
-url = 'https://duckduckgo.com/html?{query}&s={offset}'
18
+# engine dependent config
19
+categories = ['general']
20
+paging = True
6
 locale = 'us-en'
21
 locale = 'us-en'
7
 
22
 
23
+# search-url
24
+url = 'https://duckduckgo.com/html?{query}&s={offset}'
25
+
26
+# specific xpath variables
27
+result_xpath = '//div[@class="results_links results_links_deep web-result"]'  # noqa
28
+url_xpath = './/a[@class="large"]/@href'
29
+title_xpath = './/a[@class="large"]//text()'
30
+content_xpath = './/div[@class="snippet"]//text()'
8
 
31
 
32
+
33
+# do search-request
9
 def request(query, params):
34
 def request(query, params):
10
     offset = (params['pageno'] - 1) * 30
35
     offset = (params['pageno'] - 1) * 30
11
-    q = urlencode({'q': query,
12
-                   'l': locale})
13
-    params['url'] = url.format(query=q, offset=offset)
36
+
37
+    params['url'] = url.format(
38
+        query=urlencode({'q': query, 'l': locale}),
39
+        offset=offset)
40
+
14
     return params
41
     return params
15
 
42
 
16
 
43
 
44
+# get response from search-request
17
 def response(resp):
45
 def response(resp):
18
-    result_xpath = '//div[@class="results_links results_links_deep web-result"]'  # noqa
19
-    url_xpath = './/a[@class="large"]/@href'
20
-    title_xpath = './/a[@class="large"]//text()'
21
-    content_xpath = './/div[@class="snippet"]//text()'
22
     results = []
46
     results = []
23
 
47
 
24
     doc = fromstring(resp.text)
48
     doc = fromstring(resp.text)
28
             res_url = r.xpath(url_xpath)[-1]
52
             res_url = r.xpath(url_xpath)[-1]
29
         except:
53
         except:
30
             continue
54
             continue
55
+
31
         if not res_url:
56
         if not res_url:
32
             continue
57
             continue
58
+
33
         title = html_to_text(''.join(r.xpath(title_xpath)))
59
         title = html_to_text(''.join(r.xpath(title_xpath)))
34
         content = html_to_text(''.join(r.xpath(content_xpath)))
60
         content = html_to_text(''.join(r.xpath(content_xpath)))
61
+
62
+        # append result
35
         results.append({'title': title,
63
         results.append({'title': title,
36
                         'content': content,
64
                         'content': content,
37
                         'url': res_url})
65
                         'url': res_url})
38
 
66
 
67
+    # return results
39
     return results
68
     return results
40
-
41
-
42
-#from json import loads
43
-#search_url = url + 'd.js?{query}&p=1&s={offset}'
44
-#
45
-#paging = True
46
-#
47
-#
48
-#def request(query, params):
49
-#    offset = (params['pageno'] - 1) * 30
50
-#    q = urlencode({'q': query,
51
-#                   'l': locale})
52
-#    params['url'] = search_url.format(query=q, offset=offset)
53
-#    return params
54
-#
55
-#
56
-#def response(resp):
57
-#    results = []
58
-#    search_res = loads(resp.text[resp.text.find('[{'):-2])[:-1]
59
-#    for r in search_res:
60
-#        if not r.get('t'):
61
-#            continue
62
-#        results.append({'title': r['t'],
63
-#                       'content': html_to_text(r['a']),
64
-#                       'url': r['u']})
65
-#    return results

+ 0
- 2
searx/settings.yml Прегледај датотеку

37
 
37
 
38
   - name : deviantart
38
   - name : deviantart
39
     engine : deviantart
39
     engine : deviantart
40
-    categories : images
41
     shortcut : da
40
     shortcut : da
42
     timeout: 3.0
41
     timeout: 3.0
43
 
42
 
47
 
46
 
48
   - name : duckduckgo
47
   - name : duckduckgo
49
     engine : duckduckgo
48
     engine : duckduckgo
50
-    locale : en-us
51
     shortcut : ddg
49
     shortcut : ddg
52
 
50
 
53
 # down - website is under criminal investigation by the UK
51
 # down - website is under criminal investigation by the UK