소스 검색

[enh] infobox for wolframalpha

TODO:
    - infobox styles
    - unit tests

ISSUES:
    - no_api version needs to re-call server for additional pods, such
      as plots. therefore, it's even slower than before. comment out the
part that calls get_async_pod if requests reach timeout or increase
timeout in settings.yml.
a01200356 9 년 전
부모
커밋
78d3f3d6b1
3개의 변경된 파일150개의 추가작업 그리고 46개의 파일을 삭제
  1. 61
    23
      searx/engines/wolframalpha_api.py
  2. 86
    20
      searx/engines/wolframalpha_noapi.py
  3. 3
    3
      searx/settings.yml

+ 61
- 23
searx/engines/wolframalpha_api.py 파일 보기

@@ -1,40 +1,56 @@
1
-# Wolfram Alpha (Maths)
1
+# Wolfram Alpha (Science)
2 2
 #
3
-# @website     http://www.wolframalpha.com
4
-# @provide-api yes (http://api.wolframalpha.com/v2/)
3
+# @website     https://www.wolframalpha.com
4
+# @provide-api yes (https://api.wolframalpha.com/v2/)
5 5
 #
6 6
 # @using-api   yes
7 7
 # @results     XML
8 8
 # @stable      yes
9
-# @parse       result
9
+# @parse       url, infobox
10 10
 
11 11
 from urllib import urlencode
12 12
 from lxml import etree
13
-from re import search
14 13
 
15 14
 # search-url
16
-base_url = 'http://api.wolframalpha.com/v2/query'
17
-search_url = base_url + '?appid={api_key}&{query}&format=plaintext'
18
-site_url = 'http://www.wolframalpha.com/input/?{query}'
15
+search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
16
+site_url = 'https://www.wolframalpha.com/input/?{query}'
19 17
 api_key = ''  # defined in settings.yml
20 18
 
21 19
 # xpath variables
22 20
 failure_xpath = '/queryresult[attribute::success="false"]'
23 21
 answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
24 22
 input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
23
+pods_xpath = '//pod'
24
+subpods_xpath = './subpod'
25
+pod_title_xpath = './@title'
26
+plaintext_xpath = './plaintext'
27
+image_xpath = './img'
28
+img_src_xpath = './@src'
29
+img_alt_xpath = './@alt'
30
+
31
+# pods to display as image in infobox
32
+# this pods do return a plaintext, but they look better and are more useful as images
33
+image_pods = {'Visual representation',
34
+              'Manipulatives illustration'}
25 35
 
26 36
 
27 37
 # do search-request
28 38
 def request(query, params):
29 39
     params['url'] = search_url.format(query=urlencode({'input': query}),
30 40
                                       api_key=api_key)
41
+    params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
31 42
 
32 43
     return params
33 44
 
34 45
 
35 46
 # replace private user area characters to make text legible
36 47
 def replace_pua_chars(text):
37
-    pua_chars = {u'\uf74c': 'd',
48
+    pua_chars = {u'\uf522': u'\u2192',
49
+                 u'\uf7b1': u'\u2115',
50
+                 u'\uf7b4': u'\u211a',
51
+                 u'\uf7b5': u'\u211d',
52
+                 u'\uf7bd': u'\u2124',
53
+                 u'\uf74c': 'd',
38 54
                  u'\uf74d': u'\u212f',
39 55
                  u'\uf74e': 'i',
40 56
                  u'\uf7d9': '='}
@@ -55,23 +71,45 @@ def response(resp):
55 71
     if search_results.xpath(failure_xpath):
56 72
         return []
57 73
 
58
-    # parse answers
59
-    answers = search_results.xpath(answer_xpath)
60
-    if answers:
61
-        for answer in answers:
62
-            answer = replace_pua_chars(answer.text)
74
+    infobox_title = search_results.xpath(input_xpath)
75
+    if infobox_title:
76
+        infobox_title = replace_pua_chars(infobox_title[0].text)
77
+
78
+    pods = search_results.xpath(pods_xpath)
79
+    result_chunks = []
80
+    for pod in pods:
81
+        pod_title = replace_pua_chars(pod.xpath(pod_title_xpath)[0])
82
+
83
+        subpods = pod.xpath(subpods_xpath)
84
+        if not subpods:
85
+            continue
86
+
87
+        for subpod in subpods:
88
+            content = subpod.xpath(plaintext_xpath)[0].text
89
+            image = subpod.xpath(image_xpath)
90
+            if content and pod_title not in image_pods:
91
+                content = replace_pua_chars(content)
92
+                result_chunks.append({'label': pod_title, 'value': content})
63 93
 
64
-            results.append({'answer': answer})
94
+                # if there's no input pod, infobox_title is content of first pod
95
+                if not infobox_title:
96
+                    infobox_title = content
97
+
98
+            elif image:
99
+                result_chunks.append({'label': pod_title,
100
+                                      'image': {'src': image[0].xpath(img_src_xpath)[0],
101
+                                                'alt': image[0].xpath(img_alt_xpath)[0]}})
102
+
103
+    if not result_chunks:
104
+        return []
65 105
 
66
-    # if there's no input section in search_results, check if answer has the input embedded (before their "=" sign)
67
-    try:
68
-        query_input = search_results.xpath(input_xpath)[0].text
69
-    except IndexError:
70
-        query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1)
106
+    results.append({'infobox': infobox_title,
107
+                    'attributes': result_chunks,
108
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
71 109
 
72 110
     # append link to site
73
-    result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')}))
74
-    results.append({'url': result_url,
75
-                    'title': query_input + " - Wolfram|Alpha"})
111
+    results.append({'url': resp.request.headers['Referer'],
112
+                    'title': 'Wolfram|Alpha',
113
+                    'content': infobox_title})
76 114
 
77 115
     return results

+ 86
- 20
searx/engines/wolframalpha_noapi.py 파일 보기

@@ -1,23 +1,23 @@
1
-# WolframAlpha (Maths)
1
+# Wolfram|Alpha (Science)
2 2
 #
3
-# @website     http://www.wolframalpha.com/
4
-# @provide-api yes (http://api.wolframalpha.com/v2/)
3
+# @website     https://www.wolframalpha.com/
4
+# @provide-api yes (https://api.wolframalpha.com/v2/)
5 5
 #
6 6
 # @using-api   no
7
-# @results     HTML
7
+# @results     JSON
8 8
 # @stable      no
9
-# @parse       answer
9
+# @parse       url, infobox
10 10
 
11 11
 from cgi import escape
12 12
 from json import loads
13 13
 from time import time
14 14
 from urllib import urlencode
15
+from lxml.etree import XML
15 16
 
16 17
 from searx.poolrequests import get as http_get
17 18
 
18 19
 # search-url
19 20
 url = 'https://www.wolframalpha.com/'
20
-search_url = url + 'input/?{query}'
21 21
 
22 22
 search_url = url + 'input/json.jsp'\
23 23
     '?async=true'\
@@ -33,13 +33,25 @@ search_url = url + 'input/json.jsp'\
33 33
     '&sponsorcategories=true'\
34 34
     '&statemethod=deploybutton'
35 35
 
36
-# xpath variables
37
-scripts_xpath = '//script'
38
-title_xpath = '//title'
39
-failure_xpath = '//p[attribute::class="pfail"]'
36
+referer_url = url + 'input/?{query}'
37
+
40 38
 token = {'value': '',
41 39
          'last_updated': None}
42 40
 
41
+# xpath variables
42
+success_xpath = '/pod[attribute::error="false"]'
43
+plaintext_xpath = './plaintext'
44
+title_xpath = './@title'
45
+image_xpath = './img'
46
+img_src_xpath = './img/@src'
47
+img_alt_xpath = './img/@alt'
48
+
49
+# pods to display as image in infobox
50
+# this pods do return a plaintext, but they look better and are more useful as images
51
+image_pods = {'Visual representation',
52
+              'Manipulatives illustration',
53
+              'Symbol'}
54
+
43 55
 
44 56
 # seems, wolframalpha resets its token in every hour
45 57
 def obtain_token():
@@ -62,13 +74,42 @@ def request(query, params):
62 74
     if time() - token['last_updated'] > 3600:
63 75
         obtain_token()
64 76
     params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
65
-    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
77
+    params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
66 78
 
67 79
     return params
68 80
 
69 81
 
82
+# get additional pod
83
+# NOTE: this makes an additional requests to server, so the response will take longer and might reach timeout
84
+def get_async_pod(url):
85
+    pod = {'subpods': []}
86
+
87
+    try:
88
+        resp = http_get(url, timeout=2.0)
89
+
90
+        resp_pod = XML(resp.content)
91
+        if resp_pod.xpath(success_xpath):
92
+
93
+            for subpod in resp_pod:
94
+                plaintext = subpod.xpath(plaintext_xpath)[0].text
95
+                if plaintext:
96
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
97
+                                           'plaintext': plaintext})
98
+                elif subpod.xpath(image_xpath):
99
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
100
+                                           'plaintext': '',
101
+                                           'img': {'src': subpod.xpath(img_src_xpath)[0],
102
+                                                   'alt': subpod.xpath(img_alt_xpath)[0]}})
103
+    except:
104
+        pass
105
+
106
+    return pod
107
+
108
+
70 109
 # get response from search-request
71 110
 def response(resp):
111
+    results = []
112
+
72 113
     resp_json = loads(resp.text)
73 114
 
74 115
     if not resp_json['queryresult']['success']:
@@ -76,20 +117,45 @@ def response(resp):
76 117
 
77 118
     # TODO handle resp_json['queryresult']['assumptions']
78 119
     result_chunks = []
120
+    infobox_title = None
79 121
     for pod in resp_json['queryresult']['pods']:
80 122
         pod_title = pod.get('title', '')
123
+
81 124
         if 'subpods' not in pod:
82
-            continue
125
+            # comment this section if your requests always reach timeout
126
+            if pod['async']:
127
+                result = get_async_pod(pod['async'])
128
+                if result:
129
+                    pod = result
130
+            else:
131
+                continue
132
+
133
+        # infobox title is input or text content on first pod
134
+        if pod_title.startswith('Input') or not infobox_title:
135
+            try:
136
+                infobox_title = pod['subpods'][0]['plaintext']
137
+            except:
138
+                infobox_title = ''
139
+                pass
140
+
83 141
         for subpod in pod['subpods']:
84
-            if 'img' in subpod:
85
-                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
86
-                                     .format(escape(pod_title or subpod['img']['alt']),
87
-                                             escape(subpod['img']['src']),
88
-                                             escape(subpod['img']['alt'])))
142
+            if subpod['plaintext'] != '' and pod_title not in image_pods:
143
+                # append unless it's not an actual answer
144
+                if subpod['plaintext'] != '(requires interactivity)':
145
+                    result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
146
+
147
+            elif 'img' in subpod:
148
+                result_chunks.append({'label': pod_title, 'image': subpod['img']})
89 149
 
90 150
     if not result_chunks:
91 151
         return []
92 152
 
93
-    return [{'url': resp.request.headers['Referer'].decode('utf-8'),
94
-             'title': 'Wolframalpha',
95
-             'content': ''.join(result_chunks)}]
153
+    results.append({'infobox': infobox_title,
154
+                    'attributes': result_chunks,
155
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
156
+
157
+    results.append({'url': resp.request.headers['Referer'],
158
+                    'title': 'Wolfram|Alpha',
159
+                    'content': infobox_title})
160
+
161
+    return results

+ 3
- 3
searx/settings.yml 파일 보기

@@ -310,10 +310,10 @@ engines:
310 310
     shortcut : wa
311 311
     # You can use the engine using the official stable API, but you need an API key
312 312
     # See : http://products.wolframalpha.com/api/
313
-    #    engine : wolframalpha_api
314
-    #    api_key: 'apikey' # required!
313
+    # engine : wolframalpha_api
314
+    # api_key: '5952JX-X52L3VKWT8' # required!
315 315
     engine : wolframalpha_noapi
316
-    timeout: 6.0
316
+    timeout: 10.0
317 317
     categories : science
318 318
 
319 319
 #The blekko technology and team have joined IBM Watson! -> https://blekko.com/