Browse Source

[enh] infobox for wolframalpha

TODO:
    - infobox styles
    - unit tests

ISSUES:
    - no_api version needs to re-call server for additional pods, such
      as plots. therefore, it's even slower than before. comment out the
part that calls get_async_pod if requests reach timeout or increase
timeout in settings.yml.
a01200356 9 years ago
parent
commit
78d3f3d6b1
3 changed files with 150 additions and 46 deletions
  1. 61
    23
      searx/engines/wolframalpha_api.py
  2. 86
    20
      searx/engines/wolframalpha_noapi.py
  3. 3
    3
      searx/settings.yml

+ 61
- 23
searx/engines/wolframalpha_api.py View File

1
-# Wolfram Alpha (Maths)
1
+# Wolfram Alpha (Science)
2
 #
2
 #
3
-# @website     http://www.wolframalpha.com
4
-# @provide-api yes (http://api.wolframalpha.com/v2/)
3
+# @website     https://www.wolframalpha.com
4
+# @provide-api yes (https://api.wolframalpha.com/v2/)
5
 #
5
 #
6
 # @using-api   yes
6
 # @using-api   yes
7
 # @results     XML
7
 # @results     XML
8
 # @stable      yes
8
 # @stable      yes
9
-# @parse       result
9
+# @parse       url, infobox
10
 
10
 
11
 from urllib import urlencode
11
 from urllib import urlencode
12
 from lxml import etree
12
 from lxml import etree
13
-from re import search
14
 
13
 
15
 # search-url
14
 # search-url
16
-base_url = 'http://api.wolframalpha.com/v2/query'
17
-search_url = base_url + '?appid={api_key}&{query}&format=plaintext'
18
-site_url = 'http://www.wolframalpha.com/input/?{query}'
15
+search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}'
16
+site_url = 'https://www.wolframalpha.com/input/?{query}'
19
 api_key = ''  # defined in settings.yml
17
 api_key = ''  # defined in settings.yml
20
 
18
 
21
 # xpath variables
19
 # xpath variables
22
 failure_xpath = '/queryresult[attribute::success="false"]'
20
 failure_xpath = '/queryresult[attribute::success="false"]'
23
 answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
21
 answer_xpath = '//pod[attribute::primary="true"]/subpod/plaintext'
24
 input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
22
 input_xpath = '//pod[starts-with(attribute::title, "Input")]/subpod/plaintext'
23
+pods_xpath = '//pod'
24
+subpods_xpath = './subpod'
25
+pod_title_xpath = './@title'
26
+plaintext_xpath = './plaintext'
27
+image_xpath = './img'
28
+img_src_xpath = './@src'
29
+img_alt_xpath = './@alt'
30
+
31
+# pods to display as image in infobox
32
+# this pods do return a plaintext, but they look better and are more useful as images
33
+image_pods = {'Visual representation',
34
+              'Manipulatives illustration'}
25
 
35
 
26
 
36
 
27
 # do search-request
37
 # do search-request
28
 def request(query, params):
38
 def request(query, params):
29
     params['url'] = search_url.format(query=urlencode({'input': query}),
39
     params['url'] = search_url.format(query=urlencode({'input': query}),
30
                                       api_key=api_key)
40
                                       api_key=api_key)
41
+    params['headers']['Referer'] = site_url.format(query=urlencode({'i': query}))
31
 
42
 
32
     return params
43
     return params
33
 
44
 
34
 
45
 
35
 # replace private user area characters to make text legible
46
 # replace private user area characters to make text legible
36
 def replace_pua_chars(text):
47
 def replace_pua_chars(text):
37
-    pua_chars = {u'\uf74c': 'd',
48
+    pua_chars = {u'\uf522': u'\u2192',
49
+                 u'\uf7b1': u'\u2115',
50
+                 u'\uf7b4': u'\u211a',
51
+                 u'\uf7b5': u'\u211d',
52
+                 u'\uf7bd': u'\u2124',
53
+                 u'\uf74c': 'd',
38
                  u'\uf74d': u'\u212f',
54
                  u'\uf74d': u'\u212f',
39
                  u'\uf74e': 'i',
55
                  u'\uf74e': 'i',
40
                  u'\uf7d9': '='}
56
                  u'\uf7d9': '='}
55
     if search_results.xpath(failure_xpath):
71
     if search_results.xpath(failure_xpath):
56
         return []
72
         return []
57
 
73
 
58
-    # parse answers
59
-    answers = search_results.xpath(answer_xpath)
60
-    if answers:
61
-        for answer in answers:
62
-            answer = replace_pua_chars(answer.text)
74
+    infobox_title = search_results.xpath(input_xpath)
75
+    if infobox_title:
76
+        infobox_title = replace_pua_chars(infobox_title[0].text)
77
+
78
+    pods = search_results.xpath(pods_xpath)
79
+    result_chunks = []
80
+    for pod in pods:
81
+        pod_title = replace_pua_chars(pod.xpath(pod_title_xpath)[0])
82
+
83
+        subpods = pod.xpath(subpods_xpath)
84
+        if not subpods:
85
+            continue
86
+
87
+        for subpod in subpods:
88
+            content = subpod.xpath(plaintext_xpath)[0].text
89
+            image = subpod.xpath(image_xpath)
90
+            if content and pod_title not in image_pods:
91
+                content = replace_pua_chars(content)
92
+                result_chunks.append({'label': pod_title, 'value': content})
63
 
93
 
64
-            results.append({'answer': answer})
94
+                # if there's no input pod, infobox_title is content of first pod
95
+                if not infobox_title:
96
+                    infobox_title = content
97
+
98
+            elif image:
99
+                result_chunks.append({'label': pod_title,
100
+                                      'image': {'src': image[0].xpath(img_src_xpath)[0],
101
+                                                'alt': image[0].xpath(img_alt_xpath)[0]}})
102
+
103
+    if not result_chunks:
104
+        return []
65
 
105
 
66
-    # if there's no input section in search_results, check if answer has the input embedded (before their "=" sign)
67
-    try:
68
-        query_input = search_results.xpath(input_xpath)[0].text
69
-    except IndexError:
70
-        query_input = search(u'([^\uf7d9]+)', answers[0].text).group(1)
106
+    results.append({'infobox': infobox_title,
107
+                    'attributes': result_chunks,
108
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
71
 
109
 
72
     # append link to site
110
     # append link to site
73
-    result_url = site_url.format(query=urlencode({'i': query_input.encode('utf-8')}))
74
-    results.append({'url': result_url,
75
-                    'title': query_input + " - Wolfram|Alpha"})
111
+    results.append({'url': resp.request.headers['Referer'],
112
+                    'title': 'Wolfram|Alpha',
113
+                    'content': infobox_title})
76
 
114
 
77
     return results
115
     return results

+ 86
- 20
searx/engines/wolframalpha_noapi.py View File

1
-# WolframAlpha (Maths)
1
+# Wolfram|Alpha (Science)
2
 #
2
 #
3
-# @website     http://www.wolframalpha.com/
4
-# @provide-api yes (http://api.wolframalpha.com/v2/)
3
+# @website     https://www.wolframalpha.com/
4
+# @provide-api yes (https://api.wolframalpha.com/v2/)
5
 #
5
 #
6
 # @using-api   no
6
 # @using-api   no
7
-# @results     HTML
7
+# @results     JSON
8
 # @stable      no
8
 # @stable      no
9
-# @parse       answer
9
+# @parse       url, infobox
10
 
10
 
11
 from cgi import escape
11
 from cgi import escape
12
 from json import loads
12
 from json import loads
13
 from time import time
13
 from time import time
14
 from urllib import urlencode
14
 from urllib import urlencode
15
+from lxml.etree import XML
15
 
16
 
16
 from searx.poolrequests import get as http_get
17
 from searx.poolrequests import get as http_get
17
 
18
 
18
 # search-url
19
 # search-url
19
 url = 'https://www.wolframalpha.com/'
20
 url = 'https://www.wolframalpha.com/'
20
-search_url = url + 'input/?{query}'
21
 
21
 
22
 search_url = url + 'input/json.jsp'\
22
 search_url = url + 'input/json.jsp'\
23
     '?async=true'\
23
     '?async=true'\
33
     '&sponsorcategories=true'\
33
     '&sponsorcategories=true'\
34
     '&statemethod=deploybutton'
34
     '&statemethod=deploybutton'
35
 
35
 
36
-# xpath variables
37
-scripts_xpath = '//script'
38
-title_xpath = '//title'
39
-failure_xpath = '//p[attribute::class="pfail"]'
36
+referer_url = url + 'input/?{query}'
37
+
40
 token = {'value': '',
38
 token = {'value': '',
41
          'last_updated': None}
39
          'last_updated': None}
42
 
40
 
41
+# xpath variables
42
+success_xpath = '/pod[attribute::error="false"]'
43
+plaintext_xpath = './plaintext'
44
+title_xpath = './@title'
45
+image_xpath = './img'
46
+img_src_xpath = './img/@src'
47
+img_alt_xpath = './img/@alt'
48
+
49
+# pods to display as image in infobox
50
+# this pods do return a plaintext, but they look better and are more useful as images
51
+image_pods = {'Visual representation',
52
+              'Manipulatives illustration',
53
+              'Symbol'}
54
+
43
 
55
 
44
 # seems, wolframalpha resets its token in every hour
56
 # seems, wolframalpha resets its token in every hour
45
 def obtain_token():
57
 def obtain_token():
62
     if time() - token['last_updated'] > 3600:
74
     if time() - token['last_updated'] > 3600:
63
         obtain_token()
75
         obtain_token()
64
     params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
76
     params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
65
-    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
77
+    params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query}))
66
 
78
 
67
     return params
79
     return params
68
 
80
 
69
 
81
 
82
+# get additional pod
83
+# NOTE: this makes an additional requests to server, so the response will take longer and might reach timeout
84
+def get_async_pod(url):
85
+    pod = {'subpods': []}
86
+
87
+    try:
88
+        resp = http_get(url, timeout=2.0)
89
+
90
+        resp_pod = XML(resp.content)
91
+        if resp_pod.xpath(success_xpath):
92
+
93
+            for subpod in resp_pod:
94
+                plaintext = subpod.xpath(plaintext_xpath)[0].text
95
+                if plaintext:
96
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
97
+                                           'plaintext': plaintext})
98
+                elif subpod.xpath(image_xpath):
99
+                    pod['subpods'].append({'title': subpod.xpath(title_xpath)[0],
100
+                                           'plaintext': '',
101
+                                           'img': {'src': subpod.xpath(img_src_xpath)[0],
102
+                                                   'alt': subpod.xpath(img_alt_xpath)[0]}})
103
+    except:
104
+        pass
105
+
106
+    return pod
107
+
108
+
70
 # get response from search-request
109
 # get response from search-request
71
 def response(resp):
110
 def response(resp):
111
+    results = []
112
+
72
     resp_json = loads(resp.text)
113
     resp_json = loads(resp.text)
73
 
114
 
74
     if not resp_json['queryresult']['success']:
115
     if not resp_json['queryresult']['success']:
76
 
117
 
77
     # TODO handle resp_json['queryresult']['assumptions']
118
     # TODO handle resp_json['queryresult']['assumptions']
78
     result_chunks = []
119
     result_chunks = []
120
+    infobox_title = None
79
     for pod in resp_json['queryresult']['pods']:
121
     for pod in resp_json['queryresult']['pods']:
80
         pod_title = pod.get('title', '')
122
         pod_title = pod.get('title', '')
123
+
81
         if 'subpods' not in pod:
124
         if 'subpods' not in pod:
82
-            continue
125
+            # comment this section if your requests always reach timeout
126
+            if pod['async']:
127
+                result = get_async_pod(pod['async'])
128
+                if result:
129
+                    pod = result
130
+            else:
131
+                continue
132
+
133
+        # infobox title is input or text content on first pod
134
+        if pod_title.startswith('Input') or not infobox_title:
135
+            try:
136
+                infobox_title = pod['subpods'][0]['plaintext']
137
+            except:
138
+                infobox_title = ''
139
+                pass
140
+
83
         for subpod in pod['subpods']:
141
         for subpod in pod['subpods']:
84
-            if 'img' in subpod:
85
-                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
86
-                                     .format(escape(pod_title or subpod['img']['alt']),
87
-                                             escape(subpod['img']['src']),
88
-                                             escape(subpod['img']['alt'])))
142
+            if subpod['plaintext'] != '' and pod_title not in image_pods:
143
+                # append unless it's not an actual answer
144
+                if subpod['plaintext'] != '(requires interactivity)':
145
+                    result_chunks.append({'label': pod_title, 'value': subpod['plaintext']})
146
+
147
+            elif 'img' in subpod:
148
+                result_chunks.append({'label': pod_title, 'image': subpod['img']})
89
 
149
 
90
     if not result_chunks:
150
     if not result_chunks:
91
         return []
151
         return []
92
 
152
 
93
-    return [{'url': resp.request.headers['Referer'].decode('utf-8'),
94
-             'title': 'Wolframalpha',
95
-             'content': ''.join(result_chunks)}]
153
+    results.append({'infobox': infobox_title,
154
+                    'attributes': result_chunks,
155
+                    'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]})
156
+
157
+    results.append({'url': resp.request.headers['Referer'],
158
+                    'title': 'Wolfram|Alpha',
159
+                    'content': infobox_title})
160
+
161
+    return results

+ 3
- 3
searx/settings.yml View File

310
     shortcut : wa
310
     shortcut : wa
311
     # You can use the engine using the official stable API, but you need an API key
311
     # You can use the engine using the official stable API, but you need an API key
312
     # See : http://products.wolframalpha.com/api/
312
     # See : http://products.wolframalpha.com/api/
313
-    #    engine : wolframalpha_api
314
-    #    api_key: 'apikey' # required!
313
+    # engine : wolframalpha_api
314
+    # api_key: '5952JX-X52L3VKWT8' # required!
315
     engine : wolframalpha_noapi
315
     engine : wolframalpha_noapi
316
-    timeout: 6.0
316
+    timeout: 10.0
317
     categories : science
317
     categories : science
318
 
318
 
319
 #The blekko technology and team have joined IBM Watson! -> https://blekko.com/
319
 #The blekko technology and team have joined IBM Watson! -> https://blekko.com/