Browse Source

[fix] wolframalpha page changes

related issues: #508 #509
Adam Tauber 9 years ago
parent
commit
d06178139f
2 changed files with 63 additions and 227 deletions
  1. 62
    56
      searx/engines/wolframalpha_noapi.py
  2. 1
    171
      tests/unit/engines/test_wolframalpha_noapi.py

+ 62
- 56
searx/engines/wolframalpha_noapi.py View File

@@ -8,79 +8,85 @@
8 8
 # @stable      no
9 9
 # @parse       answer
10 10
 
11
-from re import search, sub
11
+from cgi import escape
12 12
 from json import loads
13
+from time import time
13 14
 from urllib import urlencode
14
-from lxml import html
15
-import HTMLParser
15
+
16
+from searx.poolrequests import get as http_get
16 17
 
17 18
 # search-url
18
-url = 'http://www.wolframalpha.com/'
19
+url = 'https://www.wolframalpha.com/'
19 20
 search_url = url + 'input/?{query}'
20 21
 
22
+search_url = url + 'input/json.jsp'\
23
+    '?async=true'\
24
+    '&banners=raw'\
25
+    '&debuggingdata=false'\
26
+    '&format=image,plaintext,imagemap,minput,moutput'\
27
+    '&formattimeout=2'\
28
+    '&{query}'\
29
+    '&output=JSON'\
30
+    '&parsetimeout=2'\
31
+    '&proxycode={token}'\
32
+    '&scantimeout=0.5'\
33
+    '&sponsorcategories=true'\
34
+    '&statemethod=deploybutton'
35
+
21 36
 # xpath variables
22 37
 scripts_xpath = '//script'
23 38
 title_xpath = '//title'
24 39
 failure_xpath = '//p[attribute::class="pfail"]'
40
+token = {'value': '',
41
+         'last_updated': None}
42
+
43
+
44
+# seems, wolframalpha resets its token in every hour
45
+def obtain_token():
46
+    update_time = time() - (time() % 3600)
47
+    token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
48
+    token['value'] = loads(token_response.text)['code']
49
+    token['last_updated'] = update_time
50
+    return token
51
+
52
+
53
+obtain_token()
25 54
 
26 55
 
27 56
 # do search-request
28 57
 def request(query, params):
29
-    params['url'] = search_url.format(query=urlencode({'i': query}))
58
+    # obtain token if last update was more than an hour
59
+    if time() - token['last_updated'] > 3600:
60
+        obtain_token()
61
+    params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value'])
62
+    params['headers']['Referer'] = 'https://www.wolframalpha.com/input/?i=' + query
30 63
 
31 64
     return params
32 65
 
33 66
 
34 67
 # get response from search-request
35 68
 def response(resp):
36
-    results = []
37
-    line = None
38
-
39
-    dom = html.fromstring(resp.text)
40
-    scripts = dom.xpath(scripts_xpath)
41
-
42
-    # the answer is inside a js function
43
-    # answer can be located in different 'pods', although by default it should be in pod_0200
44
-    possible_locations = ['pod_0200\.push\((.*)',
45
-                          'pod_0100\.push\((.*)']
46
-
47
-    # failed result
48
-    if dom.xpath(failure_xpath):
49
-        return results
50
-
51
-    # get line that matches the pattern
52
-    for pattern in possible_locations:
53
-        for script in scripts:
54
-            try:
55
-                line = search(pattern, script.text_content()).group(1)
56
-                break
57
-            except AttributeError:
58
-                continue
59
-        if line:
60
-            break
61
-
62
-    if line:
63
-        # extract answer from json
64
-        answer = line[line.find('{'):line.rfind('}') + 1]
65
-        try:
66
-            answer = loads(answer)
67
-        except Exception:
68
-            answer = loads(answer.encode('unicode-escape'))
69
-        answer = answer['stringified']
70
-
71
-        # clean plaintext answer
72
-        h = HTMLParser.HTMLParser()
73
-        answer = h.unescape(answer.decode('unicode-escape'))
74
-        answer = sub(r'\\', '', answer)
75
-
76
-        results.append({'answer': answer})
77
-
78
-    # user input is in first part of title
79
-    title = dom.xpath(title_xpath)[0].text.encode('utf-8')
80
-    result_url = request(title[:-16], {})['url']
81
-
82
-    # append result
83
-    results.append({'url': result_url,
84
-                    'title': title.decode('utf-8')})
85
-
86
-    return results
69
+    resp_json = loads(resp.text)
70
+
71
+    if not resp_json['queryresult']['success']:
72
+        return []
73
+
74
+    # TODO handle resp_json['queryresult']['assumptions']
75
+    result_chunks = []
76
+    for pod in resp_json['queryresult']['pods']:
77
+        pod_title = pod.get('title', '')
78
+        if 'subpods' not in pod:
79
+            continue
80
+        for subpod in pod['subpods']:
81
+            if 'img' in subpod:
82
+                result_chunks.append(u'<p>{0}<br /><img src="{1}" alt="{2}" /></p>'
83
+                                     .format(escape(pod_title or subpod['img']['alt']),
84
+                                             escape(subpod['img']['src']),
85
+                                             escape(subpod['img']['alt'])))
86
+
87
+    if not result_chunks:
88
+        return []
89
+
90
+    return [{'url': resp.request.headers['Referer'],
91
+             'title': 'Wolframalpha',
92
+             'content': ''.join(result_chunks)}]

+ 1
- 171
tests/unit/engines/test_wolframalpha_noapi.py View File

@@ -1,6 +1,5 @@
1 1
 # -*- coding: utf-8 -*-
2 2
 from collections import defaultdict
3
-import mock
4 3
 from searx.engines import wolframalpha_noapi
5 4
 from searx.testing import SearxTestCase
6 5
 
@@ -21,173 +20,4 @@ class TestWolframAlphaNoAPIEngine(SearxTestCase):
21 20
         self.assertRaises(AttributeError, wolframalpha_noapi.response, [])
22 21
         self.assertRaises(AttributeError, wolframalpha_noapi.response, '')
23 22
         self.assertRaises(AttributeError, wolframalpha_noapi.response, '[]')
24
-
25
-        html = """
26
-        <!DOCTYPE html>
27
-            <title> Parangaricutirimícuaro - Wolfram|Alpha</title>
28
-            <meta charset="utf-8" />
29
-            <body>
30
-                <div id="closest">
31
-                    <p class="pfail">Wolfram|Alpha doesn't know how to interpret your input.</p>
32
-                    <div id="dtips">
33
-                        <div class="tip">
34
-                            <span class="tip-title">Tip:&nbsp;</span>
35
-                                Check your spelling, and use English
36
-                            <span class="tip-extra"></span>
37
-                        </div>
38
-                    </div>
39
-                </div>
40
-            </body>
41
-        </html>
42
-        """
43
-        # test failed query
44
-        response = mock.Mock(text=html)
45
-        self.assertEqual(wolframalpha_noapi.response(response), [])
46
-
47
-        html = """
48
-        <!DOCTYPE html>
49
-            <title> sqrt(-1) - Wolfram|Alpha</title>
50
-            <meta charset="utf-8" />
51
-            <body>
52
-                <script type="text/javascript">
53
-                  try {
54
-                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
55
-                      context.jsonArray.popups.pod_0100 = [];
56
-                    }
57
-                    context.jsonArray.popups.pod_0100.push( {"stringified": "sqrt(-1)","mInput": "","mOutput": ""});
58
-                  } catch(e) { }
59
-
60
-                  try {
61
-                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
62
-                      context.jsonArray.popups.pod_0200 = [];
63
-                    }
64
-                    context.jsonArray.popups.pod_0200.push( {"stringified": "i","mInput": "","mOutput": ""});
65
-                  } catch(e) { }
66
-                </script>
67
-            </body>
68
-        </html>
69
-        """
70
-        # test plaintext
71
-        response = mock.Mock(text=html)
72
-        results = wolframalpha_noapi.response(response)
73
-        self.assertEqual(type(results), list)
74
-        self.assertEqual(len(results), 2)
75
-        self.assertEquals('i', results[0]['answer'])
76
-        self.assertIn('sqrt(-1) - Wolfram|Alpha', results[1]['title'])
77
-        self.assertEquals('http://www.wolframalpha.com/input/?i=+sqrt%28-1%29', results[1]['url'])
78
-
79
-        html = """
80
-        <!DOCTYPE html>
81
-            <title> integral 1/x - Wolfram|Alpha</title>
82
-            <meta charset="utf-8" />
83
-            <body>
84
-                <script type="text/javascript">
85
-                  try {
86
-                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
87
-                      context.jsonArray.popups.pod_0100 = [];
88
-                    }
89
-                    context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
90
-                  } catch(e) { }
91
-                </script>
92
-            </body>
93
-        </html>
94
-        """
95
-        # test integral
96
-        response = mock.Mock(text=html)
97
-        results = wolframalpha_noapi.response(response)
98
-        self.assertEqual(type(results), list)
99
-        self.assertEqual(len(results), 2)
100
-        self.assertIn('log(x)+c', results[0]['answer'])
101
-        self.assertIn('integral 1/x - Wolfram|Alpha', results[1]['title'])
102
-        self.assertEquals('http://www.wolframalpha.com/input/?i=+integral+1%2Fx', results[1]['url'])
103
-
104
-        html = """
105
-        <!DOCTYPE html>
106
-            <title> &int;1&#x2f;x &#xf74c;x - Wolfram|Alpha</title>
107
-            <meta charset="utf-8" />
108
-            <body>
109
-                <script type="text/javascript">
110
-                  try {
111
-                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
112
-                      context.jsonArray.popups.pod_0100 = [];
113
-                    }
114
-                    context.jsonArray.popups.pod_0100.push( {"stringified": "integral 1\/x dx = log(x)+constant"});
115
-                  } catch(e) { }
116
-                </script>
117
-            </body>
118
-        </html>
119
-        """
120
-        # test input in mathematical notation
121
-        response = mock.Mock(text=html)
122
-        results = wolframalpha_noapi.response(response)
123
-        self.assertEqual(type(results), list)
124
-        self.assertEqual(len(results), 2)
125
-        self.assertIn('log(x)+c', results[0]['answer'])
126
-        self.assertIn('∫1/x x - Wolfram|Alpha'.decode('utf-8'), results[1]['title'])
127
-        self.assertEquals('http://www.wolframalpha.com/input/?i=+%E2%88%AB1%2Fx+%EF%9D%8Cx', results[1]['url'])
128
-
129
-        html = """
130
-        <!DOCTYPE html>
131
-            <title> 1 euro to yen - Wolfram|Alpha</title>
132
-            <meta charset="utf-8" />
133
-            <body>
134
-                <script type="text/javascript">
135
-                  try {
136
-                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
137
-                      context.jsonArray.popups.pod_0100 = [];
138
-                    }
139
-                  context.jsonArray.popups.pod_0100.push( {"stringified": "convert euro1  (euro) to Japanese yen"});
140
-                  } catch(e) { }
141
-
142
-                  try {
143
-                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
144
-                      context.jsonArray.popups.pod_0200 = [];
145
-                    }
146
-                    context.jsonArray.popups.pod_0200.push( {"stringified": "&yen;130.5  (Japanese yen)"});
147
-                  } catch(e) { }
148
-                </script>
149
-            </body>
150
-        </html>
151
-        """
152
-        # test output with htmlentity
153
-        response = mock.Mock(text=html)
154
-        results = wolframalpha_noapi.response(response)
155
-        self.assertEqual(type(results), list)
156
-        self.assertEqual(len(results), 2)
157
-        self.assertIn('¥'.decode('utf-8'), results[0]['answer'])
158
-        self.assertIn('1 euro to yen - Wolfram|Alpha', results[1]['title'])
159
-        self.assertEquals('http://www.wolframalpha.com/input/?i=+1+euro+to+yen', results[1]['url'])
160
-
161
-        html = """
162
-        <!DOCTYPE html>
163
-            <title> distance from nairobi to kyoto in inches - Wolfram|Alpha</title>
164
-            <meta charset="utf-8" />
165
-            <body>
166
-                <script type="text/javascript">
167
-                  try {
168
-                    if (typeof context.jsonArray.popups.pod_0100 == "undefined" ) {
169
-                      context.jsonArray.popups.pod_0100 = [];
170
-                    }
171
-[...].pod_0100.push( {"stringified": "convert distance | from | Nairobi, Kenya\nto | Kyoto, Japan to inches"});
172
-                  } catch(e) { }
173
-
174
-                  try {
175
-                    if (typeof context.jsonArray.popups.pod_0200 == "undefined" ) {
176
-                      context.jsonArray.popups.pod_0200 = [];
177
-                    }
178
-pod_0200.push({"stringified": "4.295&times;10^8 inches","mOutput": "Quantity[4.295×10^8,&amp;quot;Inches&amp;quot;]"});
179
-
180
-                  } catch(e) { }
181
-                </script>
182
-            </body>
183
-        </html>
184
-        """
185
-        # test output with utf-8 character
186
-        response = mock.Mock(text=html)
187
-        results = wolframalpha_noapi.response(response)
188
-        self.assertEqual(type(results), list)
189
-        self.assertEqual(len(results), 2)
190
-        self.assertIn('4.295×10^8 inches'.decode('utf-8'), results[0]['answer'])
191
-        self.assertIn('distance from nairobi to kyoto in inches - Wolfram|Alpha', results[1]['title'])
192
-        self.assertEquals('http://www.wolframalpha.com/input/?i=+distance+from+nairobi+to+kyoto+in+inches',
193
-                          results[1]['url'])
23
+        # TODO