Browse Source

Merge pull request #357 from asciimoo/google_engine

[enh] google engine : parse map links and more
Adam Tauber 10 years ago
parent
commit
f2cbefeb54
1 changed files with 131 additions and 16 deletions
  1. 131
    16
      searx/engines/google.py

+ 131
- 16
searx/engines/google.py View File

8
 # @stable      no (HTML can change)
8
 # @stable      no (HTML can change)
9
 # @parse       url, title, content, suggestion
9
 # @parse       url, title, content, suggestion
10
 
10
 
11
+import re
11
 from urllib import urlencode
12
 from urllib import urlencode
12
 from urlparse import urlparse, parse_qsl
13
 from urlparse import urlparse, parse_qsl
13
 from lxml import html
14
 from lxml import html
78
     'TW': 'www.google.com.tw'  # Taiwan
79
     'TW': 'www.google.com.tw'  # Taiwan
79
 }
80
 }
80
 
81
 
82
+# osm
83
+url_map = 'https://www.openstreetmap.org/'\
84
+    + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
85
+
81
 # search-url
86
 # search-url
82
 search_path = '/search'
87
 search_path = '/search'
83
-maps_path = '/maps/'
84
-redirect_path = '/url'
85
-images_path = '/images'
86
 search_url = ('https://{hostname}' +
88
 search_url = ('https://{hostname}' +
87
               search_path +
89
               search_path +
88
               '?{query}&start={offset}&gbv=1')
90
               '?{query}&start={offset}&gbv=1')
89
 
91
 
92
+# other URLs
93
+map_hostname_start = 'maps.google.'
94
+maps_path = '/maps'
95
+redirect_path = '/url'
96
+images_path = '/images'
97
+
90
 # specific xpath variables
98
 # specific xpath variables
91
 results_xpath = '//li[@class="g"]'
99
 results_xpath = '//li[@class="g"]'
92
 url_xpath = './/h3/a/@href'
100
 url_xpath = './/h3/a/@href'
95
 content_misc_xpath = './/div[@class="f slp"]'
103
 content_misc_xpath = './/div[@class="f slp"]'
96
 suggestion_xpath = '//p[@class="_Bmc"]'
104
 suggestion_xpath = '//p[@class="_Bmc"]'
97
 
105
 
106
+# map : detail location
107
+map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
108
+map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span'
109
+map_website_url_xpath = 'h3[2]/a/@href'
110
+map_website_title_xpath = 'h3[2]'
111
+
112
+# map : near the location
113
+map_near = 'table[@class="ts"]//tr'
114
+map_near_title = './/h4'
115
+map_near_url = './/h4/a/@href'
116
+map_near_phone = './/span[@class="nobr"]'
117
+
118
+# images
98
 images_xpath = './/div/a'
119
 images_xpath = './/div/a'
99
 image_url_xpath = './@href'
120
 image_url_xpath = './@href'
100
 image_img_src_xpath = './img/@src'
121
 image_img_src_xpath = './img/@src'
101
 
122
 
123
+# property names
124
+# FIXME : no translation
125
+property_address = "Address"
126
+property_phone = "Phone number"
127
+
128
+# cookies
102
 pref_cookie = ''
129
 pref_cookie = ''
103
 nid_cookie = {}
130
 nid_cookie = {}
104
 
131
 
122
 
149
 
123
 # remove google-specific tracking-url
150
 # remove google-specific tracking-url
124
 def parse_url(url_string, google_hostname):
151
 def parse_url(url_string, google_hostname):
152
+    # sanity check
153
+    if url_string is None:
154
+        return url_string
155
+
156
+    # normal case
125
     parsed_url = urlparse(url_string)
157
     parsed_url = urlparse(url_string)
126
     if (parsed_url.netloc in [google_hostname, '']
158
     if (parsed_url.netloc in [google_hostname, '']
127
             and parsed_url.path == redirect_path):
159
             and parsed_url.path == redirect_path):
151
         if len(language_array) == 2:
183
         if len(language_array) == 2:
152
             country = language_array[1]
184
             country = language_array[1]
153
         else:
185
         else:
154
-            country = '  '
186
+            country = 'US'
155
         language = language_array[0] + ',' + language_array[0] + '-' + country
187
         language = language_array[0] + ',' + language_array[0] + '-' + country
156
 
188
 
157
     if use_locale_domain:
189
     if use_locale_domain:
196
         try:
228
         try:
197
             url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
229
             url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
198
             parsed_url = urlparse(url, google_hostname)
230
             parsed_url = urlparse(url, google_hostname)
199
-            if (parsed_url.netloc == google_hostname
200
-                and (parsed_url.path == search_path
201
-                     or parsed_url.path.startswith(maps_path))):
202
-                # remove the link to google news and google maps
203
-                # FIXME : sometimes the URL is https://maps.google.*/maps
204
-                # no consequence, the result trigger an exception after which is ignored
205
-                continue
231
+
232
+            # map result
233
+            if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path))
234
+               or (parsed_url.netloc.startswith(map_hostname_start))):
235
+                x = result.xpath(map_near)
236
+                if len(x) > 0:
237
+                    # map : near the location
238
+                    results = results + parse_map_near(parsed_url, x, google_hostname)
239
+                else:
240
+                    # map : detail about a location
241
+                    results = results + parse_map_detail(parsed_url, result, google_hostname)
242
+
243
+            # google news
244
+            elif (parsed_url.netloc == google_hostname
245
+                  and parsed_url.path == search_path):
246
+                # skipping news results
247
+                pass
206
 
248
 
207
             # images result
249
             # images result
208
-            if (parsed_url.netloc == google_hostname
209
-                    and parsed_url.path == images_path):
250
+            elif (parsed_url.netloc == google_hostname
251
+                  and parsed_url.path == images_path):
210
                 # only thumbnail image provided,
252
                 # only thumbnail image provided,
211
                 # so skipping image results
253
                 # so skipping image results
212
                 # results = results + parse_images(result, google_hostname)
254
                 # results = results + parse_images(result, google_hostname)
213
                 pass
255
                 pass
256
+
214
             else:
257
             else:
215
                 # normal result
258
                 # normal result
216
                 content = extract_text_from_dom(result, content_xpath)
259
                 content = extract_text_from_dom(result, content_xpath)
222
                 # append result
265
                 # append result
223
                 results.append({'url': url,
266
                 results.append({'url': url,
224
                                 'title': title,
267
                                 'title': title,
225
-                                'content': content})
226
-        except Exception:
268
+                                'content': content
269
+                                })
270
+        except:
227
             continue
271
             continue
228
 
272
 
229
     # parse suggestion
273
     # parse suggestion
246
                         'title': '',
290
                         'title': '',
247
                         'content': '',
291
                         'content': '',
248
                         'img_src': img_src,
292
                         'img_src': img_src,
249
-                        'template': 'images.html'})
293
+                        'template': 'images.html'
294
+                        })
295
+
296
+    return results
297
+
298
+
299
+def parse_map_near(parsed_url, x, google_hostname):
300
+    results = []
301
+
302
+    for result in x:
303
+        title = extract_text_from_dom(result, map_near_title)
304
+        url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname)
305
+        attributes = []
306
+        phone = extract_text_from_dom(result, map_near_phone)
307
+        add_attributes(attributes, property_phone, phone, 'tel:' + phone)
308
+        results.append({'title': title,
309
+                        'url': url,
310
+                        'content': attributes_to_html(attributes)
311
+                        })
250
 
312
 
251
     return results
313
     return results
314
+
315
+
316
+def parse_map_detail(parsed_url, result, google_hostname):
317
+    results = []
318
+
319
+    # try to parse the geoloc
320
+    m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
321
+    if m is None:
322
+        m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
323
+
324
+    if m is not None:
325
+        # geoloc found (ignored)
326
+        lon = float(m.group(2))  # noqa
327
+        lat = float(m.group(1))  # noqa
328
+        zoom = int(m.group(3))  # noqa
329
+
330
+        # attributes
331
+        attributes = []
332
+        address = extract_text_from_dom(result, map_address_xpath)
333
+        phone = extract_text_from_dom(result, map_phone_xpath)
334
+        add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon))
335
+        add_attributes(attributes, property_phone, phone, 'tel:' + phone)
336
+
337
+        # title / content / url
338
+        website_title = extract_text_from_dom(result, map_website_title_xpath)
339
+        content = extract_text_from_dom(result, content_xpath)
340
+        website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname)
341
+
342
+        # add a result if there is a website
343
+        if website_url is not None:
344
+            results.append({'title': website_title,
345
+                            'content': (content + '<br />' if content is not None else '')
346
+                            + attributes_to_html(attributes),
347
+                            'url': website_url
348
+                            })
349
+
350
+    return results
351
+
352
+
353
+def add_attributes(attributes, name, value, url):
354
+    if value is not None and len(value) > 0:
355
+        attributes.append({'label': name, 'value': value, 'url': url})
356
+
357
+
358
+def attributes_to_html(attributes):
359
+    retval = '<table class="table table-striped">'
360
+    for a in attributes:
361
+        value = a.get('value')
362
+        if 'url' in a:
363
+            value = '<a href="' + a.get('url') + '">' + value + '</a>'
364
+        retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
365
+    retval = retval + '</table>'
366
+    return retval