Przeglądaj źródła

Merge pull request #357 from asciimoo/google_engine

[enh] google engine : parse map links and more
Adam Tauber 9 lat temu
rodzic
commit
f2cbefeb54
1 zmienionych plików z 131 dodań i 16 usunięć
  1. 131
    16
      searx/engines/google.py

+ 131
- 16
searx/engines/google.py Wyświetl plik

@@ -8,6 +8,7 @@
8 8
 # @stable      no (HTML can change)
9 9
 # @parse       url, title, content, suggestion
10 10
 
11
+import re
11 12
 from urllib import urlencode
12 13
 from urlparse import urlparse, parse_qsl
13 14
 from lxml import html
@@ -78,15 +79,22 @@ country_to_hostname = {
78 79
     'TW': 'www.google.com.tw'  # Taiwan
79 80
 }
80 81
 
82
+# osm
83
+url_map = 'https://www.openstreetmap.org/'\
84
+    + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
85
+
81 86
 # search-url
82 87
 search_path = '/search'
83
-maps_path = '/maps/'
84
-redirect_path = '/url'
85
-images_path = '/images'
86 88
 search_url = ('https://{hostname}' +
87 89
               search_path +
88 90
               '?{query}&start={offset}&gbv=1')
89 91
 
92
+# other URLs
93
+map_hostname_start = 'maps.google.'
94
+maps_path = '/maps'
95
+redirect_path = '/url'
96
+images_path = '/images'
97
+
90 98
 # specific xpath variables
91 99
 results_xpath = '//li[@class="g"]'
92 100
 url_xpath = './/h3/a/@href'
@@ -95,10 +103,29 @@ content_xpath = './/span[@class="st"]'
95 103
 content_misc_xpath = './/div[@class="f slp"]'
96 104
 suggestion_xpath = '//p[@class="_Bmc"]'
97 105
 
106
+# map : detail location
107
+map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()'
108
+map_phone_xpath = './/div[@class="s"]//table//td[2]/span/span'
109
+map_website_url_xpath = 'h3[2]/a/@href'
110
+map_website_title_xpath = 'h3[2]'
111
+
112
+# map : near the location
113
+map_near = 'table[@class="ts"]//tr'
114
+map_near_title = './/h4'
115
+map_near_url = './/h4/a/@href'
116
+map_near_phone = './/span[@class="nobr"]'
117
+
118
+# images
98 119
 images_xpath = './/div/a'
99 120
 image_url_xpath = './@href'
100 121
 image_img_src_xpath = './img/@src'
101 122
 
123
+# property names
124
+# FIXME : no translation
125
+property_address = "Address"
126
+property_phone = "Phone number"
127
+
128
+# cookies
102 129
 pref_cookie = ''
103 130
 nid_cookie = {}
104 131
 
@@ -122,6 +149,11 @@ def get_google_nid_cookie(google_hostname):
122 149
 
123 150
 # remove google-specific tracking-url
124 151
 def parse_url(url_string, google_hostname):
152
+    # sanity check
153
+    if url_string is None:
154
+        return url_string
155
+
156
+    # normal case
125 157
     parsed_url = urlparse(url_string)
126 158
     if (parsed_url.netloc in [google_hostname, '']
127 159
             and parsed_url.path == redirect_path):
@@ -151,7 +183,7 @@ def request(query, params):
151 183
         if len(language_array) == 2:
152 184
             country = language_array[1]
153 185
         else:
154
-            country = '  '
186
+            country = 'US'
155 187
         language = language_array[0] + ',' + language_array[0] + '-' + country
156 188
 
157 189
     if use_locale_domain:
@@ -196,21 +228,32 @@ def response(resp):
196 228
         try:
197 229
             url = parse_url(extract_url(result.xpath(url_xpath), google_url), google_hostname)
198 230
             parsed_url = urlparse(url, google_hostname)
199
-            if (parsed_url.netloc == google_hostname
200
-                and (parsed_url.path == search_path
201
-                     or parsed_url.path.startswith(maps_path))):
202
-                # remove the link to google news and google maps
203
-                # FIXME : sometimes the URL is https://maps.google.*/maps
204
-                # no consequence, the result trigger an exception after which is ignored
205
-                continue
231
+
232
+            # map result
233
+            if ((parsed_url.netloc == google_hostname and parsed_url.path.startswith(maps_path))
234
+               or (parsed_url.netloc.startswith(map_hostname_start))):
235
+                x = result.xpath(map_near)
236
+                if len(x) > 0:
237
+                    # map : near the location
238
+                    results = results + parse_map_near(parsed_url, x, google_hostname)
239
+                else:
240
+                    # map : detail about a location
241
+                    results = results + parse_map_detail(parsed_url, result, google_hostname)
242
+
243
+            # google news
244
+            elif (parsed_url.netloc == google_hostname
245
+                  and parsed_url.path == search_path):
246
+                # skipping news results
247
+                pass
206 248
 
207 249
             # images result
208
-            if (parsed_url.netloc == google_hostname
209
-                    and parsed_url.path == images_path):
250
+            elif (parsed_url.netloc == google_hostname
251
+                  and parsed_url.path == images_path):
210 252
                 # only thumbnail image provided,
211 253
                 # so skipping image results
212 254
                 # results = results + parse_images(result, google_hostname)
213 255
                 pass
256
+
214 257
             else:
215 258
                 # normal result
216 259
                 content = extract_text_from_dom(result, content_xpath)
@@ -222,8 +265,9 @@ def response(resp):
222 265
                 # append result
223 266
                 results.append({'url': url,
224 267
                                 'title': title,
225
-                                'content': content})
226
-        except Exception:
268
+                                'content': content
269
+                                })
270
+        except:
227 271
             continue
228 272
 
229 273
     # parse suggestion
@@ -246,6 +290,77 @@ def parse_images(result, google_hostname):
246 290
                         'title': '',
247 291
                         'content': '',
248 292
                         'img_src': img_src,
249
-                        'template': 'images.html'})
293
+                        'template': 'images.html'
294
+                        })
295
+
296
+    return results
297
+
298
+
299
+def parse_map_near(parsed_url, x, google_hostname):
300
+    results = []
301
+
302
+    for result in x:
303
+        title = extract_text_from_dom(result, map_near_title)
304
+        url = parse_url(extract_text_from_dom(result, map_near_url), google_hostname)
305
+        attributes = []
306
+        phone = extract_text_from_dom(result, map_near_phone)
307
+        add_attributes(attributes, property_phone, phone, 'tel:' + phone)
308
+        results.append({'title': title,
309
+                        'url': url,
310
+                        'content': attributes_to_html(attributes)
311
+                        })
250 312
 
251 313
     return results
314
+
315
+
316
+def parse_map_detail(parsed_url, result, google_hostname):
317
+    results = []
318
+
319
+    # try to parse the geoloc
320
+    m = re.search('@([0-9\.]+),([0-9\.]+),([0-9]+)', parsed_url.path)
321
+    if m is None:
322
+        m = re.search('ll\=([0-9\.]+),([0-9\.]+)\&z\=([0-9]+)', parsed_url.query)
323
+
324
+    if m is not None:
325
+        # geoloc found (ignored)
326
+        lon = float(m.group(2))  # noqa
327
+        lat = float(m.group(1))  # noqa
328
+        zoom = int(m.group(3))  # noqa
329
+
330
+        # attributes
331
+        attributes = []
332
+        address = extract_text_from_dom(result, map_address_xpath)
333
+        phone = extract_text_from_dom(result, map_phone_xpath)
334
+        add_attributes(attributes, property_address, address, 'geo:' + str(lat) + ',' + str(lon))
335
+        add_attributes(attributes, property_phone, phone, 'tel:' + phone)
336
+
337
+        # title / content / url
338
+        website_title = extract_text_from_dom(result, map_website_title_xpath)
339
+        content = extract_text_from_dom(result, content_xpath)
340
+        website_url = parse_url(extract_text_from_dom(result, map_website_url_xpath), google_hostname)
341
+
342
+        # add a result if there is a website
343
+        if website_url is not None:
344
+            results.append({'title': website_title,
345
+                            'content': (content + '<br />' if content is not None else '')
346
+                            + attributes_to_html(attributes),
347
+                            'url': website_url
348
+                            })
349
+
350
+    return results
351
+
352
+
353
+def add_attributes(attributes, name, value, url):
354
+    if value is not None and len(value) > 0:
355
+        attributes.append({'label': name, 'value': value, 'url': url})
356
+
357
+
358
+def attributes_to_html(attributes):
359
+    retval = '<table class="table table-striped">'
360
+    for a in attributes:
361
+        value = a.get('value')
362
+        if 'url' in a:
363
+            value = '<a href="' + a.get('url') + '">' + value + '</a>'
364
+        retval = retval + '<tr><th>' + a.get('label') + '</th><td>' + value + '</td></tr>'
365
+    retval = retval + '</table>'
366
+    return retval