Browse Source

Merge pull request #152 from pointhi/search_engines

[enh] add photon engine, and fix pep8 errors
Adam Tauber 10 years ago
parent
commit
813247b37a

+ 8
- 3
searx/engines/bing_news.py View File

57
         link = result.xpath('.//div[@class="newstitle"]/a')[0]
57
         link = result.xpath('.//div[@class="newstitle"]/a')[0]
58
         url = link.attrib.get('href')
58
         url = link.attrib.get('href')
59
         title = ' '.join(link.xpath('.//text()'))
59
         title = ' '.join(link.xpath('.//text()'))
60
-        contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')
60
+        contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
61
+                                    '//span[@class="sn_snip"]//text()')
61
         if contentXPath is not None:
62
         if contentXPath is not None:
62
             content = escape(' '.join(contentXPath))
63
             content = escape(' '.join(contentXPath))
63
 
64
 
64
         # parse publishedDate
65
         # parse publishedDate
65
-        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div//span[contains(@class,"sn_ST")]//span[contains(@class,"sn_tm")]//text()')
66
+        publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
67
+                                          '//span[contains(@class,"sn_ST")]'
68
+                                          '//span[contains(@class,"sn_tm")]'
69
+                                          '//text()')
66
         if publishedDateXPath is not None:
70
         if publishedDateXPath is not None:
67
             publishedDate = escape(' '.join(publishedDateXPath))
71
             publishedDate = escape(' '.join(publishedDateXPath))
68
 
72
 
74
             timeNumbers = re.findall(r'\d+', publishedDate)
78
             timeNumbers = re.findall(r'\d+', publishedDate)
75
             publishedDate = datetime.now()\
79
             publishedDate = datetime.now()\
76
                 - timedelta(hours=int(timeNumbers[0]))
80
                 - timedelta(hours=int(timeNumbers[0]))
77
-        elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
81
+        elif re.match("^[0-9]+ hour(s|),"
82
+                      " [0-9]+ minute(s|) ago$", publishedDate):
78
             timeNumbers = re.findall(r'\d+', publishedDate)
83
             timeNumbers = re.findall(r'\d+', publishedDate)
79
             publishedDate = datetime.now()\
84
             publishedDate = datetime.now()\
80
                 - timedelta(hours=int(timeNumbers[0]))\
85
                 - timedelta(hours=int(timeNumbers[0]))\

+ 11
- 4
searx/engines/faroo.py View File

22
 
22
 
23
 # search-url
23
 # search-url
24
 url = 'http://www.faroo.com/'
24
 url = 'http://www.faroo.com/'
25
-search_url = url + 'api?{query}&start={offset}&length={number_of_results}&l={language}&src={categorie}&i=false&f=json&key={api_key}'
25
+search_url = url + 'api?{query}'\
26
+                      '&start={offset}'\
27
+                      '&length={number_of_results}'\
28
+                      '&l={language}'\
29
+                      '&src={categorie}'\
30
+                      '&i=false'\
31
+                      '&f=json'\
32
+                      '&key={api_key}'  # noqa
26
 
33
 
27
 search_category = {'general': 'web',
34
 search_category = {'general': 'web',
28
-                'news': 'news'}
35
+                   'news': 'news'}
29
 
36
 
30
 
37
 
31
 # do search-request
38
 # do search-request
80
     # parse results
87
     # parse results
81
     for result in search_res['results']:
88
     for result in search_res['results']:
82
         if result['news']:
89
         if result['news']:
83
-            # timestamp (how many milliseconds have passed between now and the beginning of 1970)
84
-            publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0)
90
+            # timestamp (milliseconds since 1970)
91
+            publishedDate = datetime.datetime.fromtimestamp(result['date']/1000.0)  # noqa
85
 
92
 
86
             # append news result
93
             # append news result
87
             results.append({'url': result['url'],
94
             results.append({'url': result['url'],

+ 1
- 1
searx/engines/google_images.py View File

9
 # @stable      yes (but deprecated)
9
 # @stable      yes (but deprecated)
10
 # @parse       url, title, img_src
10
 # @parse       url, title, img_src
11
 
11
 
12
-from urllib import urlencode,unquote
12
+from urllib import urlencode, unquote
13
 from json import loads
13
 from json import loads
14
 
14
 
15
 # engine dependent config
15
 # engine dependent config

+ 4
- 4
searx/engines/kickass.py View File

1
 ## Kickass Torrent (Videos, Music, Files)
1
 ## Kickass Torrent (Videos, Music, Files)
2
-# 
2
+#
3
 # @website     https://kickass.so
3
 # @website     https://kickass.so
4
 # @provide-api no (nothing found)
4
 # @provide-api no (nothing found)
5
-# 
5
+#
6
 # @using-api   no
6
 # @using-api   no
7
 # @results     HTML (using search portal)
7
 # @results     HTML (using search portal)
8
 # @stable      yes (HTML can change)
8
 # @stable      yes (HTML can change)
13
 from urllib import quote
13
 from urllib import quote
14
 from lxml import html
14
 from lxml import html
15
 from operator import itemgetter
15
 from operator import itemgetter
16
-from dateutil import parser
17
 
16
 
18
 # engine dependent config
17
 # engine dependent config
19
 categories = ['videos', 'music', 'files']
18
 categories = ['videos', 'music', 'files']
33
     params['url'] = search_url.format(search_term=quote(query),
32
     params['url'] = search_url.format(search_term=quote(query),
34
                                       pageno=params['pageno'])
33
                                       pageno=params['pageno'])
35
 
34
 
36
-    # FIX: SSLError: hostname 'kickass.so' doesn't match either of '*.kickass.to', 'kickass.to'
35
+    # FIX: SSLError: hostname 'kickass.so'
36
+    # doesn't match either of '*.kickass.to', 'kickass.to'
37
     params['verify'] = False
37
     params['verify'] = False
38
 
38
 
39
     return params
39
     return params

+ 7
- 4
searx/engines/mediawiki.py View File

28
                                  '&srprop=timestamp'\
28
                                  '&srprop=timestamp'\
29
                                  '&format=json'\
29
                                  '&format=json'\
30
                                  '&sroffset={offset}'\
30
                                  '&sroffset={offset}'\
31
-                                 '&srlimit={limit}'
31
+                                 '&srlimit={limit}'     # noqa
32
 
32
 
33
 
33
 
34
 # do search-request
34
 # do search-request
35
 def request(query, params):
35
 def request(query, params):
36
     offset = (params['pageno'] - 1) * number_of_results
36
     offset = (params['pageno'] - 1) * number_of_results
37
+
37
     string_args = dict(query=urlencode({'srsearch': query}),
38
     string_args = dict(query=urlencode({'srsearch': query}),
38
-                        offset=offset,
39
-                        limit=number_of_results)
39
+                       offset=offset,
40
+                       limit=number_of_results)
41
+
40
     format_strings = list(Formatter().parse(base_url))
42
     format_strings = list(Formatter().parse(base_url))
41
 
43
 
42
     if params['language'] == 'all':
44
     if params['language'] == 'all':
67
 
69
 
68
     # parse results
70
     # parse results
69
     for result in search_results['query']['search']:
71
     for result in search_results['query']['search']:
70
-        url = base_url.format(language=resp.search_params['language']) + 'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
72
+        url = base_url.format(language=resp.search_params['language']) +\
73
+            'wiki/' + quote(result['title'].replace(' ', '_').encode('utf-8'))
71
 
74
 
72
         # append result
75
         # append result
73
         results.append({'url': url,
76
         results.append({'url': url,

+ 9
- 5
searx/engines/openstreetmap.py View File

9
 # @parse       url, title
9
 # @parse       url, title
10
 
10
 
11
 from json import loads
11
 from json import loads
12
+from searx.utils import searx_useragent
12
 
13
 
13
 # engine dependent config
14
 # engine dependent config
14
 categories = ['map']
15
 categories = ['map']
15
 paging = False
16
 paging = False
16
 
17
 
17
 # search-url
18
 # search-url
18
-url = 'https://nominatim.openstreetmap.org/search/{query}?format=json&polygon_geojson=1&addressdetails=1'
19
-
19
+base_url = 'https://nominatim.openstreetmap.org/'
20
+search_string = 'search/{query}?format=json&polygon_geojson=1&addressdetails=1'
20
 result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
21
 result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
21
 
22
 
22
 
23
 
23
 # do search-request
24
 # do search-request
24
 def request(query, params):
25
 def request(query, params):
25
-    params['url'] = url.format(query=query)
26
+    params['url'] = base_url + search_string.format(query=query)
27
+
28
+    # using searx User-Agent
29
+    params['headers']['User-Agent'] = searx_useragent()
26
 
30
 
27
     return params
31
     return params
28
 
32
 
68
             address.update({'house_number': address_raw.get('house_number'),
72
             address.update({'house_number': address_raw.get('house_number'),
69
                            'road': address_raw.get('road'),
73
                            'road': address_raw.get('road'),
70
                            'locality': address_raw.get('city',
74
                            'locality': address_raw.get('city',
71
-                                       address_raw.get('town',
72
-                                       address_raw.get('village'))),
75
+                                       address_raw.get('town',          # noqa
76
+                                       address_raw.get('village'))),    # noqa
73
                            'postcode': address_raw.get('postcode'),
77
                            'postcode': address_raw.get('postcode'),
74
                            'country': address_raw.get('country'),
78
                            'country': address_raw.get('country'),
75
                            'country_code': address_raw.get('country_code')})
79
                            'country_code': address_raw.get('country_code')})

+ 128
- 0
searx/engines/photon.py View File

1
+## Photon (Map)
2
+#
3
+# @website     https://photon.komoot.de
4
+# @provide-api yes (https://photon.komoot.de/)
5
+#
6
+# @using-api   yes
7
+# @results     JSON
8
+# @stable      yes
9
+# @parse       url, title
10
+
11
+from urllib import urlencode
12
+from json import loads
13
+from searx.utils import searx_useragent
14
+
15
+# engine dependent config
16
+categories = ['map']
17
+paging = False
18
+language_support = True
19
+number_of_results = 10
20
+
21
+# search-url
22
+base_url = 'https://photon.komoot.de/'
23
+search_string = 'api/?{query}&limit={limit}'
24
+result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}'
25
+
26
+
27
+# do search-request
28
+def request(query, params):
29
+    params['url'] = base_url +\
30
+        search_string.format(query=urlencode({'q': query}),
31
+                             limit=number_of_results)
32
+
33
+    if params['language'] != 'all':
34
+        params['url'] = params['url'] +\
35
+            "&lang=" + params['language'].replace('_', '-')
36
+
37
+    # using searx User-Agent
38
+    params['headers']['User-Agent'] = searx_useragent()
39
+
40
+    # FIX: SSLError: SSL3_GET_SERVER_CERTIFICATE:certificate verify failed
41
+    params['verify'] = False
42
+
43
+    return params
44
+
45
+
46
+# get response from search-request
47
+def response(resp):
48
+    results = []
49
+    json = loads(resp.text)
50
+
51
+    # parse results
52
+    for r in json.get('features', {}):
53
+
54
+        properties = r.get('properties')
55
+
56
+        if not properties:
57
+            continue
58
+
59
+        # get title
60
+        title = properties['name']
61
+
62
+        # get osm-type
63
+        if properties.get('osm_type') == 'N':
64
+            osm_type = 'node'
65
+        elif properties.get('osm_type') == 'W':
66
+            osm_type = 'way'
67
+        elif properties.get('osm_type') == 'R':
68
+            osm_type = 'relation'
69
+        else:
70
+            # continue if invalide osm-type
71
+            continue
72
+
73
+        url = result_base_url.format(osm_type=osm_type,
74
+                                     osm_id=properties.get('osm_id'))
75
+
76
+        osm = {'type': osm_type,
77
+               'id': properties.get('osm_id')}
78
+
79
+        geojson = r.get('geometry')
80
+
81
+        if properties.get('extent'):
82
+            boundingbox = [properties.get('extent')[3],
83
+                           properties.get('extent')[1],
84
+                           properties.get('extent')[0],
85
+                           properties.get('extent')[2]]
86
+        else:
87
+            # TODO: better boundingbox calculation
88
+            boundingbox = [geojson['coordinates'][1],
89
+                           geojson['coordinates'][1],
90
+                           geojson['coordinates'][0],
91
+                           geojson['coordinates'][0]]
92
+
93
+        # address calculation
94
+        address = {}
95
+
96
+        # get name
97
+        if properties.get('osm_key') == 'amenity' or\
98
+           properties.get('osm_key') == 'shop' or\
99
+           properties.get('osm_key') == 'tourism' or\
100
+           properties.get('osm_key') == 'leisure':
101
+            address = {'name': properties.get('name')}
102
+
103
+        # add rest of adressdata, if something is already found
104
+        if address.get('name'):
105
+            address.update({'house_number': properties.get('housenumber'),
106
+                           'road': properties.get('street'),
107
+                           'locality': properties.get('city',
108
+                                       properties.get('town',           # noqa
109
+                                       properties.get('village'))),     # noqa
110
+                           'postcode': properties.get('postcode'),
111
+                           'country': properties.get('country')})
112
+        else:
113
+            address = None
114
+
115
+        # append result
116
+        results.append({'template': 'map.html',
117
+                        'title': title,
118
+                        'content': '',
119
+                        'longitude': geojson['coordinates'][0],
120
+                        'latitude': geojson['coordinates'][1],
121
+                        'boundingbox': boundingbox,
122
+                        'geojson': geojson,
123
+                        'address': address,
124
+                        'osm': osm,
125
+                        'url': url})
126
+
127
+    # return results
128
+    return results

+ 6
- 1
searx/engines/soundcloud.py View File

20
 
20
 
21
 # search-url
21
 # search-url
22
 url = 'https://api.soundcloud.com/'
22
 url = 'https://api.soundcloud.com/'
23
-search_url = url + 'search?{query}&facet=model&limit=20&offset={offset}&linked_partitioning=1&client_id={client_id}'
23
+search_url = url + 'search?{query}'\
24
+                         '&facet=model'\
25
+                         '&limit=20'\
26
+                         '&offset={offset}'\
27
+                         '&linked_partitioning=1'\
28
+                         '&client_id={client_id}'   # noqa
24
 
29
 
25
 
30
 
26
 # do search-request
31
 # do search-request

+ 17
- 12
searx/engines/yacy.py View File

24
 
24
 
25
 # search-url
25
 # search-url
26
 base_url = 'http://localhost:8090'
26
 base_url = 'http://localhost:8090'
27
-search_url = '/yacysearch.json?{query}&startRecord={offset}&maximumRecords={limit}&contentdom={search_type}&resource=global'
27
+search_url = '/yacysearch.json?{query}'\
28
+                             '&startRecord={offset}'\
29
+                             '&maximumRecords={limit}'\
30
+                             '&contentdom={search_type}'\
31
+                             '&resource=global'             # noqa
28
 
32
 
29
 # yacy specific type-definitions
33
 # yacy specific type-definitions
30
 search_types = {'general': 'text',
34
 search_types = {'general': 'text',
39
     offset = (params['pageno'] - 1) * number_of_results
43
     offset = (params['pageno'] - 1) * number_of_results
40
     search_type = search_types.get(params['category'], '0')
44
     search_type = search_types.get(params['category'], '0')
41
 
45
 
42
-    params['url'] = base_url + search_url.format(query=urlencode({'query': query}),
43
-                                                 offset=offset,
44
-                                                 limit=number_of_results,
45
-                                                 search_type=search_type)
46
+    params['url'] = base_url +\
47
+        search_url.format(query=urlencode({'query': query}),
48
+                          offset=offset,
49
+                          limit=number_of_results,
50
+                          search_type=search_type)
46
 
51
 
47
     # add language tag if specified
52
     # add language tag if specified
48
     if params['language'] != 'all':
53
     if params['language'] != 'all':
70
 
75
 
71
             # append result
76
             # append result
72
             results.append({'url': result['link'],
77
             results.append({'url': result['link'],
73
-                        'title': result['title'],
74
-                        'content': result['description'],
75
-                        'publishedDate': publishedDate})
78
+                            'title': result['title'],
79
+                            'content': result['description'],
80
+                            'publishedDate': publishedDate})
76
 
81
 
77
     elif resp.search_params['category'] == 'images':
82
     elif resp.search_params['category'] == 'images':
78
         # parse image results
83
         # parse image results
79
         for result in search_results:
84
         for result in search_results:
80
             # append result
85
             # append result
81
             results.append({'url': result['url'],
86
             results.append({'url': result['url'],
82
-                        'title': result['title'],
83
-                        'content': '',
84
-                        'img_src': result['image'],
85
-                        'template': 'images.html'})
87
+                            'title': result['title'],
88
+                            'content': '',
89
+                            'img_src': result['image'],
90
+                            'template': 'images.html'})
86
 
91
 
87
     #TODO parse video, audio and file results
92
     #TODO parse video, audio and file results
88
 
93
 

+ 5
- 4
searx/engines/yahoo.py View File

20
 language_support = True
20
 language_support = True
21
 
21
 
22
 # search-url
22
 # search-url
23
-search_url = 'https://search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}'
23
+base_url = 'https://search.yahoo.com/'
24
+search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
24
 
25
 
25
 # specific xpath variables
26
 # specific xpath variables
26
 results_xpath = '//div[@class="res"]'
27
 results_xpath = '//div[@class="res"]'
57
     else:
58
     else:
58
         language = params['language'].split('_')[0]
59
         language = params['language'].split('_')[0]
59
 
60
 
60
-    params['url'] = search_url.format(offset=offset,
61
-                                      query=urlencode({'p': query}),
62
-                                      lang=language)
61
+    params['url'] = base_url + search_url.format(offset=offset,
62
+                                                 query=urlencode({'p': query}),
63
+                                                 lang=language)
63
 
64
 
64
     # TODO required?
65
     # TODO required?
65
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
66
     params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\

+ 4
- 0
searx/settings.yml View File

95
     engine : openstreetmap
95
     engine : openstreetmap
96
     shortcut : osm
96
     shortcut : osm
97
 
97
 
98
+  - name : photon
99
+    engine : photon
100
+    shortcut : ph
101
+
98
 #  - name : piratebay
102
 #  - name : piratebay
99
 #    engine : piratebay
103
 #    engine : piratebay
100
 #    shortcut : tpb
104
 #    shortcut : tpb