Browse Source

Merge pull request #165 from Cqoicebordel/Moar-engines

Moar engines
Adam Tauber 10 years ago
parent
commit
469e08881e

+ 2
- 2
searx/engines/500px.py View File

35
 # get response from search-request
35
 # get response from search-request
36
 def response(resp):
36
 def response(resp):
37
     results = []
37
     results = []
38
-    
38
+
39
     dom = html.fromstring(resp.text)
39
     dom = html.fromstring(resp.text)
40
-    
40
+
41
     # parse results
41
     # parse results
42
     for result in dom.xpath('//div[@class="photo"]'):
42
     for result in dom.xpath('//div[@class="photo"]'):
43
         link = result.xpath('.//a')[0]
43
         link = result.xpath('.//a')[0]

+ 2
- 2
searx/engines/__init__.py View File

81
         if engine_attr.startswith('_'):
81
         if engine_attr.startswith('_'):
82
             continue
82
             continue
83
         if getattr(engine, engine_attr) is None:
83
         if getattr(engine, engine_attr) is None:
84
-            print('[E] Engine config error: Missing attribute "{0}.{1}"'\
84
+            print('[E] Engine config error: Missing attribute "{0}.{1}"'
85
                   .format(engine.name, engine_attr))
85
                   .format(engine.name, engine_attr))
86
             sys.exit(1)
86
             sys.exit(1)
87
 
87
 
102
     if engine.shortcut:
102
     if engine.shortcut:
103
         # TODO check duplications
103
         # TODO check duplications
104
         if engine.shortcut in engine_shortcuts:
104
         if engine.shortcut in engine_shortcuts:
105
-            print('[E] Engine config error: ambigious shortcut: {0}'\
105
+            print('[E] Engine config error: ambigious shortcut: {0}'
106
                   .format(engine.shortcut))
106
                   .format(engine.shortcut))
107
             sys.exit(1)
107
             sys.exit(1)
108
         engine_shortcuts[engine.shortcut] = engine.name
108
         engine_shortcuts[engine.shortcut] = engine.name

+ 67
- 0
searx/engines/digg.py View File

1
+## Digg (News, Social media)
2
+#
3
+# @website     https://digg.com/
4
+# @provide-api no
5
+#
6
+# @using-api   no
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate, thumbnail
10
+
11
+from urllib import quote_plus
12
+from json import loads
13
+from lxml import html
14
+from cgi import escape
15
+from dateutil import parser
16
+
17
+# engine dependent config
18
+categories = ['news', 'social media']
19
+paging = True
20
+
21
+# search-url
22
+base_url = 'https://digg.com/'
23
+search_url = base_url+'api/search/{query}.json?position={position}&format=html'
24
+
25
+# specific xpath variables
26
+results_xpath = '//article'
27
+link_xpath = './/small[@class="time"]//a'
28
+title_xpath = './/h2//a//text()'
29
+content_xpath = './/p//text()'
30
+pubdate_xpath = './/time'
31
+
32
+
33
+# do search-request
34
+def request(query, params):
35
+    offset = (params['pageno'] - 1) * 10
36
+    params['url'] = search_url.format(position=offset,
37
+                                      query=quote_plus(query))
38
+    return params
39
+
40
+
41
+# get response from search-request
42
+def response(resp):
43
+    results = []
44
+
45
+    search_result = loads(resp.text)
46
+
47
+    dom = html.fromstring(search_result['html'])
48
+
49
+    # parse results
50
+    for result in dom.xpath(results_xpath):
51
+        url = result.attrib.get('data-contenturl')
52
+        thumbnail = result.xpath('.//img')[0].attrib.get('src')
53
+        title = ''.join(result.xpath(title_xpath))
54
+        content = escape(''.join(result.xpath(content_xpath)))
55
+        pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
56
+        publishedDate = parser.parse(pubdate)
57
+
58
+        # append result
59
+        results.append({'url': url,
60
+                        'title': title,
61
+                        'content': content,
62
+                        'template': 'videos.html',
63
+                        'publishedDate': publishedDate,
64
+                        'thumbnail': thumbnail})
65
+
66
+    # return results
67
+    return results

+ 9
- 3
searx/engines/flickr-noapi.py View File

53
 
53
 
54
     for photo in photos:
54
     for photo in photos:
55
 
55
 
56
-        # In paged configuration, the first pages' photos are represented by a None object
56
+        # In paged configuration, the first pages' photos
57
+        # are represented by a None object
57
         if photo is None:
58
         if photo is None:
58
             continue
59
             continue
59
 
60
 
74
 
75
 
75
         title = photo['title']
76
         title = photo['title']
76
 
77
 
77
-        content = '<span class="photo-author">' + photo['owner']['username'] + '</span><br />'
78
+        content = '<span class="photo-author">' +\
79
+                  photo['owner']['username'] +\
80
+                  '</span><br />'
78
 
81
 
79
         if 'description' in photo:
82
         if 'description' in photo:
80
-            content = content + '<span class="description">' + photo['description'] + '</span>'
83
+            content = content +\
84
+                      '<span class="description">' +\
85
+                      photo['description'] +\
86
+                      '</span>'
81
 
87
 
82
         # append result
88
         # append result
83
         results.append({'url': url,
89
         results.append({'url': url,

+ 19
- 12
searx/engines/flickr.py View File

1
 #!/usr/bin/env python
1
 #!/usr/bin/env python
2
 
2
 
3
 ## Flickr (Images)
3
 ## Flickr (Images)
4
-# 
4
+#
5
 # @website     https://www.flickr.com
5
 # @website     https://www.flickr.com
6
-# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) 
7
-# 
6
+# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
7
+#
8
 # @using-api   yes
8
 # @using-api   yes
9
 # @results     JSON
9
 # @results     JSON
10
 # @stable      yes
10
 # @stable      yes
18
 
18
 
19
 nb_per_page = 15
19
 nb_per_page = 15
20
 paging = True
20
 paging = True
21
-api_key= None
21
+api_key = None
22
 
22
 
23
 
23
 
24
-url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={api_key}&{text}&sort=relevance&extras=description%2C+owner_name%2C+url_o%2C+url_z&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
24
+url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
25
+      '&api_key={api_key}&{text}&sort=relevance' +\
26
+      '&extras=description%2C+owner_name%2C+url_o%2C+url_z' +\
27
+      '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
25
 photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
28
 photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
26
 
29
 
27
 paging = True
30
 paging = True
28
 
31
 
32
+
29
 def build_flickr_url(user_id, photo_id):
33
 def build_flickr_url(user_id, photo_id):
30
-    return photo_url.format(userid=user_id,photoid=photo_id)
34
+    return photo_url.format(userid=user_id, photoid=photo_id)
31
 
35
 
32
 
36
 
33
 def request(query, params):
37
 def request(query, params):
40
 
44
 
41
 def response(resp):
45
 def response(resp):
42
     results = []
46
     results = []
43
-    
47
+
44
     search_results = loads(resp.text)
48
     search_results = loads(resp.text)
45
 
49
 
46
     # return empty array if there are no results
50
     # return empty array if there are no results
64
         url = build_flickr_url(photo['owner'], photo['id'])
68
         url = build_flickr_url(photo['owner'], photo['id'])
65
 
69
 
66
         title = photo['title']
70
         title = photo['title']
67
-        
68
-        content = '<span class="photo-author">'+ photo['ownername'] +'</span><br />'
69
-        
70
-        content = content + '<span class="description">' + photo['description']['_content'] + '</span>'
71
-        
71
+
72
+        content = '<span class="photo-author">' +\
73
+                  photo['ownername'] +\
74
+                  '</span><br />' +\
75
+                  '<span class="description">' +\
76
+                  photo['description']['_content'] +\
77
+                  '</span>'
78
+
72
         # append result
79
         # append result
73
         results.append({'url': url,
80
         results.append({'url': url,
74
                         'title': title,
81
                         'title': title,

+ 3
- 2
searx/engines/kickass.py View File

24
 
24
 
25
 # specific xpath variables
25
 # specific xpath variables
26
 magnet_xpath = './/a[@title="Torrent magnet link"]'
26
 magnet_xpath = './/a[@title="Torrent magnet link"]'
27
-#content_xpath = './/font[@class="detDesc"]//text()'
27
+content_xpath = './/span[@class="font11px lightgrey block"]'
28
 
28
 
29
 
29
 
30
 # do search-request
30
 # do search-request
56
         link = result.xpath('.//a[@class="cellMainLink"]')[0]
56
         link = result.xpath('.//a[@class="cellMainLink"]')[0]
57
         href = urljoin(url, link.attrib['href'])
57
         href = urljoin(url, link.attrib['href'])
58
         title = ' '.join(link.xpath('.//text()'))
58
         title = ' '.join(link.xpath('.//text()'))
59
-        content = escape(html.tostring(result.xpath('.//span[@class="font11px lightgrey block"]')[0], method="text"))
59
+        content = escape(html.tostring(result.xpath(content_xpath)[0],
60
+                                       method="text"))
60
         seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
61
         seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
61
         leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
62
         leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
62
 
63
 

+ 9
- 9
searx/engines/searchcode_code.py View File

11
 from urllib import urlencode
11
 from urllib import urlencode
12
 from json import loads
12
 from json import loads
13
 import cgi
13
 import cgi
14
-import re
15
 
14
 
16
 # engine dependent config
15
 # engine dependent config
17
 categories = ['it']
16
 categories = ['it']
33
 # get response from search-request
32
 # get response from search-request
34
 def response(resp):
33
 def response(resp):
35
     results = []
34
     results = []
36
-    
35
+
37
     search_results = loads(resp.text)
36
     search_results = loads(resp.text)
38
 
37
 
39
     # parse results
38
     # parse results
41
         href = result['url']
40
         href = result['url']
42
         title = "" + result['name'] + " - " + result['filename']
41
         title = "" + result['name'] + " - " + result['filename']
43
         content = result['repo'] + "<br />"
42
         content = result['repo'] + "<br />"
44
-        
43
+
45
         lines = dict()
44
         lines = dict()
46
         for line, code in result['lines'].items():
45
         for line, code in result['lines'].items():
47
             lines[int(line)] = code
46
             lines[int(line)] = code
48
 
47
 
49
         content = content + '<pre class="code-formatter"><table class="code">'
48
         content = content + '<pre class="code-formatter"><table class="code">'
50
         for line, code in sorted(lines.items()):
49
         for line, code in sorted(lines.items()):
51
-            content = content + '<tr><td class="line-number" style="padding-right:5px;">' 
52
-            content = content + str(line) + '</td><td class="code-snippet">' 
53
-            # Replace every two spaces with ' &nbps;' to keep formatting while allowing the browser to break the line if necessary
54
-            content = content + cgi.escape(code).replace('\t', '    ').replace('  ', '&nbsp; ').replace('  ', ' &nbsp;') 
50
+            content = content + '<tr><td class="line-number" style="padding-right:5px;">'
51
+            content = content + str(line) + '</td><td class="code-snippet">'
52
+            # Replace every two spaces with ' &nbps;' to keep formatting
53
+            # while allowing the browser to break the line if necessary
54
+            content = content + cgi.escape(code).replace('\t', '    ').replace('  ', '&nbsp; ').replace('  ', ' &nbsp;')
55
             content = content + "</td></tr>"
55
             content = content + "</td></tr>"
56
-            
56
+
57
         content = content + "</table></pre>"
57
         content = content + "</table></pre>"
58
-        
58
+
59
         # append result
59
         # append result
60
         results.append({'url': href,
60
         results.append({'url': href,
61
                         'title': title,
61
                         'title': title,

+ 11
- 4
searx/engines/searchcode_doc.py View File

31
 # get response from search-request
31
 # get response from search-request
32
 def response(resp):
32
 def response(resp):
33
     results = []
33
     results = []
34
-    
34
+
35
     search_results = loads(resp.text)
35
     search_results = loads(resp.text)
36
 
36
 
37
     # parse results
37
     # parse results
38
     for result in search_results['results']:
38
     for result in search_results['results']:
39
         href = result['url']
39
         href = result['url']
40
-        title = "[" + result['type'] + "] " + result['namespace'] + " " + result['name']
41
-        content = '<span class="highlight">[' + result['type'] + "] " + result['name'] + " " + result['synopsis'] + "</span><br />" + result['description']
42
-        
40
+        title = "[" + result['type'] + "] " +\
41
+                result['namespace'] +\
42
+                " " + result['name']
43
+        content = '<span class="highlight">[' +\
44
+                  result['type'] + "] " +\
45
+                  result['name'] + " " +\
46
+                  result['synopsis'] +\
47
+                  "</span><br />" +\
48
+                  result['description']
49
+
43
         # append result
50
         # append result
44
         results.append({'url': href,
51
         results.append({'url': href,
45
                         'title': title,
52
                         'title': title,

+ 6
- 2
searx/engines/subtitleseeker.py View File

60
 
60
 
61
         content = result.xpath('.//div[contains(@class,"red")]//text()')[0]
61
         content = result.xpath('.//div[contains(@class,"red")]//text()')[0]
62
         content = content + " - "
62
         content = content + " - "
63
-        content = content + html.tostring(result.xpath('.//div[contains(@class,"grey-web")]')[0], method='text')
63
+        text = result.xpath('.//div[contains(@class,"grey-web")]')[0]
64
+        content = content + html.tostring(text, method='text')
64
 
65
 
65
         if result.xpath(".//span") != []:
66
         if result.xpath(".//span") != []:
66
-            content = content + " - (" + result.xpath(".//span//text()")[0].strip() + ")"
67
+            content = content +\
68
+                      " - (" +\
69
+                      result.xpath(".//span//text()")[0].strip() +\
70
+                      ")"
67
 
71
 
68
         # append result
72
         # append result
69
         results.append({'url': href,
73
         results.append({'url': href,

+ 19
- 8
searx/engines/twitter.py View File

1
 ## Twitter (Social media)
1
 ## Twitter (Social media)
2
 #
2
 #
3
-# @website     https://www.bing.com/news
3
+# @website     https://twitter.com/
4
 # @provide-api yes (https://dev.twitter.com/docs/using-search)
4
 # @provide-api yes (https://dev.twitter.com/docs/using-search)
5
 #
5
 #
6
 # @using-api   no
6
 # @using-api   no
14
 from urllib import urlencode
14
 from urllib import urlencode
15
 from lxml import html
15
 from lxml import html
16
 from cgi import escape
16
 from cgi import escape
17
+from datetime import datetime
17
 
18
 
18
 # engine dependent config
19
 # engine dependent config
19
 categories = ['social media']
20
 categories = ['social media']
27
 results_xpath = '//li[@data-item-type="tweet"]'
28
 results_xpath = '//li[@data-item-type="tweet"]'
28
 link_xpath = './/small[@class="time"]//a'
29
 link_xpath = './/small[@class="time"]//a'
29
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
30
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
30
-content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
31
+content_xpath = './/p[@class="js-tweet-text tweet-text"]'
32
+timestamp_xpath = './/span[contains(@class,"_timestamp")]'
31
 
33
 
32
 
34
 
33
 # do search-request
35
 # do search-request
52
         link = tweet.xpath(link_xpath)[0]
54
         link = tweet.xpath(link_xpath)[0]
53
         url = urljoin(base_url, link.attrib.get('href'))
55
         url = urljoin(base_url, link.attrib.get('href'))
54
         title = ''.join(tweet.xpath(title_xpath))
56
         title = ''.join(tweet.xpath(title_xpath))
55
-        content = escape(''.join(tweet.xpath(content_xpath)))
56
-
57
-        # append result
58
-        results.append({'url': url,
59
-                        'title': title,
60
-                        'content': content})
57
+        content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8"))
58
+        pubdate = tweet.xpath(timestamp_xpath)
59
+        if len(pubdate) > 0:
60
+            timestamp = float(pubdate[0].attrib.get('data-time'))
61
+            publishedDate = datetime.fromtimestamp(timestamp, None)
62
+            # append result
63
+            results.append({'url': url,
64
+                            'title': title,
65
+                            'content': content,
66
+                            'publishedDate': publishedDate})
67
+        else:
68
+            # append result
69
+            results.append({'url': url,
70
+                            'title': title,
71
+                            'content': content})
61
 
72
 
62
     # return results
73
     # return results
63
     return results
74
     return results

+ 0
- 1
searx/https_rewrite.py View File

154
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
154
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
155
 
155
 
156
 
156
 
157
-
158
 def https_url_rewrite(result):
157
 def https_url_rewrite(result):
159
     skip_https_rewrite = False
158
     skip_https_rewrite = False
160
     # check if HTTPS rewrite is possible
159
     # check if HTTPS rewrite is possible

+ 10
- 4
searx/search.py View File

69
                 print('engine timeout: {0}'.format(th._engine_name))
69
                 print('engine timeout: {0}'.format(th._engine_name))
70
 
70
 
71
 
71
 
72
-
73
 # get default reqest parameter
72
 # get default reqest parameter
74
 def default_request_params():
73
 def default_request_params():
75
     return {
74
     return {
76
-        'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}, 'verify': True}
75
+        'method': 'GET',
76
+        'headers': {},
77
+        'data': {},
78
+        'url': '',
79
+        'cookies': {},
80
+        'verify': True
81
+    }
77
 
82
 
78
 
83
 
79
 # create a callback wrapper for the search engine results
84
 # create a callback wrapper for the search engine results
487
                 continue
492
                 continue
488
 
493
 
489
             # append request to list
494
             # append request to list
490
-            requests.append((req, request_params['url'], request_args, selected_engine['name']))
495
+            requests.append((req, request_params['url'],
496
+                             request_args,
497
+                             selected_engine['name']))
491
 
498
 
492
         if not requests:
499
         if not requests:
493
             return results, suggestions, answers, infoboxes
500
             return results, suggestions, answers, infoboxes
494
         # send all search-request
501
         # send all search-request
495
         threaded_requests(requests)
502
         threaded_requests(requests)
496
 
503
 
497
-
498
         while not results_queue.empty():
504
         while not results_queue.empty():
499
             engine_name, engine_results = results_queue.get_nowait()
505
             engine_name, engine_results = results_queue.get_nowait()
500
 
506
 

+ 31
- 0
searx/settings.yml View File

44
   - name : ddg definitions
44
   - name : ddg definitions
45
     engine : duckduckgo_definitions
45
     engine : duckduckgo_definitions
46
     shortcut : ddd
46
     shortcut : ddd
47
+    
48
+  - name : digg
49
+    engine : digg
50
+    shortcut : dg
47
 
51
 
48
   - name : wikidata
52
   - name : wikidata
49
     engine : wikidata
53
     engine : wikidata
99
     engine : google_news
103
     engine : google_news
100
     shortcut : gon
104
     shortcut : gon
101
 
105
 
106
+  - name : google play apps
107
+    engine        : xpath
108
+    search_url    : https://play.google.com/store/search?q={query}&c=apps
109
+    url_xpath     : //a[@class="title"]/@href
110
+    title_xpath   : //a[@class="title"]
111
+    content_xpath : //a[@class="subtitle"]
112
+    categories : files
113
+    shortcut : gpa
114
+    
115
+  - name : google play movies
116
+    engine        : xpath
117
+    search_url    : https://play.google.com/store/search?q={query}&c=movies
118
+    url_xpath     : //a[@class="title"]/@href
119
+    title_xpath   : //a[@class="title"]
120
+    content_xpath : //a[@class="subtitle"]
121
+    categories : videos
122
+    shortcut : gpm
123
+    
124
+  - name : google play music
125
+    engine        : xpath
126
+    search_url    : https://play.google.com/store/search?q={query}&c=music
127
+    url_xpath     : //a[@class="title"]/@href
128
+    title_xpath   : //a[@class="title"]
129
+    content_xpath : //a[@class="subtitle"]
130
+    categories : music
131
+    shortcut : gps
132
+    
102
   - name : openstreetmap
133
   - name : openstreetmap
103
     engine : openstreetmap
134
     engine : openstreetmap
104
     shortcut : osm
135
     shortcut : osm

+ 3
- 2
searx/utils.py View File

30
 
30
 
31
 
31
 
32
 def searx_useragent():
32
 def searx_useragent():
33
-    return 'searx/{searx_version} {suffix}'.format(searx_version=VERSION_STRING,
34
-                                          suffix=settings['server'].get('useragent_suffix', ''))
33
+    return 'searx/{searx_version} {suffix}'.format(
34
+           searx_version=VERSION_STRING,
35
+           suffix=settings['server'].get('useragent_suffix', ''))
35
 
36
 
36
 
37
 
37
 def highlight_content(content, query):
38
 def highlight_content(content, query):