Selaa lähdekoodia

Merge pull request #165 from Cqoicebordel/Moar-engines

Moar engines
Adam Tauber 10 vuotta sitten
vanhempi
commit
469e08881e

+ 2
- 2
searx/engines/500px.py Näytä tiedosto

@@ -35,9 +35,9 @@ def request(query, params):
35 35
 # get response from search-request
36 36
 def response(resp):
37 37
     results = []
38
-    
38
+
39 39
     dom = html.fromstring(resp.text)
40
-    
40
+
41 41
     # parse results
42 42
     for result in dom.xpath('//div[@class="photo"]'):
43 43
         link = result.xpath('.//a')[0]

+ 2
- 2
searx/engines/__init__.py Näytä tiedosto

@@ -81,7 +81,7 @@ def load_engine(engine_data):
81 81
         if engine_attr.startswith('_'):
82 82
             continue
83 83
         if getattr(engine, engine_attr) is None:
84
-            print('[E] Engine config error: Missing attribute "{0}.{1}"'\
84
+            print('[E] Engine config error: Missing attribute "{0}.{1}"'
85 85
                   .format(engine.name, engine_attr))
86 86
             sys.exit(1)
87 87
 
@@ -102,7 +102,7 @@ def load_engine(engine_data):
102 102
     if engine.shortcut:
103 103
         # TODO check duplications
104 104
         if engine.shortcut in engine_shortcuts:
105
-            print('[E] Engine config error: ambigious shortcut: {0}'\
105
+            print('[E] Engine config error: ambigious shortcut: {0}'
106 106
                   .format(engine.shortcut))
107 107
             sys.exit(1)
108 108
         engine_shortcuts[engine.shortcut] = engine.name

+ 67
- 0
searx/engines/digg.py Näytä tiedosto

@@ -0,0 +1,67 @@
1
+## Digg (News, Social media)
2
+#
3
+# @website     https://digg.com/
4
+# @provide-api no
5
+#
6
+# @using-api   no
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate, thumbnail
10
+
11
+from urllib import quote_plus
12
+from json import loads
13
+from lxml import html
14
+from cgi import escape
15
+from dateutil import parser
16
+
17
+# engine dependent config
18
+categories = ['news', 'social media']
19
+paging = True
20
+
21
+# search-url
22
+base_url = 'https://digg.com/'
23
+search_url = base_url+'api/search/{query}.json?position={position}&format=html'
24
+
25
+# specific xpath variables
26
+results_xpath = '//article'
27
+link_xpath = './/small[@class="time"]//a'
28
+title_xpath = './/h2//a//text()'
29
+content_xpath = './/p//text()'
30
+pubdate_xpath = './/time'
31
+
32
+
33
+# do search-request
34
+def request(query, params):
35
+    offset = (params['pageno'] - 1) * 10
36
+    params['url'] = search_url.format(position=offset,
37
+                                      query=quote_plus(query))
38
+    return params
39
+
40
+
41
+# get response from search-request
42
+def response(resp):
43
+    results = []
44
+
45
+    search_result = loads(resp.text)
46
+
47
+    dom = html.fromstring(search_result['html'])
48
+
49
+    # parse results
50
+    for result in dom.xpath(results_xpath):
51
+        url = result.attrib.get('data-contenturl')
52
+        thumbnail = result.xpath('.//img')[0].attrib.get('src')
53
+        title = ''.join(result.xpath(title_xpath))
54
+        content = escape(''.join(result.xpath(content_xpath)))
55
+        pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
56
+        publishedDate = parser.parse(pubdate)
57
+
58
+        # append result
59
+        results.append({'url': url,
60
+                        'title': title,
61
+                        'content': content,
62
+                        'template': 'videos.html',
63
+                        'publishedDate': publishedDate,
64
+                        'thumbnail': thumbnail})
65
+
66
+    # return results
67
+    return results

+ 9
- 3
searx/engines/flickr-noapi.py Näytä tiedosto

@@ -53,7 +53,8 @@ def response(resp):
53 53
 
54 54
     for photo in photos:
55 55
 
56
-        # In paged configuration, the first pages' photos are represented by a None object
56
+        # In paged configuration, the first pages' photos
57
+        # are represented by a None object
57 58
         if photo is None:
58 59
             continue
59 60
 
@@ -74,10 +75,15 @@ def response(resp):
74 75
 
75 76
         title = photo['title']
76 77
 
77
-        content = '<span class="photo-author">' + photo['owner']['username'] + '</span><br />'
78
+        content = '<span class="photo-author">' +\
79
+                  photo['owner']['username'] +\
80
+                  '</span><br />'
78 81
 
79 82
         if 'description' in photo:
80
-            content = content + '<span class="description">' + photo['description'] + '</span>'
83
+            content = content +\
84
+                      '<span class="description">' +\
85
+                      photo['description'] +\
86
+                      '</span>'
81 87
 
82 88
         # append result
83 89
         results.append({'url': url,

+ 19
- 12
searx/engines/flickr.py Näytä tiedosto

@@ -1,10 +1,10 @@
1 1
 #!/usr/bin/env python
2 2
 
3 3
 ## Flickr (Images)
4
-# 
4
+#
5 5
 # @website     https://www.flickr.com
6
-# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html) 
7
-# 
6
+# @provide-api yes (https://secure.flickr.com/services/api/flickr.photos.search.html)
7
+#
8 8
 # @using-api   yes
9 9
 # @results     JSON
10 10
 # @stable      yes
@@ -18,16 +18,20 @@ categories = ['images']
18 18
 
19 19
 nb_per_page = 15
20 20
 paging = True
21
-api_key= None
21
+api_key = None
22 22
 
23 23
 
24
-url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search&api_key={api_key}&{text}&sort=relevance&extras=description%2C+owner_name%2C+url_o%2C+url_z&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
24
+url = 'https://api.flickr.com/services/rest/?method=flickr.photos.search' +\
25
+      '&api_key={api_key}&{text}&sort=relevance' +\
26
+      '&extras=description%2C+owner_name%2C+url_o%2C+url_z' +\
27
+      '&per_page={nb_per_page}&format=json&nojsoncallback=1&page={page}'
25 28
 photo_url = 'https://www.flickr.com/photos/{userid}/{photoid}'
26 29
 
27 30
 paging = True
28 31
 
32
+
29 33
 def build_flickr_url(user_id, photo_id):
30
-    return photo_url.format(userid=user_id,photoid=photo_id)
34
+    return photo_url.format(userid=user_id, photoid=photo_id)
31 35
 
32 36
 
33 37
 def request(query, params):
@@ -40,7 +44,7 @@ def request(query, params):
40 44
 
41 45
 def response(resp):
42 46
     results = []
43
-    
47
+
44 48
     search_results = loads(resp.text)
45 49
 
46 50
     # return empty array if there are no results
@@ -64,11 +68,14 @@ def response(resp):
64 68
         url = build_flickr_url(photo['owner'], photo['id'])
65 69
 
66 70
         title = photo['title']
67
-        
68
-        content = '<span class="photo-author">'+ photo['ownername'] +'</span><br />'
69
-        
70
-        content = content + '<span class="description">' + photo['description']['_content'] + '</span>'
71
-        
71
+
72
+        content = '<span class="photo-author">' +\
73
+                  photo['ownername'] +\
74
+                  '</span><br />' +\
75
+                  '<span class="description">' +\
76
+                  photo['description']['_content'] +\
77
+                  '</span>'
78
+
72 79
         # append result
73 80
         results.append({'url': url,
74 81
                         'title': title,

+ 3
- 2
searx/engines/kickass.py Näytä tiedosto

@@ -24,7 +24,7 @@ search_url = url + 'search/{search_term}/{pageno}/'
24 24
 
25 25
 # specific xpath variables
26 26
 magnet_xpath = './/a[@title="Torrent magnet link"]'
27
-#content_xpath = './/font[@class="detDesc"]//text()'
27
+content_xpath = './/span[@class="font11px lightgrey block"]'
28 28
 
29 29
 
30 30
 # do search-request
@@ -56,7 +56,8 @@ def response(resp):
56 56
         link = result.xpath('.//a[@class="cellMainLink"]')[0]
57 57
         href = urljoin(url, link.attrib['href'])
58 58
         title = ' '.join(link.xpath('.//text()'))
59
-        content = escape(html.tostring(result.xpath('.//span[@class="font11px lightgrey block"]')[0], method="text"))
59
+        content = escape(html.tostring(result.xpath(content_xpath)[0],
60
+                                       method="text"))
60 61
         seed = result.xpath('.//td[contains(@class, "green")]/text()')[0]
61 62
         leech = result.xpath('.//td[contains(@class, "red")]/text()')[0]
62 63
 

+ 9
- 9
searx/engines/searchcode_code.py Näytä tiedosto

@@ -11,7 +11,6 @@
11 11
 from urllib import urlencode
12 12
 from json import loads
13 13
 import cgi
14
-import re
15 14
 
16 15
 # engine dependent config
17 16
 categories = ['it']
@@ -33,7 +32,7 @@ def request(query, params):
33 32
 # get response from search-request
34 33
 def response(resp):
35 34
     results = []
36
-    
35
+
37 36
     search_results = loads(resp.text)
38 37
 
39 38
     # parse results
@@ -41,21 +40,22 @@ def response(resp):
41 40
         href = result['url']
42 41
         title = "" + result['name'] + " - " + result['filename']
43 42
         content = result['repo'] + "<br />"
44
-        
43
+
45 44
         lines = dict()
46 45
         for line, code in result['lines'].items():
47 46
             lines[int(line)] = code
48 47
 
49 48
         content = content + '<pre class="code-formatter"><table class="code">'
50 49
         for line, code in sorted(lines.items()):
51
-            content = content + '<tr><td class="line-number" style="padding-right:5px;">' 
52
-            content = content + str(line) + '</td><td class="code-snippet">' 
53
-            # Replace every two spaces with ' &nbps;' to keep formatting while allowing the browser to break the line if necessary
54
-            content = content + cgi.escape(code).replace('\t', '    ').replace('  ', '&nbsp; ').replace('  ', ' &nbsp;') 
50
+            content = content + '<tr><td class="line-number" style="padding-right:5px;">'
51
+            content = content + str(line) + '</td><td class="code-snippet">'
52
+            # Replace every two spaces with ' &nbps;' to keep formatting
53
+            # while allowing the browser to break the line if necessary
54
+            content = content + cgi.escape(code).replace('\t', '    ').replace('  ', '&nbsp; ').replace('  ', ' &nbsp;')
55 55
             content = content + "</td></tr>"
56
-            
56
+
57 57
         content = content + "</table></pre>"
58
-        
58
+
59 59
         # append result
60 60
         results.append({'url': href,
61 61
                         'title': title,

+ 11
- 4
searx/engines/searchcode_doc.py Näytä tiedosto

@@ -31,15 +31,22 @@ def request(query, params):
31 31
 # get response from search-request
32 32
 def response(resp):
33 33
     results = []
34
-    
34
+
35 35
     search_results = loads(resp.text)
36 36
 
37 37
     # parse results
38 38
     for result in search_results['results']:
39 39
         href = result['url']
40
-        title = "[" + result['type'] + "] " + result['namespace'] + " " + result['name']
41
-        content = '<span class="highlight">[' + result['type'] + "] " + result['name'] + " " + result['synopsis'] + "</span><br />" + result['description']
42
-        
40
+        title = "[" + result['type'] + "] " +\
41
+                result['namespace'] +\
42
+                " " + result['name']
43
+        content = '<span class="highlight">[' +\
44
+                  result['type'] + "] " +\
45
+                  result['name'] + " " +\
46
+                  result['synopsis'] +\
47
+                  "</span><br />" +\
48
+                  result['description']
49
+
43 50
         # append result
44 51
         results.append({'url': href,
45 52
                         'title': title,

+ 6
- 2
searx/engines/subtitleseeker.py Näytä tiedosto

@@ -60,10 +60,14 @@ def response(resp):
60 60
 
61 61
         content = result.xpath('.//div[contains(@class,"red")]//text()')[0]
62 62
         content = content + " - "
63
-        content = content + html.tostring(result.xpath('.//div[contains(@class,"grey-web")]')[0], method='text')
63
+        text = result.xpath('.//div[contains(@class,"grey-web")]')[0]
64
+        content = content + html.tostring(text, method='text')
64 65
 
65 66
         if result.xpath(".//span") != []:
66
-            content = content + " - (" + result.xpath(".//span//text()")[0].strip() + ")"
67
+            content = content +\
68
+                      " - (" +\
69
+                      result.xpath(".//span//text()")[0].strip() +\
70
+                      ")"
67 71
 
68 72
         # append result
69 73
         results.append({'url': href,

+ 19
- 8
searx/engines/twitter.py Näytä tiedosto

@@ -1,6 +1,6 @@
1 1
 ## Twitter (Social media)
2 2
 #
3
-# @website     https://www.bing.com/news
3
+# @website     https://twitter.com/
4 4
 # @provide-api yes (https://dev.twitter.com/docs/using-search)
5 5
 #
6 6
 # @using-api   no
@@ -14,6 +14,7 @@ from urlparse import urljoin
14 14
 from urllib import urlencode
15 15
 from lxml import html
16 16
 from cgi import escape
17
+from datetime import datetime
17 18
 
18 19
 # engine dependent config
19 20
 categories = ['social media']
@@ -27,7 +28,8 @@ search_url = base_url+'search?'
27 28
 results_xpath = '//li[@data-item-type="tweet"]'
28 29
 link_xpath = './/small[@class="time"]//a'
29 30
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
30
-content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
31
+content_xpath = './/p[@class="js-tweet-text tweet-text"]'
32
+timestamp_xpath = './/span[contains(@class,"_timestamp")]'
31 33
 
32 34
 
33 35
 # do search-request
@@ -52,12 +54,21 @@ def response(resp):
52 54
         link = tweet.xpath(link_xpath)[0]
53 55
         url = urljoin(base_url, link.attrib.get('href'))
54 56
         title = ''.join(tweet.xpath(title_xpath))
55
-        content = escape(''.join(tweet.xpath(content_xpath)))
56
-
57
-        # append result
58
-        results.append({'url': url,
59
-                        'title': title,
60
-                        'content': content})
57
+        content = escape(html.tostring(tweet.xpath(content_xpath)[0], method='text', encoding='UTF-8').decode("utf-8"))
58
+        pubdate = tweet.xpath(timestamp_xpath)
59
+        if len(pubdate) > 0:
60
+            timestamp = float(pubdate[0].attrib.get('data-time'))
61
+            publishedDate = datetime.fromtimestamp(timestamp, None)
62
+            # append result
63
+            results.append({'url': url,
64
+                            'title': title,
65
+                            'content': content,
66
+                            'publishedDate': publishedDate})
67
+        else:
68
+            # append result
69
+            results.append({'url': url,
70
+                            'title': title,
71
+                            'content': content})
61 72
 
62 73
     # return results
63 74
     return results

+ 0
- 1
searx/https_rewrite.py Näytä tiedosto

@@ -154,7 +154,6 @@ def load_https_rules(rules_path):
154 154
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
155 155
 
156 156
 
157
-
158 157
 def https_url_rewrite(result):
159 158
     skip_https_rewrite = False
160 159
     # check if HTTPS rewrite is possible

+ 10
- 4
searx/search.py Näytä tiedosto

@@ -69,11 +69,16 @@ def threaded_requests(requests):
69 69
                 print('engine timeout: {0}'.format(th._engine_name))
70 70
 
71 71
 
72
-
73 72
 # get default reqest parameter
74 73
 def default_request_params():
75 74
     return {
76
-        'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}, 'verify': True}
75
+        'method': 'GET',
76
+        'headers': {},
77
+        'data': {},
78
+        'url': '',
79
+        'cookies': {},
80
+        'verify': True
81
+    }
77 82
 
78 83
 
79 84
 # create a callback wrapper for the search engine results
@@ -487,14 +492,15 @@ class Search(object):
487 492
                 continue
488 493
 
489 494
             # append request to list
490
-            requests.append((req, request_params['url'], request_args, selected_engine['name']))
495
+            requests.append((req, request_params['url'],
496
+                             request_args,
497
+                             selected_engine['name']))
491 498
 
492 499
         if not requests:
493 500
             return results, suggestions, answers, infoboxes
494 501
         # send all search-request
495 502
         threaded_requests(requests)
496 503
 
497
-
498 504
         while not results_queue.empty():
499 505
             engine_name, engine_results = results_queue.get_nowait()
500 506
 

+ 31
- 0
searx/settings.yml Näytä tiedosto

@@ -44,6 +44,10 @@ engines:
44 44
   - name : ddg definitions
45 45
     engine : duckduckgo_definitions
46 46
     shortcut : ddd
47
+    
48
+  - name : digg
49
+    engine : digg
50
+    shortcut : dg
47 51
 
48 52
   - name : wikidata
49 53
     engine : wikidata
@@ -99,6 +103,33 @@ engines:
99 103
     engine : google_news
100 104
     shortcut : gon
101 105
 
106
+  - name : google play apps
107
+    engine        : xpath
108
+    search_url    : https://play.google.com/store/search?q={query}&c=apps
109
+    url_xpath     : //a[@class="title"]/@href
110
+    title_xpath   : //a[@class="title"]
111
+    content_xpath : //a[@class="subtitle"]
112
+    categories : files
113
+    shortcut : gpa
114
+    
115
+  - name : google play movies
116
+    engine        : xpath
117
+    search_url    : https://play.google.com/store/search?q={query}&c=movies
118
+    url_xpath     : //a[@class="title"]/@href
119
+    title_xpath   : //a[@class="title"]
120
+    content_xpath : //a[@class="subtitle"]
121
+    categories : videos
122
+    shortcut : gpm
123
+    
124
+  - name : google play music
125
+    engine        : xpath
126
+    search_url    : https://play.google.com/store/search?q={query}&c=music
127
+    url_xpath     : //a[@class="title"]/@href
128
+    title_xpath   : //a[@class="title"]
129
+    content_xpath : //a[@class="subtitle"]
130
+    categories : music
131
+    shortcut : gps
132
+    
102 133
   - name : openstreetmap
103 134
     engine : openstreetmap
104 135
     shortcut : osm

+ 3
- 2
searx/utils.py Näytä tiedosto

@@ -30,8 +30,9 @@ def gen_useragent():
30 30
 
31 31
 
32 32
 def searx_useragent():
33
-    return 'searx/{searx_version} {suffix}'.format(searx_version=VERSION_STRING,
34
-                                          suffix=settings['server'].get('useragent_suffix', ''))
33
+    return 'searx/{searx_version} {suffix}'.format(
34
+           searx_version=VERSION_STRING,
35
+           suffix=settings['server'].get('useragent_suffix', ''))
35 36
 
36 37
 
37 38
 def highlight_content(content, query):