Sfoglia il codice sorgente

Add Reddit search engine

Kirill Isakov 9 anni fa
parent
commit
d026a97e42
3 ha cambiato i file con 148 aggiunte e 0 eliminazioni
  1. 74
    0
      searx/engines/reddit.py
  2. 7
    0
      searx/settings.yml
  3. 67
    0
      tests/unit/engines/test_reddit.py

+ 74
- 0
searx/engines/reddit.py Vedi File

@@ -0,0 +1,74 @@
1
+"""
2
+ Reddit
3
+
4
+ @website      https://www.reddit.com/
5
+ @provide-api  yes (https://www.reddit.com/dev/api)
6
+
7
+ @using-api    yes
8
+ @results      JSON
9
+ @stable       yes
10
+ @parse        url, title, content, thumbnail, publishedDate
11
+"""
12
+
13
+import json
14
+from cgi import escape
15
+from urllib import urlencode
16
+from urlparse import urlparse
17
+from datetime import datetime
18
+
19
+# engine dependent config
20
+categories = ['general', 'images', 'news', 'social media']
21
+page_size = 25
22
+
23
+# search-url
24
+search_url = 'https://www.reddit.com/search.json?{query}'
25
+
26
+
27
+# do search-request
28
+def request(query, params):
29
+    query = urlencode({'q': query,
30
+                       'limit': page_size})
31
+    params['url'] = search_url.format(query=query)
32
+
33
+    return params
34
+
35
+
36
+# get response from search-request
37
+def response(resp):
38
+    img_results = []
39
+    text_results = []
40
+
41
+    search_results = json.loads(resp.text)
42
+
43
+    # return empty array if there are no results
44
+    if 'data' not in search_results:
45
+        return []
46
+
47
+    posts = search_results.get('data', {}).get('children', [])
48
+
49
+    # process results
50
+    for post in posts:
51
+        data = post['data']
52
+
53
+        # extract post information
54
+        params = {
55
+            'url': data['url'],
56
+            'title': data['title']
57
+        }
58
+
59
+        # if thumbnail field contains a valid URL, we need to change template
60
+        thumbnail = data['thumbnail']
61
+        url_info = urlparse(thumbnail)
62
+        # netloc & path
63
+        if url_info[1] != '' and url_info[2] != '':
64
+            params['thumbnail_src'] = thumbnail
65
+            params['template'] = 'images.html'
66
+            img_results.append(params)
67
+        else:
68
+            created = datetime.fromtimestamp(data['created_utc'])
69
+            params['content'] = escape(data['selftext'])
70
+            params['publishedDate'] = created
71
+            text_results.append(params)
72
+
73
+    # show images first and text results second
74
+    return img_results + text_results

+ 7
- 0
searx/settings.yml Vedi File

@@ -213,6 +213,13 @@ engines:
213 213
     shortcut : qws
214 214
     categories : social media
215 215
 
216
+  - name : reddit
217
+    engine : reddit
218
+    shortcut : re
219
+    page_size : 25
220
+    timeout : 10.0
221
+    disabled : True
222
+
216 223
   - name : kickass
217 224
     engine : kickass
218 225
     shortcut : ka

+ 67
- 0
tests/unit/engines/test_reddit.py Vedi File

@@ -0,0 +1,67 @@
1
+from collections import defaultdict
2
+import mock
3
+from searx.engines import reddit
4
+from searx.testing import SearxTestCase
5
+from datetime import datetime
6
+
7
+
8
+class TestRedditEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dic = defaultdict(dict)
13
+        params = reddit.request(query, dic)
14
+        self.assertTrue('url' in params)
15
+        self.assertTrue(query in params['url'])
16
+        self.assertTrue('reddit.com' in params['url'])
17
+
18
+    def test_response(self):
19
+        resp = mock.Mock(text='{}')
20
+        self.assertEqual(reddit.response(resp), [])
21
+
22
+        json = """
23
+        {
24
+            "kind": "Listing",
25
+            "data": {
26
+                "children": [{
27
+                    "data": {
28
+                        "url": "http://google.com/",
29
+                        "title": "Title number one",
30
+                        "selftext": "Sample",
31
+                        "created_utc": 1401219957.0,
32
+                        "thumbnail": "http://image.com/picture.jpg"
33
+                    }
34
+                }, {
35
+                    "data": {
36
+                        "url": "https://reddit.com/",
37
+                        "title": "Title number two",
38
+                        "selftext": "Dominus vobiscum",
39
+                        "created_utc": 1438792533.0,
40
+                        "thumbnail": "self"
41
+                    }
42
+                }]
43
+            }
44
+        }
45
+        """
46
+
47
+        resp = mock.Mock(text=json)
48
+        results = reddit.response(resp)
49
+
50
+        self.assertEqual(len(results), 2)
51
+        self.assertEqual(type(results), list)
52
+
53
+        # testing first result (picture)
54
+        r = results[0]
55
+        self.assertEqual(r['url'], 'http://google.com/')
56
+        self.assertEqual(r['title'], 'Title number one')
57
+        self.assertEqual(r['template'], 'images.html')
58
+        self.assertEqual(r['thumbnail_src'], 'http://image.com/picture.jpg')
59
+
60
+        # testing second result (self-post)
61
+        r = results[1]
62
+        self.assertEqual(r['url'], 'https://reddit.com/')
63
+        self.assertEqual(r['title'], 'Title number two')
64
+        self.assertEqual(r['content'], 'Dominus vobiscum')
65
+        created = datetime.fromtimestamp(1438792533.0)
66
+        self.assertEqual(r['publishedDate'], created)
67
+        self.assertTrue('thumbnail_src' not in r)