Bläddra i källkod

Digg + Twitter corrections

Digg engines, with thumbnails
Add pubdate for twitter
Cqoicebordel 10 år sedan
förälder
incheckning
e7e2981536
3 ändrade filer med 86 tillägg och 6 borttagningar
  1. 66
    0
      searx/engines/digg.py
  2. 16
    6
      searx/engines/twitter.py
  3. 4
    0
      searx/settings.yml

+ 66
- 0
searx/engines/digg.py Visa fil

1
+## Digg (News, Social media)
2
+#
3
+# @website     https://digg.com/
4
+# @provide-api no
5
+#
6
+# @using-api   no
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, content, publishedDate, thumbnail
10
+
11
+from urllib import quote_plus
12
+from json import loads
13
+from lxml import html
14
+from cgi import escape
15
+from dateutil import parser
16
+
17
+# engine dependent config
18
+categories = ['news', 'social media']
19
+paging = True
20
+
21
+# search-url
22
+base_url = 'https://digg.com/'
23
+search_url = base_url+'api/search/{query}.json?position={position}&format=html'
24
+
25
+# specific xpath variables
26
+results_xpath = '//article'
27
+link_xpath = './/small[@class="time"]//a'
28
+title_xpath = './/h2//a//text()'
29
+content_xpath = './/p//text()'
30
+pubdate_xpath = './/time'
31
+
32
+
33
+# do search-request
34
+def request(query, params):
35
+    offset = (params['pageno'] - 1) * 10
36
+    params['url'] = search_url.format(position=offset,
37
+                                      query=quote_plus(query))
38
+    return params
39
+
40
+
41
+# get response from search-request
42
+def response(resp):
43
+    results = []
44
+
45
+    search_result = loads(resp.text)
46
+
47
+    dom = html.fromstring(search_result['html'])
48
+
49
+    # parse results
50
+    for result in dom.xpath(results_xpath):
51
+        url = result.attrib.get('data-contenturl')
52
+        thumbnail = result.xpath('.//img')[0].attrib.get('src')
53
+        title = ''.join(result.xpath(title_xpath))
54
+        content = escape(''.join(result.xpath(content_xpath)))
55
+        publishedDate = parser.parse(result.xpath(pubdate_xpath)[0].attrib.get('datetime'))
56
+
57
+        # append result
58
+        results.append({'url': url,
59
+                        'title': title,
60
+                        'content': content,
61
+                        'template': 'videos.html',
62
+                        'publishedDate': publishedDate,
63
+                        'thumbnail': thumbnail})
64
+
65
+    # return results
66
+    return results

+ 16
- 6
searx/engines/twitter.py Visa fil

1
 ## Twitter (Social media)
1
 ## Twitter (Social media)
2
 #
2
 #
3
-# @website     https://www.bing.com/news
3
+# @website     https://twitter.com/
4
 # @provide-api yes (https://dev.twitter.com/docs/using-search)
4
 # @provide-api yes (https://dev.twitter.com/docs/using-search)
5
 #
5
 #
6
 # @using-api   no
6
 # @using-api   no
14
 from urllib import urlencode
14
 from urllib import urlencode
15
 from lxml import html
15
 from lxml import html
16
 from cgi import escape
16
 from cgi import escape
17
+from datetime import datetime
17
 
18
 
18
 # engine dependent config
19
 # engine dependent config
19
 categories = ['social media']
20
 categories = ['social media']
28
 link_xpath = './/small[@class="time"]//a'
29
 link_xpath = './/small[@class="time"]//a'
29
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
30
 title_xpath = './/span[@class="username js-action-profile-name"]//text()'
30
 content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
31
 content_xpath = './/p[@class="js-tweet-text tweet-text"]//text()'
32
+timestamp_xpath = './/span[contains(@class,"_timestamp")]'
31
 
33
 
32
 
34
 
33
 # do search-request
35
 # do search-request
53
         url = urljoin(base_url, link.attrib.get('href'))
55
         url = urljoin(base_url, link.attrib.get('href'))
54
         title = ''.join(tweet.xpath(title_xpath))
56
         title = ''.join(tweet.xpath(title_xpath))
55
         content = escape(''.join(tweet.xpath(content_xpath)))
57
         content = escape(''.join(tweet.xpath(content_xpath)))
56
-
57
-        # append result
58
-        results.append({'url': url,
59
-                        'title': title,
60
-                        'content': content})
58
+        pubdate = tweet.xpath(timestamp_xpath)
59
+        if len(pubdate) > 0:
60
+            publishedDate = datetime.fromtimestamp(float(pubdate[0].attrib.get('data-time')), None)
61
+            # append result
62
+            results.append({'url': url,
63
+                            'title': title,
64
+                            'content': content,
65
+                            'publishedDate': publishedDate})
66
+        else:
67
+            # append result
68
+            results.append({'url': url,
69
+                            'title': title,
70
+                            'content': content})
61
 
71
 
62
     # return results
72
     # return results
63
     return results
73
     return results

+ 4
- 0
searx/settings.yml Visa fil

44
   - name : ddg definitions
44
   - name : ddg definitions
45
     engine : duckduckgo_definitions
45
     engine : duckduckgo_definitions
46
     shortcut : ddd
46
     shortcut : ddd
47
+    
48
+  - name : digg
49
+    engine : digg
50
+    shortcut : dg
47
 
51
 
48
   - name : wikidata
52
   - name : wikidata
49
     engine : wikidata
53
     engine : wikidata