Browse Source

Merge pull request #57 from pointhi/results

improving publishDate extraction and output of it
Adam Tauber 11 years ago
parent
commit
018a14431b

+ 1
- 0
requirements.txt View File

3
 grequests
3
 grequests
4
 lxml
4
 lxml
5
 pyyaml
5
 pyyaml
6
+python-dateutil

+ 4
- 10
searx/engines/google_news.py View File

2
 
2
 
3
 from urllib import urlencode
3
 from urllib import urlencode
4
 from json import loads
4
 from json import loads
5
+from dateutil import parser
5
 from datetime import datetime
6
 from datetime import datetime
6
 
7
 
7
 categories = ['news']
8
 categories = ['news']
32
         return []
33
         return []
33
 
34
 
34
     for result in search_res['responseData']['results']:
35
     for result in search_res['responseData']['results']:
35
-# S.149 (159), library.pdf
36
-# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700",
37
-#                   "%a, %d %b %Y %H:%M:%S %z")
38
-#        publishedDate = parse(result['publishedDate'])
39
-        publishedDate = datetime.strptime(
40
-            str.join(' ', result['publishedDate'].split(None)[0:5]),
41
-            "%a, %d %b %Y %H:%M:%S")
42
-        #utc_offset = timedelta(result['publishedDate'].split(None)[5])
43
-        # local = utc + offset
44
-        #publishedDate = publishedDate + utc_offset
36
+
37
+# Mon, 10 Mar 2014 16:26:15 -0700
38
+        publishedDate = parser.parse(result['publishedDate'])
45
 
39
 
46
         results.append({'url': result['unescapedUrl'],
40
         results.append({'url': result['unescapedUrl'],
47
                         'title': result['titleNoFormatting'],
41
                         'title': result['titleNoFormatting'],

+ 6
- 0
searx/engines/vimeo.py View File

2
 from HTMLParser import HTMLParser
2
 from HTMLParser import HTMLParser
3
 from lxml import html
3
 from lxml import html
4
 from xpath import extract_text
4
 from xpath import extract_text
5
+from datetime import datetime
6
+from dateutil import parser
5
 
7
 
6
 base_url = 'http://vimeo.com'
8
 base_url = 'http://vimeo.com'
7
 search_url = base_url + '/search?{query}'
9
 search_url = base_url + '/search?{query}'
10
 title_xpath = None
12
 title_xpath = None
11
 results_xpath = ''
13
 results_xpath = ''
12
 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
14
 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
15
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
13
 
16
 
14
 # the cookie set by vimeo contains all the following values,
17
 # the cookie set by vimeo contains all the following values,
15
 # but only __utma seems to be requiered
18
 # but only __utma seems to be requiered
40
         url = base_url + result.xpath(url_xpath)[0]
43
         url = base_url + result.xpath(url_xpath)[0]
41
         title = p.unescape(extract_text(result.xpath(title_xpath)))
44
         title = p.unescape(extract_text(result.xpath(title_xpath)))
42
         thumbnail = extract_text(result.xpath(content_xpath)[0])
45
         thumbnail = extract_text(result.xpath(content_xpath)[0])
46
+        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
47
+
43
         results.append({'url': url,
48
         results.append({'url': url,
44
                         'title': title,
49
                         'title': title,
45
                         'content': content_tpl.format(url, title, thumbnail),
50
                         'content': content_tpl.format(url, title, thumbnail),
46
                         'template': 'videos.html',
51
                         'template': 'videos.html',
52
+                        'publishedDate': publishedDate,
47
                         'thumbnail': thumbnail})
53
                         'thumbnail': thumbnail})
48
     return results
54
     return results

+ 2
- 3
searx/engines/yahoo_news.py View File

6
 from searx.engines.yahoo import parse_url
6
 from searx.engines.yahoo import parse_url
7
 from datetime import datetime, timedelta
7
 from datetime import datetime, timedelta
8
 import re
8
 import re
9
+from dateutil import parser
9
 
10
 
10
 categories = ['news']
11
 categories = ['news']
11
 search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
12
 search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
52
                     - timedelta(hours=int(timeNumbers[0]))\
53
                     - timedelta(hours=int(timeNumbers[0]))\
53
                     - timedelta(minutes=int(timeNumbers[1]))
54
                     - timedelta(minutes=int(timeNumbers[1]))
54
             else:
55
             else:
55
-                # TODO year in string possible?
56
-                publishedDate = datetime.strptime(publishedDate,
57
-                                                  "%b %d %H:%M%p")
56
+                publishedDate = parser.parse(publishedDate)
58
 
57
 
59
         if publishedDate.year == 1900:
58
         if publishedDate.year == 1900:
60
             publishedDate = publishedDate.replace(year=datetime.now().year)
59
             publishedDate = publishedDate.replace(year=datetime.now().year)

+ 7
- 0
searx/engines/youtube.py View File

1
 from json import loads
1
 from json import loads
2
 from urllib import urlencode
2
 from urllib import urlencode
3
+from dateutil import parser
4
+from datetime import datetime
3
 
5
 
4
 categories = ['videos']
6
 categories = ['videos']
5
 
7
 
35
         content = ''
37
         content = ''
36
         thumbnail = ''
38
         thumbnail = ''
37
 
39
 
40
+#"2013-12-31T15:22:51.000Z"
41
+        pubdate = result['published']['$t']
42
+        publishedDate = parser.parse(pubdate)
43
+
38
         if result['media$group']['media$thumbnail']:
44
         if result['media$group']['media$thumbnail']:
39
             thumbnail = result['media$group']['media$thumbnail'][0]['url']
45
             thumbnail = result['media$group']['media$thumbnail'][0]['url']
40
             content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail)  # noqa
46
             content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail)  # noqa
48
                         'title': title,
54
                         'title': title,
49
                         'content': content,
55
                         'content': content,
50
                         'template': 'videos.html',
56
                         'template': 'videos.html',
57
+                        'publishedDate': publishedDate,
51
                         'thumbnail': thumbnail})
58
                         'thumbnail': thumbnail})
52
 
59
 
53
     return results
60
     return results

+ 1
- 0
searx/templates/opensearch_response_rss.xml View File

16
       <title>{{ r.title }}</title>
16
       <title>{{ r.title }}</title>
17
       <link>{{ r.url }}</link>
17
       <link>{{ r.url }}</link>
18
       <description>{{ r.content }}</description>
18
       <description>{{ r.content }}</description>
19
+      {% if r.pubdate %}<pubDate>{{ r.pubdate }}</pubDate>{% endif %}
19
     </item>
20
     </item>
20
     {% endfor %}
21
     {% endfor %}
21
   </channel>
22
   </channel>

+ 1
- 0
searx/templates/result_templates/videos.html View File

5
 
5
 
6
     <p>
6
     <p>
7
       <h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
7
       <h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
8
+      {% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %}
8
       <a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
9
       <a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
9
       <p class="url">{{ result.url }}</p>
10
       <p class="url">{{ result.url }}</p>
10
     </p>
11
     </p>

+ 3
- 2
searx/webapp.py View File

159
 
159
 
160
         # TODO, check if timezone is calculated right
160
         # TODO, check if timezone is calculated right
161
         if 'publishedDate' in result:
161
         if 'publishedDate' in result:
162
-            if result['publishedDate'] >= datetime.now() - timedelta(days=1):
163
-                timedifference = datetime.now() - result['publishedDate']
162
+            if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
163
+                timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
164
                 minutes = int((timedifference.seconds / 60) % 60)
164
                 minutes = int((timedifference.seconds / 60) % 60)
165
                 hours = int(timedifference.seconds / 60 / 60)
165
                 hours = int(timedifference.seconds / 60 / 60)
166
                 if hours == 0:
166
                 if hours == 0:
168
                 else:
168
                 else:
169
                     result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
169
                     result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
170
             else:
170
             else:
171
+                result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S %z')
171
                 result['publishedDate'] = format_date(result['publishedDate'])
172
                 result['publishedDate'] = format_date(result['publishedDate'])
172
 
173
 
173
     if search.request_data.get('format') == 'json':
174
     if search.request_data.get('format') == 'json':

+ 1
- 0
setup.py View File

35
         'lxml',
35
         'lxml',
36
         'pyyaml',
36
         'pyyaml',
37
         'setuptools',
37
         'setuptools',
38
+        'python-dateutil',
38
     ],
39
     ],
39
     extras_require={
40
     extras_require={
40
         'test': [
41
         'test': [