improving publishDate extraction and output of it

11 年前 · 018a14431b
--- a/requirements.txt
+++ b/requirements.txt
 
															 grequests
														
 
															 lxml
														
 
															 pyyaml
														
 
															+python-dateutil
														
--- a/searx/engines/google_news.py
+++ b/searx/engines/google_news.py
 
															 from urllib import urlencode
														
 
															 from json import loads
														
 
															+from dateutil import parser
														
 
															 from datetime import datetime
														
 
															 categories = ['news']
														
 
															         return []
														
 
															     for result in search_res['responseData']['results']:
														
 
															-# S.149 (159), library.pdf
														
 
															-# datetime.strptime("Mon, 10 Mar 2014 16:26:15 -0700",
														
 
															-#                   "%a, %d %b %Y %H:%M:%S %z")
														
 
															-#        publishedDate = parse(result['publishedDate'])
														
 
															-        publishedDate = datetime.strptime(
														
 
															-            str.join(' ', result['publishedDate'].split(None)[0:5]),
														
 
															-            "%a, %d %b %Y %H:%M:%S")
														
 
															-        #utc_offset = timedelta(result['publishedDate'].split(None)[5])
														
 
															-        # local = utc + offset
														
 
															-        #publishedDate = publishedDate + utc_offset
														
 
															+
														
 
															+# Mon, 10 Mar 2014 16:26:15 -0700
														
 
															+        publishedDate = parser.parse(result['publishedDate'])
														
 
															         results.append({'url': result['unescapedUrl'],
														
 
															                         'title': result['titleNoFormatting'],
														
--- a/searx/engines/vimeo.py
+++ b/searx/engines/vimeo.py
 
															 from HTMLParser import HTMLParser
														
 
															 from lxml import html
														
 
															 from xpath import extract_text
														
 
															+from datetime import datetime
														
 
															+from dateutil import parser
														
 
															 base_url = 'http://vimeo.com'
														
 
															 search_url = base_url + '/search?{query}'
														
 
															 title_xpath = None
														
 
															 results_xpath = ''
														
 
															 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
														
 
															+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
														
 
															 # the cookie set by vimeo contains all the following values,
														
 
															 # but only __utma seems to be requiered
														
 
															         url = base_url + result.xpath(url_xpath)[0]
														
 
															         title = p.unescape(extract_text(result.xpath(title_xpath)))
														
 
															         thumbnail = extract_text(result.xpath(content_xpath)[0])
														
 
															+        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
														
 
															+
														
 
															         results.append({'url': url,
														
 
															                         'title': title,
														
 
															                         'content': content_tpl.format(url, title, thumbnail),
														
 
															                         'template': 'videos.html',
														
 
															+                        'publishedDate': publishedDate,
														
 
															                         'thumbnail': thumbnail})
														
 
															     return results
														
--- a/searx/engines/yahoo_news.py
+++ b/searx/engines/yahoo_news.py
 
															 from searx.engines.yahoo import parse_url
														
 
															 from datetime import datetime, timedelta
														
 
															 import re
														
 
															+from dateutil import parser
														
 
															 categories = ['news']
														
 
															 search_url = 'http://news.search.yahoo.com/search?{query}&b={offset}'
														
 
															                     - timedelta(hours=int(timeNumbers[0]))\
														
 
															                     - timedelta(minutes=int(timeNumbers[1]))
														
 
															             else:
														
 
															-                # TODO year in string possible?
														
 
															-                publishedDate = datetime.strptime(publishedDate,
														
 
															-                                                  "%b %d %H:%M%p")
														
 
															+                publishedDate = parser.parse(publishedDate)
														
 
															         if publishedDate.year == 1900:
														
 
															             publishedDate = publishedDate.replace(year=datetime.now().year)
														
--- a/searx/engines/youtube.py
+++ b/searx/engines/youtube.py
 
															 from json import loads
														
 
															 from urllib import urlencode
														
 
															+from dateutil import parser
														
 
															+from datetime import datetime
														
 
															 categories = ['videos']
														
 
															         content = ''
														
 
															         thumbnail = ''
														
 
															+#"2013-12-31T15:22:51.000Z"
														
 
															+        pubdate = result['published']['$t']
														
 
															+        publishedDate = parser.parse(pubdate)
														
 
															+
														
 
															         if result['media$group']['media$thumbnail']:
														
 
															             thumbnail = result['media$group']['media$thumbnail'][0]['url']
														
 
															             content += '<a href="{0}" title="{0}" ><img src="{1}" /></a>'.format(url, thumbnail)  # noqa
														
 
															                         'title': title,
														
 
															                         'content': content,
														
 
															                         'template': 'videos.html',
														
 
															+                        'publishedDate': publishedDate,
														
 
															                         'thumbnail': thumbnail})
														
 
															     return results
														
--- a/searx/templates/opensearch_response_rss.xml
+++ b/searx/templates/opensearch_response_rss.xml
 
															       <title>{{ r.title }}</title>
														
 
															       <link>{{ r.url }}</link>
														
 
															       <description>{{ r.content }}</description>
														
 
															+      {% if r.pubdate %}<pubDate>{{ r.pubdate }}</pubDate>{% endif %}
														
 
															     </item>
														
 
															     {% endfor %}
														
 
															   </channel>
														
--- a/searx/templates/result_templates/videos.html
+++ b/searx/templates/result_templates/videos.html
 
															     <p>
														
 
															       <h3 class="result_title"><a href="{{ result.url }}">{{ result.title|safe }}</a></h3>
														
 
															+      {% if result.publishedDate %}<p class="published_date">{{ result.publishedDate }}</p>{% endif %}
														
 
															       <a href="{{ result.url }}"><img width="400px" src="{{ result.thumbnail }}" title={{ result.title }} alt=" {{ result.title }}"/></a>
														
 
															       <p class="url">{{ result.url }}</p>
														
 
															     </p>
														
--- a/searx/webapp.py
+++ b/searx/webapp.py
 
															         # TODO, check if timezone is calculated right
														
 
															         if 'publishedDate' in result:
														
 
															-            if result['publishedDate'] >= datetime.now() - timedelta(days=1):
														
 
															-                timedifference = datetime.now() - result['publishedDate']
														
 
															+            if result['publishedDate'].replace(tzinfo=None) >= datetime.now() - timedelta(days=1):
														
 
															+                timedifference = datetime.now() - result['publishedDate'].replace(tzinfo=None)
														
 
															                 minutes = int((timedifference.seconds / 60) % 60)
														
 
															                 hours = int(timedifference.seconds / 60 / 60)
														
 
															                 if hours == 0:
														
 
															                 else:
														
 
															                     result['publishedDate'] = gettext(u'{hours} hour(s), {minutes} minute(s) ago').format(hours=hours, minutes=minutes)  # noqa
														
 
															             else:
														
 
															+                result['pubdate'] = result['publishedDate'].strftime('%a, %d %b %Y %H:%M:%S %z')
														
 
															                 result['publishedDate'] = format_date(result['publishedDate'])
														
 
															     if search.request_data.get('format') == 'json':
														
--- a/setup.py
+++ b/setup.py
 
															         'lxml',
														
 
															         'pyyaml',
														
 
															         'setuptools',
														
 
															+        'python-dateutil',
														
 
															     ],
														
 
															     extras_require={
														
 
															         'test': [