Browse Source

extract publishDate from vimeo

Thomas Pointhuber 11 years ago
parent
commit
993271bed3
2 changed files with 7 additions and 1 deletions
  1. 6
    0
      searx/engines/vimeo.py
  2. 1
    1
      searx/engines/yahoo_news.py

+ 6
- 0
searx/engines/vimeo.py View File

2
 from HTMLParser import HTMLParser
2
 from HTMLParser import HTMLParser
3
 from lxml import html
3
 from lxml import html
4
 from xpath import extract_text
4
 from xpath import extract_text
5
+from datetime import datetime
6
+from dateutil import parser
5
 
7
 
6
 base_url = 'http://vimeo.com'
8
 base_url = 'http://vimeo.com'
7
 search_url = base_url + '/search?{query}'
9
 search_url = base_url + '/search?{query}'
10
 title_xpath = None
12
 title_xpath = None
11
 results_xpath = ''
13
 results_xpath = ''
12
 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
14
 content_tpl = '<a href="{0}">  <img src="{2}"/> </a>'
15
+publishedDate_xpath = './/p[@class="meta"]//attribute::datetime'
13
 
16
 
14
 # the cookie set by vimeo contains all the following values,
17
 # the cookie set by vimeo contains all the following values,
15
 # but only __utma seems to be requiered
18
 # but only __utma seems to be requiered
40
         url = base_url + result.xpath(url_xpath)[0]
43
         url = base_url + result.xpath(url_xpath)[0]
41
         title = p.unescape(extract_text(result.xpath(title_xpath)))
44
         title = p.unescape(extract_text(result.xpath(title_xpath)))
42
         thumbnail = extract_text(result.xpath(content_xpath)[0])
45
         thumbnail = extract_text(result.xpath(content_xpath)[0])
46
+        publishedDate = parser.parse(extract_text(result.xpath(publishedDate_xpath)[0]))
47
+
43
         results.append({'url': url,
48
         results.append({'url': url,
44
                         'title': title,
49
                         'title': title,
45
                         'content': content_tpl.format(url, title, thumbnail),
50
                         'content': content_tpl.format(url, title, thumbnail),
46
                         'template': 'videos.html',
51
                         'template': 'videos.html',
52
+                        'publishedDate': publishedDate,
47
                         'thumbnail': thumbnail})
53
                         'thumbnail': thumbnail})
48
     return results
54
     return results

+ 1
- 1
searx/engines/yahoo_news.py View File

53
                     - timedelta(hours=int(timeNumbers[0]))\
53
                     - timedelta(hours=int(timeNumbers[0]))\
54
                     - timedelta(minutes=int(timeNumbers[1]))
54
                     - timedelta(minutes=int(timeNumbers[1]))
55
             else:
55
             else:
56
-                publishedDate =parser.parse(publishedDate)
56
+                publishedDate = parser.parse(publishedDate)
57
 
57
 
58
         if publishedDate.year == 1900:
58
         if publishedDate.year == 1900:
59
             publishedDate = publishedDate.replace(year=datetime.now().year)
59
             publishedDate = publishedDate.replace(year=datetime.now().year)