|
@@ -12,6 +12,8 @@
|
12
|
12
|
|
13
|
13
|
from lxml import html
|
14
|
14
|
from cgi import escape
|
|
15
|
+from dateutil import parser
|
|
16
|
+from datetime import datetime, timedelta
|
15
|
17
|
import re
|
16
|
18
|
from searx.engines.xpath import extract_text
|
17
|
19
|
|
|
@@ -79,15 +81,44 @@ def response(resp):
|
79
|
81
|
|
80
|
82
|
title = escape(extract_text(link))
|
81
|
83
|
|
82
|
|
- if result.xpath('./p[@class="desc"]'):
|
83
|
|
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
|
|
84
|
+ if result.xpath('./p[@class="desc clk"]'):
|
|
85
|
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
|
84
|
86
|
else:
|
85
|
87
|
content = ''
|
86
|
88
|
|
87
|
|
- # append result
|
88
|
|
- results.append({'url': url,
|
89
|
|
- 'title': title,
|
90
|
|
- 'content': content})
|
|
89
|
+ published_date = None
|
|
90
|
+
|
|
91
|
+ # check if search result starts with something like: "2 Sep 2014 ... "
|
|
92
|
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
|
|
93
|
+ date_pos = content.find('...')+4
|
|
94
|
+ date_string = content[0:date_pos-5]
|
|
95
|
+ published_date = parser.parse(date_string, dayfirst=True)
|
|
96
|
+
|
|
97
|
+ # fix content string
|
|
98
|
+ content = content[date_pos:]
|
|
99
|
+
|
|
100
|
+ # check if search result starts with something like: "5 days ago ... "
|
|
101
|
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
|
|
102
|
+ date_pos = content.find('...')+4
|
|
103
|
+ date_string = content[0:date_pos-5]
|
|
104
|
+
|
|
105
|
+ # calculate datetime
|
|
106
|
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
|
|
107
|
+
|
|
108
|
+ # fix content string
|
|
109
|
+ content = content[date_pos:]
|
|
110
|
+
|
|
111
|
+ if published_date:
|
|
112
|
+ # append result
|
|
113
|
+ results.append({'url': url,
|
|
114
|
+ 'title': title,
|
|
115
|
+ 'content': content,
|
|
116
|
+ 'publishedDate': published_date})
|
|
117
|
+ else:
|
|
118
|
+ # append result
|
|
119
|
+ results.append({'url': url,
|
|
120
|
+ 'title': title,
|
|
121
|
+ 'content': content})
|
91
|
122
|
|
92
|
123
|
# return results
|
93
|
124
|
return results
|