|
@@ -57,12 +57,16 @@ def response(resp):
|
57
|
57
|
link = result.xpath('.//div[@class="newstitle"]/a')[0]
|
58
|
58
|
url = link.attrib.get('href')
|
59
|
59
|
title = ' '.join(link.xpath('.//text()'))
|
60
|
|
- contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')
|
|
60
|
+ contentXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
|
61
|
+ '//span[@class="sn_snip"]//text()')
|
61
|
62
|
if contentXPath is not None:
|
62
|
63
|
content = escape(' '.join(contentXPath))
|
63
|
64
|
|
64
|
65
|
# parse publishedDate
|
65
|
|
- publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div//span[contains(@class,"sn_ST")]//span[contains(@class,"sn_tm")]//text()')
|
|
66
|
+ publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
|
|
67
|
+ '//span[contains(@class,"sn_ST")]'
|
|
68
|
+ '//span[contains(@class,"sn_tm")]'
|
|
69
|
+ '//text()')
|
66
|
70
|
if publishedDateXPath is not None:
|
67
|
71
|
publishedDate = escape(' '.join(publishedDateXPath))
|
68
|
72
|
|
|
@@ -74,7 +78,8 @@ def response(resp):
|
74
|
78
|
timeNumbers = re.findall(r'\d+', publishedDate)
|
75
|
79
|
publishedDate = datetime.now()\
|
76
|
80
|
- timedelta(hours=int(timeNumbers[0]))
|
77
|
|
- elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
|
|
81
|
+ elif re.match("^[0-9]+ hour(s|),"
|
|
82
|
+ " [0-9]+ minute(s|) ago$", publishedDate):
|
78
|
83
|
timeNumbers = re.findall(r'\d+', publishedDate)
|
79
|
84
|
publishedDate = datetime.now()\
|
80
|
85
|
- timedelta(hours=int(timeNumbers[0]))\
|