bing_news.py 2.8KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. ## Bing (News)
  2. #
  3. # @website https://www.bing.com/news
  4. # @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
  5. #
  6. # @using-api no (because of query limit)
  7. # @results HTML (using search portal)
  8. # @stable no (HTML can change)
  9. # @parse url, title, content, publishedDate
  10. from urllib import urlencode
  11. from cgi import escape
  12. from lxml import html
  13. from datetime import datetime, timedelta
  14. from dateutil import parser
  15. import re
  16. # engine dependent config
  17. categories = ['news']
  18. paging = True
  19. language_support = True
  20. # search-url
  21. base_url = 'https://www.bing.com/'
  22. search_string = 'news/search?{query}&first={offset}'
  23. # do search-request
  24. def request(query, params):
  25. offset = (params['pageno'] - 1) * 10 + 1
  26. if params['language'] == 'all':
  27. language = 'en-US'
  28. else:
  29. language = params['language'].replace('_', '-')
  30. search_path = search_string.format(
  31. query=urlencode({'q': query, 'setmkt': language}),
  32. offset=offset)
  33. params['cookies']['SRCHHPGUSR'] = \
  34. 'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
  35. params['url'] = base_url + search_path
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. dom = html.fromstring(resp.content)
  41. # parse results
  42. for result in dom.xpath('//div[@class="sn_r"]'):
  43. link = result.xpath('.//div[@class="newstitle"]/a')[0]
  44. url = link.attrib.get('href')
  45. title = ' '.join(link.xpath('.//text()'))
  46. content = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]//text()')))
  47. # parse publishedDate
  48. publishedDate = escape(' '.join(result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_ST"]//span[@class="sn_tm"]//text()')))
  49. if re.match("^[0-9]+ minute(s|) ago$", publishedDate):
  50. timeNumbers = re.findall(r'\d+', publishedDate)
  51. publishedDate = datetime.now()\
  52. - timedelta(minutes=int(timeNumbers[0]))
  53. elif re.match("^[0-9]+ hour(s|) ago$", publishedDate):
  54. timeNumbers = re.findall(r'\d+', publishedDate)
  55. publishedDate = datetime.now()\
  56. - timedelta(hours=int(timeNumbers[0]))
  57. elif re.match("^[0-9]+ hour(s|), [0-9]+ minute(s|) ago$", publishedDate):
  58. timeNumbers = re.findall(r'\d+', publishedDate)
  59. publishedDate = datetime.now()\
  60. - timedelta(hours=int(timeNumbers[0]))\
  61. - timedelta(minutes=int(timeNumbers[1]))
  62. else:
  63. publishedDate = parser.parse(publishedDate)
  64. # append result
  65. results.append({'url': url,
  66. 'title': title,
  67. 'publishedDate': publishedDate,
  68. 'content': content})
  69. # return results
  70. return results