twitter.py 2.3KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. """
  2. Twitter (Social media)
  3. @website https://twitter.com/
  4. @provide-api yes (https://dev.twitter.com/docs/using-search)
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. @todo publishedDate
  10. """
  11. from urlparse import urljoin
  12. from urllib import urlencode
  13. from lxml import html
  14. from datetime import datetime
  15. from searx.engines.xpath import extract_text
  16. # engine dependent config
  17. categories = ['social media']
  18. language_support = True
  19. # search-url
  20. base_url = 'https://twitter.com/'
  21. search_url = base_url + 'search?'
  22. # specific xpath variables
  23. results_xpath = '//li[@data-item-type="tweet"]'
  24. link_xpath = './/small[@class="time"]//a'
  25. title_xpath = './/span[contains(@class, "username")]'
  26. content_xpath = './/p[contains(@class, "tweet-text")]'
  27. timestamp_xpath = './/span[contains(@class,"_timestamp")]'
  28. # do search-request
  29. def request(query, params):
  30. params['url'] = search_url + urlencode({'q': query})
  31. # set language if specified
  32. if params['language'] != 'all':
  33. params['cookies']['lang'] = params['language'].split('_')[0]
  34. else:
  35. params['cookies']['lang'] = 'en'
  36. return params
  37. # get response from search-request
  38. def response(resp):
  39. results = []
  40. dom = html.fromstring(resp.text)
  41. # parse results
  42. for tweet in dom.xpath(results_xpath):
  43. try:
  44. link = tweet.xpath(link_xpath)[0]
  45. content = extract_text(tweet.xpath(content_xpath)[0])
  46. except Exception:
  47. continue
  48. url = urljoin(base_url, link.attrib.get('href'))
  49. title = extract_text(tweet.xpath(title_xpath))
  50. pubdate = tweet.xpath(timestamp_xpath)
  51. if len(pubdate) > 0:
  52. timestamp = float(pubdate[0].attrib.get('data-time'))
  53. publishedDate = datetime.fromtimestamp(timestamp, None)
  54. # append result
  55. results.append({'url': url,
  56. 'title': title,
  57. 'content': content,
  58. 'publishedDate': publishedDate})
  59. else:
  60. # append result
  61. results.append({'url': url,
  62. 'title': title,
  63. 'content': content})
  64. # return results
  65. return results