yahoo.py 2.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. """
  2. Yahoo (Web)
  3. @website https://search.yahoo.com/web
  4. @provide-api yes (https://developer.yahoo.com/boss/search/),
  5. $0.80/1000 queries
  6. @using-api no (because pricing)
  7. @results HTML (using search portal)
  8. @stable no (HTML can change)
  9. @parse url, title, content, suggestion
  10. """
  11. from urllib import urlencode
  12. from urlparse import unquote
  13. from lxml import html
  14. from searx.engines.xpath import extract_text, extract_url
  15. # engine dependent config
  16. categories = ['general']
  17. paging = True
  18. language_support = True
  19. # search-url
  20. base_url = 'https://search.yahoo.com/'
  21. search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
  22. # specific xpath variables
  23. results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
  24. url_xpath = './/h3/a/@href'
  25. title_xpath = './/h3/a'
  26. content_xpath = './/div[@class="compText aAbs"]'
  27. suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
  28. # remove yahoo-specific tracking-url
  29. def parse_url(url_string):
  30. endings = ['/RS', '/RK']
  31. endpositions = []
  32. start = url_string.find('http', url_string.find('/RU=') + 1)
  33. for ending in endings:
  34. endpos = url_string.rfind(ending)
  35. if endpos > -1:
  36. endpositions.append(endpos)
  37. if start == 0 or len(endpositions) == 0:
  38. return url_string
  39. else:
  40. end = min(endpositions)
  41. return unquote(url_string[start:end])
  42. # do search-request
  43. def request(query, params):
  44. offset = (params['pageno'] - 1) * 10 + 1
  45. if params['language'] == 'all':
  46. language = 'en'
  47. else:
  48. language = params['language'].split('_')[0]
  49. params['url'] = base_url + search_url.format(offset=offset,
  50. query=urlencode({'p': query}),
  51. lang=language)
  52. # TODO required?
  53. params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
  54. .format(lang=language)
  55. return params
  56. # get response from search-request
  57. def response(resp):
  58. results = []
  59. dom = html.fromstring(resp.text)
  60. # parse results
  61. for result in dom.xpath(results_xpath):
  62. try:
  63. url = parse_url(extract_url(result.xpath(url_xpath), search_url))
  64. title = extract_text(result.xpath(title_xpath)[0])
  65. except:
  66. continue
  67. content = extract_text(result.xpath(content_xpath)[0])
  68. # append result
  69. results.append({'url': url,
  70. 'title': title,
  71. 'content': content})
  72. # if no suggestion found, return results
  73. suggestions = dom.xpath(suggestion_xpath)
  74. if not suggestions:
  75. return results
  76. # parse suggestion
  77. for suggestion in suggestions:
  78. # append suggestion
  79. results.append({'suggestion': extract_text(suggestion)})
  80. # return results
  81. return results