yandex.py 1.6KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. """
  2. Yahoo (Web)
  3. @website https://yandex.ru/
  4. @provide-api ?
  5. @using-api no
  6. @results HTML (using search portal)
  7. @stable no (HTML can change)
  8. @parse url, title, content
  9. """
  10. from urllib import urlencode
  11. from lxml import html
  12. from searx.search import logger
  13. logger = logger.getChild('yandex engine')
  14. # engine dependent config
  15. categories = ['general']
  16. paging = True
  17. language_support = True # TODO
  18. default_tld = 'com'
  19. language_map = {'ru': 'ru',
  20. 'ua': 'uk',
  21. 'tr': 'com.tr'}
  22. # search-url
  23. base_url = 'https://yandex.{tld}/'
  24. search_url = 'search/?{query}&p={page}'
  25. results_xpath = '//li[@class="serp-item"]'
  26. url_xpath = './/h2/a/@href'
  27. title_xpath = './/h2/a//text()'
  28. content_xpath = './/div[@class="text-container typo typo_text_m typo_line_m organic__text"]//text()'
  29. def request(query, params):
  30. lang = params['language'].split('_')[0]
  31. host = base_url.format(tld=language_map.get(lang) or default_tld)
  32. params['url'] = host + search_url.format(page=params['pageno'] - 1,
  33. query=urlencode({'text': query}))
  34. return params
  35. # get response from search-request
  36. def response(resp):
  37. dom = html.fromstring(resp.text)
  38. results = []
  39. for result in dom.xpath(results_xpath):
  40. try:
  41. res = {'url': result.xpath(url_xpath)[0],
  42. 'title': ''.join(result.xpath(title_xpath)),
  43. 'content': ''.join(result.xpath(content_xpath))}
  44. except:
  45. logger.exception('yandex parse crash')
  46. continue
  47. results.append(res)
  48. return results