tokyotoshokan.py 3.5KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102
  1. """
  2. Tokyo Toshokan (A BitTorrent Library for Japanese Media)
  3. @website https://www.tokyotosho.info/
  4. @provide-api no
  5. @using-api no
  6. @results HTML
  7. @stable no (HTML can change)
  8. @parse url, title, publishedDate, seed, leech,
  9. filesize, magnetlink, content
  10. """
  11. import re
  12. from cgi import escape
  13. from urllib import urlencode
  14. from lxml import html
  15. from searx.engines.xpath import extract_text
  16. from datetime import datetime
  17. from searx.engines.nyaa import int_or_zero, get_filesize_mul
  18. # engine dependent config
  19. categories = ['files', 'videos', 'music']
  20. paging = True
  21. # search-url
  22. base_url = 'https://www.tokyotosho.info/'
  23. search_url = base_url + 'search.php?{query}'
  24. # do search-request
  25. def request(query, params):
  26. query = urlencode({'page': params['pageno'],
  27. 'terms': query})
  28. params['url'] = search_url.format(query=query)
  29. return params
  30. # get response from search-request
  31. def response(resp):
  32. results = []
  33. dom = html.fromstring(resp.text)
  34. rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
  35. # check if there are no results or page layout was changed so we cannot parse it
  36. # currently there are two rows for each result, so total count must be even
  37. if len(rows) == 0 or len(rows) % 2 != 0:
  38. return []
  39. # regular expression for parsing torrent size strings
  40. size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
  41. # processing the results, two rows at a time
  42. for i in xrange(0, len(rows), 2):
  43. # parse the first row
  44. name_row = rows[i]
  45. links = name_row.xpath('./td[@class="desc-top"]/a')
  46. params = {
  47. 'template': 'torrent.html',
  48. 'url': links[-1].attrib.get('href'),
  49. 'title': extract_text(links[-1])
  50. }
  51. # I have not yet seen any torrents without magnet links, but
  52. # it's better to be prepared to stumble upon one some day
  53. if len(links) == 2:
  54. magnet = links[0].attrib.get('href')
  55. if magnet.startswith('magnet'):
  56. # okay, we have a valid magnet link, let's add it to the result
  57. params['magnetlink'] = magnet
  58. # no more info in the first row, start parsing the second one
  59. info_row = rows[i + 1]
  60. desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
  61. for item in desc.split('|'):
  62. item = item.strip()
  63. if item.startswith('Size:'):
  64. try:
  65. # ('1.228', 'GB')
  66. groups = size_re.match(item).groups()
  67. multiplier = get_filesize_mul(groups[1])
  68. params['filesize'] = int(multiplier * float(groups[0]))
  69. except Exception as e:
  70. pass
  71. elif item.startswith('Date:'):
  72. try:
  73. # Date: 2016-02-21 21:44 UTC
  74. date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
  75. params['publishedDate'] = date
  76. except Exception as e:
  77. pass
  78. elif item.startswith('Comment:'):
  79. params['content'] = item
  80. stats = info_row.xpath('./td[@class="stats"]/span')
  81. # has the layout not changed yet?
  82. if len(stats) == 3:
  83. params['seed'] = int_or_zero(extract_text(stats[0]))
  84. params['leech'] = int_or_zero(extract_text(stats[1]))
  85. results.append(params)
  86. return results