filecrop.py 2.3KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273
  1. from json import loads
  2. from urllib import urlencode
  3. from searx.utils import html_to_text
  4. from HTMLParser import HTMLParser
  5. url = 'http://www.filecrop.com/'
  6. search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1'
  7. class FilecropResultParser(HTMLParser):
  8. def __init__(self):
  9. HTMLParser.__init__(self)
  10. self.__start_processing = False
  11. self.results = []
  12. self.result = {}
  13. self.tr_counter = 0
  14. self.data_counter = 0
  15. def handle_starttag(self, tag, attrs):
  16. if tag == 'tr':
  17. if ('bgcolor', '#edeff5') in attrs or ('bgcolor', '#ffffff') in attrs:
  18. self.__start_processing = True
  19. if not self.__start_processing:
  20. return
  21. if tag == 'label':
  22. self.result['title'] = [attr[1] for attr in attrs if attr[0] == 'title'][0]
  23. elif tag == 'a' and ('rel', 'nofollow') in attrs and ('class', 'sourcelink') in attrs:
  24. if 'content' in self.result:
  25. self.result['content'] += [attr[1] for attr in attrs if attr[0] == 'title'][0]
  26. else:
  27. self.result['content'] = [attr[1] for attr in attrs if attr[0] == 'title'][0]
  28. self.result['content'] += ' '
  29. elif tag == 'a':
  30. self.result['url'] = url + [attr[1] for attr in attrs if attr[0] == 'href'][0]
  31. def handle_endtag(self, tag):
  32. if self.__start_processing is False:
  33. return
  34. if tag == 'tr':
  35. self.tr_counter += 1
  36. if self.tr_counter == 2:
  37. self.__start_processing = False
  38. self.tr_counter = 0
  39. self.data_counter = 0
  40. self.results.append(self.result)
  41. self.result = {}
  42. def handle_data(self, data):
  43. if not self.__start_processing:
  44. return
  45. if 'content' in self.result:
  46. self.result['content'] += data + ' '
  47. else:
  48. self.result['content'] = data + ' '
  49. self.data_counter += 1
  50. def request(query, params):
  51. params['url'] = search_url.format(query=urlencode({'w' :query}))
  52. return params
  53. def response(resp):
  54. parser = FilecropResultParser()
  55. parser.feed(resp.text)
  56. return parser.results