utils.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. #import htmlentitydefs
  2. from codecs import getincrementalencoder
  3. from HTMLParser import HTMLParser
  4. from random import choice
  5. import cStringIO
  6. import csv
  7. import os
  8. import re
  9. ua_versions = ('26.0', '27.0', '28.0')
  10. ua_os = ('Windows NT 6.3; WOW64',
  11. 'X11; Linux x86_64',
  12. 'X11; Linux x86')
  13. ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
  14. def gen_useragent():
  15. # TODO
  16. return ua.format(os=choice(ua_os), version=choice(ua_versions))
  17. def highlight_content(content, query):
  18. if not content:
  19. return None
  20. # ignoring html contents
  21. # TODO better html content detection
  22. if content.find('<') != -1:
  23. return content
  24. query = query.decode('utf-8')
  25. if content.lower().find(query.lower()) > -1:
  26. query_regex = u'({0})'.format(re.escape(query))
  27. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  28. content, flags=re.I | re.U)
  29. else:
  30. regex_parts = []
  31. for chunk in query.split():
  32. if len(chunk) == 1:
  33. regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
  34. else:
  35. regex_parts.append(u'{0}'.format(re.escape(chunk)))
  36. query_regex = u'({0})'.format('|'.join(regex_parts))
  37. content = re.sub(query_regex, '<span class="highlight">\\1</span>',
  38. content, flags=re.I | re.U)
  39. return content
  40. class HTMLTextExtractor(HTMLParser):
  41. def __init__(self):
  42. HTMLParser.__init__(self)
  43. self.result = []
  44. def handle_data(self, d):
  45. self.result.append(d)
  46. def handle_charref(self, number):
  47. if number[0] in (u'x', u'X'):
  48. codepoint = int(number[1:], 16)
  49. else:
  50. codepoint = int(number)
  51. self.result.append(unichr(codepoint))
  52. def handle_entityref(self, name):
  53. #codepoint = htmlentitydefs.name2codepoint[name]
  54. #self.result.append(unichr(codepoint))
  55. self.result.append(name)
  56. def get_text(self):
  57. return u''.join(self.result)
  58. def html_to_text(html):
  59. s = HTMLTextExtractor()
  60. s.feed(html)
  61. return s.get_text()
  62. class UnicodeWriter:
  63. """
  64. A CSV writer which will write rows to CSV file "f",
  65. which is encoded in the given encoding.
  66. """
  67. def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
  68. # Redirect output to a queue
  69. self.queue = cStringIO.StringIO()
  70. self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
  71. self.stream = f
  72. self.encoder = getincrementalencoder(encoding)()
  73. def writerow(self, row):
  74. unicode_row = []
  75. for col in row:
  76. if type(col) == str or type(col) == unicode:
  77. unicode_row.append(col.encode('utf-8').strip())
  78. else:
  79. unicode_row.append(col)
  80. self.writer.writerow(unicode_row)
  81. # Fetch UTF-8 output from the queue ...
  82. data = self.queue.getvalue()
  83. data = data.decode("utf-8")
  84. # ... and reencode it into the target encoding
  85. data = self.encoder.encode(data)
  86. # write to the target stream
  87. self.stream.write(data)
  88. # empty queue
  89. self.queue.truncate(0)
  90. def writerows(self, rows):
  91. for row in rows:
  92. self.writerow(row)
  93. def get_themes(root):
  94. """Returns available themes list."""
  95. static_path = os.path.join(root, 'static')
  96. static_names = set(os.listdir(static_path))
  97. templates_path = os.path.join(root, 'templates')
  98. templates_names = set(os.listdir(templates_path))
  99. themes = []
  100. for name in static_names.intersection(templates_names):
  101. themes += [name]
  102. return static_path, templates_path, themes