https_rewrite.py 6.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import re
  15. from urlparse import urlparse
  16. from lxml import etree
  17. from os import listdir
  18. from os.path import isfile, isdir, join
  19. from searx import logger
  20. logger = logger.getChild("https_rewrite")
  21. # https://gitweb.torproject.org/\
  22. # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
  23. # HTTPS rewrite rules
  24. https_rules = []
  25. # load single ruleset from a xml file
  26. def load_single_https_ruleset(filepath):
  27. ruleset = ()
  28. # init parser
  29. parser = etree.XMLParser()
  30. # load and parse xml-file
  31. try:
  32. tree = etree.parse(filepath, parser)
  33. except:
  34. # TODO, error message
  35. return ()
  36. # get root node
  37. root = tree.getroot()
  38. # check if root is a node with the name ruleset
  39. # TODO improve parsing
  40. if root.tag != 'ruleset':
  41. return ()
  42. # check if rule is deactivated by default
  43. if root.attrib.get('default_off'):
  44. return ()
  45. # check if rule does only work for specific platforms
  46. if root.attrib.get('platform'):
  47. return ()
  48. hosts = []
  49. rules = []
  50. exclusions = []
  51. # parse childs from ruleset
  52. for ruleset in root:
  53. # this child define a target
  54. if ruleset.tag == 'target':
  55. # check if required tags available
  56. if not ruleset.attrib.get('host'):
  57. continue
  58. # convert host-rule to valid regex
  59. host = ruleset.attrib.get('host')\
  60. .replace('.', '\.').replace('*', '.*')
  61. # append to host list
  62. hosts.append(host)
  63. # this child define a rule
  64. elif ruleset.tag == 'rule':
  65. # check if required tags available
  66. if not ruleset.attrib.get('from')\
  67. or not ruleset.attrib.get('to'):
  68. continue
  69. # TODO hack, which convert a javascript regex group
  70. # into a valid python regex group
  71. rule_from = ruleset.attrib['from'].replace('$', '\\')
  72. if rule_from.endswith('\\'):
  73. rule_from = rule_from[:-1]+'$'
  74. rule_to = ruleset.attrib['to'].replace('$', '\\')
  75. if rule_to.endswith('\\'):
  76. rule_to = rule_to[:-1]+'$'
  77. # TODO, not working yet because of the hack above,
  78. # currently doing that in webapp.py
  79. # rule_from_rgx = re.compile(rule_from, re.I)
  80. # append rule
  81. try:
  82. rules.append((re.compile(rule_from, re.I | re.U), rule_to))
  83. except:
  84. # TODO log regex error
  85. continue
  86. # this child define an exclusion
  87. elif ruleset.tag == 'exclusion':
  88. # check if required tags available
  89. if not ruleset.attrib.get('pattern'):
  90. continue
  91. exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
  92. # append exclusion
  93. exclusions.append(exclusion_rgx)
  94. # convert list of possible hosts to a simple regex
  95. # TODO compress regex to improve performance
  96. try:
  97. target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
  98. except:
  99. return ()
  100. # return ruleset
  101. return (target_hosts, rules, exclusions)
  102. # load all https rewrite rules
  103. def load_https_rules(rules_path):
  104. # check if directory exists
  105. if not isdir(rules_path):
  106. logger.error("directory not found: '" + rules_path + "'")
  107. return
  108. # search all xml files which are stored in the https rule directory
  109. xml_files = [join(rules_path, f)
  110. for f in listdir(rules_path)
  111. if isfile(join(rules_path, f)) and f[-4:] == '.xml']
  112. # load xml-files
  113. for ruleset_file in xml_files:
  114. # calculate rewrite-rules
  115. ruleset = load_single_https_ruleset(ruleset_file)
  116. # skip if no ruleset returned
  117. if not ruleset:
  118. continue
  119. # append ruleset
  120. https_rules.append(ruleset)
  121. logger.info('{n} rules loaded'.format(n=len(https_rules)))
  122. def https_url_rewrite(result):
  123. skip_https_rewrite = False
  124. # check if HTTPS rewrite is possible
  125. for target, rules, exclusions in https_rules:
  126. # check if target regex match with url
  127. if target.match(result['parsed_url'].netloc):
  128. # process exclusions
  129. for exclusion in exclusions:
  130. # check if exclusion match with url
  131. if exclusion.match(result['url']):
  132. skip_https_rewrite = True
  133. break
  134. # skip https rewrite if required
  135. if skip_https_rewrite:
  136. break
  137. # process rules
  138. for rule in rules:
  139. try:
  140. new_result_url = rule[0].sub(rule[1], result['url'])
  141. except:
  142. break
  143. # parse new url
  144. new_parsed_url = urlparse(new_result_url)
  145. # continiue if nothing was rewritten
  146. if result['url'] == new_result_url:
  147. continue
  148. # get domainname from result
  149. # TODO, does only work correct with TLD's like
  150. # asdf.com, not for asdf.com.de
  151. # TODO, using publicsuffix instead of this rewrite rule
  152. old_result_domainname = '.'.join(
  153. result['parsed_url'].hostname.split('.')[-2:])
  154. new_result_domainname = '.'.join(
  155. new_parsed_url.hostname.split('.')[-2:])
  156. # check if rewritten hostname is the same,
  157. # to protect against wrong or malicious rewrite rules
  158. if old_result_domainname == new_result_domainname:
  159. # set new url
  160. result['url'] = new_result_url
  161. # target has matched, do not search over the other rules
  162. break
  163. return result