|  | @@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
 | 
	
		
			
			| 16 | 16 |  '''
 | 
	
		
			
			| 17 | 17 |  
 | 
	
		
			
			| 18 | 18 |  import re
 | 
	
		
			
			|  | 19 | +from urlparse import urlparse
 | 
	
		
			
			| 19 | 20 |  from lxml import etree
 | 
	
		
			
			| 20 | 21 |  from os import listdir
 | 
	
		
			
			| 21 | 22 |  from os.path import isfile, isdir, join
 | 
	
	
		
			
			|  | @@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
 | 
	
		
			
			| 86 | 87 |  
 | 
	
		
			
			| 87 | 88 |              # TODO hack, which convert a javascript regex group
 | 
	
		
			
			| 88 | 89 |              # into a valid python regex group
 | 
	
		
			
			| 89 |  | -            rule_from = ruleset.attrib.get('from').replace('$', '\\')
 | 
	
		
			
			| 90 |  | -            rule_to = ruleset.attrib.get('to').replace('$', '\\')
 | 
	
		
			
			|  | 90 | +            rule_from = ruleset.attrib['from'].replace('$', '\\')
 | 
	
		
			
			|  | 91 | +            if rule_from.endswith('\\'):
 | 
	
		
			
			|  | 92 | +                rule_from = rule_from[:-1]+'$'
 | 
	
		
			
			|  | 93 | +            rule_to = ruleset.attrib['to'].replace('$', '\\')
 | 
	
		
			
			|  | 94 | +            if rule_to.endswith('\\'):
 | 
	
		
			
			|  | 95 | +                rule_to = rule_to[:-1]+'$'
 | 
	
		
			
			| 91 | 96 |  
 | 
	
		
			
			| 92 | 97 |              # TODO, not working yet because of the hack above,
 | 
	
		
			
			| 93 | 98 |              # currently doing that in webapp.py
 | 
	
		
			
			| 94 | 99 |              # rule_from_rgx = re.compile(rule_from, re.I)
 | 
	
		
			
			| 95 | 100 |  
 | 
	
		
			
			| 96 | 101 |              # append rule
 | 
	
		
			
			| 97 |  | -            rules.append((rule_from, rule_to))
 | 
	
		
			
			|  | 102 | +            try:
 | 
	
		
			
			|  | 103 | +                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
 | 
	
		
			
			|  | 104 | +            except:
 | 
	
		
			
			|  | 105 | +                # TODO log regex error
 | 
	
		
			
			|  | 106 | +                continue
 | 
	
		
			
			| 98 | 107 |  
 | 
	
		
			
			| 99 | 108 |          # this child define an exclusion
 | 
	
		
			
			| 100 | 109 |          elif ruleset.tag == 'exclusion':
 | 
	
	
		
			
			|  | @@ -143,3 +152,56 @@ def load_https_rules(rules_path):
 | 
	
		
			
			| 143 | 152 |          https_rules.append(ruleset)
 | 
	
		
			
			| 144 | 153 |  
 | 
	
		
			
			| 145 | 154 |      print(' * {n} https-rules loaded'.format(n=len(https_rules)))
 | 
	
		
			
			|  | 155 | +
 | 
	
		
			
			|  | 156 | +
 | 
	
		
			
			|  | 157 | +
 | 
	
		
			
			|  | 158 | +def https_url_rewrite(result):
 | 
	
		
			
			|  | 159 | +    skip_https_rewrite = False
 | 
	
		
			
			|  | 160 | +    # check if HTTPS rewrite is possible
 | 
	
		
			
			|  | 161 | +    for target, rules, exclusions in https_rules:
 | 
	
		
			
			|  | 162 | +
 | 
	
		
			
			|  | 163 | +        # check if target regex match with url
 | 
	
		
			
			|  | 164 | +        if target.match(result['parsed_url'].netloc):
 | 
	
		
			
			|  | 165 | +            # process exclusions
 | 
	
		
			
			|  | 166 | +            for exclusion in exclusions:
 | 
	
		
			
			|  | 167 | +                # check if exclusion match with url
 | 
	
		
			
			|  | 168 | +                if exclusion.match(result['url']):
 | 
	
		
			
			|  | 169 | +                    skip_https_rewrite = True
 | 
	
		
			
			|  | 170 | +                    break
 | 
	
		
			
			|  | 171 | +
 | 
	
		
			
			|  | 172 | +            # skip https rewrite if required
 | 
	
		
			
			|  | 173 | +            if skip_https_rewrite:
 | 
	
		
			
			|  | 174 | +                break
 | 
	
		
			
			|  | 175 | +
 | 
	
		
			
			|  | 176 | +            # process rules
 | 
	
		
			
			|  | 177 | +            for rule in rules:
 | 
	
		
			
			|  | 178 | +                try:
 | 
	
		
			
			|  | 179 | +                    new_result_url = rule[0].sub(rule[1], result['url'])
 | 
	
		
			
			|  | 180 | +                except:
 | 
	
		
			
			|  | 181 | +                    break
 | 
	
		
			
			|  | 182 | +
 | 
	
		
			
			|  | 183 | +                # parse new url
 | 
	
		
			
			|  | 184 | +                new_parsed_url = urlparse(new_result_url)
 | 
	
		
			
			|  | 185 | +
 | 
	
		
			
			|  | 186 | +                # continiue if nothing was rewritten
 | 
	
		
			
			|  | 187 | +                if result['url'] == new_result_url:
 | 
	
		
			
			|  | 188 | +                    continue
 | 
	
		
			
			|  | 189 | +
 | 
	
		
			
			|  | 190 | +                # get domainname from result
 | 
	
		
			
			|  | 191 | +                # TODO, does only work correct with TLD's like
 | 
	
		
			
			|  | 192 | +                #  asdf.com, not for asdf.com.de
 | 
	
		
			
			|  | 193 | +                # TODO, using publicsuffix instead of this rewrite rule
 | 
	
		
			
			|  | 194 | +                old_result_domainname = '.'.join(
 | 
	
		
			
			|  | 195 | +                    result['parsed_url'].hostname.split('.')[-2:])
 | 
	
		
			
			|  | 196 | +                new_result_domainname = '.'.join(
 | 
	
		
			
			|  | 197 | +                    new_parsed_url.hostname.split('.')[-2:])
 | 
	
		
			
			|  | 198 | +
 | 
	
		
			
			|  | 199 | +                # check if rewritten hostname is the same,
 | 
	
		
			
			|  | 200 | +                # to protect against wrong or malicious rewrite rules
 | 
	
		
			
			|  | 201 | +                if old_result_domainname == new_result_domainname:
 | 
	
		
			
			|  | 202 | +                    # set new url
 | 
	
		
			
			|  | 203 | +                    result['url'] = new_result_url
 | 
	
		
			
			|  | 204 | +
 | 
	
		
			
			|  | 205 | +            # target has matched, do not search over the other rules
 | 
	
		
			
			|  | 206 | +            break
 | 
	
		
			
			|  | 207 | +    return result
 |