* parsing XML-Files which contain target, exclusions and rules * convert regex if required (is a little hack, probably does not work for all rules) * check if target rule apply for http url, and use the rules to rewrite it * add pice of code, to check if domain name has not changed during rewrite (should be rewritten, using publicsuffix instead of little hack)

11 年前 · 9b9f097adb
--- a/searx/__init__.py
+++ b/searx/__init__.py
@@ -1,5 +1,6 @@
 
				 from os import environ
			
 
				 from os.path import realpath, dirname, join, abspath
			
 
				+from searx.https_rewrite import load_https_rules
			
 
				 try:
			
 
				     from yaml import load
			
 
				 except:
			
@@ -15,6 +16,13 @@ if 'SEARX_SETTINGS_PATH' in environ:
 
				 else:
			
 
				     settings_path = join(searx_dir, 'settings.yml')
			
 
				 
			
 
				+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
			
 
				+    https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
			
 
				+else:
			
 
				+    https_rewrite_path = join(searx_dir, 'https_rules')
			
 
				 
			
 
				 with open(settings_path) as settings_yaml:
			
 
				     settings = load(settings_yaml)
			
 
				+
			
 
				+# loade https rules
			
 
				+load_https_rules(https_rewrite_path)
			
--- a/searx/https_rewrite.py
+++ b/searx/https_rewrite.py
@@ -1,14 +1,139 @@
 
				+'''
			
 
				+searx is free software: you can redistribute it and/or modify
			
 
				+it under the terms of the GNU Affero General Public License as published by
			
 
				+the Free Software Foundation, either version 3 of the License, or
			
 
				+(at your option) any later version.
			
 
				+
			
 
				+searx is distributed in the hope that it will be useful,
			
 
				+but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+GNU Affero General Public License for more details.
			
 
				+
			
 
				+You should have received a copy of the GNU Affero General Public License
			
 
				+along with searx. If not, see < http://www.gnu.org/licenses/ >.
			
 
				+
			
 
				+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
			
 
				+'''
			
 
				+
			
 
				 import re
			
 
				+from lxml import etree
			
 
				+from os import listdir
			
 
				+from os.path import isfile, join
			
 
				+
			
 
				 
			
 
				 # https://gitweb.torproject.org/\
			
 
				 # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
			
 
				 
			
 
				 # HTTPS rewrite rules
			
 
				-https_rules = (
			
 
				-    # from
			
 
				-    (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
			
 
				-     # to
			
 
				-     r'https://\1xkcd.com/'),
			
 
				-    (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
			
 
				-     r'https://sslimgs.xkcd.com/'),
			
 
				-)
			
 
				+https_rules = []
			
 
				+
			
 
				+
			
 
				+# load single ruleset from a xml file
			
 
				+def load_single_https_ruleset(filepath):
			
 
				+    ruleset = ()
			
 
				+
			
 
				+    # init parser
			
 
				+    parser = etree.XMLParser()
			
 
				+
			
 
				+    # load and parse xml-file
			
 
				+    try:
			
 
				+        tree = etree.parse(filepath, parser)
			
 
				+    except:
			
 
				+        # TODO, error message
			
 
				+        return ()
			
 
				+
			
 
				+    # get root node
			
 
				+    root = tree.getroot()
			
 
				+
			
 
				+    #print(etree.tostring(tree))
			
 
				+
			
 
				+    # check if root is a node with the name ruleset
			
 
				+    # TODO improve parsing
			
 
				+    if root.tag != 'ruleset':        
			
 
				+        return ()
			
 
				+
			
 
				+    # check if rule is deactivated by default
			
 
				+    if root.attrib.get('default_off'):
			
 
				+        return ()
			
 
				+
			
 
				+    # check if rule does only work for specific platforms
			
 
				+    if root.attrib.get('platform'):
			
 
				+        return ()
			
 
				+
			
 
				+    hosts = []
			
 
				+    rules = []
			
 
				+    exclusions = []
			
 
				+
			
 
				+    # parse childs from ruleset
			
 
				+    for ruleset in root:
			
 
				+        # this child define a target
			
 
				+        if ruleset.tag == 'target':
			
 
				+            # check if required tags available 
			
 
				+            if not ruleset.attrib.get('host'):
			
 
				+                continue
			
 
				+
			
 
				+            # convert host-rule to valid regex
			
 
				+            host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
			
 
				+
			
 
				+            # append to host list
			
 
				+            hosts.append(host)
			
 
				+
			
 
				+        # this child define a rule
			
 
				+        elif ruleset.tag == 'rule':
			
 
				+            # check if required tags available 
			
 
				+            if not ruleset.attrib.get('from')\
			
 
				+               or not ruleset.attrib.get('to'):
			
 
				+                continue
			
 
				+
			
 
				+            # TODO hack, which convert a javascript regex group into a valid python regex group
			
 
				+            rule_from = ruleset.attrib.get('from').replace('$', '\\')
			
 
				+            rule_to = ruleset.attrib.get('to').replace('$', '\\')
			
 
				+
			
 
				+            # TODO, not working yet because of the hack above, currently doing that in webapp.py
			
 
				+            #rule_from_rgx = re.compile(rule_from, re.I)
			
 
				+
			
 
				+            # append rule
			
 
				+            rules.append((rule_from, rule_to))
			
 
				+
			
 
				+        # this child define an exclusion
			
 
				+        elif ruleset.tag == 'exclusion':
			
 
				+            # check if required tags available 
			
 
				+            if not ruleset.attrib.get('pattern'):
			
 
				+                continue
			
 
				+
			
 
				+            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
			
 
				+
			
 
				+            # append exclusion
			
 
				+            exclusions.append(exclusion_rgx)
			
 
				+
			
 
				+    # convert list of possible hosts to a simple regex
			
 
				+    # TODO compress regex to improve performance
			
 
				+    try:
			
 
				+        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
			
 
				+    except:
			
 
				+        return ()
			
 
				+
			
 
				+    # return ruleset
			
 
				+    return (target_hosts, rules, exclusions)
			
 
				+
			
 
				+
			
 
				+# load all https rewrite rules
			
 
				+def load_https_rules(rules_path):
			
 
				+    # add / to path if not set yet
			
 
				+    if rules_path[-1:] != '/':
			
 
				+        rules_path += '/'
			
 
				+
			
 
				+    # search all xml files which are stored in the https rule directory
			
 
				+    xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
			
 
				+
			
 
				+    # load xml-files
			
 
				+    for ruleset_file in xml_files:
			
 
				+        # calculate rewrite-rules
			
 
				+        ruleset = load_single_https_ruleset(ruleset_file)
			
 
				+
			
 
				+        # skip if no ruleset returned
			
 
				+        if not ruleset:
			
 
				+            continue
			
 
				+
			
 
				+        # append ruleset
			
 
				+        https_rules.append(ruleset)
			
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -49,6 +49,9 @@ from searx.languages import language_codes
 
				 from searx.search import Search
			
 
				 from searx.autocomplete import backends as autocomplete_backends
			
 
				 
			
 
				+from urlparse import urlparse
			
 
				+import re
			
 
				+
			
 
				 
			
 
				 static_path, templates_path, themes =\
			
 
				     get_themes(settings['themes_path']
			
@@ -197,16 +200,53 @@ def index():
 
				         if not search.paging and engines[result['engine']].paging:
			
 
				             search.paging = True
			
 
				 
			
 
				+        # check if HTTPS rewrite is required 
			
 
				         if settings['server']['https_rewrite']\
			
 
				            and result['parsed_url'].scheme == 'http':
			
 
				 
			
 
				-            for http_regex, https_url in https_rules:
			
 
				-                if http_regex.match(result['url']):
			
 
				-                    result['url'] = http_regex.sub(https_url, result['url'])
			
 
				-                    # TODO result['parsed_url'].scheme
			
 
				-                    break
			
 
				+            skip_https_rewrite = False
			
 
				+
			
 
				+            # check if HTTPS rewrite is possible
			
 
				+            for target, rules, exclusions in https_rules:
			
 
				+
			
 
				+                # check if target regex match with url
			
 
				+                if target.match(result['url']):
			
 
				+                    # process exclusions
			
 
				+                    for exclusion in exclusions:
			
 
				+                        # check if exclusion match with url
			
 
				+                        if exclusion.match(result['url']):
			
 
				+                            skip_https_rewrite = True
			
 
				+                            break
			
 
				+
			
 
				+                    # skip https rewrite if required
			
 
				+                    if skip_https_rewrite:
			
 
				+                        break
			
 
				+
			
 
				+                    # process rules
			
 
				+                    for rule in rules:
			
 
				+                        # TODO, precompile rule
			
 
				+                        p = re.compile(rule[0])
			
 
				+                        # rewrite url if possible
			
 
				+                        new_result_url = p.sub(rule[1], result['url'])
			
 
				+
			
 
				+                        # parse new url
			
 
				+                        new_parsed_url = urlparse(new_result_url)
			
 
				+
			
 
				+                        # continiue if nothing was rewritten
			
 
				+                        if result['url'] == new_result_url:
			
 
				+                            continue
			
 
				+
			
 
				+                        # get domainname from result
			
 
				+                        # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
			
 
				+                        # TODO, using publicsuffix instead of this rewrite rule
			
 
				+                        old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
			
 
				+                        new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
			
 
				+
			
 
				+                        # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
			
 
				+                        if old_result_domainname == new_result_domainname:
			
 
				+                            # set new url
			
 
				+                            result['url'] = new_result_url
			
 
				 
			
 
				-        # HTTPS rewrite
			
 
				         if search.request_data.get('format', 'html') == 'html':
			
 
				             if 'content' in result:
			
 
				                 result['content'] = highlight_content(result['content'],