浏览代码

Implementing https rewrite support #71

* parsing XML-Files which contain target, exclusions and rules
* convert regex if required (is a little hack, probably does not work
for all rules)
* check if target rule apply for http url, and use the rules to rewrite
it
* add pice of code, to check if domain name has not changed during
rewrite (should be rewritten, using publicsuffix instead of little hack)
Thomas Pointhuber 10 年前
父节点
当前提交
9b9f097adb
共有 3 个文件被更改,包括 187 次插入14 次删除
  1. 8
    0
      searx/__init__.py
  2. 133
    8
      searx/https_rewrite.py
  3. 46
    6
      searx/webapp.py

+ 8
- 0
searx/__init__.py 查看文件

@@ -1,5 +1,6 @@
1 1
 from os import environ
2 2
 from os.path import realpath, dirname, join, abspath
3
+from searx.https_rewrite import load_https_rules
3 4
 try:
4 5
     from yaml import load
5 6
 except:
@@ -15,6 +16,13 @@ if 'SEARX_SETTINGS_PATH' in environ:
15 16
 else:
16 17
     settings_path = join(searx_dir, 'settings.yml')
17 18
 
19
+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
20
+    https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
21
+else:
22
+    https_rewrite_path = join(searx_dir, 'https_rules')
18 23
 
19 24
 with open(settings_path) as settings_yaml:
20 25
     settings = load(settings_yaml)
26
+
27
+# loade https rules
28
+load_https_rules(https_rewrite_path)

+ 133
- 8
searx/https_rewrite.py 查看文件

@@ -1,14 +1,139 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
1 18
 import re
19
+from lxml import etree
20
+from os import listdir
21
+from os.path import isfile, join
22
+
2 23
 
3 24
 # https://gitweb.torproject.org/\
4 25
 # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
5 26
 
6 27
 # HTTPS rewrite rules
7
-https_rules = (
8
-    # from
9
-    (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
10
-     # to
11
-     r'https://\1xkcd.com/'),
12
-    (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
13
-     r'https://sslimgs.xkcd.com/'),
14
-)
28
+https_rules = []
29
+
30
+
31
+# load single ruleset from a xml file
32
+def load_single_https_ruleset(filepath):
33
+    ruleset = ()
34
+
35
+    # init parser
36
+    parser = etree.XMLParser()
37
+
38
+    # load and parse xml-file
39
+    try:
40
+        tree = etree.parse(filepath, parser)
41
+    except:
42
+        # TODO, error message
43
+        return ()
44
+
45
+    # get root node
46
+    root = tree.getroot()
47
+
48
+    #print(etree.tostring(tree))
49
+
50
+    # check if root is a node with the name ruleset
51
+    # TODO improve parsing
52
+    if root.tag != 'ruleset':        
53
+        return ()
54
+
55
+    # check if rule is deactivated by default
56
+    if root.attrib.get('default_off'):
57
+        return ()
58
+
59
+    # check if rule does only work for specific platforms
60
+    if root.attrib.get('platform'):
61
+        return ()
62
+
63
+    hosts = []
64
+    rules = []
65
+    exclusions = []
66
+
67
+    # parse childs from ruleset
68
+    for ruleset in root:
69
+        # this child define a target
70
+        if ruleset.tag == 'target':
71
+            # check if required tags available 
72
+            if not ruleset.attrib.get('host'):
73
+                continue
74
+
75
+            # convert host-rule to valid regex
76
+            host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
77
+
78
+            # append to host list
79
+            hosts.append(host)
80
+
81
+        # this child define a rule
82
+        elif ruleset.tag == 'rule':
83
+            # check if required tags available 
84
+            if not ruleset.attrib.get('from')\
85
+               or not ruleset.attrib.get('to'):
86
+                continue
87
+
88
+            # TODO hack, which convert a javascript regex group into a valid python regex group
89
+            rule_from = ruleset.attrib.get('from').replace('$', '\\')
90
+            rule_to = ruleset.attrib.get('to').replace('$', '\\')
91
+
92
+            # TODO, not working yet because of the hack above, currently doing that in webapp.py
93
+            #rule_from_rgx = re.compile(rule_from, re.I)
94
+
95
+            # append rule
96
+            rules.append((rule_from, rule_to))
97
+
98
+        # this child define an exclusion
99
+        elif ruleset.tag == 'exclusion':
100
+            # check if required tags available 
101
+            if not ruleset.attrib.get('pattern'):
102
+                continue
103
+
104
+            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
105
+
106
+            # append exclusion
107
+            exclusions.append(exclusion_rgx)
108
+
109
+    # convert list of possible hosts to a simple regex
110
+    # TODO compress regex to improve performance
111
+    try:
112
+        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
113
+    except:
114
+        return ()
115
+
116
+    # return ruleset
117
+    return (target_hosts, rules, exclusions)
118
+
119
+
120
+# load all https rewrite rules
121
+def load_https_rules(rules_path):
122
+    # add / to path if not set yet
123
+    if rules_path[-1:] != '/':
124
+        rules_path += '/'
125
+
126
+    # search all xml files which are stored in the https rule directory
127
+    xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
128
+
129
+    # load xml-files
130
+    for ruleset_file in xml_files:
131
+        # calculate rewrite-rules
132
+        ruleset = load_single_https_ruleset(ruleset_file)
133
+
134
+        # skip if no ruleset returned
135
+        if not ruleset:
136
+            continue
137
+
138
+        # append ruleset
139
+        https_rules.append(ruleset)

+ 46
- 6
searx/webapp.py 查看文件

@@ -49,6 +49,9 @@ from searx.languages import language_codes
49 49
 from searx.search import Search
50 50
 from searx.autocomplete import backends as autocomplete_backends
51 51
 
52
+from urlparse import urlparse
53
+import re
54
+
52 55
 
53 56
 static_path, templates_path, themes =\
54 57
     get_themes(settings['themes_path']
@@ -197,16 +200,53 @@ def index():
197 200
         if not search.paging and engines[result['engine']].paging:
198 201
             search.paging = True
199 202
 
203
+        # check if HTTPS rewrite is required 
200 204
         if settings['server']['https_rewrite']\
201 205
            and result['parsed_url'].scheme == 'http':
202 206
 
203
-            for http_regex, https_url in https_rules:
204
-                if http_regex.match(result['url']):
205
-                    result['url'] = http_regex.sub(https_url, result['url'])
206
-                    # TODO result['parsed_url'].scheme
207
-                    break
207
+            skip_https_rewrite = False
208
+
209
+            # check if HTTPS rewrite is possible
210
+            for target, rules, exclusions in https_rules:
211
+
212
+                # check if target regex match with url
213
+                if target.match(result['url']):
214
+                    # process exclusions
215
+                    for exclusion in exclusions:
216
+                        # check if exclusion match with url
217
+                        if exclusion.match(result['url']):
218
+                            skip_https_rewrite = True
219
+                            break
220
+
221
+                    # skip https rewrite if required
222
+                    if skip_https_rewrite:
223
+                        break
224
+
225
+                    # process rules
226
+                    for rule in rules:
227
+                        # TODO, precompile rule
228
+                        p = re.compile(rule[0])
229
+                        # rewrite url if possible
230
+                        new_result_url = p.sub(rule[1], result['url'])
231
+
232
+                        # parse new url
233
+                        new_parsed_url = urlparse(new_result_url)
234
+
235
+                        # continiue if nothing was rewritten
236
+                        if result['url'] == new_result_url:
237
+                            continue
238
+
239
+                        # get domainname from result
240
+                        # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
241
+                        # TODO, using publicsuffix instead of this rewrite rule
242
+                        old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
243
+                        new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
244
+
245
+                        # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
246
+                        if old_result_domainname == new_result_domainname:
247
+                            # set new url
248
+                            result['url'] = new_result_url
208 249
 
209
-        # HTTPS rewrite
210 250
         if search.request_data.get('format', 'html') == 'html':
211 251
             if 'content' in result:
212 252
                 result['content'] = highlight_content(result['content'],