소스 검색

Implementing https rewrite support #71

* parsing XML-Files which contain target, exclusions and rules
* convert regex if required (is a little hack, probably does not work
for all rules)
* check if target rule apply for http url, and use the rules to rewrite
it
* add pice of code, to check if domain name has not changed during
rewrite (should be rewritten, using publicsuffix instead of little hack)
Thomas Pointhuber 10 년 전
부모
커밋
9b9f097adb
3개의 변경된 파일187개의 추가작업 그리고 14개의 파일을 삭제
  1. 8
    0
      searx/__init__.py
  2. 133
    8
      searx/https_rewrite.py
  3. 46
    6
      searx/webapp.py

+ 8
- 0
searx/__init__.py 파일 보기

@@ -1,5 +1,6 @@
1 1
 from os import environ
2 2
 from os.path import realpath, dirname, join, abspath
3
+from searx.https_rewrite import load_https_rules
3 4
 try:
4 5
     from yaml import load
5 6
 except:
@@ -15,6 +16,13 @@ if 'SEARX_SETTINGS_PATH' in environ:
15 16
 else:
16 17
     settings_path = join(searx_dir, 'settings.yml')
17 18
 
19
+if 'SEARX_HTTPS_REWRITE_PATH' in environ:
20
+    https_rewrite_path = environ['SEARX_HTTPS_REWRITE_PATH']
21
+else:
22
+    https_rewrite_path = join(searx_dir, 'https_rules')
18 23
 
19 24
 with open(settings_path) as settings_yaml:
20 25
     settings = load(settings_yaml)
26
+
27
+# loade https rules
28
+load_https_rules(https_rewrite_path)

+ 133
- 8
searx/https_rewrite.py 파일 보기

@@ -1,14 +1,139 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
1 18
 import re
19
+from lxml import etree
20
+from os import listdir
21
+from os.path import isfile, join
22
+
2 23
 
3 24
 # https://gitweb.torproject.org/\
4 25
 # pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
5 26
 
6 27
 # HTTPS rewrite rules
7
-https_rules = (
8
-    # from
9
-    (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
10
-     # to
11
-     r'https://\1xkcd.com/'),
12
-    (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
13
-     r'https://sslimgs.xkcd.com/'),
14
-)
28
+https_rules = []
29
+
30
+
31
+# load single ruleset from a xml file
32
+def load_single_https_ruleset(filepath):
33
+    ruleset = ()
34
+
35
+    # init parser
36
+    parser = etree.XMLParser()
37
+
38
+    # load and parse xml-file
39
+    try:
40
+        tree = etree.parse(filepath, parser)
41
+    except:
42
+        # TODO, error message
43
+        return ()
44
+
45
+    # get root node
46
+    root = tree.getroot()
47
+
48
+    #print(etree.tostring(tree))
49
+
50
+    # check if root is a node with the name ruleset
51
+    # TODO improve parsing
52
+    if root.tag != 'ruleset':        
53
+        return ()
54
+
55
+    # check if rule is deactivated by default
56
+    if root.attrib.get('default_off'):
57
+        return ()
58
+
59
+    # check if rule does only work for specific platforms
60
+    if root.attrib.get('platform'):
61
+        return ()
62
+
63
+    hosts = []
64
+    rules = []
65
+    exclusions = []
66
+
67
+    # parse childs from ruleset
68
+    for ruleset in root:
69
+        # this child define a target
70
+        if ruleset.tag == 'target':
71
+            # check if required tags available 
72
+            if not ruleset.attrib.get('host'):
73
+                continue
74
+
75
+            # convert host-rule to valid regex
76
+            host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
77
+
78
+            # append to host list
79
+            hosts.append(host)
80
+
81
+        # this child define a rule
82
+        elif ruleset.tag == 'rule':
83
+            # check if required tags available 
84
+            if not ruleset.attrib.get('from')\
85
+               or not ruleset.attrib.get('to'):
86
+                continue
87
+
88
+            # TODO hack, which convert a javascript regex group into a valid python regex group
89
+            rule_from = ruleset.attrib.get('from').replace('$', '\\')
90
+            rule_to = ruleset.attrib.get('to').replace('$', '\\')
91
+
92
+            # TODO, not working yet because of the hack above, currently doing that in webapp.py
93
+            #rule_from_rgx = re.compile(rule_from, re.I)
94
+
95
+            # append rule
96
+            rules.append((rule_from, rule_to))
97
+
98
+        # this child define an exclusion
99
+        elif ruleset.tag == 'exclusion':
100
+            # check if required tags available 
101
+            if not ruleset.attrib.get('pattern'):
102
+                continue
103
+
104
+            exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
105
+
106
+            # append exclusion
107
+            exclusions.append(exclusion_rgx)
108
+
109
+    # convert list of possible hosts to a simple regex
110
+    # TODO compress regex to improve performance
111
+    try:
112
+        target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
113
+    except:
114
+        return ()
115
+
116
+    # return ruleset
117
+    return (target_hosts, rules, exclusions)
118
+
119
+
120
+# load all https rewrite rules
121
+def load_https_rules(rules_path):
122
+    # add / to path if not set yet
123
+    if rules_path[-1:] != '/':
124
+        rules_path += '/'
125
+
126
+    # search all xml files which are stored in the https rule directory
127
+    xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
128
+
129
+    # load xml-files
130
+    for ruleset_file in xml_files:
131
+        # calculate rewrite-rules
132
+        ruleset = load_single_https_ruleset(ruleset_file)
133
+
134
+        # skip if no ruleset returned
135
+        if not ruleset:
136
+            continue
137
+
138
+        # append ruleset
139
+        https_rules.append(ruleset)

+ 46
- 6
searx/webapp.py 파일 보기

@@ -49,6 +49,9 @@ from searx.languages import language_codes
49 49
 from searx.search import Search
50 50
 from searx.autocomplete import backends as autocomplete_backends
51 51
 
52
+from urlparse import urlparse
53
+import re
54
+
52 55
 
53 56
 static_path, templates_path, themes =\
54 57
     get_themes(settings['themes_path']
@@ -197,16 +200,53 @@ def index():
197 200
         if not search.paging and engines[result['engine']].paging:
198 201
             search.paging = True
199 202
 
203
+        # check if HTTPS rewrite is required 
200 204
         if settings['server']['https_rewrite']\
201 205
            and result['parsed_url'].scheme == 'http':
202 206
 
203
-            for http_regex, https_url in https_rules:
204
-                if http_regex.match(result['url']):
205
-                    result['url'] = http_regex.sub(https_url, result['url'])
206
-                    # TODO result['parsed_url'].scheme
207
-                    break
207
+            skip_https_rewrite = False
208
+
209
+            # check if HTTPS rewrite is possible
210
+            for target, rules, exclusions in https_rules:
211
+
212
+                # check if target regex match with url
213
+                if target.match(result['url']):
214
+                    # process exclusions
215
+                    for exclusion in exclusions:
216
+                        # check if exclusion match with url
217
+                        if exclusion.match(result['url']):
218
+                            skip_https_rewrite = True
219
+                            break
220
+
221
+                    # skip https rewrite if required
222
+                    if skip_https_rewrite:
223
+                        break
224
+
225
+                    # process rules
226
+                    for rule in rules:
227
+                        # TODO, precompile rule
228
+                        p = re.compile(rule[0])
229
+                        # rewrite url if possible
230
+                        new_result_url = p.sub(rule[1], result['url'])
231
+
232
+                        # parse new url
233
+                        new_parsed_url = urlparse(new_result_url)
234
+
235
+                        # continiue if nothing was rewritten
236
+                        if result['url'] == new_result_url:
237
+                            continue
238
+
239
+                        # get domainname from result
240
+                        # TODO, does only work correct with TLD's like asdf.com, not for asdf.com.de
241
+                        # TODO, using publicsuffix instead of this rewrite rule
242
+                        old_result_domainname = '.'.join(result['parsed_url'].hostname.split('.')[-2:])
243
+                        new_result_domainname = '.'.join(new_parsed_url.hostname.split('.')[-2:])
244
+
245
+                        # check if rewritten hostname is the same, to protect against wrong or malicious rewrite rules
246
+                        if old_result_domainname == new_result_domainname:
247
+                            # set new url
248
+                            result['url'] = new_result_url
208 249
 
209
-        # HTTPS rewrite
210 250
         if search.request_data.get('format', 'html') == 'html':
211 251
             if 'content' in result:
212 252
                 result['content'] = highlight_content(result['content'],