Explorar el Código

[mod][fix] https rewrite refactor ++ fixes

Adam Tauber hace 10 años
padre
commit
f141773814
Se han modificado 3 ficheros con 68 adiciones y 61 borrados
  1. 65
    3
      searx/https_rewrite.py
  2. 1
    1
      searx/https_rules/Soundcloud.xml
  3. 2
    57
      searx/webapp.py

+ 65
- 3
searx/https_rewrite.py Ver fichero

16
 '''
16
 '''
17
 
17
 
18
 import re
18
 import re
19
+from urlparse import urlparse
19
 from lxml import etree
20
 from lxml import etree
20
 from os import listdir
21
 from os import listdir
21
 from os.path import isfile, isdir, join
22
 from os.path import isfile, isdir, join
86
 
87
 
87
             # TODO hack, which convert a javascript regex group
88
             # TODO hack, which convert a javascript regex group
88
             # into a valid python regex group
89
             # into a valid python regex group
89
-            rule_from = ruleset.attrib.get('from').replace('$', '\\')
90
-            rule_to = ruleset.attrib.get('to').replace('$', '\\')
90
+            rule_from = ruleset.attrib['from'].replace('$', '\\')
91
+            if rule_from.endswith('\\'):
92
+                rule_from = rule_from[:-1]+'$'
93
+            rule_to = ruleset.attrib['to'].replace('$', '\\')
94
+            if rule_to.endswith('\\'):
95
+                rule_to = rule_to[:-1]+'$'
91
 
96
 
92
             # TODO, not working yet because of the hack above,
97
             # TODO, not working yet because of the hack above,
93
             # currently doing that in webapp.py
98
             # currently doing that in webapp.py
94
             # rule_from_rgx = re.compile(rule_from, re.I)
99
             # rule_from_rgx = re.compile(rule_from, re.I)
95
 
100
 
96
             # append rule
101
             # append rule
97
-            rules.append((rule_from, rule_to))
102
+            try:
103
+                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
104
+            except:
105
+                # TODO log regex error
106
+                continue
98
 
107
 
99
         # this child define an exclusion
108
         # this child define an exclusion
100
         elif ruleset.tag == 'exclusion':
109
         elif ruleset.tag == 'exclusion':
143
         https_rules.append(ruleset)
152
         https_rules.append(ruleset)
144
 
153
 
145
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
154
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
155
+
156
+
157
+
158
+def https_url_rewrite(result):
159
+    skip_https_rewrite = False
160
+    # check if HTTPS rewrite is possible
161
+    for target, rules, exclusions in https_rules:
162
+
163
+        # check if target regex match with url
164
+        if target.match(result['parsed_url'].netloc):
165
+            # process exclusions
166
+            for exclusion in exclusions:
167
+                # check if exclusion match with url
168
+                if exclusion.match(result['url']):
169
+                    skip_https_rewrite = True
170
+                    break
171
+
172
+            # skip https rewrite if required
173
+            if skip_https_rewrite:
174
+                break
175
+
176
+            # process rules
177
+            for rule in rules:
178
+                try:
179
+                    new_result_url = rule[0].sub(rule[1], result['url'])
180
+                except:
181
+                    break
182
+
183
+                # parse new url
184
+                new_parsed_url = urlparse(new_result_url)
185
+
186
+                # continiue if nothing was rewritten
187
+                if result['url'] == new_result_url:
188
+                    continue
189
+
190
+                # get domainname from result
191
+                # TODO, does only work correct with TLD's like
192
+                #  asdf.com, not for asdf.com.de
193
+                # TODO, using publicsuffix instead of this rewrite rule
194
+                old_result_domainname = '.'.join(
195
+                    result['parsed_url'].hostname.split('.')[-2:])
196
+                new_result_domainname = '.'.join(
197
+                    new_parsed_url.hostname.split('.')[-2:])
198
+
199
+                # check if rewritten hostname is the same,
200
+                # to protect against wrong or malicious rewrite rules
201
+                if old_result_domainname == new_result_domainname:
202
+                    # set new url
203
+                    result['url'] = new_result_url
204
+
205
+            # target has matched, do not search over the other rules
206
+            break
207
+    return result

+ 1
- 1
searx/https_rules/Soundcloud.xml Ver fichero

89
 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
89
 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
90
 		to="https://$1.sndcdn.com/" />
90
 		to="https://$1.sndcdn.com/" />
91
 
91
 
92
-	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
92
+	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
93
 		to="https://$1soundcloud.com/" />
93
 		to="https://$1soundcloud.com/" />
94
 
94
 
95
 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
95
 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"

+ 2
- 57
searx/webapp.py Ver fichero

41
     UnicodeWriter, highlight_content, html_to_text, get_themes
41
     UnicodeWriter, highlight_content, html_to_text, get_themes
42
 )
42
 )
43
 from searx.version import VERSION_STRING
43
 from searx.version import VERSION_STRING
44
-from searx.https_rewrite import https_rules
45
 from searx.languages import language_codes
44
 from searx.languages import language_codes
45
+from searx.https_rewrite import https_url_rewrite
46
 from searx.search import Search
46
 from searx.search import Search
47
 from searx.query import Query
47
 from searx.query import Query
48
 from searx.autocomplete import backends as autocomplete_backends
48
 from searx.autocomplete import backends as autocomplete_backends
49
 
49
 
50
-from urlparse import urlparse
51
-import re
52
-
53
 
50
 
54
 static_path, templates_path, themes =\
51
 static_path, templates_path, themes =\
55
     get_themes(settings['themes_path']
52
     get_themes(settings['themes_path']
215
         if settings['server']['https_rewrite']\
212
         if settings['server']['https_rewrite']\
216
            and result['parsed_url'].scheme == 'http':
213
            and result['parsed_url'].scheme == 'http':
217
 
214
 
218
-            skip_https_rewrite = False
219
-
220
-            # check if HTTPS rewrite is possible
221
-            for target, rules, exclusions in https_rules:
222
-
223
-                # check if target regex match with url
224
-                if target.match(result['url']):
225
-                    # process exclusions
226
-                    for exclusion in exclusions:
227
-                        # check if exclusion match with url
228
-                        if exclusion.match(result['url']):
229
-                            skip_https_rewrite = True
230
-                            break
231
-
232
-                    # skip https rewrite if required
233
-                    if skip_https_rewrite:
234
-                        break
235
-
236
-                    # process rules
237
-                    for rule in rules:
238
-                        try:
239
-                            # TODO, precompile rule
240
-                            p = re.compile(rule[0])
241
-
242
-                            # rewrite url if possible
243
-                            new_result_url = p.sub(rule[1], result['url'])
244
-                        except:
245
-                            break
246
-
247
-                        # parse new url
248
-                        new_parsed_url = urlparse(new_result_url)
249
-
250
-                        # continiue if nothing was rewritten
251
-                        if result['url'] == new_result_url:
252
-                            continue
253
-
254
-                        # get domainname from result
255
-                        # TODO, does only work correct with TLD's like
256
-                        #  asdf.com, not for asdf.com.de
257
-                        # TODO, using publicsuffix instead of this rewrite rule
258
-                        old_result_domainname = '.'.join(
259
-                            result['parsed_url'].hostname.split('.')[-2:])
260
-                        new_result_domainname = '.'.join(
261
-                            new_parsed_url.hostname.split('.')[-2:])
262
-
263
-                        # check if rewritten hostname is the same,
264
-                        # to protect against wrong or malicious rewrite rules
265
-                        if old_result_domainname == new_result_domainname:
266
-                            # set new url
267
-                            result['url'] = new_result_url
268
-
269
-                    # target has matched, do not search over the other rules
270
-                    break
215
+            result = https_url_rewrite(result)
271
 
216
 
272
         if search.request_data.get('format', 'html') == 'html':
217
         if search.request_data.get('format', 'html') == 'html':
273
             if 'content' in result:
218
             if 'content' in result: