Ver código fonte

[mod][fix] https rewrite refactor ++ fixes

Adam Tauber 10 anos atrás
pai
commit
f141773814
3 arquivos alterados com 68 adições e 61 exclusões
  1. 65
    3
      searx/https_rewrite.py
  2. 1
    1
      searx/https_rules/Soundcloud.xml
  3. 2
    57
      searx/webapp.py

+ 65
- 3
searx/https_rewrite.py Ver arquivo

@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
16 16
 '''
17 17
 
18 18
 import re
19
+from urlparse import urlparse
19 20
 from lxml import etree
20 21
 from os import listdir
21 22
 from os.path import isfile, isdir, join
@@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
86 87
 
87 88
             # TODO hack, which convert a javascript regex group
88 89
             # into a valid python regex group
89
-            rule_from = ruleset.attrib.get('from').replace('$', '\\')
90
-            rule_to = ruleset.attrib.get('to').replace('$', '\\')
90
+            rule_from = ruleset.attrib['from'].replace('$', '\\')
91
+            if rule_from.endswith('\\'):
92
+                rule_from = rule_from[:-1]+'$'
93
+            rule_to = ruleset.attrib['to'].replace('$', '\\')
94
+            if rule_to.endswith('\\'):
95
+                rule_to = rule_to[:-1]+'$'
91 96
 
92 97
             # TODO, not working yet because of the hack above,
93 98
             # currently doing that in webapp.py
94 99
             # rule_from_rgx = re.compile(rule_from, re.I)
95 100
 
96 101
             # append rule
97
-            rules.append((rule_from, rule_to))
102
+            try:
103
+                rules.append((re.compile(rule_from, re.I | re.U), rule_to))
104
+            except:
105
+                # TODO log regex error
106
+                continue
98 107
 
99 108
         # this child define an exclusion
100 109
         elif ruleset.tag == 'exclusion':
@@ -143,3 +152,56 @@ def load_https_rules(rules_path):
143 152
         https_rules.append(ruleset)
144 153
 
145 154
     print(' * {n} https-rules loaded'.format(n=len(https_rules)))
155
+
156
+
157
+
158
+def https_url_rewrite(result):
159
+    skip_https_rewrite = False
160
+    # check if HTTPS rewrite is possible
161
+    for target, rules, exclusions in https_rules:
162
+
163
+        # check if target regex match with url
164
+        if target.match(result['parsed_url'].netloc):
165
+            # process exclusions
166
+            for exclusion in exclusions:
167
+                # check if exclusion match with url
168
+                if exclusion.match(result['url']):
169
+                    skip_https_rewrite = True
170
+                    break
171
+
172
+            # skip https rewrite if required
173
+            if skip_https_rewrite:
174
+                break
175
+
176
+            # process rules
177
+            for rule in rules:
178
+                try:
179
+                    new_result_url = rule[0].sub(rule[1], result['url'])
180
+                except:
181
+                    break
182
+
183
+                # parse new url
184
+                new_parsed_url = urlparse(new_result_url)
185
+
186
+                # continiue if nothing was rewritten
187
+                if result['url'] == new_result_url:
188
+                    continue
189
+
190
+                # get domainname from result
191
+                # TODO, does only work correct with TLD's like
192
+                #  asdf.com, not for asdf.com.de
193
+                # TODO, using publicsuffix instead of this rewrite rule
194
+                old_result_domainname = '.'.join(
195
+                    result['parsed_url'].hostname.split('.')[-2:])
196
+                new_result_domainname = '.'.join(
197
+                    new_parsed_url.hostname.split('.')[-2:])
198
+
199
+                # check if rewritten hostname is the same,
200
+                # to protect against wrong or malicious rewrite rules
201
+                if old_result_domainname == new_result_domainname:
202
+                    # set new url
203
+                    result['url'] = new_result_url
204
+
205
+            # target has matched, do not search over the other rules
206
+            break
207
+    return result

+ 1
- 1
searx/https_rules/Soundcloud.xml Ver arquivo

@@ -89,7 +89,7 @@
89 89
 	<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
90 90
 		to="https://$1.sndcdn.com/" />
91 91
 
92
-	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
92
+	<rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
93 93
 		to="https://$1soundcloud.com/" />
94 94
 
95 95
 	<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"

+ 2
- 57
searx/webapp.py Ver arquivo

@@ -41,15 +41,12 @@ from searx.utils import (
41 41
     UnicodeWriter, highlight_content, html_to_text, get_themes
42 42
 )
43 43
 from searx.version import VERSION_STRING
44
-from searx.https_rewrite import https_rules
45 44
 from searx.languages import language_codes
45
+from searx.https_rewrite import https_url_rewrite
46 46
 from searx.search import Search
47 47
 from searx.query import Query
48 48
 from searx.autocomplete import backends as autocomplete_backends
49 49
 
50
-from urlparse import urlparse
51
-import re
52
-
53 50
 
54 51
 static_path, templates_path, themes =\
55 52
     get_themes(settings['themes_path']
@@ -215,59 +212,7 @@ def index():
215 212
         if settings['server']['https_rewrite']\
216 213
            and result['parsed_url'].scheme == 'http':
217 214
 
218
-            skip_https_rewrite = False
219
-
220
-            # check if HTTPS rewrite is possible
221
-            for target, rules, exclusions in https_rules:
222
-
223
-                # check if target regex match with url
224
-                if target.match(result['url']):
225
-                    # process exclusions
226
-                    for exclusion in exclusions:
227
-                        # check if exclusion match with url
228
-                        if exclusion.match(result['url']):
229
-                            skip_https_rewrite = True
230
-                            break
231
-
232
-                    # skip https rewrite if required
233
-                    if skip_https_rewrite:
234
-                        break
235
-
236
-                    # process rules
237
-                    for rule in rules:
238
-                        try:
239
-                            # TODO, precompile rule
240
-                            p = re.compile(rule[0])
241
-
242
-                            # rewrite url if possible
243
-                            new_result_url = p.sub(rule[1], result['url'])
244
-                        except:
245
-                            break
246
-
247
-                        # parse new url
248
-                        new_parsed_url = urlparse(new_result_url)
249
-
250
-                        # continiue if nothing was rewritten
251
-                        if result['url'] == new_result_url:
252
-                            continue
253
-
254
-                        # get domainname from result
255
-                        # TODO, does only work correct with TLD's like
256
-                        #  asdf.com, not for asdf.com.de
257
-                        # TODO, using publicsuffix instead of this rewrite rule
258
-                        old_result_domainname = '.'.join(
259
-                            result['parsed_url'].hostname.split('.')[-2:])
260
-                        new_result_domainname = '.'.join(
261
-                            new_parsed_url.hostname.split('.')[-2:])
262
-
263
-                        # check if rewritten hostname is the same,
264
-                        # to protect against wrong or malicious rewrite rules
265
-                        if old_result_domainname == new_result_domainname:
266
-                            # set new url
267
-                            result['url'] = new_result_url
268
-
269
-                    # target has matched, do not search over the other rules
270
-                    break
215
+            result = https_url_rewrite(result)
271 216
 
272 217
         if search.request_data.get('format', 'html') == 'html':
273 218
             if 'content' in result: