|
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
16
|
16
|
'''
|
17
|
17
|
|
18
|
18
|
import re
|
|
19
|
+from urlparse import urlparse
|
19
|
20
|
from lxml import etree
|
20
|
21
|
from os import listdir
|
21
|
22
|
from os.path import isfile, isdir, join
|
|
@@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
|
86
|
87
|
|
87
|
88
|
# TODO hack, which convert a javascript regex group
|
88
|
89
|
# into a valid python regex group
|
89
|
|
- rule_from = ruleset.attrib.get('from').replace('$', '\\')
|
90
|
|
- rule_to = ruleset.attrib.get('to').replace('$', '\\')
|
|
90
|
+ rule_from = ruleset.attrib['from'].replace('$', '\\')
|
|
91
|
+ if rule_from.endswith('\\'):
|
|
92
|
+ rule_from = rule_from[:-1]+'$'
|
|
93
|
+ rule_to = ruleset.attrib['to'].replace('$', '\\')
|
|
94
|
+ if rule_to.endswith('\\'):
|
|
95
|
+ rule_to = rule_to[:-1]+'$'
|
91
|
96
|
|
92
|
97
|
# TODO, not working yet because of the hack above,
|
93
|
98
|
# currently doing that in webapp.py
|
94
|
99
|
# rule_from_rgx = re.compile(rule_from, re.I)
|
95
|
100
|
|
96
|
101
|
# append rule
|
97
|
|
- rules.append((rule_from, rule_to))
|
|
102
|
+ try:
|
|
103
|
+ rules.append((re.compile(rule_from, re.I | re.U), rule_to))
|
|
104
|
+ except:
|
|
105
|
+ # TODO log regex error
|
|
106
|
+ continue
|
98
|
107
|
|
99
|
108
|
# this child define an exclusion
|
100
|
109
|
elif ruleset.tag == 'exclusion':
|
|
@@ -143,3 +152,56 @@ def load_https_rules(rules_path):
|
143
|
152
|
https_rules.append(ruleset)
|
144
|
153
|
|
145
|
154
|
print(' * {n} https-rules loaded'.format(n=len(https_rules)))
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+def https_url_rewrite(result):
|
|
159
|
+ skip_https_rewrite = False
|
|
160
|
+ # check if HTTPS rewrite is possible
|
|
161
|
+ for target, rules, exclusions in https_rules:
|
|
162
|
+
|
|
163
|
+ # check if target regex match with url
|
|
164
|
+ if target.match(result['parsed_url'].netloc):
|
|
165
|
+ # process exclusions
|
|
166
|
+ for exclusion in exclusions:
|
|
167
|
+ # check if exclusion match with url
|
|
168
|
+ if exclusion.match(result['url']):
|
|
169
|
+ skip_https_rewrite = True
|
|
170
|
+ break
|
|
171
|
+
|
|
172
|
+ # skip https rewrite if required
|
|
173
|
+ if skip_https_rewrite:
|
|
174
|
+ break
|
|
175
|
+
|
|
176
|
+ # process rules
|
|
177
|
+ for rule in rules:
|
|
178
|
+ try:
|
|
179
|
+ new_result_url = rule[0].sub(rule[1], result['url'])
|
|
180
|
+ except:
|
|
181
|
+ break
|
|
182
|
+
|
|
183
|
+ # parse new url
|
|
184
|
+ new_parsed_url = urlparse(new_result_url)
|
|
185
|
+
|
|
186
|
+ # continiue if nothing was rewritten
|
|
187
|
+ if result['url'] == new_result_url:
|
|
188
|
+ continue
|
|
189
|
+
|
|
190
|
+ # get domainname from result
|
|
191
|
+ # TODO, does only work correct with TLD's like
|
|
192
|
+ # asdf.com, not for asdf.com.de
|
|
193
|
+ # TODO, using publicsuffix instead of this rewrite rule
|
|
194
|
+ old_result_domainname = '.'.join(
|
|
195
|
+ result['parsed_url'].hostname.split('.')[-2:])
|
|
196
|
+ new_result_domainname = '.'.join(
|
|
197
|
+ new_parsed_url.hostname.split('.')[-2:])
|
|
198
|
+
|
|
199
|
+ # check if rewritten hostname is the same,
|
|
200
|
+ # to protect against wrong or malicious rewrite rules
|
|
201
|
+ if old_result_domainname == new_result_domainname:
|
|
202
|
+ # set new url
|
|
203
|
+ result['url'] = new_result_url
|
|
204
|
+
|
|
205
|
+ # target has matched, do not search over the other rules
|
|
206
|
+ break
|
|
207
|
+ return result
|