|
@@ -1,14 +1,139 @@
|
|
1
|
+'''
|
|
2
|
+searx is free software: you can redistribute it and/or modify
|
|
3
|
+it under the terms of the GNU Affero General Public License as published by
|
|
4
|
+the Free Software Foundation, either version 3 of the License, or
|
|
5
|
+(at your option) any later version.
|
|
6
|
+
|
|
7
|
+searx is distributed in the hope that it will be useful,
|
|
8
|
+but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9
|
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10
|
+GNU Affero General Public License for more details.
|
|
11
|
+
|
|
12
|
+You should have received a copy of the GNU Affero General Public License
|
|
13
|
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
14
|
+
|
|
15
|
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
16
|
+'''
|
|
17
|
+
|
1
|
18
|
import re
|
|
19
|
+from lxml import etree
|
|
20
|
+from os import listdir
|
|
21
|
+from os.path import isfile, join
|
|
22
|
+
|
2
|
23
|
|
3
|
24
|
# https://gitweb.torproject.org/\
|
4
|
25
|
# pde/https-everywhere.git/tree/4.0:/src/chrome/content/rules
|
5
|
26
|
|
6
|
27
|
# HTTPS rewrite rules
|
7
|
|
-https_rules = (
|
8
|
|
- # from
|
9
|
|
- (re.compile(r'^http://(www\.|m\.|)?xkcd\.(?:com|org)/', re.I | re.U),
|
10
|
|
- # to
|
11
|
|
- r'https://\1xkcd.com/'),
|
12
|
|
- (re.compile(r'^https?://(?:ssl)?imgs\.xkcd\.com/', re.I | re.U),
|
13
|
|
- r'https://sslimgs.xkcd.com/'),
|
14
|
|
-)
|
|
28
|
+https_rules = []
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+# load single ruleset from a xml file
|
|
32
|
+def load_single_https_ruleset(filepath):
|
|
33
|
+ ruleset = ()
|
|
34
|
+
|
|
35
|
+ # init parser
|
|
36
|
+ parser = etree.XMLParser()
|
|
37
|
+
|
|
38
|
+ # load and parse xml-file
|
|
39
|
+ try:
|
|
40
|
+ tree = etree.parse(filepath, parser)
|
|
41
|
+ except:
|
|
42
|
+ # TODO, error message
|
|
43
|
+ return ()
|
|
44
|
+
|
|
45
|
+ # get root node
|
|
46
|
+ root = tree.getroot()
|
|
47
|
+
|
|
48
|
+ #print(etree.tostring(tree))
|
|
49
|
+
|
|
50
|
+ # check if root is a node with the name ruleset
|
|
51
|
+ # TODO improve parsing
|
|
52
|
+ if root.tag != 'ruleset':
|
|
53
|
+ return ()
|
|
54
|
+
|
|
55
|
+ # check if rule is deactivated by default
|
|
56
|
+ if root.attrib.get('default_off'):
|
|
57
|
+ return ()
|
|
58
|
+
|
|
59
|
+ # check if rule does only work for specific platforms
|
|
60
|
+ if root.attrib.get('platform'):
|
|
61
|
+ return ()
|
|
62
|
+
|
|
63
|
+ hosts = []
|
|
64
|
+ rules = []
|
|
65
|
+ exclusions = []
|
|
66
|
+
|
|
67
|
+ # parse childs from ruleset
|
|
68
|
+ for ruleset in root:
|
|
69
|
+ # this child define a target
|
|
70
|
+ if ruleset.tag == 'target':
|
|
71
|
+ # check if required tags available
|
|
72
|
+ if not ruleset.attrib.get('host'):
|
|
73
|
+ continue
|
|
74
|
+
|
|
75
|
+ # convert host-rule to valid regex
|
|
76
|
+ host = ruleset.attrib.get('host').replace('.', '\.').replace('*', '.*')
|
|
77
|
+
|
|
78
|
+ # append to host list
|
|
79
|
+ hosts.append(host)
|
|
80
|
+
|
|
81
|
+ # this child define a rule
|
|
82
|
+ elif ruleset.tag == 'rule':
|
|
83
|
+ # check if required tags available
|
|
84
|
+ if not ruleset.attrib.get('from')\
|
|
85
|
+ or not ruleset.attrib.get('to'):
|
|
86
|
+ continue
|
|
87
|
+
|
|
88
|
+ # TODO hack, which convert a javascript regex group into a valid python regex group
|
|
89
|
+ rule_from = ruleset.attrib.get('from').replace('$', '\\')
|
|
90
|
+ rule_to = ruleset.attrib.get('to').replace('$', '\\')
|
|
91
|
+
|
|
92
|
+ # TODO, not working yet because of the hack above, currently doing that in webapp.py
|
|
93
|
+ #rule_from_rgx = re.compile(rule_from, re.I)
|
|
94
|
+
|
|
95
|
+ # append rule
|
|
96
|
+ rules.append((rule_from, rule_to))
|
|
97
|
+
|
|
98
|
+ # this child define an exclusion
|
|
99
|
+ elif ruleset.tag == 'exclusion':
|
|
100
|
+ # check if required tags available
|
|
101
|
+ if not ruleset.attrib.get('pattern'):
|
|
102
|
+ continue
|
|
103
|
+
|
|
104
|
+ exclusion_rgx = re.compile(ruleset.attrib.get('pattern'))
|
|
105
|
+
|
|
106
|
+ # append exclusion
|
|
107
|
+ exclusions.append(exclusion_rgx)
|
|
108
|
+
|
|
109
|
+ # convert list of possible hosts to a simple regex
|
|
110
|
+ # TODO compress regex to improve performance
|
|
111
|
+ try:
|
|
112
|
+ target_hosts = re.compile('^(' + '|'.join(hosts) + ')', re.I | re.U)
|
|
113
|
+ except:
|
|
114
|
+ return ()
|
|
115
|
+
|
|
116
|
+ # return ruleset
|
|
117
|
+ return (target_hosts, rules, exclusions)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+# load all https rewrite rules
|
|
121
|
+def load_https_rules(rules_path):
|
|
122
|
+ # add / to path if not set yet
|
|
123
|
+ if rules_path[-1:] != '/':
|
|
124
|
+ rules_path += '/'
|
|
125
|
+
|
|
126
|
+ # search all xml files which are stored in the https rule directory
|
|
127
|
+ xml_files = [ join(rules_path,f) for f in listdir(rules_path) if isfile(join(rules_path,f)) and f[-4:] == '.xml' ]
|
|
128
|
+
|
|
129
|
+ # load xml-files
|
|
130
|
+ for ruleset_file in xml_files:
|
|
131
|
+ # calculate rewrite-rules
|
|
132
|
+ ruleset = load_single_https_ruleset(ruleset_file)
|
|
133
|
+
|
|
134
|
+ # skip if no ruleset returned
|
|
135
|
+ if not ruleset:
|
|
136
|
+ continue
|
|
137
|
+
|
|
138
|
+ # append ruleset
|
|
139
|
+ https_rules.append(ruleset)
|