|
@@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64',
|
23
|
23
|
|
24
|
24
|
ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
|
25
|
25
|
|
|
26
|
+blocked_tags = ('script',
|
|
27
|
+ 'style')
|
|
28
|
+
|
26
|
29
|
|
27
|
30
|
def gen_useragent():
|
28
|
31
|
# TODO
|
|
@@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser):
|
67
|
70
|
def __init__(self):
|
68
|
71
|
HTMLParser.__init__(self)
|
69
|
72
|
self.result = []
|
|
73
|
+ self.tags = []
|
|
74
|
+
|
|
75
|
+ def handle_starttag(self, tag, attrs):
|
|
76
|
+ print tag
|
|
77
|
+ self.tags.append(tag)
|
|
78
|
+
|
|
79
|
+ def handle_endtag(self, tag):
|
|
80
|
+ print tag,tag
|
|
81
|
+ if tag != self.tags[-1]:
|
|
82
|
+ raise Exception("invalid html")
|
|
83
|
+ self.tags.pop()
|
|
84
|
+
|
|
85
|
+ def is_valid_tag(self):
|
|
86
|
+ return not self.tags or self.tags[-1] not in blocked_tags
|
70
|
87
|
|
71
|
88
|
def handle_data(self, d):
|
|
89
|
+ if not self.is_valid_tag():
|
|
90
|
+ return
|
72
|
91
|
self.result.append(d)
|
73
|
92
|
|
74
|
93
|
def handle_charref(self, number):
|
|
94
|
+ if not self.is_valid_tag():
|
|
95
|
+ return
|
75
|
96
|
if number[0] in (u'x', u'X'):
|
76
|
97
|
codepoint = int(number[1:], 16)
|
77
|
98
|
else:
|
|
@@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser):
|
79
|
100
|
self.result.append(unichr(codepoint))
|
80
|
101
|
|
81
|
102
|
def handle_entityref(self, name):
|
|
103
|
+ if not self.is_valid_tag():
|
|
104
|
+ return
|
82
|
105
|
# codepoint = htmlentitydefs.name2codepoint[name]
|
83
|
106
|
# self.result.append(unichr(codepoint))
|
84
|
107
|
self.result.append(name)
|