|  | @@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64',
 | 
	
		
			
			| 23 | 23 |  
 | 
	
		
			
			| 24 | 24 |  ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
 | 
	
		
			
			| 25 | 25 |  
 | 
	
		
			
			|  | 26 | +blocked_tags = ('script',
 | 
	
		
			
			|  | 27 | +                'style')
 | 
	
		
			
			|  | 28 | +
 | 
	
		
			
			| 26 | 29 |  
 | 
	
		
			
			| 27 | 30 |  def gen_useragent():
 | 
	
		
			
			| 28 | 31 |      # TODO
 | 
	
	
		
			
			|  | @@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser):
 | 
	
		
			
			| 67 | 70 |      def __init__(self):
 | 
	
		
			
			| 68 | 71 |          HTMLParser.__init__(self)
 | 
	
		
			
			| 69 | 72 |          self.result = []
 | 
	
		
			
			|  | 73 | +        self.tags = []
 | 
	
		
			
			|  | 74 | +
 | 
	
		
			
			|  | 75 | +    def handle_starttag(self, tag, attrs):
 | 
	
		
			
			|  | 76 | +        print tag
 | 
	
		
			
			|  | 77 | +        self.tags.append(tag)
 | 
	
		
			
			|  | 78 | +
 | 
	
		
			
			|  | 79 | +    def handle_endtag(self, tag):
 | 
	
		
			
			|  | 80 | +        print tag,tag
 | 
	
		
			
			|  | 81 | +        if tag != self.tags[-1]:
 | 
	
		
			
			|  | 82 | +            raise Exception("invalid html")
 | 
	
		
			
			|  | 83 | +        self.tags.pop()
 | 
	
		
			
			|  | 84 | +
 | 
	
		
			
			|  | 85 | +    def is_valid_tag(self):
 | 
	
		
			
			|  | 86 | +        return not self.tags or self.tags[-1] not in blocked_tags
 | 
	
		
			
			| 70 | 87 |  
 | 
	
		
			
			| 71 | 88 |      def handle_data(self, d):
 | 
	
		
			
			|  | 89 | +        if not self.is_valid_tag():
 | 
	
		
			
			|  | 90 | +            return
 | 
	
		
			
			| 72 | 91 |          self.result.append(d)
 | 
	
		
			
			| 73 | 92 |  
 | 
	
		
			
			| 74 | 93 |      def handle_charref(self, number):
 | 
	
		
			
			|  | 94 | +        if not self.is_valid_tag():
 | 
	
		
			
			|  | 95 | +            return
 | 
	
		
			
			| 75 | 96 |          if number[0] in (u'x', u'X'):
 | 
	
		
			
			| 76 | 97 |              codepoint = int(number[1:], 16)
 | 
	
		
			
			| 77 | 98 |          else:
 | 
	
	
		
			
			|  | @@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser):
 | 
	
		
			
			| 79 | 100 |          self.result.append(unichr(codepoint))
 | 
	
		
			
			| 80 | 101 |  
 | 
	
		
			
			| 81 | 102 |      def handle_entityref(self, name):
 | 
	
		
			
			|  | 103 | +        if not self.is_valid_tag():
 | 
	
		
			
			|  | 104 | +            return
 | 
	
		
			
			| 82 | 105 |          # codepoint = htmlentitydefs.name2codepoint[name]
 | 
	
		
			
			| 83 | 106 |          # self.result.append(unichr(codepoint))
 | 
	
		
			
			| 84 | 107 |          self.result.append(name)
 |