| 
				
			 | 
			
			
				@@ -0,0 +1,26 @@ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				1
			 | 
			
			
				+from HTMLParser import HTMLParser 
			 | 
		
	
		
			
			| 
				
			 | 
			
				2
			 | 
			
			
				+import htmlentitydefs 
			 | 
		
	
		
			
			| 
				
			 | 
			
				3
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				4
			 | 
			
			
				+class HTMLTextExtractor(HTMLParser): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				5
			 | 
			
			
				+    def __init__(self): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				6
			 | 
			
			
				+        HTMLParser.__init__(self) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				7
			 | 
			
			
				+        self.result = [ ] 
			 | 
		
	
		
			
			| 
				
			 | 
			
				8
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				9
			 | 
			
			
				+    def handle_data(self, d): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				10
			 | 
			
			
				+        self.result.append(d) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				11
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				12
			 | 
			
			
				+    def handle_charref(self, number): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				13
			 | 
			
			
				+        codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				14
			 | 
			
			
				+        self.result.append(unichr(codepoint)) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				15
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				16
			 | 
			
			
				+    def handle_entityref(self, name): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				17
			 | 
			
			
				+        codepoint = htmlentitydefs.name2codepoint[name] 
			 | 
		
	
		
			
			| 
				
			 | 
			
				18
			 | 
			
			
				+        self.result.append(unichr(codepoint)) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				19
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				20
			 | 
			
			
				+    def get_text(self): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				21
			 | 
			
			
				+        return u''.join(self.result) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				22
			 | 
			
			
				+ 
			 | 
		
	
		
			
			| 
				
			 | 
			
				23
			 | 
			
			
				+def html_to_text(html): 
			 | 
		
	
		
			
			| 
				
			 | 
			
				24
			 | 
			
			
				+    s = HTMLTextExtractor() 
			 | 
		
	
		
			
			| 
				
			 | 
			
				25
			 | 
			
			
				+    s.feed(html) 
			 | 
		
	
		
			
			| 
				
			 | 
			
				26
			 | 
			
			
				+    return s.get_text() 
			 |