|
@@ -1,5 +1,5 @@
|
1
|
1
|
from HTMLParser import HTMLParser
|
2
|
|
-import htmlentitydefs
|
|
2
|
+#import htmlentitydefs
|
3
|
3
|
import csv
|
4
|
4
|
import codecs
|
5
|
5
|
import cStringIO
|
|
@@ -17,8 +17,9 @@ class HTMLTextExtractor(HTMLParser):
|
17
|
17
|
self.result.append(unichr(codepoint))
|
18
|
18
|
|
19
|
19
|
def handle_entityref(self, name):
|
20
|
|
- codepoint = htmlentitydefs.name2codepoint[name]
|
21
|
|
- self.result.append(unichr(codepoint))
|
|
20
|
+ #codepoint = htmlentitydefs.name2codepoint[name]
|
|
21
|
+ #self.result.append(unichr(codepoint))
|
|
22
|
+ self.result.append(name)
|
22
|
23
|
|
23
|
24
|
def get_text(self):
|
24
|
25
|
return u''.join(self.result)
|