浏览代码

[fix] ignore scripts/styles in html_to_text

Adam Tauber 10 年前
父节点
当前提交
1408859b4b
共有 1 个文件被更改,包括 23 次插入0 次删除
  1. 23
    0
      searx/utils.py

+ 23
- 0
searx/utils.py 查看文件

@@ -23,6 +23,9 @@ ua_os = ('Windows NT 6.3; WOW64',
23 23
 
24 24
 ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
25 25
 
26
+blocked_tags = ('script',
27
+                'style')
28
+
26 29
 
27 30
 def gen_useragent():
28 31
     # TODO
@@ -67,11 +70,29 @@ class HTMLTextExtractor(HTMLParser):
67 70
     def __init__(self):
68 71
         HTMLParser.__init__(self)
69 72
         self.result = []
73
+        self.tags = []
74
+
75
+    def handle_starttag(self, tag, attrs):
76
+        print tag
77
+        self.tags.append(tag)
78
+
79
+    def handle_endtag(self, tag):
80
+        print tag,tag
81
+        if tag != self.tags[-1]:
82
+            raise Exception("invalid html")
83
+        self.tags.pop()
84
+
85
+    def is_valid_tag(self):
86
+        return not self.tags or self.tags[-1] not in blocked_tags
70 87
 
71 88
     def handle_data(self, d):
89
+        if not self.is_valid_tag():
90
+            return
72 91
         self.result.append(d)
73 92
 
74 93
     def handle_charref(self, number):
94
+        if not self.is_valid_tag():
95
+            return
75 96
         if number[0] in (u'x', u'X'):
76 97
             codepoint = int(number[1:], 16)
77 98
         else:
@@ -79,6 +100,8 @@ class HTMLTextExtractor(HTMLParser):
79 100
         self.result.append(unichr(codepoint))
80 101
 
81 102
     def handle_entityref(self, name):
103
+        if not self.is_valid_tag():
104
+            return
82 105
         # codepoint = htmlentitydefs.name2codepoint[name]
83 106
         # self.result.append(unichr(codepoint))
84 107
         self.result.append(name)