Browse Source

[fix] ignore scripts/styles in html_to_text

Adam Tauber 10 years ago
parent
commit
1408859b4b
1 changed files with 23 additions and 0 deletions
  1. 23
    0
      searx/utils.py

+ 23
- 0
searx/utils.py View File

23
 
23
 
24
 ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
24
 ua = "Mozilla/5.0 ({os}) Gecko/20100101 Firefox/{version}"
25
 
25
 
26
+blocked_tags = ('script',
27
+                'style')
28
+
26
 
29
 
27
 def gen_useragent():
30
 def gen_useragent():
28
     # TODO
31
     # TODO
67
     def __init__(self):
70
     def __init__(self):
68
         HTMLParser.__init__(self)
71
         HTMLParser.__init__(self)
69
         self.result = []
72
         self.result = []
73
+        self.tags = []
74
+
75
+    def handle_starttag(self, tag, attrs):
76
+        print tag
77
+        self.tags.append(tag)
78
+
79
+    def handle_endtag(self, tag):
80
+        print tag,tag
81
+        if tag != self.tags[-1]:
82
+            raise Exception("invalid html")
83
+        self.tags.pop()
84
+
85
+    def is_valid_tag(self):
86
+        return not self.tags or self.tags[-1] not in blocked_tags
70
 
87
 
71
     def handle_data(self, d):
88
     def handle_data(self, d):
89
+        if not self.is_valid_tag():
90
+            return
72
         self.result.append(d)
91
         self.result.append(d)
73
 
92
 
74
     def handle_charref(self, number):
93
     def handle_charref(self, number):
94
+        if not self.is_valid_tag():
95
+            return
75
         if number[0] in (u'x', u'X'):
96
         if number[0] in (u'x', u'X'):
76
             codepoint = int(number[1:], 16)
97
             codepoint = int(number[1:], 16)
77
         else:
98
         else:
79
         self.result.append(unichr(codepoint))
100
         self.result.append(unichr(codepoint))
80
 
101
 
81
     def handle_entityref(self, name):
102
     def handle_entityref(self, name):
103
+        if not self.is_valid_tag():
104
+            return
82
         # codepoint = htmlentitydefs.name2codepoint[name]
105
         # codepoint = htmlentitydefs.name2codepoint[name]
83
         # self.result.append(unichr(codepoint))
106
         # self.result.append(unichr(codepoint))
84
         self.result.append(name)
107
         self.result.append(name)