Quellcode durchsuchen

[mod] do not escape html content in engines

Adam Tauber vor 8 Jahren
Ursprung
Commit
16bdc0baf4

+ 1
- 2
searx/engines/archlinux.py Datei anzeigen

@@ -12,7 +12,6 @@
12 12
 """
13 13
 
14 14
 from urlparse import urljoin
15
-from cgi import escape
16 15
 from urllib import urlencode
17 16
 from lxml import html
18 17
 from searx.engines.xpath import extract_text
@@ -135,7 +134,7 @@ def response(resp):
135 134
     for result in dom.xpath(xpath_results):
136 135
         link = result.xpath(xpath_link)[0]
137 136
         href = urljoin(base_url, link.attrib.get('href'))
138
-        title = escape(extract_text(link))
137
+        title = extract_text(link)
139 138
 
140 139
         results.append({'url': href,
141 140
                         'title': title})

+ 1
- 2
searx/engines/base.py Datei anzeigen

@@ -16,7 +16,6 @@
16 16
 from lxml import etree
17 17
 from urllib import urlencode
18 18
 from searx.utils import searx_useragent
19
-from cgi import escape
20 19
 from datetime import datetime
21 20
 import re
22 21
 
@@ -94,7 +93,7 @@ def response(resp):
94 93
                 url = item.text
95 94
 
96 95
             elif item.attrib["name"] == "dcdescription":
97
-                content = escape(item.text[:300])
96
+                content = item.text[:300]
98 97
                 if len(item.text) > 300:
99 98
                     content += "..."
100 99
 

+ 2
- 3
searx/engines/bing.py Datei anzeigen

@@ -14,7 +14,6 @@
14 14
 """
15 15
 
16 16
 from urllib import urlencode
17
-from cgi import escape
18 17
 from lxml import html
19 18
 from searx.engines.xpath import extract_text
20 19
 
@@ -61,7 +60,7 @@ def response(resp):
61 60
         link = result.xpath('.//h3/a')[0]
62 61
         url = link.attrib.get('href')
63 62
         title = extract_text(link)
64
-        content = escape(extract_text(result.xpath('.//p')))
63
+        content = extract_text(result.xpath('.//p'))
65 64
 
66 65
         # append result
67 66
         results.append({'url': url,
@@ -73,7 +72,7 @@ def response(resp):
73 72
         link = result.xpath('.//h2/a')[0]
74 73
         url = link.attrib.get('href')
75 74
         title = extract_text(link)
76
-        content = escape(extract_text(result.xpath('.//p')))
75
+        content = extract_text(result.xpath('.//p'))
77 76
 
78 77
         # append result
79 78
         results.append({'url': url,

+ 2
- 3
searx/engines/btdigg.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 """
12 12
 
13 13
 from urlparse import urljoin
14
-from cgi import escape
15 14
 from urllib import quote
16 15
 from lxml import html
17 16
 from operator import itemgetter
@@ -51,8 +50,8 @@ def response(resp):
51 50
     for result in search_res:
52 51
         link = result.xpath('.//td[@class="torrent_name"]//a')[0]
53 52
         href = urljoin(url, link.attrib.get('href'))
54
-        title = escape(extract_text(link))
55
-        content = escape(extract_text(result.xpath('.//pre[@class="snippet"]')[0]))
53
+        title = extract_text(link)
54
+        content = extract_text(result.xpath('.//pre[@class="snippet"]')[0])
56 55
         content = "<br />".join(content.split("\n"))
57 56
 
58 57
         filesize = result.xpath('.//span[@class="attr_val"]/text()')[0].split()[0]

+ 1
- 2
searx/engines/dailymotion.py Datei anzeigen

@@ -14,7 +14,6 @@
14 14
 
15 15
 from urllib import urlencode
16 16
 from json import loads
17
-from cgi import escape
18 17
 from datetime import datetime
19 18
 
20 19
 # engine dependent config
@@ -57,7 +56,7 @@ def response(resp):
57 56
     for res in search_res['list']:
58 57
         title = res['title']
59 58
         url = res['url']
60
-        content = escape(res['description'])
59
+        content = res['description']
61 60
         thumbnail = res['thumbnail_360_url']
62 61
         publishedDate = datetime.fromtimestamp(res['created_time'], None)
63 62
         embedded = embedded_url.format(videoid=res['id'])

+ 5
- 4
searx/engines/deezer.py Datei anzeigen

@@ -51,10 +51,11 @@ def response(resp):
51 51
             if url.startswith('http://'):
52 52
                 url = 'https' + url[4:]
53 53
 
54
-            content = result['artist']['name'] +\
55
-                " &bull; " +\
56
-                result['album']['title'] +\
57
-                " &bull; " + result['title']
54
+            content = '{} - {} - {}'.format(
55
+                result['artist']['name'],
56
+                result['album']['title'],
57
+                result['title'])
58
+
58 59
             embedded = embedded_url.format(audioid=result['id'])
59 60
 
60 61
             # append result

+ 2
- 3
searx/engines/dictzone.py Datei anzeigen

@@ -12,7 +12,6 @@
12 12
 import re
13 13
 from urlparse import urljoin
14 14
 from lxml import html
15
-from cgi import escape
16 15
 from searx.utils import is_valid_lang
17 16
 
18 17
 categories = ['general']
@@ -62,8 +61,8 @@ def response(resp):
62 61
 
63 62
         results.append({
64 63
             'url': urljoin(resp.url, '?%d' % k),
65
-            'title': escape(from_result.text_content()),
66
-            'content': escape('; '.join(to_results))
64
+            'title': from_result.text_content(),
65
+            'content': '; '.join(to_results)
67 66
         })
68 67
 
69 68
     return results

+ 1
- 2
searx/engines/digg.py Datei anzeigen

@@ -13,7 +13,6 @@
13 13
 from urllib import quote_plus
14 14
 from json import loads
15 15
 from lxml import html
16
-from cgi import escape
17 16
 from dateutil import parser
18 17
 
19 18
 # engine dependent config
@@ -56,7 +55,7 @@ def response(resp):
56 55
         url = result.attrib.get('data-contenturl')
57 56
         thumbnail = result.xpath('.//img')[0].attrib.get('src')
58 57
         title = ''.join(result.xpath(title_xpath))
59
-        content = escape(''.join(result.xpath(content_xpath)))
58
+        content = ''.join(result.xpath(content_xpath))
60 59
         pubdate = result.xpath(pubdate_xpath)[0].attrib.get('datetime')
61 60
         publishedDate = parser.parse(pubdate)
62 61
 

+ 1
- 2
searx/engines/fdroid.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
  @parse        url, title, content
10 10
 """
11 11
 
12
-from cgi import escape
13 12
 from urllib import urlencode
14 13
 from searx.engines.xpath import extract_text
15 14
 from lxml import html
@@ -43,7 +42,7 @@ def response(resp):
43 42
         img_src = app.xpath('.//img/@src')[0]
44 43
 
45 44
         content = extract_text(app.xpath('./p')[0])
46
-        content = escape(content.replace(title, '', 1).strip())
45
+        content = content.replace(title, '', 1).strip()
47 46
 
48 47
         results.append({'url': url,
49 48
                         'title': title,

+ 3
- 11
searx/engines/flickr.py Datei anzeigen

@@ -77,21 +77,13 @@ def response(resp):
77 77
 
78 78
         url = build_flickr_url(photo['owner'], photo['id'])
79 79
 
80
-        title = photo['title']
81
-
82
-        content = '<span class="photo-author">' +\
83
-                  photo['ownername'] +\
84
-                  '</span><br />' +\
85
-                  '<span class="description">' +\
86
-                  photo['description']['_content'] +\
87
-                  '</span>'
88
-
89 80
         # append result
90 81
         results.append({'url': url,
91
-                        'title': title,
82
+                        'title': photo['title'],
92 83
                         'img_src': img_src,
93 84
                         'thumbnail_src': thumbnail_src,
94
-                        'content': content,
85
+                        'content': content = photo['description']['_content'],
86
+                        'author': photo['ownername'],
95 87
                         'template': 'images.html'})
96 88
 
97 89
     # return results

+ 3
- 4
searx/engines/flickr_noapi.py Datei anzeigen

@@ -102,16 +102,15 @@ def response(resp):
102 102
 
103 103
         title = photo.get('title', '')
104 104
 
105
-        content = '<span class="photo-author">' +\
106
-                  photo['username'] +\
107
-                  '</span><br />'
105
+        author = photo['username']
108 106
 
109 107
         # append result
110 108
         results.append({'url': url,
111 109
                         'title': title,
112 110
                         'img_src': img_src,
113 111
                         'thumbnail_src': thumbnail_src,
114
-                        'content': content,
112
+                        'content': '',
113
+                        'author': author,
115 114
                         'template': 'images.html'})
116 115
 
117 116
     return results

+ 2
- 3
searx/engines/gigablast.py Datei anzeigen

@@ -10,7 +10,6 @@
10 10
  @parse       url, title, content
11 11
 """
12 12
 
13
-from cgi import escape
14 13
 from json import loads
15 14
 from random import randint
16 15
 from time import time
@@ -78,8 +77,8 @@ def response(resp):
78 77
     for result in response_json['results']:
79 78
         # append result
80 79
         results.append({'url': result['url'],
81
-                        'title': escape(result['title']),
82
-                        'content': escape(result['sum'])})
80
+                        'title': result['title'],
81
+                        'content': result['sum']})
83 82
 
84 83
     # return results
85 84
     return results

+ 1
- 2
searx/engines/github.py Datei anzeigen

@@ -12,7 +12,6 @@
12 12
 
13 13
 from urllib import urlencode
14 14
 from json import loads
15
-from cgi import escape
16 15
 
17 16
 # engine dependent config
18 17
 categories = ['it']
@@ -48,7 +47,7 @@ def response(resp):
48 47
         url = res['html_url']
49 48
 
50 49
         if res['description']:
51
-            content = escape(res['description'][:500])
50
+            content = res['description'][:500]
52 51
         else:
53 52
             content = ''
54 53
 

+ 2
- 3
searx/engines/google.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
 # @parse       url, title, content, suggestion
10 10
 
11 11
 import re
12
-from cgi import escape
13 12
 from urllib import urlencode
14 13
 from urlparse import urlparse, parse_qsl
15 14
 from lxml import html, etree
@@ -155,7 +154,7 @@ def parse_url(url_string, google_hostname):
155 154
 def extract_text_from_dom(result, xpath):
156 155
     r = result.xpath(xpath)
157 156
     if len(r) > 0:
158
-        return escape(extract_text(r[0]))
157
+        return extract_text(r[0])
159 158
     return None
160 159
 
161 160
 
@@ -264,7 +263,7 @@ def response(resp):
264 263
     # parse suggestion
265 264
     for suggestion in dom.xpath(suggestion_xpath):
266 265
         # append suggestion
267
-        results.append({'suggestion': escape(extract_text(suggestion))})
266
+        results.append({'suggestion': extract_text(suggestion)})
268 267
 
269 268
     # return results
270 269
     return results

+ 1
- 2
searx/engines/kickass.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 """
12 12
 
13 13
 from urlparse import urljoin
14
-from cgi import escape
15 14
 from urllib import quote
16 15
 from lxml import html
17 16
 from operator import itemgetter
@@ -57,7 +56,7 @@ def response(resp):
57 56
         link = result.xpath('.//a[@class="cellMainLink"]')[0]
58 57
         href = urljoin(url, link.attrib['href'])
59 58
         title = extract_text(link)
60
-        content = escape(extract_text(result.xpath(content_xpath)))
59
+        content = extract_text(result.xpath(content_xpath))
61 60
         seed = extract_text(result.xpath('.//td[contains(@class, "green")]'))
62 61
         leech = extract_text(result.xpath('.//td[contains(@class, "red")]'))
63 62
         filesize_info = extract_text(result.xpath('.//td[contains(@class, "nobr")]'))

+ 2
- 4
searx/engines/nyaa.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
  @parse        url, title, content, seed, leech, torrentfile
10 10
 """
11 11
 
12
-from cgi import escape
13 12
 from urllib import urlencode
14 13
 from lxml import html
15 14
 from searx.engines.xpath import extract_text
@@ -78,7 +77,7 @@ def response(resp):
78 77
 
79 78
         # torrent title
80 79
         page_a = result.xpath(xpath_title)[0]
81
-        title = escape(extract_text(page_a))
80
+        title = extract_text(page_a)
82 81
 
83 82
         # link to the page
84 83
         href = page_a.attrib.get('href')
@@ -90,7 +89,7 @@ def response(resp):
90 89
         try:
91 90
             file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
92 91
             file_size = int(float(file_size) * get_filesize_mul(suffix))
93
-        except Exception as e:
92
+        except:
94 93
             file_size = None
95 94
 
96 95
         # seed count
@@ -105,7 +104,6 @@ def response(resp):
105 104
         # content string contains all information not included into template
106 105
         content = 'Category: "{category}". Downloaded {downloads} times.'
107 106
         content = content.format(category=category, downloads=downloads)
108
-        content = escape(content)
109 107
 
110 108
         results.append({'url': href,
111 109
                         'title': title,

+ 1
- 2
searx/engines/piratebay.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
 # @parse       url, title, content, seed, leech, magnetlink
10 10
 
11 11
 from urlparse import urljoin
12
-from cgi import escape
13 12
 from urllib import quote
14 13
 from lxml import html
15 14
 from operator import itemgetter
@@ -62,7 +61,7 @@ def response(resp):
62 61
         link = result.xpath('.//div[@class="detName"]//a')[0]
63 62
         href = urljoin(url, link.attrib.get('href'))
64 63
         title = extract_text(link)
65
-        content = escape(extract_text(result.xpath(content_xpath)))
64
+        content = extract_text(result.xpath(content_xpath))
66 65
         seed, leech = result.xpath('.//td[@align="right"]/text()')[:2]
67 66
 
68 67
         # convert seed to int if possible

+ 1
- 2
searx/engines/reddit.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 """
12 12
 
13 13
 import json
14
-from cgi import escape
15 14
 from urllib import urlencode
16 15
 from urlparse import urlparse, urljoin
17 16
 from datetime import datetime
@@ -68,7 +67,7 @@ def response(resp):
68 67
             img_results.append(params)
69 68
         else:
70 69
             created = datetime.fromtimestamp(data['created_utc'])
71
-            content = escape(data['selftext'])
70
+            content = data['selftext']
72 71
             if len(content) > 500:
73 72
                 content = content[:500] + '...'
74 73
             params['content'] = content

+ 2
- 10
searx/engines/searchcode_doc.py Datei anzeigen

@@ -44,20 +44,12 @@ def response(resp):
44 44
     # parse results
45 45
     for result in search_results.get('results', []):
46 46
         href = result['url']
47
-        title = "[" + result['type'] + "] " +\
48
-                result['namespace'] +\
49
-                " " + result['name']
50
-        content = '<span class="highlight">[' +\
51
-                  result['type'] + "] " +\
52
-                  result['name'] + " " +\
53
-                  result['synopsis'] +\
54
-                  "</span><br />" +\
55
-                  result['description']
47
+        title = "[{}] {} {}".format(result['type'], result['namespace'], result['name'])
56 48
 
57 49
         # append result
58 50
         results.append({'url': href,
59 51
                         'title': title,
60
-                        'content': content})
52
+                        'content': result['description']})
61 53
 
62 54
     # return results
63 55
     return results

+ 0
- 1
searx/engines/seedpeer.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
 # @parse       url, title, content, seed, leech, magnetlink
10 10
 
11 11
 from urlparse import urljoin
12
-from cgi import escape
13 12
 from urllib import quote
14 13
 from lxml import html
15 14
 from operator import itemgetter

+ 5
- 4
searx/engines/spotify.py Datei anzeigen

@@ -46,10 +46,11 @@ def response(resp):
46 46
         if result['type'] == 'track':
47 47
             title = result['name']
48 48
             url = result['external_urls']['spotify']
49
-            content = result['artists'][0]['name'] +\
50
-                " &bull; " +\
51
-                result['album']['name'] +\
52
-                " &bull; " + result['name']
49
+            content = '{} - {} - {}'.format(
50
+                result['artists'][0]['name'],
51
+                result['album']['name'],
52
+                result['name'])
53
+
53 54
             embedded = embedded_url.format(audioid=result['id'])
54 55
 
55 56
             # append result

+ 2
- 3
searx/engines/stackoverflow.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 """
12 12
 
13 13
 from urlparse import urljoin
14
-from cgi import escape
15 14
 from urllib import urlencode
16 15
 from lxml import html
17 16
 from searx.engines.xpath import extract_text
@@ -48,8 +47,8 @@ def response(resp):
48 47
     for result in dom.xpath(results_xpath):
49 48
         link = result.xpath(link_xpath)[0]
50 49
         href = urljoin(url, link.attrib.get('href'))
51
-        title = escape(extract_text(link))
52
-        content = escape(extract_text(result.xpath(content_xpath)))
50
+        title = extract_text(link)
51
+        content = extract_text(result.xpath(content_xpath))
53 52
 
54 53
         # append result
55 54
         results.append({'url': href,

+ 2
- 3
searx/engines/startpage.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 # @todo        paging
12 12
 
13 13
 from lxml import html
14
-from cgi import escape
15 14
 from dateutil import parser
16 15
 from datetime import datetime, timedelta
17 16
 import re
@@ -79,10 +78,10 @@ def response(resp):
79 78
         if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
80 79
             continue
81 80
 
82
-        title = escape(extract_text(link))
81
+        title = extract_text(link)
83 82
 
84 83
         if result.xpath('./p[@class="desc clk"]'):
85
-            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
84
+            content = extract_text(result.xpath('./p[@class="desc clk"]'))
86 85
         else:
87 86
             content = ''
88 87
 

+ 2
- 3
searx/engines/subtitleseeker.py Datei anzeigen

@@ -10,7 +10,6 @@
10 10
  @parse       url, title, content
11 11
 """
12 12
 
13
-from cgi import escape
14 13
 from urllib import quote_plus
15 14
 from lxml import html
16 15
 from searx.languages import language_codes
@@ -59,7 +58,7 @@ def response(resp):
59 58
         elif search_lang:
60 59
             href = href + search_lang + '/'
61 60
 
62
-        title = escape(extract_text(link))
61
+        title = extract_text(link)
63 62
 
64 63
         content = extract_text(result.xpath('.//div[contains(@class,"red")]'))
65 64
         content = content + " - "
@@ -75,7 +74,7 @@ def response(resp):
75 74
         # append result
76 75
         results.append({'url': href,
77 76
                         'title': title,
78
-                        'content': escape(content)})
77
+                        'content': content})
79 78
 
80 79
     # return results
81 80
     return results

+ 4
- 5
searx/engines/swisscows.py Datei anzeigen

@@ -10,7 +10,6 @@
10 10
  @parse       url, title, content
11 11
 """
12 12
 
13
-from cgi import escape
14 13
 from json import loads
15 14
 from urllib import urlencode, unquote
16 15
 import re
@@ -78,7 +77,7 @@ def response(resp):
78 77
 
79 78
             # append result
80 79
             results.append({'url': result['SourceUrl'],
81
-                            'title': escape(result['Title']),
80
+                            'title': result['Title'],
82 81
                             'content': '',
83 82
                             'img_src': img_url,
84 83
                             'template': 'images.html'})
@@ -90,8 +89,8 @@ def response(resp):
90 89
 
91 90
             # append result
92 91
             results.append({'url': result_url,
93
-                            'title': escape(result_title),
94
-                            'content': escape(result_content)})
92
+                            'title': result_title,
93
+                            'content': result_content})
95 94
 
96 95
     # parse images
97 96
     for result in json.get('Images', []):
@@ -100,7 +99,7 @@ def response(resp):
100 99
 
101 100
         # append result
102 101
         results.append({'url': result['SourceUrl'],
103
-                        'title': escape(result['Title']),
102
+                        'title': result['Title'],
104 103
                         'content': '',
105 104
                         'img_src': img_url,
106 105
                         'template': 'images.html'})

+ 0
- 1
searx/engines/tokyotoshokan.py Datei anzeigen

@@ -11,7 +11,6 @@
11 11
 """
12 12
 
13 13
 import re
14
-from cgi import escape
15 14
 from urllib import urlencode
16 15
 from lxml import html
17 16
 from searx.engines.xpath import extract_text

+ 0
- 1
searx/engines/torrentz.py Datei anzeigen

@@ -12,7 +12,6 @@
12 12
 """
13 13
 
14 14
 import re
15
-from cgi import escape
16 15
 from urllib import urlencode
17 16
 from lxml import html
18 17
 from searx.engines.xpath import extract_text

+ 5
- 6
searx/engines/translated.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
  @parse       url, title, content
10 10
 """
11 11
 import re
12
-from cgi import escape
13 12
 from searx.utils import is_valid_lang
14 13
 
15 14
 categories = ['general']
@@ -52,14 +51,14 @@ def request(query, params):
52 51
 def response(resp):
53 52
     results = []
54 53
     results.append({
55
-        'url': escape(web_url.format(
54
+        'url': web_url.format(
56 55
             from_lang=resp.search_params['from_lang'][2],
57 56
             to_lang=resp.search_params['to_lang'][2],
58
-            query=resp.search_params['query'])),
59
-        'title': escape('[{0}-{1}] {2}'.format(
57
+            query=resp.search_params['query']),
58
+        'title': '[{0}-{1}] {2}'.format(
60 59
             resp.search_params['from_lang'][1],
61 60
             resp.search_params['to_lang'][1],
62
-            resp.search_params['query'])),
63
-        'content': escape(resp.json()['responseData']['translatedText'])
61
+            resp.search_params['query']),
62
+        'content': resp.json()['responseData']['translatedText']
64 63
     })
65 64
     return results

+ 0
- 1
searx/engines/wolframalpha_noapi.py Datei anzeigen

@@ -8,7 +8,6 @@
8 8
 # @stable      no
9 9
 # @parse       url, infobox
10 10
 
11
-from cgi import escape
12 11
 from json import loads
13 12
 from time import time
14 13
 from urllib import urlencode

+ 2
- 3
searx/engines/yandex.py Datei anzeigen

@@ -9,7 +9,6 @@
9 9
  @parse       url, title, content
10 10
 """
11 11
 
12
-from cgi import escape
13 12
 from urllib import urlencode
14 13
 from lxml import html
15 14
 from searx.search import logger
@@ -52,8 +51,8 @@ def response(resp):
52 51
     for result in dom.xpath(results_xpath):
53 52
         try:
54 53
             res = {'url': result.xpath(url_xpath)[0],
55
-                   'title': escape(''.join(result.xpath(title_xpath))),
56
-                   'content': escape(''.join(result.xpath(content_xpath)))}
54
+                   'title': ''.join(result.xpath(title_xpath)),
55
+                   'content': ''.join(result.xpath(content_xpath))}
57 56
         except:
58 57
             logger.exception('yandex parse crash')
59 58
             continue