瀏覽代碼

[fix] highlighting only html

asciimoo 11 年之前
父節點
當前提交
7b4ec5c5e9
共有 3 個文件被更改,包括 35 次插入28 次删除
  1. 0
    28
      searx/engines/__init__.py
  2. 26
    0
      searx/utils.py
  3. 9
    0
      searx/webapp.py

+ 0
- 28
searx/engines/__init__.py 查看文件

@@ -25,7 +25,6 @@ from urlparse import urlparse
25 25
 from searx import settings
26 26
 import ConfigParser
27 27
 import sys
28
-import re
29 28
 from datetime import datetime
30 29
 
31 30
 engine_dir = dirname(realpath(__file__))
@@ -106,31 +105,6 @@ def make_callback(engine_name, results, suggestions, callback, params):
106 105
         results[engine_name] = cb_res
107 106
     return process_callback
108 107
 
109
-def highlight_content(content, query):
110
-
111
-    if not content:
112
-        return None
113
-    # ignoring html contents
114
-    # TODO better html content detection
115
-    if content.find('<') != -1:
116
-        return content
117
-
118
-    query = query.decode('utf-8')
119
-    if content.lower().find(query.lower()) > -1:
120
-        query_regex = u'({0})'.format(re.escape(query))
121
-        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
122
-    else:
123
-        regex_parts = []
124
-        for chunk in query.split():
125
-            if len(chunk) == 1:
126
-                regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
127
-            else:
128
-                regex_parts.append(u'{0}'.format(re.escape(chunk)))
129
-        query_regex = u'({0})'.format('|'.join(regex_parts))
130
-        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
131
-
132
-    return content
133
-
134 108
 def score_results(results):
135 109
     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
136 110
     flat_len = len(flat_res)
@@ -218,8 +192,6 @@ def search(query, request, selected_engines):
218 192
     results = score_results(results)
219 193
 
220 194
     for result in results:
221
-        if 'content' in result:
222
-            result['content'] = highlight_content(result['content'], query)
223 195
         for res_engine in result['engines']:
224 196
             engines[result['engine']].stats['score_count'] += result['score']
225 197
 

+ 26
- 0
searx/utils.py 查看文件

@@ -3,6 +3,32 @@ from HTMLParser import HTMLParser
3 3
 import csv
4 4
 import codecs
5 5
 import cStringIO
6
+import re
7
+
8
+def highlight_content(content, query):
9
+
10
+    if not content:
11
+        return None
12
+    # ignoring html contents
13
+    # TODO better html content detection
14
+    if content.find('<') != -1:
15
+        return content
16
+
17
+    query = query.decode('utf-8')
18
+    if content.lower().find(query.lower()) > -1:
19
+        query_regex = u'({0})'.format(re.escape(query))
20
+        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
21
+    else:
22
+        regex_parts = []
23
+        for chunk in query.split():
24
+            if len(chunk) == 1:
25
+                regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
26
+            else:
27
+                regex_parts.append(u'{0}'.format(re.escape(chunk)))
28
+        query_regex = u'({0})'.format('|'.join(regex_parts))
29
+        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
30
+
31
+    return content
6 32
 
7 33
 class HTMLTextExtractor(HTMLParser):
8 34
     def __init__(self):

+ 9
- 0
searx/webapp.py 查看文件

@@ -29,6 +29,7 @@ import json
29 29
 import cStringIO
30 30
 from searx.utils import UnicodeWriter
31 31
 from flask import send_from_directory
32
+from searx.utils import highlight_content, html_to_text
32 33
 
33 34
 
34 35
 
@@ -104,6 +105,14 @@ def index():
104 105
     results, suggestions = search(query, request, selected_engines)
105 106
 
106 107
     for result in results:
108
+        if request_data.get('format', 'html') == 'html':
109
+            if 'content' in result:
110
+                result['content'] = highlight_content(result['content'], query)
111
+            result['title'] = highlight_content(result['title'], query)
112
+        else:
113
+            if 'content' in result:
114
+                result['content'] = html_to_text(result['content']).strip()
115
+            result['title'] = html_to_text(result['title']).strip()
107 116
         if len(result['url']) > 74:
108 117
             result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
109 118
         else: