Browse Source

[fix] highlighting only html

asciimoo 11 years ago
parent
commit
7b4ec5c5e9
3 changed files with 35 additions and 28 deletions
  1. 0
    28
      searx/engines/__init__.py
  2. 26
    0
      searx/utils.py
  3. 9
    0
      searx/webapp.py

+ 0
- 28
searx/engines/__init__.py View File

25
 from searx import settings
25
 from searx import settings
26
 import ConfigParser
26
 import ConfigParser
27
 import sys
27
 import sys
28
-import re
29
 from datetime import datetime
28
 from datetime import datetime
30
 
29
 
31
 engine_dir = dirname(realpath(__file__))
30
 engine_dir = dirname(realpath(__file__))
106
         results[engine_name] = cb_res
105
         results[engine_name] = cb_res
107
     return process_callback
106
     return process_callback
108
 
107
 
109
-def highlight_content(content, query):
110
-
111
-    if not content:
112
-        return None
113
-    # ignoring html contents
114
-    # TODO better html content detection
115
-    if content.find('<') != -1:
116
-        return content
117
-
118
-    query = query.decode('utf-8')
119
-    if content.lower().find(query.lower()) > -1:
120
-        query_regex = u'({0})'.format(re.escape(query))
121
-        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
122
-    else:
123
-        regex_parts = []
124
-        for chunk in query.split():
125
-            if len(chunk) == 1:
126
-                regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
127
-            else:
128
-                regex_parts.append(u'{0}'.format(re.escape(chunk)))
129
-        query_regex = u'({0})'.format('|'.join(regex_parts))
130
-        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
131
-
132
-    return content
133
-
134
 def score_results(results):
108
 def score_results(results):
135
     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
109
     flat_res = filter(None, chain.from_iterable(izip_longest(*results.values())))
136
     flat_len = len(flat_res)
110
     flat_len = len(flat_res)
218
     results = score_results(results)
192
     results = score_results(results)
219
 
193
 
220
     for result in results:
194
     for result in results:
221
-        if 'content' in result:
222
-            result['content'] = highlight_content(result['content'], query)
223
         for res_engine in result['engines']:
195
         for res_engine in result['engines']:
224
             engines[result['engine']].stats['score_count'] += result['score']
196
             engines[result['engine']].stats['score_count'] += result['score']
225
 
197
 

+ 26
- 0
searx/utils.py View File

3
 import csv
3
 import csv
4
 import codecs
4
 import codecs
5
 import cStringIO
5
 import cStringIO
6
+import re
7
+
8
+def highlight_content(content, query):
9
+
10
+    if not content:
11
+        return None
12
+    # ignoring html contents
13
+    # TODO better html content detection
14
+    if content.find('<') != -1:
15
+        return content
16
+
17
+    query = query.decode('utf-8')
18
+    if content.lower().find(query.lower()) > -1:
19
+        query_regex = u'({0})'.format(re.escape(query))
20
+        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
21
+    else:
22
+        regex_parts = []
23
+        for chunk in query.split():
24
+            if len(chunk) == 1:
25
+                regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
26
+            else:
27
+                regex_parts.append(u'{0}'.format(re.escape(chunk)))
28
+        query_regex = u'({0})'.format('|'.join(regex_parts))
29
+        content = re.sub(query_regex, '<b>\\1</b>', content, flags=re.I | re.U)
30
+
31
+    return content
6
 
32
 
7
 class HTMLTextExtractor(HTMLParser):
33
 class HTMLTextExtractor(HTMLParser):
8
     def __init__(self):
34
     def __init__(self):

+ 9
- 0
searx/webapp.py View File

29
 import cStringIO
29
 import cStringIO
30
 from searx.utils import UnicodeWriter
30
 from searx.utils import UnicodeWriter
31
 from flask import send_from_directory
31
 from flask import send_from_directory
32
+from searx.utils import highlight_content, html_to_text
32
 
33
 
33
 
34
 
34
 
35
 
104
     results, suggestions = search(query, request, selected_engines)
105
     results, suggestions = search(query, request, selected_engines)
105
 
106
 
106
     for result in results:
107
     for result in results:
108
+        if request_data.get('format', 'html') == 'html':
109
+            if 'content' in result:
110
+                result['content'] = highlight_content(result['content'], query)
111
+            result['title'] = highlight_content(result['title'], query)
112
+        else:
113
+            if 'content' in result:
114
+                result['content'] = html_to_text(result['content']).strip()
115
+            result['title'] = html_to_text(result['title']).strip()
107
         if len(result['url']) > 74:
116
         if len(result['url']) > 74:
108
             result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
117
             result['pretty_url'] = result['url'][:35] + '[..]' + result['url'][-35:]
109
         else:
118
         else: