Browse Source

[enh][mod] code refactor ++ search engine filtering

asciimoo 11 years ago
parent
commit
0540ea9ee2
3 changed files with 133 additions and 100 deletions
  1. 94
    0
      searx/search.py
  2. 4
    4
      searx/tests/test_webapp.py
  3. 35
    96
      searx/webapp.py

+ 94
- 0
searx/search.py View File

1
+from searx.engines import (
2
+    categories, engines, engine_shortcuts
3
+)
4
+from searx.languages import language_codes
5
+
6
+
7
+class Search(object):
8
+
9
+    """Search information container"""
10
+
11
+    def __init__(self, request):
12
+        super(Search, self).__init__()
13
+        self.query = None
14
+        self.engines = []
15
+        self.categories = []
16
+        query_engines = []
17
+        self.paging = False
18
+        self.pageno = 1
19
+        self.lang = 'all'
20
+        if request.cookies.get('blocked_engines'):
21
+            self.blocked_engines = request.cookies['blocked_engines'].split(',')
22
+        else:
23
+            self.blocked_engines = []
24
+        self.results = []
25
+        self.suggestions = []
26
+        self.request_data = {}
27
+
28
+        if request.cookies.get('language')\
29
+           and request.cookies['language'] in (x[0] for x in language_codes):
30
+            self.lang = request.cookies['language']
31
+
32
+        if request.method == 'POST':
33
+            self.request_data = request.form
34
+        else:
35
+            self.request_data = request.args
36
+
37
+        # TODO better exceptions
38
+        if not self.request_data.get('q'):
39
+            raise Exception('noquery')
40
+
41
+        self.query = self.request_data['q']
42
+
43
+        pageno_param = self.request_data.get('pageno', '1')
44
+        if not pageno_param.isdigit() or int(pageno_param) < 1:
45
+            raise Exception('wrong pagenumber')
46
+
47
+        self.pageno = int(pageno_param)
48
+
49
+        query_parts = self.query.split()
50
+        if query_parts[0].startswith('!'):
51
+            prefix = query_parts[0][1:].replace('_', ' ')
52
+            if prefix in engine_shortcuts\
53
+               and not engine_shortcuts[prefix] in self.blocked_engines:
54
+                self.engines.append({'category': 'none',
55
+                                     'name': engine_shortcuts[prefix]})
56
+            elif prefix in engines\
57
+                    and not prefix in self.blocked_engines:
58
+                self.engines.append({'category': 'none',
59
+                                    'name': prefix})
60
+            elif prefix in categories:
61
+                self.engines.extend({'category': prefix,
62
+                                    'name': engine.name}
63
+                                    for engine in categories[prefix]
64
+                                    if not engine in self.blocked_engines)
65
+
66
+        if len(query_engines):
67
+            self.query = self.query.replace(query_parts[0], '', 1).strip()
68
+
69
+        self.categories = []
70
+
71
+        if len(self.engines):
72
+            self.categories = list(set(engine['category']
73
+                                           for engine in self.engines))
74
+        else:
75
+            for pd_name, pd in self.request_data.items():
76
+                if pd_name.startswith('category_'):
77
+                    category = pd_name[9:]
78
+                    if not category in categories:
79
+                        continue
80
+                    self.categories.append(category)
81
+            if not len(self.categories):
82
+                cookie_categories = request.cookies.get('categories', '')
83
+                cookie_categories = cookie_categories.split(',')
84
+                for ccateg in cookie_categories:
85
+                    if ccateg in categories:
86
+                        self.categories.append(ccateg)
87
+            if not len(self.categories):
88
+                self.categories = ['general']
89
+
90
+            for categ in self.categories:
91
+                self.engines.extend({'category': categ,
92
+                                     'name': x.name}
93
+                                    for x in categories[categ]
94
+                                    if not x.name in self.blocked_engines)

+ 4
- 4
searx/tests/test_webapp.py View File

39
         self.assertEqual(result.status_code, 200)
39
         self.assertEqual(result.status_code, 200)
40
         self.assertIn('<div class="title"><h1>searx</h1></div>', result.data)
40
         self.assertIn('<div class="title"><h1>searx</h1></div>', result.data)
41
 
41
 
42
-    @patch('searx.webapp.search')
42
+    @patch('searx.webapp.do_search')
43
     def test_index_html(self, search):
43
     def test_index_html(self, search):
44
         search.return_value = (
44
         search.return_value = (
45
             self.test_results,
45
             self.test_results,
55
             result.data
55
             result.data
56
         )
56
         )
57
 
57
 
58
-    @patch('searx.webapp.search')
58
+    @patch('searx.webapp.do_search')
59
     def test_index_json(self, search):
59
     def test_index_json(self, search):
60
         search.return_value = (
60
         search.return_value = (
61
             self.test_results,
61
             self.test_results,
71
         self.assertEqual(
71
         self.assertEqual(
72
             result_dict['results'][0]['url'], 'http://first.test.xyz')
72
             result_dict['results'][0]['url'], 'http://first.test.xyz')
73
 
73
 
74
-    @patch('searx.webapp.search')
74
+    @patch('searx.webapp.do_search')
75
     def test_index_csv(self, search):
75
     def test_index_csv(self, search):
76
         search.return_value = (
76
         search.return_value = (
77
             self.test_results,
77
             self.test_results,
86
             result.data
86
             result.data
87
         )
87
         )
88
 
88
 
89
-    @patch('searx.webapp.search')
89
+    @patch('searx.webapp.do_search')
90
     def test_index_rss(self, search):
90
     def test_index_rss(self, search):
91
         search.return_value = (
91
         search.return_value = (
92
             self.test_results,
92
             self.test_results,

+ 35
- 96
searx/webapp.py View File

28
 from flask.ext.babel import Babel
28
 from flask.ext.babel import Babel
29
 from searx import settings, searx_dir
29
 from searx import settings, searx_dir
30
 from searx.engines import (
30
 from searx.engines import (
31
-    search, categories, engines, get_engines_stats, engine_shortcuts
31
+    search as do_search, categories, engines, get_engines_stats,
32
+    engine_shortcuts
32
 )
33
 )
33
 from searx.utils import UnicodeWriter, highlight_content, html_to_text
34
 from searx.utils import UnicodeWriter, highlight_content, html_to_text
34
 from searx.languages import language_codes
35
 from searx.languages import language_codes
36
+from searx.search import Search
35
 
37
 
36
 
38
 
37
 app = Flask(
39
 app = Flask(
94
     return render_template(template_name, **kwargs)
96
     return render_template(template_name, **kwargs)
95
 
97
 
96
 
98
 
97
-def parse_query(query):
98
-    query_engines = []
99
-    query_parts = query.split()
100
-
101
-    if query_parts[0].startswith('!'):
102
-        prefix = query_parts[0][1:].replace('_', ' ')
103
-        if prefix in engine_shortcuts:
104
-            query_engines.append({'category': 'none',
105
-                                  'name': engine_shortcuts[prefix]})
106
-        elif prefix in engines:
107
-            query_engines.append({'category': 'none',
108
-                                  'name': prefix})
109
-        elif prefix in categories:
110
-            query_engines.extend({'category': prefix,
111
-                                  'name': engine.name}
112
-                                 for engine in categories[prefix])
113
-
114
-    if len(query_engines):
115
-        query = query.replace(query_parts[0], '', 1).strip()
116
-    return query, query_engines
117
-
118
-
119
 @app.route('/', methods=['GET', 'POST'])
99
 @app.route('/', methods=['GET', 'POST'])
120
 def index():
100
 def index():
121
     """Render index page.
101
     """Render index page.
122
 
102
 
123
     Supported outputs: html, json, csv, rss.
103
     Supported outputs: html, json, csv, rss.
124
     """
104
     """
125
-    paging = False
126
-    lang = 'all'
127
-
128
-    if request.cookies.get('language')\
129
-       and request.cookies['language'] in (x[0] for x in language_codes):
130
-        lang = request.cookies['language']
131
-
132
-    if request.method == 'POST':
133
-        request_data = request.form
134
-    else:
135
-        request_data = request.args
136
-    if not request_data.get('q'):
137
-        return render('index.html')
138
 
105
 
139
-    pageno_param = request_data.get('pageno', '1')
140
-    if not pageno_param.isdigit() or int(pageno_param) < 1:
106
+    try:
107
+        search = Search(request)
108
+    except:
141
         return render('index.html')
109
         return render('index.html')
142
 
110
 
143
-    pageno = int(pageno_param)
144
-
145
-    selected_categories = []
146
-
147
-    query, selected_engines = parse_query(request_data['q'].encode('utf-8'))
148
-
149
-    if len(selected_engines):
150
-        selected_categories = list(set(engine['category']
151
-                                       for engine in selected_engines))
152
-    else:
153
-        for pd_name, pd in request_data.items():
154
-            if pd_name.startswith('category_'):
155
-                category = pd_name[9:]
156
-                if not category in categories:
157
-                    continue
158
-                selected_categories.append(category)
159
-        if not len(selected_categories):
160
-            cookie_categories = request.cookies.get('categories', '')
161
-            cookie_categories = cookie_categories.split(',')
162
-            for ccateg in cookie_categories:
163
-                if ccateg in categories:
164
-                    selected_categories.append(ccateg)
165
-        if not len(selected_categories):
166
-            selected_categories = ['general']
167
-
168
-        for categ in selected_categories:
169
-            selected_engines.extend({'category': categ,
170
-                                     'name': x.name}
171
-                                    for x in categories[categ])
172
-
173
-    results, suggestions = search(query,
174
-                                  request,
175
-                                  selected_engines,
176
-                                  pageno,
177
-                                  lang)
178
-
179
-    for result in results:
180
-        if not paging and engines[result['engine']].paging:
181
-            paging = True
182
-        if request_data.get('format', 'html') == 'html':
111
+    # TODO moar refactor - do_search integration into Search class
112
+    search.results, search.suggestions = do_search(search.query,
113
+                                                   request,
114
+                                                   search.engines,
115
+                                                   search.pageno,
116
+                                                   search.lang)
117
+
118
+    for result in search.results:
119
+        if not search.paging and engines[result['engine']].paging:
120
+            search.paging = True
121
+        if search.request_data.get('format', 'html') == 'html':
183
             if 'content' in result:
122
             if 'content' in result:
184
-                result['content'] = highlight_content(result['content'], query)
185
-            result['title'] = highlight_content(result['title'], query)
123
+                result['content'] = highlight_content(result['content'], search.query)
124
+            result['title'] = highlight_content(result['title'], search.query)
186
         else:
125
         else:
187
             if 'content' in result:
126
             if 'content' in result:
188
                 result['content'] = html_to_text(result['content']).strip()
127
                 result['content'] = html_to_text(result['content']).strip()
199
             if engine in favicons:
138
             if engine in favicons:
200
                 result['favicon'] = engine
139
                 result['favicon'] = engine
201
 
140
 
202
-    if request_data.get('format') == 'json':
203
-        return Response(json.dumps({'query': query, 'results': results}),
141
+    if search.request_data.get('format') == 'json':
142
+        return Response(json.dumps({'query': search.query, 'results': search.results}),
204
                         mimetype='application/json')
143
                         mimetype='application/json')
205
-    elif request_data.get('format') == 'csv':
144
+    elif search.request_data.get('format') == 'csv':
206
         csv = UnicodeWriter(cStringIO.StringIO())
145
         csv = UnicodeWriter(cStringIO.StringIO())
207
         keys = ('title', 'url', 'content', 'host', 'engine', 'score')
146
         keys = ('title', 'url', 'content', 'host', 'engine', 'score')
208
-        if len(results):
147
+        if len(search.results):
209
             csv.writerow(keys)
148
             csv.writerow(keys)
210
-            for row in results:
149
+            for row in search.results:
211
                 row['host'] = row['parsed_url'].netloc
150
                 row['host'] = row['parsed_url'].netloc
212
                 csv.writerow([row.get(key, '') for key in keys])
151
                 csv.writerow([row.get(key, '') for key in keys])
213
         csv.stream.seek(0)
152
         csv.stream.seek(0)
214
         response = Response(csv.stream.read(), mimetype='application/csv')
153
         response = Response(csv.stream.read(), mimetype='application/csv')
215
-        content_disp = 'attachment;Filename=searx_-_{0}.csv'.format(query)
154
+        content_disp = 'attachment;Filename=searx_-_{0}.csv'.format(search.query)
216
         response.headers.add('Content-Disposition', content_disp)
155
         response.headers.add('Content-Disposition', content_disp)
217
         return response
156
         return response
218
-    elif request_data.get('format') == 'rss':
157
+    elif search.request_data.get('format') == 'rss':
219
         response_rss = render(
158
         response_rss = render(
220
             'opensearch_response_rss.xml',
159
             'opensearch_response_rss.xml',
221
-            results=results,
222
-            q=request_data['q'],
223
-            number_of_results=len(results),
160
+            results=search.results,
161
+            q=search.request_data['q'],
162
+            number_of_results=len(search.results),
224
             base_url=get_base_url()
163
             base_url=get_base_url()
225
         )
164
         )
226
         return Response(response_rss, mimetype='text/xml')
165
         return Response(response_rss, mimetype='text/xml')
227
 
166
 
228
     return render(
167
     return render(
229
         'results.html',
168
         'results.html',
230
-        results=results,
231
-        q=request_data['q'],
232
-        selected_categories=selected_categories,
233
-        paging=paging,
234
-        pageno=pageno,
235
-        suggestions=suggestions
169
+        results=search.results,
170
+        q=search.request_data['q'],
171
+        selected_categories=search.categories,
172
+        paging=search.paging,
173
+        pageno=search.pageno,
174
+        suggestions=search.suggestions
236
     )
175
     )
237
 
176
 
238
 
177