Parcourir la source

Merge pull request #106 from pointhi/query_parser

FIX: #101 implement query parser and use it inside autocompletion and search query extraction
Adam Tauber il y a 10 ans
Parent
révision
983339bb03
3 fichiers modifiés avec 164 ajouts et 58 suppressions
  1. 127
    0
      searx/query.py
  2. 10
    55
      searx/search.py
  3. 27
    3
      searx/webapp.py

+ 127
- 0
searx/query.py Voir le fichier

1
+#!/usr/bin/env python
2
+
3
+'''
4
+searx is free software: you can redistribute it and/or modify
5
+it under the terms of the GNU Affero General Public License as published by
6
+the Free Software Foundation, either version 3 of the License, or
7
+(at your option) any later version.
8
+
9
+searx is distributed in the hope that it will be useful,
10
+but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12
+GNU Affero General Public License for more details.
13
+
14
+You should have received a copy of the GNU Affero General Public License
15
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
16
+
17
+(C) 2014 by Thomas Pointhuber, <thomas.pointhuber@gmx.at>
18
+'''
19
+
20
+from searx.languages import language_codes
21
+from searx.engines import (
22
+    categories, engines, engine_shortcuts
23
+)
24
+import string
25
+import re
26
+
27
+
28
+class Query(object):
29
+    """parse query"""
30
+
31
+    def __init__(self, query, blocked_engines):
32
+        self.query = query
33
+        self.blocked_engines = []
34
+        
35
+        if blocked_engines:
36
+            self.blocked_engines = blocked_engines
37
+            
38
+        self.query_parts = []
39
+        self.engines = []
40
+        self.languages = []
41
+    
42
+    # parse query, if tags are set, which change the serch engine or search-language
43
+    def parse_query(self):
44
+        self.query_parts = []
45
+        
46
+        # split query, including whitespaces
47
+        raw_query_parts = re.split(r'(\s+)', self.query)
48
+        
49
+        parse_next = True
50
+        
51
+        for query_part in raw_query_parts:
52
+            if not parse_next:
53
+                self.query_parts[-1] += query_part
54
+                continue
55
+           
56
+            parse_next = False
57
+           
58
+            # part does only contain spaces, skip
59
+            if query_part.isspace()\
60
+               or query_part == '':
61
+                parse_next = True
62
+                self.query_parts.append(query_part)
63
+                continue
64
+
65
+            # this force a language            
66
+            if query_part[0] == ':':
67
+                lang = query_part[1:].lower()
68
+
69
+                # check if any language-code is equal with declared language-codes
70
+                for lc in language_codes:
71
+                    lang_id, lang_name, country = map(str.lower, lc)
72
+
73
+                    # if correct language-code is found, set it as new search-language
74
+                    if lang == lang_id\
75
+                       or lang_id.startswith(lang)\
76
+                       or lang == lang_name\
77
+                       or lang == country:
78
+                        parse_next = True
79
+                        self.languages.append(lang)
80
+                        break
81
+
82
+            # this force a engine or category
83
+            if query_part[0] == '!':
84
+                prefix = query_part[1:].replace('_', ' ')
85
+
86
+                # check if prefix is equal with engine shortcut
87
+                if prefix in engine_shortcuts\
88
+                   and not engine_shortcuts[prefix] in self.blocked_engines:
89
+                    parse_next = True
90
+                    self.engines.append({'category': 'none',
91
+                                         'name': engine_shortcuts[prefix]})
92
+                
93
+                # check if prefix is equal with engine name
94
+                elif prefix in engines\
95
+                        and not prefix in self.blocked_engines:
96
+                    parse_next = True
97
+                    self.engines.append({'category': 'none',
98
+                                        'name': prefix})
99
+
100
+                # check if prefix is equal with categorie name
101
+                elif prefix in categories:
102
+                    # using all engines for that search, which are declared under that categorie name
103
+                    parse_next = True
104
+                    self.engines.extend({'category': prefix,
105
+                                        'name': engine.name}
106
+                                        for engine in categories[prefix]
107
+                                        if not engine in self.blocked_engines)
108
+          
109
+            # append query part to query_part list
110
+            self.query_parts.append(query_part)
111
+
112
+    def changeSearchQuery(self, search_query):
113
+        if len(self.query_parts):
114
+            self.query_parts[-1] = search_query
115
+        else:
116
+            self.query_parts.append(search_query)
117
+            
118
+    def getSearchQuery(self):
119
+        if len(self.query_parts):
120
+            return self.query_parts[-1]
121
+        else:
122
+            return ''
123
+    
124
+    def getFullQuery(self):
125
+        # get full querry including whitespaces
126
+        return string.join(self.query_parts, '')
127
+

+ 10
- 55
searx/search.py Voir le fichier

25
 )
25
 )
26
 from searx.languages import language_codes
26
 from searx.languages import language_codes
27
 from searx.utils import gen_useragent
27
 from searx.utils import gen_useragent
28
+from searx.query import Query
28
 
29
 
29
 
30
 
30
 number_of_searches = 0
31
 number_of_searches = 0
235
         self.pageno = int(pageno_param)
236
         self.pageno = int(pageno_param)
236
 
237
 
237
         # parse query, if tags are set, which change the serch engine or search-language
238
         # parse query, if tags are set, which change the serch engine or search-language
238
-        self.parse_query()
239
+        query_obj = Query(self.query, self.blocked_engines)
240
+        query_obj.parse_query()        
241
+
242
+        # get last selected language in query, if possible
243
+        # TODO support search with multible languages
244
+        if len(query_obj.languages):
245
+            self.lang = query_obj.languages[-1]
246
+
247
+        self.engines = query_obj.engines
239
 
248
 
240
         self.categories = []
249
         self.categories = []
241
 
250
 
276
                                     for x in categories[categ]
285
                                     for x in categories[categ]
277
                                     if not x.name in self.blocked_engines)
286
                                     if not x.name in self.blocked_engines)
278
 
287
 
279
-    # parse query, if tags are set, which change the serch engine or search-language
280
-    def parse_query(self):
281
-        query_parts = self.query.split()
282
-        modified = False
283
-
284
-        # check if language-prefix is set
285
-        if query_parts[0].startswith(':'):
286
-            lang = query_parts[0][1:].lower()
287
-
288
-            # check if any language-code is equal with declared language-codes
289
-            for lc in language_codes:
290
-                lang_id, lang_name, country = map(str.lower, lc)
291
-
292
-                # if correct language-code is found, set it as new search-language
293
-                if lang == lang_id\
294
-                   or lang_id.startswith(lang)\
295
-                   or lang == lang_name\
296
-                   or lang == country:
297
-                    self.lang = lang
298
-                    modified = True
299
-                    break
300
-
301
-        # check if category/engine prefix is set
302
-        elif query_parts[0].startswith('!'):
303
-            prefix = query_parts[0][1:].replace('_', ' ')
304
-
305
-            # check if prefix is equal with engine shortcut
306
-            if prefix in engine_shortcuts\
307
-               and not engine_shortcuts[prefix] in self.blocked_engines:
308
-                modified = True
309
-                self.engines.append({'category': 'none',
310
-                                     'name': engine_shortcuts[prefix]})
311
-
312
-            # check if prefix is equal with engine name
313
-            elif prefix in engines\
314
-                    and not prefix in self.blocked_engines:
315
-                modified = True
316
-                self.engines.append({'category': 'none',
317
-                                    'name': prefix})
318
-
319
-            # check if prefix is equal with categorie name
320
-            elif prefix in categories:
321
-                modified = True
322
-                # using all engines for that search, which are declared under that categorie name
323
-                self.engines.extend({'category': prefix,
324
-                                    'name': engine.name}
325
-                                    for engine in categories[prefix]
326
-                                    if not engine in self.blocked_engines)
327
-
328
-        # if language, category or engine were specificed in this query, search for more tags which does the same
329
-        if modified:
330
-            self.query = self.query.replace(query_parts[0], '', 1).strip()
331
-            self.parse_query()
332
-
333
     # do search-request
288
     # do search-request
334
     def search(self, request):
289
     def search(self, request):
335
         global number_of_searches
290
         global number_of_searches

+ 27
- 3
searx/webapp.py Voir le fichier

47
 from searx.https_rewrite import https_rules
47
 from searx.https_rewrite import https_rules
48
 from searx.languages import language_codes
48
 from searx.languages import language_codes
49
 from searx.search import Search
49
 from searx.search import Search
50
+from searx.query import Query
50
 from searx.autocomplete import backends as autocomplete_backends
51
 from searx.autocomplete import backends as autocomplete_backends
51
 
52
 
52
 
53
 
308
     """Return autocompleter results"""
309
     """Return autocompleter results"""
309
     request_data = {}
310
     request_data = {}
310
 
311
 
312
+    # select request method
311
     if request.method == 'POST':
313
     if request.method == 'POST':
312
         request_data = request.form
314
         request_data = request.form
313
     else:
315
     else:
314
         request_data = request.args
316
         request_data = request.args
315
 
317
 
316
-    query = request_data.get('q', '').encode('utf-8')
318
+    # set blocked engines
319
+    if request.cookies.get('blocked_engines'):
320
+        blocked_engines = request.cookies['blocked_engines'].split(',')  # noqa
321
+    else:
322
+        blocked_engines = []
323
+
324
+    # parse query
325
+    query = Query(request_data.get('q', '').encode('utf-8'), blocked_engines)
326
+    query.parse_query()
317
 
327
 
318
-    if not query:
328
+    # check if search query is set
329
+    if not query.getSearchQuery():
319
         return
330
         return
320
 
331
 
332
+    # run autocompleter
321
     completer = autocomplete_backends.get(request.cookies.get('autocomplete'))
333
     completer = autocomplete_backends.get(request.cookies.get('autocomplete'))
322
 
334
 
335
+    # check if valid autocompleter is selected
323
     if not completer:
336
     if not completer:
324
         return
337
         return
325
 
338
 
326
-    results = completer(query)
339
+    # run autocompletion
340
+    raw_results = completer(query.getSearchQuery())
341
+
342
+    # parse results (write :language and !engine back to result string)
343
+    results = []
344
+    for result in raw_results:
345
+        result_query = query
346
+        result_query.changeSearchQuery(result)
347
+
348
+        # add parsed result
349
+        results.append(result_query.getFullQuery())
327
 
350
 
351
+    # return autocompleter results
328
     if request_data.get('format') == 'x-suggestions':
352
     if request_data.get('format') == 'x-suggestions':
329
         return Response(json.dumps([query, results]),
353
         return Response(json.dumps([query, results]),
330
                         mimetype='application/json')
354
                         mimetype='application/json')