|
@@ -1,3 +1,20 @@
|
|
1
|
+'''
|
|
2
|
+searx is free software: you can redistribute it and/or modify
|
|
3
|
+it under the terms of the GNU Affero General Public License as published by
|
|
4
|
+the Free Software Foundation, either version 3 of the License, or
|
|
5
|
+(at your option) any later version.
|
|
6
|
+
|
|
7
|
+searx is distributed in the hope that it will be useful,
|
|
8
|
+but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9
|
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10
|
+GNU Affero General Public License for more details.
|
|
11
|
+
|
|
12
|
+You should have received a copy of the GNU Affero General Public License
|
|
13
|
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
14
|
+
|
|
15
|
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
16
|
+'''
|
|
17
|
+
|
1
|
18
|
import grequests
|
2
|
19
|
from itertools import izip_longest, chain
|
3
|
20
|
from datetime import datetime
|
|
@@ -9,45 +26,65 @@ from searx.engines import (
|
9
|
26
|
from searx.languages import language_codes
|
10
|
27
|
from searx.utils import gen_useragent
|
11
|
28
|
|
|
29
|
+
|
12
|
30
|
number_of_searches = 0
|
13
|
31
|
|
14
|
32
|
|
|
33
|
+# get default reqest parameter
|
15
|
34
|
def default_request_params():
|
16
|
35
|
return {
|
17
|
36
|
'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
|
18
|
37
|
|
19
|
38
|
|
|
39
|
+# create a callback wrapper for the search engine results
|
20
|
40
|
def make_callback(engine_name, results, suggestions, callback, params):
|
|
41
|
+
|
21
|
42
|
# creating a callback wrapper for the search engine results
|
22
|
43
|
def process_callback(response, **kwargs):
|
23
|
44
|
cb_res = []
|
24
|
45
|
response.search_params = params
|
|
46
|
+
|
|
47
|
+ # update stats with current page-load-time
|
25
|
48
|
engines[engine_name].stats['page_load_time'] += \
|
26
|
49
|
(datetime.now() - params['started']).total_seconds()
|
|
50
|
+
|
27
|
51
|
try:
|
28
|
52
|
search_results = callback(response)
|
29
|
53
|
except Exception, e:
|
|
54
|
+ # increase errors stats
|
30
|
55
|
engines[engine_name].stats['errors'] += 1
|
31
|
56
|
results[engine_name] = cb_res
|
|
57
|
+
|
|
58
|
+ # print engine name and specific error message
|
32
|
59
|
print '[E] Error with engine "{0}":\n\t{1}'.format(
|
33
|
60
|
engine_name, str(e))
|
34
|
61
|
return
|
|
62
|
+
|
35
|
63
|
for result in search_results:
|
36
|
64
|
result['engine'] = engine_name
|
|
65
|
+
|
|
66
|
+ # if it is a suggestion, add it to list of suggestions
|
37
|
67
|
if 'suggestion' in result:
|
38
|
68
|
# TODO type checks
|
39
|
69
|
suggestions.add(result['suggestion'])
|
40
|
70
|
continue
|
|
71
|
+
|
|
72
|
+ # append result
|
41
|
73
|
cb_res.append(result)
|
|
74
|
+
|
42
|
75
|
results[engine_name] = cb_res
|
|
76
|
+
|
43
|
77
|
return process_callback
|
44
|
78
|
|
45
|
79
|
|
|
80
|
+# score results and remove duplications
|
46
|
81
|
def score_results(results):
|
|
82
|
+ # calculate scoring parameters
|
47
|
83
|
flat_res = filter(
|
48
|
84
|
None, chain.from_iterable(izip_longest(*results.values())))
|
49
|
85
|
flat_len = len(flat_res)
|
50
|
86
|
engines_len = len(results)
|
|
87
|
+
|
51
|
88
|
results = []
|
52
|
89
|
|
53
|
90
|
# pass 1: deduplication + scoring
|
|
@@ -63,34 +100,53 @@ def score_results(results):
|
63
|
100
|
res['engines'] = [res['engine']]
|
64
|
101
|
weight = 1.0
|
65
|
102
|
|
|
103
|
+ # get weight of this engine if possible
|
66
|
104
|
if hasattr(engines[res['engine']], 'weight'):
|
67
|
105
|
weight = float(engines[res['engine']].weight)
|
68
|
106
|
|
|
107
|
+ # calculate score for that engine
|
69
|
108
|
score = int((flat_len - i) / engines_len) * weight + 1
|
|
109
|
+
|
70
|
110
|
duplicated = False
|
71
|
111
|
|
|
112
|
+ # check for duplicates
|
72
|
113
|
for new_res in results:
|
|
114
|
+ # remove / from the end of the url if required
|
73
|
115
|
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
74
|
116
|
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
|
117
|
+
|
|
118
|
+ # check if that result is a duplicate
|
75
|
119
|
if res['host'] == new_res['host'] and\
|
76
|
120
|
unquote(p1) == unquote(p2) and\
|
77
|
121
|
res['parsed_url'].query == new_res['parsed_url'].query and\
|
78
|
122
|
res.get('template') == new_res.get('template'):
|
79
|
123
|
duplicated = new_res
|
80
|
124
|
break
|
|
125
|
+
|
|
126
|
+ # merge duplicates together
|
81
|
127
|
if duplicated:
|
|
128
|
+ # using content with more text
|
82
|
129
|
if res.get('content') > duplicated.get('content'):
|
83
|
130
|
duplicated['content'] = res['content']
|
|
131
|
+
|
|
132
|
+ # increase result-score
|
84
|
133
|
duplicated['score'] += score
|
|
134
|
+
|
|
135
|
+ # add engine to list of result-engines
|
85
|
136
|
duplicated['engines'].append(res['engine'])
|
|
137
|
+
|
|
138
|
+ # using https if possible
|
86
|
139
|
if duplicated['parsed_url'].scheme == 'https':
|
87
|
140
|
continue
|
88
|
141
|
elif res['parsed_url'].scheme == 'https':
|
89
|
142
|
duplicated['url'] = res['parsed_url'].geturl()
|
90
|
143
|
duplicated['parsed_url'] = res['parsed_url']
|
|
144
|
+
|
|
145
|
+ # if there is no duplicate found, append result
|
91
|
146
|
else:
|
92
|
147
|
res['score'] = score
|
93
|
148
|
results.append(res)
|
|
149
|
+
|
94
|
150
|
results = sorted(results, key=itemgetter('score'), reverse=True)
|
95
|
151
|
|
96
|
152
|
# pass 2 : group results by category and template
|
|
@@ -99,7 +155,7 @@ def score_results(results):
|
99
|
155
|
|
100
|
156
|
for i, res in enumerate(results):
|
101
|
157
|
# FIXME : handle more than one category per engine
|
102
|
|
- category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']
|
|
158
|
+ category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']
|
103
|
159
|
|
104
|
160
|
current = None if category not in categoryPositions else categoryPositions[category]
|
105
|
161
|
|
|
@@ -134,6 +190,7 @@ class Search(object):
|
134
|
190
|
"""Search information container"""
|
135
|
191
|
|
136
|
192
|
def __init__(self, request):
|
|
193
|
+ # init vars
|
137
|
194
|
super(Search, self).__init__()
|
138
|
195
|
self.query = None
|
139
|
196
|
self.engines = []
|
|
@@ -141,18 +198,23 @@ class Search(object):
|
141
|
198
|
self.paging = False
|
142
|
199
|
self.pageno = 1
|
143
|
200
|
self.lang = 'all'
|
|
201
|
+
|
|
202
|
+ # set blocked engines
|
144
|
203
|
if request.cookies.get('blocked_engines'):
|
145
|
204
|
self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa
|
146
|
205
|
else:
|
147
|
206
|
self.blocked_engines = []
|
|
207
|
+
|
148
|
208
|
self.results = []
|
149
|
209
|
self.suggestions = []
|
150
|
210
|
self.request_data = {}
|
151
|
211
|
|
|
212
|
+ # set specific language if set
|
152
|
213
|
if request.cookies.get('language')\
|
153
|
214
|
and request.cookies['language'] in (x[0] for x in language_codes):
|
154
|
215
|
self.lang = request.cookies['language']
|
155
|
216
|
|
|
217
|
+ # set request method
|
156
|
218
|
if request.method == 'POST':
|
157
|
219
|
self.request_data = request.form
|
158
|
220
|
else:
|
|
@@ -162,51 +224,72 @@ class Search(object):
|
162
|
224
|
if not self.request_data.get('q'):
|
163
|
225
|
raise Exception('noquery')
|
164
|
226
|
|
|
227
|
+ # set query
|
165
|
228
|
self.query = self.request_data['q']
|
166
|
229
|
|
|
230
|
+ # set pagenumber
|
167
|
231
|
pageno_param = self.request_data.get('pageno', '1')
|
168
|
232
|
if not pageno_param.isdigit() or int(pageno_param) < 1:
|
169
|
233
|
raise Exception('wrong pagenumber')
|
170
|
234
|
|
171
|
235
|
self.pageno = int(pageno_param)
|
172
|
236
|
|
|
237
|
+ # parse query, if tags are set, which change the serch engine or search-language
|
173
|
238
|
self.parse_query()
|
174
|
239
|
|
175
|
240
|
self.categories = []
|
176
|
241
|
|
|
242
|
+ # if engines are calculated from query, set categories by using that informations
|
177
|
243
|
if self.engines:
|
178
|
244
|
self.categories = list(set(engine['category']
|
179
|
245
|
for engine in self.engines))
|
|
246
|
+
|
|
247
|
+ # otherwise, using defined categories to calculate which engines should be used
|
180
|
248
|
else:
|
|
249
|
+ # set used categories
|
181
|
250
|
for pd_name, pd in self.request_data.items():
|
182
|
251
|
if pd_name.startswith('category_'):
|
183
|
252
|
category = pd_name[9:]
|
|
253
|
+ # if category is not found in list, skip
|
184
|
254
|
if not category in categories:
|
185
|
255
|
continue
|
|
256
|
+
|
|
257
|
+ # add category to list
|
186
|
258
|
self.categories.append(category)
|
|
259
|
+
|
|
260
|
+ # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie)
|
187
|
261
|
if not self.categories:
|
188
|
262
|
cookie_categories = request.cookies.get('categories', '')
|
189
|
263
|
cookie_categories = cookie_categories.split(',')
|
190
|
264
|
for ccateg in cookie_categories:
|
191
|
265
|
if ccateg in categories:
|
192
|
266
|
self.categories.append(ccateg)
|
|
267
|
+
|
|
268
|
+ # if still no category is specified, using general as default-category
|
193
|
269
|
if not self.categories:
|
194
|
270
|
self.categories = ['general']
|
195
|
271
|
|
|
272
|
+ # using all engines for that search, which are declared under the specific categories
|
196
|
273
|
for categ in self.categories:
|
197
|
274
|
self.engines.extend({'category': categ,
|
198
|
275
|
'name': x.name}
|
199
|
276
|
for x in categories[categ]
|
200
|
277
|
if not x.name in self.blocked_engines)
|
201
|
278
|
|
|
279
|
+ # parse query, if tags are set, which change the serch engine or search-language
|
202
|
280
|
def parse_query(self):
|
203
|
281
|
query_parts = self.query.split()
|
204
|
282
|
modified = False
|
|
283
|
+
|
|
284
|
+ # check if language-prefix is set
|
205
|
285
|
if query_parts[0].startswith(':'):
|
206
|
286
|
lang = query_parts[0][1:].lower()
|
207
|
287
|
|
|
288
|
+ # check if any language-code is equal with declared language-codes
|
208
|
289
|
for lc in language_codes:
|
209
|
290
|
lang_id, lang_name, country = map(str.lower, lc)
|
|
291
|
+
|
|
292
|
+ # if correct language-code is found, set it as new search-language
|
210
|
293
|
if lang == lang_id\
|
211
|
294
|
or lang_id.startswith(lang)\
|
212
|
295
|
or lang == lang_name\
|
|
@@ -215,56 +298,78 @@ class Search(object):
|
215
|
298
|
modified = True
|
216
|
299
|
break
|
217
|
300
|
|
|
301
|
+ # check if category/engine prefix is set
|
218
|
302
|
elif query_parts[0].startswith('!'):
|
219
|
303
|
prefix = query_parts[0][1:].replace('_', ' ')
|
220
|
304
|
|
|
305
|
+ # check if prefix is equal with engine shortcut
|
221
|
306
|
if prefix in engine_shortcuts\
|
222
|
307
|
and not engine_shortcuts[prefix] in self.blocked_engines:
|
223
|
308
|
modified = True
|
224
|
309
|
self.engines.append({'category': 'none',
|
225
|
310
|
'name': engine_shortcuts[prefix]})
|
|
311
|
+
|
|
312
|
+ # check if prefix is equal with engine name
|
226
|
313
|
elif prefix in engines\
|
227
|
314
|
and not prefix in self.blocked_engines:
|
228
|
315
|
modified = True
|
229
|
316
|
self.engines.append({'category': 'none',
|
230
|
317
|
'name': prefix})
|
|
318
|
+
|
|
319
|
+ # check if prefix is equal with categorie name
|
231
|
320
|
elif prefix in categories:
|
232
|
321
|
modified = True
|
|
322
|
+ # using all engines for that search, which are declared under that categorie name
|
233
|
323
|
self.engines.extend({'category': prefix,
|
234
|
324
|
'name': engine.name}
|
235
|
325
|
for engine in categories[prefix]
|
236
|
326
|
if not engine in self.blocked_engines)
|
|
327
|
+
|
|
328
|
+ # if language, category or engine were specificed in this query, search for more tags which does the same
|
237
|
329
|
if modified:
|
238
|
330
|
self.query = self.query.replace(query_parts[0], '', 1).strip()
|
239
|
331
|
self.parse_query()
|
240
|
332
|
|
|
333
|
+ # do search-request
|
241
|
334
|
def search(self, request):
|
242
|
335
|
global number_of_searches
|
|
336
|
+
|
|
337
|
+ # init vars
|
243
|
338
|
requests = []
|
244
|
339
|
results = {}
|
245
|
340
|
suggestions = set()
|
|
341
|
+
|
|
342
|
+ # increase number of searches
|
246
|
343
|
number_of_searches += 1
|
|
344
|
+
|
|
345
|
+ # set default useragent
|
247
|
346
|
#user_agent = request.headers.get('User-Agent', '')
|
248
|
347
|
user_agent = gen_useragent()
|
249
|
348
|
|
|
349
|
+ # start search-reqest for all selected engines
|
250
|
350
|
for selected_engine in self.engines:
|
251
|
351
|
if selected_engine['name'] not in engines:
|
252
|
352
|
continue
|
253
|
353
|
|
254
|
354
|
engine = engines[selected_engine['name']]
|
255
|
355
|
|
|
356
|
+ # if paging is not supported, skip
|
256
|
357
|
if self.pageno > 1 and not engine.paging:
|
257
|
358
|
continue
|
258
|
359
|
|
|
360
|
+ # if search-language is set and engine does not provide language-support, skip
|
259
|
361
|
if self.lang != 'all' and not engine.language_support:
|
260
|
362
|
continue
|
261
|
363
|
|
|
364
|
+ # set default request parameters
|
262
|
365
|
request_params = default_request_params()
|
263
|
366
|
request_params['headers']['User-Agent'] = user_agent
|
264
|
367
|
request_params['category'] = selected_engine['category']
|
265
|
368
|
request_params['started'] = datetime.now()
|
266
|
369
|
request_params['pageno'] = self.pageno
|
267
|
370
|
request_params['language'] = self.lang
|
|
371
|
+
|
|
372
|
+ # update request parameters dependent on search-engine (contained in engines folder)
|
268
|
373
|
request_params = engine.request(self.query.encode('utf-8'),
|
269
|
374
|
request_params)
|
270
|
375
|
|
|
@@ -272,6 +377,7 @@ class Search(object):
|
272
|
377
|
# TODO add support of offline engines
|
273
|
378
|
pass
|
274
|
379
|
|
|
380
|
+ # create a callback wrapper for the search engine results
|
275
|
381
|
callback = make_callback(
|
276
|
382
|
selected_engine['name'],
|
277
|
383
|
results,
|
|
@@ -280,6 +386,7 @@ class Search(object):
|
280
|
386
|
request_params
|
281
|
387
|
)
|
282
|
388
|
|
|
389
|
+ # create dictionary which contain all informations about the request
|
283
|
390
|
request_args = dict(
|
284
|
391
|
headers=request_params['headers'],
|
285
|
392
|
hooks=dict(response=callback),
|
|
@@ -287,6 +394,7 @@ class Search(object):
|
287
|
394
|
timeout=engine.timeout
|
288
|
395
|
)
|
289
|
396
|
|
|
397
|
+ # specific type of request (GET or POST)
|
290
|
398
|
if request_params['method'] == 'GET':
|
291
|
399
|
req = grequests.get
|
292
|
400
|
else:
|
|
@@ -297,17 +405,25 @@ class Search(object):
|
297
|
405
|
if not request_params['url']:
|
298
|
406
|
continue
|
299
|
407
|
|
|
408
|
+ # append request to list
|
300
|
409
|
requests.append(req(request_params['url'], **request_args))
|
|
410
|
+
|
|
411
|
+ # send all search-request
|
301
|
412
|
grequests.map(requests)
|
|
413
|
+
|
|
414
|
+ # update engine-specific stats
|
302
|
415
|
for engine_name, engine_results in results.items():
|
303
|
416
|
engines[engine_name].stats['search_count'] += 1
|
304
|
417
|
engines[engine_name].stats['result_count'] += len(engine_results)
|
305
|
418
|
|
|
419
|
+ # score results and remove duplications
|
306
|
420
|
results = score_results(results)
|
307
|
421
|
|
|
422
|
+ # update engine stats, using calculated score
|
308
|
423
|
for result in results:
|
309
|
424
|
for res_engine in result['engines']:
|
310
|
425
|
engines[result['engine']]\
|
311
|
426
|
.stats['score_count'] += result['score']
|
312
|
427
|
|
|
428
|
+ # return results and suggestions
|
313
|
429
|
return results, suggestions
|