|
@@ -1,3 +1,20 @@
|
|
1
|
+'''
|
|
2
|
+searx is free software: you can redistribute it and/or modify
|
|
3
|
+it under the terms of the GNU Affero General Public License as published by
|
|
4
|
+the Free Software Foundation, either version 3 of the License, or
|
|
5
|
+(at your option) any later version.
|
|
6
|
+
|
|
7
|
+searx is distributed in the hope that it will be useful,
|
|
8
|
+but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
9
|
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
10
|
+GNU Affero General Public License for more details.
|
|
11
|
+
|
|
12
|
+You should have received a copy of the GNU Affero General Public License
|
|
13
|
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
|
14
|
+
|
|
15
|
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
|
|
16
|
+'''
|
|
17
|
+
|
1
|
18
|
import grequests
|
2
|
19
|
from itertools import izip_longest, chain
|
3
|
20
|
from datetime import datetime
|
|
@@ -9,46 +26,67 @@ from searx.engines import (
|
9
|
26
|
from searx.languages import language_codes
|
10
|
27
|
from searx.utils import gen_useragent
|
11
|
28
|
|
|
29
|
+
|
12
|
30
|
number_of_searches = 0
|
13
|
31
|
|
14
|
32
|
|
|
33
|
+# get default reqest parameter
|
15
|
34
|
def default_request_params():
|
16
|
35
|
return {
|
17
|
36
|
'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
|
18
|
37
|
|
19
|
38
|
|
|
39
|
+# create a callback wrapper for the search engine results
|
20
|
40
|
def make_callback(engine_name, results, suggestions, callback, params):
|
|
41
|
+
|
21
|
42
|
# creating a callback wrapper for the search engine results
|
22
|
43
|
def process_callback(response, **kwargs):
|
23
|
44
|
cb_res = []
|
24
|
45
|
response.search_params = params
|
|
46
|
+
|
|
47
|
+ # update stats with current page-load-time
|
25
|
48
|
engines[engine_name].stats['page_load_time'] += \
|
26
|
49
|
(datetime.now() - params['started']).total_seconds()
|
|
50
|
+
|
27
|
51
|
try:
|
28
|
52
|
search_results = callback(response)
|
29
|
53
|
except Exception, e:
|
|
54
|
+ # increase errors stats
|
30
|
55
|
engines[engine_name].stats['errors'] += 1
|
31
|
56
|
results[engine_name] = cb_res
|
|
57
|
+
|
|
58
|
+ # print engine name and specific error message
|
32
|
59
|
print '[E] Error with engine "{0}":\n\t{1}'.format(
|
33
|
60
|
engine_name, str(e))
|
34
|
61
|
return
|
|
62
|
+
|
35
|
63
|
for result in search_results:
|
36
|
64
|
result['engine'] = engine_name
|
|
65
|
+
|
|
66
|
+ # if it is a suggestion, add it to list of suggestions
|
37
|
67
|
if 'suggestion' in result:
|
38
|
68
|
# TODO type checks
|
39
|
69
|
suggestions.add(result['suggestion'])
|
40
|
70
|
continue
|
|
71
|
+
|
|
72
|
+ # append result
|
41
|
73
|
cb_res.append(result)
|
|
74
|
+
|
42
|
75
|
results[engine_name] = cb_res
|
|
76
|
+
|
43
|
77
|
return process_callback
|
44
|
78
|
|
45
|
79
|
|
|
80
|
+# score results and remove duplications
|
46
|
81
|
def score_results(results):
|
|
82
|
+ # calculate scoring parameters
|
47
|
83
|
flat_res = filter(
|
48
|
84
|
None, chain.from_iterable(izip_longest(*results.values())))
|
49
|
85
|
flat_len = len(flat_res)
|
50
|
86
|
engines_len = len(results)
|
|
87
|
+
|
51
|
88
|
results = []
|
|
89
|
+
|
52
|
90
|
# deduplication + scoring
|
53
|
91
|
for i, res in enumerate(flat_res):
|
54
|
92
|
|
|
@@ -62,34 +100,54 @@ def score_results(results):
|
62
|
100
|
res['engines'] = [res['engine']]
|
63
|
101
|
weight = 1.0
|
64
|
102
|
|
|
103
|
+ # get weight of this engine if possible
|
65
|
104
|
if hasattr(engines[res['engine']], 'weight'):
|
66
|
105
|
weight = float(engines[res['engine']].weight)
|
67
|
106
|
|
|
107
|
+ # calculate score for that engine
|
68
|
108
|
score = int((flat_len - i) / engines_len) * weight + 1
|
|
109
|
+
|
69
|
110
|
duplicated = False
|
70
|
111
|
|
|
112
|
+ # check for duplicates
|
71
|
113
|
for new_res in results:
|
|
114
|
+ # remove / from the end of the url if required
|
72
|
115
|
p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
73
|
116
|
p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
|
117
|
+
|
|
118
|
+ # check if that result is a duplicate
|
74
|
119
|
if res['host'] == new_res['host'] and\
|
75
|
120
|
unquote(p1) == unquote(p2) and\
|
76
|
121
|
res['parsed_url'].query == new_res['parsed_url'].query and\
|
77
|
122
|
res.get('template') == new_res.get('template'):
|
78
|
123
|
duplicated = new_res
|
79
|
124
|
break
|
|
125
|
+
|
|
126
|
+ # merge duplicates together
|
80
|
127
|
if duplicated:
|
|
128
|
+ # using content with more text
|
81
|
129
|
if res.get('content') > duplicated.get('content'):
|
82
|
130
|
duplicated['content'] = res['content']
|
|
131
|
+
|
|
132
|
+ # increase result-score
|
83
|
133
|
duplicated['score'] += score
|
|
134
|
+
|
|
135
|
+ # add engine to list of result-engines
|
84
|
136
|
duplicated['engines'].append(res['engine'])
|
|
137
|
+
|
|
138
|
+ # using https if possible
|
85
|
139
|
if duplicated['parsed_url'].scheme == 'https':
|
86
|
140
|
continue
|
87
|
141
|
elif res['parsed_url'].scheme == 'https':
|
88
|
142
|
duplicated['url'] = res['parsed_url'].geturl()
|
89
|
143
|
duplicated['parsed_url'] = res['parsed_url']
|
|
144
|
+
|
|
145
|
+ # if there is no duplicate found, append result
|
90
|
146
|
else:
|
91
|
147
|
res['score'] = score
|
92
|
148
|
results.append(res)
|
|
149
|
+
|
|
150
|
+ # return results sorted by score
|
93
|
151
|
return sorted(results, key=itemgetter('score'), reverse=True)
|
94
|
152
|
|
95
|
153
|
|
|
@@ -98,6 +156,7 @@ class Search(object):
|
98
|
156
|
"""Search information container"""
|
99
|
157
|
|
100
|
158
|
def __init__(self, request):
|
|
159
|
+ # init vars
|
101
|
160
|
super(Search, self).__init__()
|
102
|
161
|
self.query = None
|
103
|
162
|
self.engines = []
|
|
@@ -105,18 +164,23 @@ class Search(object):
|
105
|
164
|
self.paging = False
|
106
|
165
|
self.pageno = 1
|
107
|
166
|
self.lang = 'all'
|
|
167
|
+
|
|
168
|
+ # set blocked engines
|
108
|
169
|
if request.cookies.get('blocked_engines'):
|
109
|
170
|
self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa
|
110
|
171
|
else:
|
111
|
172
|
self.blocked_engines = []
|
|
173
|
+
|
112
|
174
|
self.results = []
|
113
|
175
|
self.suggestions = []
|
114
|
176
|
self.request_data = {}
|
115
|
177
|
|
|
178
|
+ # set specific language if set
|
116
|
179
|
if request.cookies.get('language')\
|
117
|
180
|
and request.cookies['language'] in (x[0] for x in language_codes):
|
118
|
181
|
self.lang = request.cookies['language']
|
119
|
182
|
|
|
183
|
+ # set request method
|
120
|
184
|
if request.method == 'POST':
|
121
|
185
|
self.request_data = request.form
|
122
|
186
|
else:
|
|
@@ -126,51 +190,72 @@ class Search(object):
|
126
|
190
|
if not self.request_data.get('q'):
|
127
|
191
|
raise Exception('noquery')
|
128
|
192
|
|
|
193
|
+ # set query
|
129
|
194
|
self.query = self.request_data['q']
|
130
|
195
|
|
|
196
|
+ # set pagenumber
|
131
|
197
|
pageno_param = self.request_data.get('pageno', '1')
|
132
|
198
|
if not pageno_param.isdigit() or int(pageno_param) < 1:
|
133
|
199
|
raise Exception('wrong pagenumber')
|
134
|
200
|
|
135
|
201
|
self.pageno = int(pageno_param)
|
136
|
202
|
|
|
203
|
+ # parse query, if tags are set, which change the serch engine or search-language
|
137
|
204
|
self.parse_query()
|
138
|
205
|
|
139
|
206
|
self.categories = []
|
140
|
207
|
|
|
208
|
+ # if engines are calculated from query, set categories by using that informations
|
141
|
209
|
if self.engines:
|
142
|
210
|
self.categories = list(set(engine['category']
|
143
|
211
|
for engine in self.engines))
|
|
212
|
+
|
|
213
|
+ # otherwise, using defined categories to calculate which engines should be used
|
144
|
214
|
else:
|
|
215
|
+ # set used categories
|
145
|
216
|
for pd_name, pd in self.request_data.items():
|
146
|
217
|
if pd_name.startswith('category_'):
|
147
|
218
|
category = pd_name[9:]
|
|
219
|
+ # if category is not found in list, skip
|
148
|
220
|
if not category in categories:
|
149
|
221
|
continue
|
|
222
|
+
|
|
223
|
+ # add category to list
|
150
|
224
|
self.categories.append(category)
|
|
225
|
+
|
|
226
|
+ # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie)
|
151
|
227
|
if not self.categories:
|
152
|
228
|
cookie_categories = request.cookies.get('categories', '')
|
153
|
229
|
cookie_categories = cookie_categories.split(',')
|
154
|
230
|
for ccateg in cookie_categories:
|
155
|
231
|
if ccateg in categories:
|
156
|
232
|
self.categories.append(ccateg)
|
|
233
|
+
|
|
234
|
+ # if still no category is specified, using general as default-category
|
157
|
235
|
if not self.categories:
|
158
|
236
|
self.categories = ['general']
|
159
|
237
|
|
|
238
|
+ # using all engines for that search, which are declared under the specific categories
|
160
|
239
|
for categ in self.categories:
|
161
|
240
|
self.engines.extend({'category': categ,
|
162
|
241
|
'name': x.name}
|
163
|
242
|
for x in categories[categ]
|
164
|
243
|
if not x.name in self.blocked_engines)
|
165
|
244
|
|
|
245
|
+ # parse query, if tags are set, which change the serch engine or search-language
|
166
|
246
|
def parse_query(self):
|
167
|
247
|
query_parts = self.query.split()
|
168
|
248
|
modified = False
|
|
249
|
+
|
|
250
|
+ # check if language-prefix is set
|
169
|
251
|
if query_parts[0].startswith(':'):
|
170
|
252
|
lang = query_parts[0][1:].lower()
|
171
|
253
|
|
|
254
|
+ # check if any language-code equal with declared language-codes
|
172
|
255
|
for lc in language_codes:
|
173
|
256
|
lang_id, lang_name, country = map(str.lower, lc)
|
|
257
|
+
|
|
258
|
+ # if correct language-code is found, set it as new search-language
|
174
|
259
|
if lang == lang_id\
|
175
|
260
|
or lang_id.startswith(lang)\
|
176
|
261
|
or lang == lang_name\
|
|
@@ -179,56 +264,78 @@ class Search(object):
|
179
|
264
|
modified = True
|
180
|
265
|
break
|
181
|
266
|
|
|
267
|
+ # check if category/engine prefix is set
|
182
|
268
|
elif query_parts[0].startswith('!'):
|
183
|
269
|
prefix = query_parts[0][1:].replace('_', ' ')
|
184
|
270
|
|
|
271
|
+ # check if prefix equal with engine shortcut
|
185
|
272
|
if prefix in engine_shortcuts\
|
186
|
273
|
and not engine_shortcuts[prefix] in self.blocked_engines:
|
187
|
274
|
modified = True
|
188
|
275
|
self.engines.append({'category': 'none',
|
189
|
276
|
'name': engine_shortcuts[prefix]})
|
|
277
|
+
|
|
278
|
+ # check if prefix equal with engine name
|
190
|
279
|
elif prefix in engines\
|
191
|
280
|
and not prefix in self.blocked_engines:
|
192
|
281
|
modified = True
|
193
|
282
|
self.engines.append({'category': 'none',
|
194
|
283
|
'name': prefix})
|
|
284
|
+
|
|
285
|
+ # check if prefix equal with categorie name
|
195
|
286
|
elif prefix in categories:
|
196
|
287
|
modified = True
|
|
288
|
+ # using all engines for that search, which are declared under that categorie name
|
197
|
289
|
self.engines.extend({'category': prefix,
|
198
|
290
|
'name': engine.name}
|
199
|
291
|
for engine in categories[prefix]
|
200
|
292
|
if not engine in self.blocked_engines)
|
|
293
|
+
|
|
294
|
+ # if language, category or engine were specificed in this query, search for more tags which does the same
|
201
|
295
|
if modified:
|
202
|
296
|
self.query = self.query.replace(query_parts[0], '', 1).strip()
|
203
|
297
|
self.parse_query()
|
204
|
298
|
|
|
299
|
+ # do search-request
|
205
|
300
|
def search(self, request):
|
206
|
301
|
global number_of_searches
|
|
302
|
+
|
|
303
|
+ # init vars
|
207
|
304
|
requests = []
|
208
|
305
|
results = {}
|
209
|
306
|
suggestions = set()
|
|
307
|
+
|
|
308
|
+ # increase number of active searches
|
210
|
309
|
number_of_searches += 1
|
|
310
|
+
|
|
311
|
+ # set default useragent
|
211
|
312
|
#user_agent = request.headers.get('User-Agent', '')
|
212
|
313
|
user_agent = gen_useragent()
|
213
|
314
|
|
|
315
|
+ # start search-reqest for all selected engines
|
214
|
316
|
for selected_engine in self.engines:
|
215
|
317
|
if selected_engine['name'] not in engines:
|
216
|
318
|
continue
|
217
|
319
|
|
218
|
320
|
engine = engines[selected_engine['name']]
|
219
|
321
|
|
|
322
|
+ # if paging is not supported, skip
|
220
|
323
|
if self.pageno > 1 and not engine.paging:
|
221
|
324
|
continue
|
222
|
325
|
|
|
326
|
+ # if search-language is set and engine does not provide language-support, skip
|
223
|
327
|
if self.lang != 'all' and not engine.language_support:
|
224
|
328
|
continue
|
225
|
329
|
|
|
330
|
+ # set default request parameters
|
226
|
331
|
request_params = default_request_params()
|
227
|
332
|
request_params['headers']['User-Agent'] = user_agent
|
228
|
333
|
request_params['category'] = selected_engine['category']
|
229
|
334
|
request_params['started'] = datetime.now()
|
230
|
335
|
request_params['pageno'] = self.pageno
|
231
|
336
|
request_params['language'] = self.lang
|
|
337
|
+
|
|
338
|
+ # update request parameters dependent on search-engine (contained in engines folder)
|
232
|
339
|
request_params = engine.request(self.query.encode('utf-8'),
|
233
|
340
|
request_params)
|
234
|
341
|
|
|
@@ -236,6 +343,7 @@ class Search(object):
|
236
|
343
|
# TODO add support of offline engines
|
237
|
344
|
pass
|
238
|
345
|
|
|
346
|
+ # create a callback wrapper for the search engine results
|
239
|
347
|
callback = make_callback(
|
240
|
348
|
selected_engine['name'],
|
241
|
349
|
results,
|
|
@@ -244,6 +352,7 @@ class Search(object):
|
244
|
352
|
request_params
|
245
|
353
|
)
|
246
|
354
|
|
|
355
|
+ # create dictionary which contain all informations about the request
|
247
|
356
|
request_args = dict(
|
248
|
357
|
headers=request_params['headers'],
|
249
|
358
|
hooks=dict(response=callback),
|
|
@@ -251,6 +360,7 @@ class Search(object):
|
251
|
360
|
timeout=engine.timeout
|
252
|
361
|
)
|
253
|
362
|
|
|
363
|
+ # specific type of request (GET or POST)
|
254
|
364
|
if request_params['method'] == 'GET':
|
255
|
365
|
req = grequests.get
|
256
|
366
|
else:
|
|
@@ -261,17 +371,25 @@ class Search(object):
|
261
|
371
|
if not request_params['url']:
|
262
|
372
|
continue
|
263
|
373
|
|
|
374
|
+ # append request to list
|
264
|
375
|
requests.append(req(request_params['url'], **request_args))
|
|
376
|
+
|
|
377
|
+ # send all search-request
|
265
|
378
|
grequests.map(requests)
|
|
379
|
+
|
|
380
|
+ # update engine-specific stats
|
266
|
381
|
for engine_name, engine_results in results.items():
|
267
|
382
|
engines[engine_name].stats['search_count'] += 1
|
268
|
383
|
engines[engine_name].stats['result_count'] += len(engine_results)
|
269
|
384
|
|
|
385
|
+ # score results and remove duplications
|
270
|
386
|
results = score_results(results)
|
271
|
387
|
|
|
388
|
+ # update engine stats, using calculated score
|
272
|
389
|
for result in results:
|
273
|
390
|
for res_engine in result['engines']:
|
274
|
391
|
engines[result['engine']]\
|
275
|
392
|
.stats['score_count'] += result['score']
|
276
|
393
|
|
|
394
|
+ # return results and suggestions
|
277
|
395
|
return results, suggestions
|