|
@@ -22,7 +22,7 @@ from datetime import datetime
|
22
|
22
|
from operator import itemgetter
|
23
|
23
|
from urlparse import urlparse, unquote
|
24
|
24
|
from searx.engines import (
|
25
|
|
- categories, engines, engine_shortcuts
|
|
25
|
+ categories, engines
|
26
|
26
|
)
|
27
|
27
|
from searx.languages import language_codes
|
28
|
28
|
from searx.utils import gen_useragent
|
|
@@ -39,7 +39,13 @@ def default_request_params():
|
39
|
39
|
|
40
|
40
|
|
41
|
41
|
# create a callback wrapper for the search engine results
|
42
|
|
-def make_callback(engine_name, results, suggestions, answers, infoboxes, callback, params):
|
|
42
|
+def make_callback(engine_name,
|
|
43
|
+ results,
|
|
44
|
+ suggestions,
|
|
45
|
+ answers,
|
|
46
|
+ infoboxes,
|
|
47
|
+ callback,
|
|
48
|
+ params):
|
43
|
49
|
|
44
|
50
|
# creating a callback wrapper for the search engine results
|
45
|
51
|
def process_callback(response, **kwargs):
|
|
@@ -95,7 +101,7 @@ def make_callback(engine_name, results, suggestions, answers, infoboxes, callbac
|
95
|
101
|
def content_result_len(content):
|
96
|
102
|
if isinstance(content, basestring):
|
97
|
103
|
content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
|
98
|
|
- return len(content)
|
|
104
|
+ return len(content)
|
99
|
105
|
else:
|
100
|
106
|
return 0
|
101
|
107
|
|
|
@@ -126,7 +132,8 @@ def score_results(results):
|
126
|
132
|
|
127
|
133
|
# strip multiple spaces and cariage returns from content
|
128
|
134
|
if 'content' in res:
|
129
|
|
- res['content'] = re.sub(' +', ' ', res['content'].strip().replace('\n', ''))
|
|
135
|
+ res['content'] = re.sub(' +', ' ',
|
|
136
|
+ res['content'].strip().replace('\n', ''))
|
130
|
137
|
|
131
|
138
|
# get weight of this engine if possible
|
132
|
139
|
if hasattr(engines[res['engine']], 'weight'):
|
|
@@ -139,8 +146,12 @@ def score_results(results):
|
139
|
146
|
duplicated = False
|
140
|
147
|
for new_res in results:
|
141
|
148
|
# remove / from the end of the url if required
|
142
|
|
- p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
143
|
|
- p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
|
149
|
+ p1 = res['parsed_url'].path[:-1]\
|
|
150
|
+ if res['parsed_url'].path.endswith('/')\
|
|
151
|
+ else res['parsed_url'].path
|
|
152
|
+ p2 = new_res['parsed_url'].path[:-1]\
|
|
153
|
+ if new_res['parsed_url'].path.endswith('/')\
|
|
154
|
+ else new_res['parsed_url'].path
|
144
|
155
|
|
145
|
156
|
# check if that result is a duplicate
|
146
|
157
|
if res['host'] == new_res['host'] and\
|
|
@@ -153,7 +164,8 @@ def score_results(results):
|
153
|
164
|
# merge duplicates together
|
154
|
165
|
if duplicated:
|
155
|
166
|
# using content with more text
|
156
|
|
- if content_result_len(res.get('content', '')) > content_result_len(duplicated.get('content', '')):
|
|
167
|
+ if content_result_len(res.get('content', '')) >\
|
|
168
|
+ content_result_len(duplicated.get('content', '')):
|
157
|
169
|
duplicated['content'] = res['content']
|
158
|
170
|
|
159
|
171
|
# increase result-score
|
|
@@ -182,17 +194,25 @@ def score_results(results):
|
182
|
194
|
|
183
|
195
|
for i, res in enumerate(results):
|
184
|
196
|
# FIXME : handle more than one category per engine
|
185
|
|
- category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']
|
186
|
|
-
|
187
|
|
- current = None if category not in categoryPositions else categoryPositions[category]
|
188
|
|
-
|
189
|
|
- # group with previous results using the same category if the group can accept more result and is not too far from the current position
|
190
|
|
- if current != None and (current['count'] > 0) and (len(gresults) - current['index'] < 20):
|
191
|
|
- # group with the previous results using the same category with this one
|
|
197
|
+ category = engines[res['engine']].categories[0] + ':' + ''\
|
|
198
|
+ if 'template' not in res\
|
|
199
|
+ else res['template']
|
|
200
|
+
|
|
201
|
+ current = None if category not in categoryPositions\
|
|
202
|
+ else categoryPositions[category]
|
|
203
|
+
|
|
204
|
+ # group with previous results using the same category
|
|
205
|
+ # if the group can accept more result and is not too far
|
|
206
|
+ # from the current position
|
|
207
|
+ if current is not None and (current['count'] > 0)\
|
|
208
|
+ and (len(gresults) - current['index'] < 20):
|
|
209
|
+ # group with the previous results using
|
|
210
|
+ # the same category with this one
|
192
|
211
|
index = current['index']
|
193
|
212
|
gresults.insert(index, res)
|
194
|
213
|
|
195
|
|
- # update every index after the current one (including the current one)
|
|
214
|
+ # update every index after the current one
|
|
215
|
+ # (including the current one)
|
196
|
216
|
for k in categoryPositions:
|
197
|
217
|
v = categoryPositions[k]['index']
|
198
|
218
|
if v >= index:
|
|
@@ -206,7 +226,7 @@ def score_results(results):
|
206
|
226
|
gresults.append(res)
|
207
|
227
|
|
208
|
228
|
# update categoryIndex
|
209
|
|
- categoryPositions[category] = { 'index' : len(gresults), 'count' : 8 }
|
|
229
|
+ categoryPositions[category] = {'index': len(gresults), 'count': 8}
|
210
|
230
|
|
211
|
231
|
# return gresults
|
212
|
232
|
return gresults
|
|
@@ -215,21 +235,21 @@ def score_results(results):
|
215
|
235
|
def merge_two_infoboxes(infobox1, infobox2):
|
216
|
236
|
if 'urls' in infobox2:
|
217
|
237
|
urls1 = infobox1.get('urls', None)
|
218
|
|
- if urls1 == None:
|
|
238
|
+ if urls1 is None:
|
219
|
239
|
urls1 = []
|
220
|
240
|
infobox1.set('urls', urls1)
|
221
|
241
|
|
222
|
242
|
urlSet = set()
|
223
|
243
|
for url in infobox1.get('urls', []):
|
224
|
244
|
urlSet.add(url.get('url', None))
|
225
|
|
-
|
|
245
|
+
|
226
|
246
|
for url in infobox2.get('urls', []):
|
227
|
247
|
if url.get('url', None) not in urlSet:
|
228
|
248
|
urls1.append(url)
|
229
|
249
|
|
230
|
250
|
if 'attributes' in infobox2:
|
231
|
251
|
attributes1 = infobox1.get('attributes', None)
|
232
|
|
- if attributes1 == None:
|
|
252
|
+ if attributes1 is None:
|
233
|
253
|
attributes1 = []
|
234
|
254
|
infobox1.set('attributes', attributes1)
|
235
|
255
|
|
|
@@ -237,14 +257,14 @@ def merge_two_infoboxes(infobox1, infobox2):
|
237
|
257
|
for attribute in infobox1.get('attributes', []):
|
238
|
258
|
if attribute.get('label', None) not in attributeSet:
|
239
|
259
|
attributeSet.add(attribute.get('label', None))
|
240
|
|
-
|
|
260
|
+
|
241
|
261
|
for attribute in infobox2.get('attributes', []):
|
242
|
262
|
attributes1.append(attribute)
|
243
|
263
|
|
244
|
264
|
if 'content' in infobox2:
|
245
|
265
|
content1 = infobox1.get('content', None)
|
246
|
266
|
content2 = infobox2.get('content', '')
|
247
|
|
- if content1 != None:
|
|
267
|
+ if content1 is not None:
|
248
|
268
|
if content_result_len(content2) > content_result_len(content1):
|
249
|
269
|
infobox1['content'] = content2
|
250
|
270
|
else:
|
|
@@ -257,12 +277,12 @@ def merge_infoboxes(infoboxes):
|
257
|
277
|
for infobox in infoboxes:
|
258
|
278
|
add_infobox = True
|
259
|
279
|
infobox_id = infobox.get('id', None)
|
260
|
|
- if infobox_id != None:
|
|
280
|
+ if infobox_id is not None:
|
261
|
281
|
existingIndex = infoboxes_id.get(infobox_id, None)
|
262
|
|
- if existingIndex != None:
|
|
282
|
+ if existingIndex is not None:
|
263
|
283
|
merge_two_infoboxes(results[existingIndex], infobox)
|
264
|
|
- add_infobox=False
|
265
|
|
-
|
|
284
|
+ add_infobox = False
|
|
285
|
+
|
266
|
286
|
if add_infobox:
|
267
|
287
|
results.append(infobox)
|
268
|
288
|
infoboxes_id[infobox_id] = len(results)-1
|
|
@@ -318,7 +338,8 @@ class Search(object):
|
318
|
338
|
|
319
|
339
|
self.pageno = int(pageno_param)
|
320
|
340
|
|
321
|
|
- # parse query, if tags are set, which change the serch engine or search-language
|
|
341
|
+ # parse query, if tags are set, which change
|
|
342
|
+ # the serch engine or search-language
|
322
|
343
|
query_obj = Query(self.request_data['q'], self.blocked_engines)
|
323
|
344
|
query_obj.parse_query()
|
324
|
345
|
|
|
@@ -334,25 +355,29 @@ class Search(object):
|
334
|
355
|
|
335
|
356
|
self.categories = []
|
336
|
357
|
|
337
|
|
- # if engines are calculated from query, set categories by using that informations
|
|
358
|
+ # if engines are calculated from query,
|
|
359
|
+ # set categories by using that informations
|
338
|
360
|
if self.engines:
|
339
|
361
|
self.categories = list(set(engine['category']
|
340
|
362
|
for engine in self.engines))
|
341
|
363
|
|
342
|
|
- # otherwise, using defined categories to calculate which engines should be used
|
|
364
|
+ # otherwise, using defined categories to
|
|
365
|
+ # calculate which engines should be used
|
343
|
366
|
else:
|
344
|
367
|
# set used categories
|
345
|
368
|
for pd_name, pd in self.request_data.items():
|
346
|
369
|
if pd_name.startswith('category_'):
|
347
|
370
|
category = pd_name[9:]
|
348
|
371
|
# if category is not found in list, skip
|
349
|
|
- if not category in categories:
|
|
372
|
+ if category not in categories:
|
350
|
373
|
continue
|
351
|
374
|
|
352
|
375
|
# add category to list
|
353
|
376
|
self.categories.append(category)
|
354
|
377
|
|
355
|
|
- # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie)
|
|
378
|
+ # if no category is specified for this search,
|
|
379
|
+ # using user-defined default-configuration which
|
|
380
|
+ # (is stored in cookie)
|
356
|
381
|
if not self.categories:
|
357
|
382
|
cookie_categories = request.cookies.get('categories', '')
|
358
|
383
|
cookie_categories = cookie_categories.split(',')
|
|
@@ -360,16 +385,18 @@ class Search(object):
|
360
|
385
|
if ccateg in categories:
|
361
|
386
|
self.categories.append(ccateg)
|
362
|
387
|
|
363
|
|
- # if still no category is specified, using general as default-category
|
|
388
|
+ # if still no category is specified, using general
|
|
389
|
+ # as default-category
|
364
|
390
|
if not self.categories:
|
365
|
391
|
self.categories = ['general']
|
366
|
392
|
|
367
|
|
- # using all engines for that search, which are declared under the specific categories
|
|
393
|
+ # using all engines for that search, which are
|
|
394
|
+ # declared under the specific categories
|
368
|
395
|
for categ in self.categories:
|
369
|
396
|
self.engines.extend({'category': categ,
|
370
|
397
|
'name': x.name}
|
371
|
398
|
for x in categories[categ]
|
372
|
|
- if not x.name in self.blocked_engines)
|
|
399
|
+ if x.name not in self.blocked_engines)
|
373
|
400
|
|
374
|
401
|
# do search-request
|
375
|
402
|
def search(self, request):
|
|
@@ -386,7 +413,7 @@ class Search(object):
|
386
|
413
|
number_of_searches += 1
|
387
|
414
|
|
388
|
415
|
# set default useragent
|
389
|
|
- #user_agent = request.headers.get('User-Agent', '')
|
|
416
|
+ # user_agent = request.headers.get('User-Agent', '')
|
390
|
417
|
user_agent = gen_useragent()
|
391
|
418
|
|
392
|
419
|
# start search-reqest for all selected engines
|
|
@@ -400,7 +427,8 @@ class Search(object):
|
400
|
427
|
if self.pageno > 1 and not engine.paging:
|
401
|
428
|
continue
|
402
|
429
|
|
403
|
|
- # if search-language is set and engine does not provide language-support, skip
|
|
430
|
+ # if search-language is set and engine does not
|
|
431
|
+ # provide language-support, skip
|
404
|
432
|
if self.lang != 'all' and not engine.language_support:
|
405
|
433
|
continue
|
406
|
434
|
|
|
@@ -412,7 +440,8 @@ class Search(object):
|
412
|
440
|
request_params['pageno'] = self.pageno
|
413
|
441
|
request_params['language'] = self.lang
|
414
|
442
|
|
415
|
|
- # update request parameters dependent on search-engine (contained in engines folder)
|
|
443
|
+ # update request parameters dependent on
|
|
444
|
+ # search-engine (contained in engines folder)
|
416
|
445
|
request_params = engine.request(self.query.encode('utf-8'),
|
417
|
446
|
request_params)
|
418
|
447
|
|
|
@@ -431,7 +460,8 @@ class Search(object):
|
431
|
460
|
request_params
|
432
|
461
|
)
|
433
|
462
|
|
434
|
|
- # create dictionary which contain all informations about the request
|
|
463
|
+ # create dictionary which contain all
|
|
464
|
+ # informations about the request
|
435
|
465
|
request_args = dict(
|
436
|
466
|
headers=request_params['headers'],
|
437
|
467
|
hooks=dict(response=callback),
|