|
@@ -19,19 +19,12 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
|
19
|
19
|
from os.path import realpath, dirname, splitext, join
|
20
|
20
|
import sys
|
21
|
21
|
from imp import load_source
|
22
|
|
-from itertools import izip_longest, chain
|
23
|
|
-from operator import itemgetter
|
24
|
|
-from urlparse import urlparse, unquote
|
25
|
|
-from datetime import datetime
|
26
|
|
-import grequests
|
27
|
22
|
from flask.ext.babel import gettext
|
|
23
|
+from operator import itemgetter
|
28
|
24
|
from searx import settings
|
29
|
|
-from searx.utils import gen_useragent
|
30
|
25
|
|
31
|
26
|
engine_dir = dirname(realpath(__file__))
|
32
|
27
|
|
33
|
|
-number_of_searches = 0
|
34
|
|
-
|
35
|
28
|
engines = {}
|
36
|
29
|
|
37
|
30
|
categories = {'general': []}
|
|
@@ -114,160 +107,6 @@ for engine_data in settings['engines']:
|
114
|
107
|
engine_shortcuts[engine.shortcut] = engine.name
|
115
|
108
|
|
116
|
109
|
|
117
|
|
-def default_request_params():
|
118
|
|
- return {
|
119
|
|
- 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
|
120
|
|
-
|
121
|
|
-
|
122
|
|
-def make_callback(engine_name, results, suggestions, callback, params):
|
123
|
|
- # creating a callback wrapper for the search engine results
|
124
|
|
- def process_callback(response, **kwargs):
|
125
|
|
- cb_res = []
|
126
|
|
- response.search_params = params
|
127
|
|
- engines[engine_name].stats['page_load_time'] += \
|
128
|
|
- (datetime.now() - params['started']).total_seconds()
|
129
|
|
- try:
|
130
|
|
- search_results = callback(response)
|
131
|
|
- except Exception, e:
|
132
|
|
- engines[engine_name].stats['errors'] += 1
|
133
|
|
- results[engine_name] = cb_res
|
134
|
|
- print '[E] Error with engine "{0}":\n\t{1}'.format(
|
135
|
|
- engine_name, str(e))
|
136
|
|
- return
|
137
|
|
- for result in search_results:
|
138
|
|
- result['engine'] = engine_name
|
139
|
|
- if 'suggestion' in result:
|
140
|
|
- # TODO type checks
|
141
|
|
- suggestions.add(result['suggestion'])
|
142
|
|
- continue
|
143
|
|
- cb_res.append(result)
|
144
|
|
- results[engine_name] = cb_res
|
145
|
|
- return process_callback
|
146
|
|
-
|
147
|
|
-
|
148
|
|
-def score_results(results):
|
149
|
|
- flat_res = filter(
|
150
|
|
- None, chain.from_iterable(izip_longest(*results.values())))
|
151
|
|
- flat_len = len(flat_res)
|
152
|
|
- engines_len = len(results)
|
153
|
|
- results = []
|
154
|
|
- # deduplication + scoring
|
155
|
|
- for i, res in enumerate(flat_res):
|
156
|
|
-
|
157
|
|
- res['parsed_url'] = urlparse(res['url'])
|
158
|
|
-
|
159
|
|
- res['host'] = res['parsed_url'].netloc
|
160
|
|
-
|
161
|
|
- if res['host'].startswith('www.'):
|
162
|
|
- res['host'] = res['host'].replace('www.', '', 1)
|
163
|
|
-
|
164
|
|
- res['engines'] = [res['engine']]
|
165
|
|
- weight = 1.0
|
166
|
|
-
|
167
|
|
- if hasattr(engines[res['engine']], 'weight'):
|
168
|
|
- weight = float(engines[res['engine']].weight)
|
169
|
|
-
|
170
|
|
- score = int((flat_len - i) / engines_len) * weight + 1
|
171
|
|
- duplicated = False
|
172
|
|
-
|
173
|
|
- for new_res in results:
|
174
|
|
- p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
|
175
|
|
- p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
|
176
|
|
- if res['host'] == new_res['host'] and\
|
177
|
|
- unquote(p1) == unquote(p2) and\
|
178
|
|
- res['parsed_url'].query == new_res['parsed_url'].query and\
|
179
|
|
- res.get('template') == new_res.get('template'):
|
180
|
|
- duplicated = new_res
|
181
|
|
- break
|
182
|
|
- if duplicated:
|
183
|
|
- if res.get('content') > duplicated.get('content'):
|
184
|
|
- duplicated['content'] = res['content']
|
185
|
|
- duplicated['score'] += score
|
186
|
|
- duplicated['engines'].append(res['engine'])
|
187
|
|
- if duplicated['parsed_url'].scheme == 'https':
|
188
|
|
- continue
|
189
|
|
- elif res['parsed_url'].scheme == 'https':
|
190
|
|
- duplicated['url'] = res['parsed_url'].geturl()
|
191
|
|
- duplicated['parsed_url'] = res['parsed_url']
|
192
|
|
- else:
|
193
|
|
- res['score'] = score
|
194
|
|
- results.append(res)
|
195
|
|
- return sorted(results, key=itemgetter('score'), reverse=True)
|
196
|
|
-
|
197
|
|
-
|
198
|
|
-def search(query, request, selected_engines, pageno=1, lang='all'):
|
199
|
|
- global engines, categories, number_of_searches
|
200
|
|
- requests = []
|
201
|
|
- results = {}
|
202
|
|
- suggestions = set()
|
203
|
|
- number_of_searches += 1
|
204
|
|
- #user_agent = request.headers.get('User-Agent', '')
|
205
|
|
- user_agent = gen_useragent()
|
206
|
|
-
|
207
|
|
- for selected_engine in selected_engines:
|
208
|
|
- if selected_engine['name'] not in engines:
|
209
|
|
- continue
|
210
|
|
-
|
211
|
|
- engine = engines[selected_engine['name']]
|
212
|
|
-
|
213
|
|
- if pageno > 1 and not engine.paging:
|
214
|
|
- continue
|
215
|
|
-
|
216
|
|
- if lang != 'all' and not engine.language_support:
|
217
|
|
- continue
|
218
|
|
-
|
219
|
|
- request_params = default_request_params()
|
220
|
|
- request_params['headers']['User-Agent'] = user_agent
|
221
|
|
- request_params['category'] = selected_engine['category']
|
222
|
|
- request_params['started'] = datetime.now()
|
223
|
|
- request_params['pageno'] = pageno
|
224
|
|
- request_params['language'] = lang
|
225
|
|
- request_params = engine.request(query.encode('utf-8'), request_params)
|
226
|
|
-
|
227
|
|
- if request_params['url'] is None:
|
228
|
|
- # TODO add support of offline engines
|
229
|
|
- pass
|
230
|
|
-
|
231
|
|
- callback = make_callback(
|
232
|
|
- selected_engine['name'],
|
233
|
|
- results,
|
234
|
|
- suggestions,
|
235
|
|
- engine.response,
|
236
|
|
- request_params
|
237
|
|
- )
|
238
|
|
-
|
239
|
|
- request_args = dict(
|
240
|
|
- headers=request_params['headers'],
|
241
|
|
- hooks=dict(response=callback),
|
242
|
|
- cookies=request_params['cookies'],
|
243
|
|
- timeout=engine.timeout
|
244
|
|
- )
|
245
|
|
-
|
246
|
|
- if request_params['method'] == 'GET':
|
247
|
|
- req = grequests.get
|
248
|
|
- else:
|
249
|
|
- req = grequests.post
|
250
|
|
- request_args['data'] = request_params['data']
|
251
|
|
-
|
252
|
|
- # ignoring empty urls
|
253
|
|
- if not request_params['url']:
|
254
|
|
- continue
|
255
|
|
-
|
256
|
|
- requests.append(req(request_params['url'], **request_args))
|
257
|
|
- grequests.map(requests)
|
258
|
|
- for engine_name, engine_results in results.items():
|
259
|
|
- engines[engine_name].stats['search_count'] += 1
|
260
|
|
- engines[engine_name].stats['result_count'] += len(engine_results)
|
261
|
|
-
|
262
|
|
- results = score_results(results)
|
263
|
|
-
|
264
|
|
- for result in results:
|
265
|
|
- for res_engine in result['engines']:
|
266
|
|
- engines[result['engine']].stats['score_count'] += result['score']
|
267
|
|
-
|
268
|
|
- return results, suggestions
|
269
|
|
-
|
270
|
|
-
|
271
|
110
|
def get_engines_stats():
|
272
|
111
|
# TODO refactor
|
273
|
112
|
pageloads = []
|