Browse Source

Simplify search.py, basically updated PR #518

The timeouts in settings.yml is about the total time (not only the HTTP request but also the prepare the request and parsing the response)
It was more or less the case before since the threaded_requests function ignores the thread after the timeout even the HTTP request is ended.

New / changed stats :
* page_load_time : record the HTTP request time
* page_load_count: the number of HTTP request
* engine_time : the execution total time of an engine
* engine_time_count : the number of "engine_time" measure

The avg response times in the preferences are the engine response time (engine_load_time / engine_load_count)

To sum up :
* Search.search() filters the engines that can't process the request
* Search.search() call search_multiple_requests function
* search_multiple_requests creates one thread per engine, each thread runs the search_one_request function
* search_one_request calls the request function, make the HTTP request, calls the response function, extends the result_container
* search_multiple_requests waits for the the thread to finish (or timeout)
Alexandre Flament 8 years ago
parent
commit
01e2648e93
3 changed files with 153 additions and 131 deletions
  1. 40
    33
      searx/engines/__init__.py
  2. 111
    98
      searx/search.py
  3. 2
    0
      searx/webapp.py

+ 40
- 33
searx/engines/__init__.py View File

99
         'result_count': 0,
99
         'result_count': 0,
100
         'search_count': 0,
100
         'search_count': 0,
101
         'page_load_time': 0,
101
         'page_load_time': 0,
102
+        'page_load_count': 0,
103
+        'engine_time': 0,
104
+        'engine_time_count': 0,
102
         'score_count': 0,
105
         'score_count': 0,
103
         'errors': 0
106
         'errors': 0
104
     }
107
     }
115
     return engine
118
     return engine
116
 
119
 
117
 
120
 
121
+def to_percentage(stats, maxvalue):
122
+    for engine_stat in stats:
123
+        if maxvalue:
124
+            engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100)
125
+        else:
126
+            engine_stat['percentage'] = 0
127
+    return stats
128
+
129
+
118
 def get_engines_stats():
130
 def get_engines_stats():
119
     # TODO refactor
131
     # TODO refactor
120
     pageloads = []
132
     pageloads = []
133
+    engine_times = []
121
     results = []
134
     results = []
122
     scores = []
135
     scores = []
123
     errors = []
136
     errors = []
124
     scores_per_result = []
137
     scores_per_result = []
125
 
138
 
126
-    max_pageload = max_results = max_score = max_errors = max_score_per_result = 0  # noqa
139
+    max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0  # noqa
127
     for engine in engines.values():
140
     for engine in engines.values():
128
         if engine.stats['search_count'] == 0:
141
         if engine.stats['search_count'] == 0:
129
             continue
142
             continue
130
         results_num = \
143
         results_num = \
131
             engine.stats['result_count'] / float(engine.stats['search_count'])
144
             engine.stats['result_count'] / float(engine.stats['search_count'])
132
-        load_times = engine.stats['page_load_time'] / float(engine.stats['search_count'])  # noqa
145
+
146
+        if engine.stats['page_load_count'] != 0:
147
+            load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count'])  # noqa
148
+        else:
149
+            load_times = 0
150
+
151
+        if engine.stats['engine_time_count'] != 0:
152
+            this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count'])  # noqa
153
+        else:
154
+            this_engine_time = 0
155
+
133
         if results_num:
156
         if results_num:
134
             score = engine.stats['score_count'] / float(engine.stats['search_count'])  # noqa
157
             score = engine.stats['score_count'] / float(engine.stats['search_count'])  # noqa
135
             score_per_result = score / results_num
158
             score_per_result = score / results_num
136
         else:
159
         else:
137
             score = score_per_result = 0.0
160
             score = score_per_result = 0.0
138
-        max_results = max(results_num, max_results)
161
+
139
         max_pageload = max(load_times, max_pageload)
162
         max_pageload = max(load_times, max_pageload)
163
+        max_engine_times = max(this_engine_time, max_engine_times)
164
+        max_results = max(results_num, max_results)
140
         max_score = max(score, max_score)
165
         max_score = max(score, max_score)
141
         max_score_per_result = max(score_per_result, max_score_per_result)
166
         max_score_per_result = max(score_per_result, max_score_per_result)
142
         max_errors = max(max_errors, engine.stats['errors'])
167
         max_errors = max(max_errors, engine.stats['errors'])
168
+
143
         pageloads.append({'avg': load_times, 'name': engine.name})
169
         pageloads.append({'avg': load_times, 'name': engine.name})
170
+        engine_times.append({'avg': this_engine_time, 'name': engine.name})
144
         results.append({'avg': results_num, 'name': engine.name})
171
         results.append({'avg': results_num, 'name': engine.name})
145
         scores.append({'avg': score, 'name': engine.name})
172
         scores.append({'avg': score, 'name': engine.name})
146
         errors.append({'avg': engine.stats['errors'], 'name': engine.name})
173
         errors.append({'avg': engine.stats['errors'], 'name': engine.name})
149
             'name': engine.name
176
             'name': engine.name
150
         })
177
         })
151
 
178
 
152
-    for engine in pageloads:
153
-        if max_pageload:
154
-            engine['percentage'] = int(engine['avg'] / max_pageload * 100)
155
-        else:
156
-            engine['percentage'] = 0
157
-
158
-    for engine in results:
159
-        if max_results:
160
-            engine['percentage'] = int(engine['avg'] / max_results * 100)
161
-        else:
162
-            engine['percentage'] = 0
163
-
164
-    for engine in scores:
165
-        if max_score:
166
-            engine['percentage'] = int(engine['avg'] / max_score * 100)
167
-        else:
168
-            engine['percentage'] = 0
169
-
170
-    for engine in scores_per_result:
171
-        if max_score_per_result:
172
-            engine['percentage'] = int(engine['avg']
173
-                                       / max_score_per_result * 100)
174
-        else:
175
-            engine['percentage'] = 0
176
-
177
-    for engine in errors:
178
-        if max_errors:
179
-            engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
180
-        else:
181
-            engine['percentage'] = 0
179
+    pageloads = to_percentage(pageloads, max_pageload)
180
+    engine_times = to_percentage(engine_times, max_engine_times)
181
+    results = to_percentage(results, max_results)
182
+    scores = to_percentage(scores, max_score)
183
+    scores_per_result = to_percentage(scores_per_result, max_score_per_result)
184
+    erros = to_percentage(errors, max_errors)
182
 
185
 
183
     return [
186
     return [
184
         (
187
         (
188
+            gettext('Engine time (sec)'),
189
+            sorted(engine_times, key=itemgetter('avg'))
190
+        ),
191
+        (
185
             gettext('Page loads (sec)'),
192
             gettext('Page loads (sec)'),
186
             sorted(pageloads, key=itemgetter('avg'))
193
             sorted(pageloads, key=itemgetter('avg'))
187
         ),
194
         ),

+ 111
- 98
searx/search.py View File

35
 number_of_searches = 0
35
 number_of_searches = 0
36
 
36
 
37
 
37
 
38
-def search_request_wrapper(fn, url, engine_name, **kwargs):
39
-    ret = None
40
-    engine = engines[engine_name]
38
+def send_http_request(engine, request_params, timeout_limit):
39
+    response = None
41
     try:
40
     try:
42
-        ret = fn(url, **kwargs)
41
+        # create dictionary which contain all
42
+        # informations about the request
43
+        request_args = dict(
44
+            headers=request_params['headers'],
45
+            cookies=request_params['cookies'],
46
+            timeout=timeout_limit,
47
+            verify=request_params['verify']
48
+        )
49
+        # specific type of request (GET or POST)
50
+        if request_params['method'] == 'GET':
51
+            req = requests_lib.get
52
+        else:
53
+            req = requests_lib.post
54
+            request_args['data'] = request_params['data']
55
+
56
+        # for page_load_time stats
57
+        time_before_request = time()
58
+
59
+        # send the request
60
+        response = req(request_params['url'], **request_args)
61
+
43
         with threading.RLock():
62
         with threading.RLock():
63
+            # no error : reset the suspend variables
44
             engine.continuous_errors = 0
64
             engine.continuous_errors = 0
45
             engine.suspend_end_time = 0
65
             engine.suspend_end_time = 0
66
+            # update stats with current page-load-time
67
+            # only the HTTP request
68
+            engine.stats['page_load_time'] += time() - time_before_request
69
+            engine.stats['page_load_count'] += 1
70
+
71
+        # is there a timeout (no parsing in this case)
72
+        timeout_overhead = 0.2  # seconds
73
+        search_duration = time() - request_params['started']
74
+        if search_duration > timeout_limit + timeout_overhead:
75
+            logger.exception('engine timeout on HTTP request:'
76
+                             '{0} (search duration : {1} ms, time-out: {2} )'
77
+                             .format(engine.name, search_duration, timeout_limit))
78
+            with threading.RLock():
79
+                engine.stats['errors'] += 1
80
+            return False
81
+
82
+        # everything is ok : return the response
83
+        return response
84
+
46
     except:
85
     except:
47
         # increase errors stats
86
         # increase errors stats
48
         with threading.RLock():
87
         with threading.RLock():
51
             engine.suspend_end_time = time() + min(60, engine.continuous_errors)
90
             engine.suspend_end_time = time() + min(60, engine.continuous_errors)
52
 
91
 
53
         # print engine name and specific error message
92
         # print engine name and specific error message
54
-        logger.exception('engine crash: {0}'.format(engine_name))
55
-    return ret
93
+        logger.exception('engine crash: {0}'.format(engine.name))
94
+        return False
95
+
96
+
97
+def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
98
+    engine = engines[engine_name]
99
+
100
+    # update request parameters dependent on
101
+    # search-engine (contained in engines folder)
102
+    engine.request(query, request_params)
103
+
104
+    # TODO add support of offline engines
105
+    if request_params['url'] is None:
106
+        return False
107
+
108
+    # ignoring empty urls
109
+    if not request_params['url']:
110
+        return False
111
+
112
+    # send request
113
+    response = send_http_request(engine, request_params, timeout_limit)
114
+
115
+    # parse response
116
+    success = None
117
+    if response:
118
+        # parse the response
119
+        response.search_params = request_params
120
+        search_results = engine.response(response)
121
+
122
+        # add results
123
+        for result in search_results:
124
+            result['engine'] = engine.name
125
+
126
+        result_container.extend(engine.name, search_results)
127
+
128
+        success = True
129
+    else:
130
+        success = False
56
 
131
 
132
+    with threading.RLock():
133
+        # update stats : total time
134
+        engine.stats['engine_time'] += time() - request_params['started']
135
+        engine.stats['engine_time_count'] += 1
57
 
136
 
58
-def threaded_requests(requests):
59
-    timeout_limit = max(r[2]['timeout'] for r in requests)
60
-    search_start = time()
137
+    #
138
+    return success
139
+
140
+
141
+def search_multiple_requests(requests, result_container, timeout_limit):
142
+    start_time = time()
61
     search_id = uuid4().__str__()
143
     search_id = uuid4().__str__()
62
-    for fn, url, request_args, engine_name in requests:
63
-        request_args['timeout'] = timeout_limit
144
+
145
+    for engine_name, query, request_params in requests:
64
         th = threading.Thread(
146
         th = threading.Thread(
65
-            target=search_request_wrapper,
66
-            args=(fn, url, engine_name),
67
-            kwargs=request_args,
147
+            target=search_one_request,
148
+            args=(engine_name, query, request_params, result_container, timeout_limit),
68
             name=search_id,
149
             name=search_id,
69
         )
150
         )
70
         th._engine_name = engine_name
151
         th._engine_name = engine_name
72
 
153
 
73
     for th in threading.enumerate():
154
     for th in threading.enumerate():
74
         if th.name == search_id:
155
         if th.name == search_id:
75
-            remaining_time = max(0.0, timeout_limit - (time() - search_start))
156
+            remaining_time = max(0.0, timeout_limit - (time() - start_time))
76
             th.join(remaining_time)
157
             th.join(remaining_time)
77
             if th.isAlive():
158
             if th.isAlive():
78
                 logger.warning('engine timeout: {0}'.format(th._engine_name))
159
                 logger.warning('engine timeout: {0}'.format(th._engine_name))
90
     }
171
     }
91
 
172
 
92
 
173
 
93
-# create a callback wrapper for the search engine results
94
-def make_callback(engine_name, callback, params, result_container):
95
-
96
-    # creating a callback wrapper for the search engine results
97
-    def process_callback(response, **kwargs):
98
-        # check if redirect comparing to the True value,
99
-        # because resp can be a Mock object, and any attribut name returns something.
100
-        if response.is_redirect is True:
101
-            logger.debug('{0} redirect on: {1}'.format(engine_name, response))
102
-            return
103
-
104
-        response.search_params = params
105
-
106
-        search_duration = time() - params['started']
107
-        # update stats with current page-load-time
108
-        with threading.RLock():
109
-            engines[engine_name].stats['page_load_time'] += search_duration
110
-
111
-        timeout_overhead = 0.2  # seconds
112
-        timeout_limit = engines[engine_name].timeout + timeout_overhead
113
-
114
-        if search_duration > timeout_limit:
115
-            with threading.RLock():
116
-                engines[engine_name].stats['errors'] += 1
117
-            return
118
-
119
-        # callback
120
-        search_results = callback(response)
121
-
122
-        # add results
123
-        for result in search_results:
124
-            result['engine'] = engine_name
125
-
126
-        result_container.extend(engine_name, search_results)
127
-
128
-    return process_callback
129
-
130
-
131
 def get_search_query_from_webapp(preferences, form):
174
 def get_search_query_from_webapp(preferences, form):
132
     query = None
175
     query = None
133
     query_engines = []
176
     query_engines = []
254
     def search(self):
297
     def search(self):
255
         global number_of_searches
298
         global number_of_searches
256
 
299
 
300
+        # start time
301
+        start_time = time()
302
+
257
         # init vars
303
         # init vars
258
         requests = []
304
         requests = []
259
 
305
 
266
 
312
 
267
         search_query = self.search_query
313
         search_query = self.search_query
268
 
314
 
315
+        # max of all selected engine timeout
316
+        timeout_limit = 0
317
+
269
         # start search-reqest for all selected engines
318
         # start search-reqest for all selected engines
270
         for selected_engine in search_query.engines:
319
         for selected_engine in search_query.engines:
271
             if selected_engine['name'] not in engines:
320
             if selected_engine['name'] not in engines:
294
             request_params = default_request_params()
343
             request_params = default_request_params()
295
             request_params['headers']['User-Agent'] = user_agent
344
             request_params['headers']['User-Agent'] = user_agent
296
             request_params['category'] = selected_engine['category']
345
             request_params['category'] = selected_engine['category']
297
-            request_params['started'] = time()
346
+            request_params['started'] = start_time
298
             request_params['pageno'] = search_query.pageno
347
             request_params['pageno'] = search_query.pageno
299
 
348
 
300
             if hasattr(engine, 'language') and engine.language:
349
             if hasattr(engine, 'language') and engine.language:
306
             request_params['safesearch'] = search_query.safesearch
355
             request_params['safesearch'] = search_query.safesearch
307
             request_params['time_range'] = search_query.time_range
356
             request_params['time_range'] = search_query.time_range
308
 
357
 
309
-            # update request parameters dependent on
310
-            # search-engine (contained in engines folder)
311
-            engine.request(search_query.query.encode('utf-8'), request_params)
312
-
313
-            if request_params['url'] is None:
314
-                # TODO add support of offline engines
315
-                pass
316
-
317
-            # create a callback wrapper for the search engine results
318
-            callback = make_callback(
319
-                selected_engine['name'],
320
-                engine.response,
321
-                request_params,
322
-                self.result_container)
323
-
324
-            # create dictionary which contain all
325
-            # informations about the request
326
-            request_args = dict(
327
-                headers=request_params['headers'],
328
-                hooks=dict(response=callback),
329
-                cookies=request_params['cookies'],
330
-                timeout=engine.timeout,
331
-                verify=request_params['verify']
332
-            )
333
-
334
-            # specific type of request (GET or POST)
335
-            if request_params['method'] == 'GET':
336
-                req = requests_lib.get
337
-            else:
338
-                req = requests_lib.post
339
-                request_args['data'] = request_params['data']
358
+            # append request to list
359
+            requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
340
 
360
 
341
-            # ignoring empty urls
342
-            if not request_params['url']:
343
-                continue
361
+            # update timeout_limit
362
+            timeout_limit = max(timeout_limit, engine.timeout)
344
 
363
 
345
-            # append request to list
346
-            requests.append((req, request_params['url'],
347
-                             request_args,
348
-                             selected_engine['name']))
349
-
350
-        if not requests:
351
-            return self.result_container
352
-        # send all search-request
353
-        threaded_requests(requests)
354
-        start_new_thread(gc.collect, tuple())
364
+        if requests:
365
+            # send all search-request
366
+            search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time))
367
+            start_new_thread(gc.collect, tuple())
355
 
368
 
356
         # return results, suggestions, answers and infoboxes
369
         # return results, suggestions, answers and infoboxes
357
         return self.result_container
370
         return self.result_container

+ 2
- 0
searx/webapp.py View File

593
             if e.timeout > settings['outgoing']['request_timeout']:
593
             if e.timeout > settings['outgoing']['request_timeout']:
594
                 stats[e.name]['warn_timeout'] = True
594
                 stats[e.name]['warn_timeout'] = True
595
 
595
 
596
+    # get first element [0], the engine time,
597
+    # and then the second element [1] : the time (the first one is the label)
596
     for engine_stat in get_engines_stats()[0][1]:
598
     for engine_stat in get_engines_stats()[0][1]:
597
         stats[engine_stat.get('name')]['time'] = round(engine_stat.get('avg'), 3)
599
         stats[engine_stat.get('name')]['time'] = round(engine_stat.get('avg'), 3)
598
         if engine_stat.get('avg') > settings['outgoing']['request_timeout']:
600
         if engine_stat.get('avg') > settings['outgoing']['request_timeout']: