ソースを参照

Simplify search.py, basically updated PR #518

The timeouts in settings.yml is about the total time (not only the HTTP request but also the prepare the request and parsing the response)
It was more or less the case before since the threaded_requests function ignores the thread after the timeout even the HTTP request is ended.

New / changed stats :
* page_load_time : record the HTTP request time
* page_load_count: the number of HTTP request
* engine_time : the execution total time of an engine
* engine_time_count : the number of "engine_time" measure

The avg response times in the preferences are the engine response time (engine_load_time / engine_load_count)

To sum up :
* Search.search() filters the engines that can't process the request
* Search.search() call search_multiple_requests function
* search_multiple_requests creates one thread per engine, each thread runs the search_one_request function
* search_one_request calls the request function, make the HTTP request, calls the response function, extends the result_container
* search_multiple_requests waits for the the thread to finish (or timeout)
Alexandre Flament 8 年 前
コミット
01e2648e93
共有3 個のファイルを変更した153 個の追加131 個の削除を含む
  1. 40
    33
      searx/engines/__init__.py
  2. 111
    98
      searx/search.py
  3. 2
    0
      searx/webapp.py

+ 40
- 33
searx/engines/__init__.py ファイルの表示

@@ -99,6 +99,9 @@ def load_engine(engine_data):
99 99
         'result_count': 0,
100 100
         'search_count': 0,
101 101
         'page_load_time': 0,
102
+        'page_load_count': 0,
103
+        'engine_time': 0,
104
+        'engine_time_count': 0,
102 105
         'score_count': 0,
103 106
         'errors': 0
104 107
     }
@@ -115,32 +118,56 @@ def load_engine(engine_data):
115 118
     return engine
116 119
 
117 120
 
121
+def to_percentage(stats, maxvalue):
122
+    for engine_stat in stats:
123
+        if maxvalue:
124
+            engine_stat['percentage'] = int(engine_stat['avg'] / maxvalue * 100)
125
+        else:
126
+            engine_stat['percentage'] = 0
127
+    return stats
128
+
129
+
118 130
 def get_engines_stats():
119 131
     # TODO refactor
120 132
     pageloads = []
133
+    engine_times = []
121 134
     results = []
122 135
     scores = []
123 136
     errors = []
124 137
     scores_per_result = []
125 138
 
126
-    max_pageload = max_results = max_score = max_errors = max_score_per_result = 0  # noqa
139
+    max_pageload = max_engine_times = max_results = max_score = max_errors = max_score_per_result = 0  # noqa
127 140
     for engine in engines.values():
128 141
         if engine.stats['search_count'] == 0:
129 142
             continue
130 143
         results_num = \
131 144
             engine.stats['result_count'] / float(engine.stats['search_count'])
132
-        load_times = engine.stats['page_load_time'] / float(engine.stats['search_count'])  # noqa
145
+
146
+        if engine.stats['page_load_count'] != 0:
147
+            load_times = engine.stats['page_load_time'] / float(engine.stats['page_load_count'])  # noqa
148
+        else:
149
+            load_times = 0
150
+
151
+        if engine.stats['engine_time_count'] != 0:
152
+            this_engine_time = engine.stats['engine_time'] / float(engine.stats['engine_time_count'])  # noqa
153
+        else:
154
+            this_engine_time = 0
155
+
133 156
         if results_num:
134 157
             score = engine.stats['score_count'] / float(engine.stats['search_count'])  # noqa
135 158
             score_per_result = score / results_num
136 159
         else:
137 160
             score = score_per_result = 0.0
138
-        max_results = max(results_num, max_results)
161
+
139 162
         max_pageload = max(load_times, max_pageload)
163
+        max_engine_times = max(this_engine_time, max_engine_times)
164
+        max_results = max(results_num, max_results)
140 165
         max_score = max(score, max_score)
141 166
         max_score_per_result = max(score_per_result, max_score_per_result)
142 167
         max_errors = max(max_errors, engine.stats['errors'])
168
+
143 169
         pageloads.append({'avg': load_times, 'name': engine.name})
170
+        engine_times.append({'avg': this_engine_time, 'name': engine.name})
144 171
         results.append({'avg': results_num, 'name': engine.name})
145 172
         scores.append({'avg': score, 'name': engine.name})
146 173
         errors.append({'avg': engine.stats['errors'], 'name': engine.name})
@@ -149,39 +176,19 @@ def get_engines_stats():
149 176
             'name': engine.name
150 177
         })
151 178
 
152
-    for engine in pageloads:
153
-        if max_pageload:
154
-            engine['percentage'] = int(engine['avg'] / max_pageload * 100)
155
-        else:
156
-            engine['percentage'] = 0
157
-
158
-    for engine in results:
159
-        if max_results:
160
-            engine['percentage'] = int(engine['avg'] / max_results * 100)
161
-        else:
162
-            engine['percentage'] = 0
163
-
164
-    for engine in scores:
165
-        if max_score:
166
-            engine['percentage'] = int(engine['avg'] / max_score * 100)
167
-        else:
168
-            engine['percentage'] = 0
169
-
170
-    for engine in scores_per_result:
171
-        if max_score_per_result:
172
-            engine['percentage'] = int(engine['avg']
173
-                                       / max_score_per_result * 100)
174
-        else:
175
-            engine['percentage'] = 0
176
-
177
-    for engine in errors:
178
-        if max_errors:
179
-            engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
180
-        else:
181
-            engine['percentage'] = 0
179
+    pageloads = to_percentage(pageloads, max_pageload)
180
+    engine_times = to_percentage(engine_times, max_engine_times)
181
+    results = to_percentage(results, max_results)
182
+    scores = to_percentage(scores, max_score)
183
+    scores_per_result = to_percentage(scores_per_result, max_score_per_result)
184
+    erros = to_percentage(errors, max_errors)
182 185
 
183 186
     return [
184 187
         (
188
+            gettext('Engine time (sec)'),
189
+            sorted(engine_times, key=itemgetter('avg'))
190
+        ),
191
+        (
185 192
             gettext('Page loads (sec)'),
186 193
             sorted(pageloads, key=itemgetter('avg'))
187 194
         ),

+ 111
- 98
searx/search.py ファイルの表示

@@ -35,14 +35,53 @@ logger = logger.getChild('search')
35 35
 number_of_searches = 0
36 36
 
37 37
 
38
-def search_request_wrapper(fn, url, engine_name, **kwargs):
39
-    ret = None
40
-    engine = engines[engine_name]
38
+def send_http_request(engine, request_params, timeout_limit):
39
+    response = None
41 40
     try:
42
-        ret = fn(url, **kwargs)
41
+        # create dictionary which contain all
42
+        # informations about the request
43
+        request_args = dict(
44
+            headers=request_params['headers'],
45
+            cookies=request_params['cookies'],
46
+            timeout=timeout_limit,
47
+            verify=request_params['verify']
48
+        )
49
+        # specific type of request (GET or POST)
50
+        if request_params['method'] == 'GET':
51
+            req = requests_lib.get
52
+        else:
53
+            req = requests_lib.post
54
+            request_args['data'] = request_params['data']
55
+
56
+        # for page_load_time stats
57
+        time_before_request = time()
58
+
59
+        # send the request
60
+        response = req(request_params['url'], **request_args)
61
+
43 62
         with threading.RLock():
63
+            # no error : reset the suspend variables
44 64
             engine.continuous_errors = 0
45 65
             engine.suspend_end_time = 0
66
+            # update stats with current page-load-time
67
+            # only the HTTP request
68
+            engine.stats['page_load_time'] += time() - time_before_request
69
+            engine.stats['page_load_count'] += 1
70
+
71
+        # is there a timeout (no parsing in this case)
72
+        timeout_overhead = 0.2  # seconds
73
+        search_duration = time() - request_params['started']
74
+        if search_duration > timeout_limit + timeout_overhead:
75
+            logger.exception('engine timeout on HTTP request:'
76
+                             '{0} (search duration : {1} ms, time-out: {2} )'
77
+                             .format(engine.name, search_duration, timeout_limit))
78
+            with threading.RLock():
79
+                engine.stats['errors'] += 1
80
+            return False
81
+
82
+        # everything is ok : return the response
83
+        return response
84
+
46 85
     except:
47 86
         # increase errors stats
48 87
         with threading.RLock():
@@ -51,20 +90,62 @@ def search_request_wrapper(fn, url, engine_name, **kwargs):
51 90
             engine.suspend_end_time = time() + min(60, engine.continuous_errors)
52 91
 
53 92
         # print engine name and specific error message
54
-        logger.exception('engine crash: {0}'.format(engine_name))
55
-    return ret
93
+        logger.exception('engine crash: {0}'.format(engine.name))
94
+        return False
95
+
96
+
97
+def search_one_request(engine_name, query, request_params, result_container, timeout_limit):
98
+    engine = engines[engine_name]
99
+
100
+    # update request parameters dependent on
101
+    # search-engine (contained in engines folder)
102
+    engine.request(query, request_params)
103
+
104
+    # TODO add support of offline engines
105
+    if request_params['url'] is None:
106
+        return False
107
+
108
+    # ignoring empty urls
109
+    if not request_params['url']:
110
+        return False
111
+
112
+    # send request
113
+    response = send_http_request(engine, request_params, timeout_limit)
114
+
115
+    # parse response
116
+    success = None
117
+    if response:
118
+        # parse the response
119
+        response.search_params = request_params
120
+        search_results = engine.response(response)
121
+
122
+        # add results
123
+        for result in search_results:
124
+            result['engine'] = engine.name
125
+
126
+        result_container.extend(engine.name, search_results)
127
+
128
+        success = True
129
+    else:
130
+        success = False
56 131
 
132
+    with threading.RLock():
133
+        # update stats : total time
134
+        engine.stats['engine_time'] += time() - request_params['started']
135
+        engine.stats['engine_time_count'] += 1
57 136
 
58
-def threaded_requests(requests):
59
-    timeout_limit = max(r[2]['timeout'] for r in requests)
60
-    search_start = time()
137
+    #
138
+    return success
139
+
140
+
141
+def search_multiple_requests(requests, result_container, timeout_limit):
142
+    start_time = time()
61 143
     search_id = uuid4().__str__()
62
-    for fn, url, request_args, engine_name in requests:
63
-        request_args['timeout'] = timeout_limit
144
+
145
+    for engine_name, query, request_params in requests:
64 146
         th = threading.Thread(
65
-            target=search_request_wrapper,
66
-            args=(fn, url, engine_name),
67
-            kwargs=request_args,
147
+            target=search_one_request,
148
+            args=(engine_name, query, request_params, result_container, timeout_limit),
68 149
             name=search_id,
69 150
         )
70 151
         th._engine_name = engine_name
@@ -72,7 +153,7 @@ def threaded_requests(requests):
72 153
 
73 154
     for th in threading.enumerate():
74 155
         if th.name == search_id:
75
-            remaining_time = max(0.0, timeout_limit - (time() - search_start))
156
+            remaining_time = max(0.0, timeout_limit - (time() - start_time))
76 157
             th.join(remaining_time)
77 158
             if th.isAlive():
78 159
                 logger.warning('engine timeout: {0}'.format(th._engine_name))
@@ -90,44 +171,6 @@ def default_request_params():
90 171
     }
91 172
 
92 173
 
93
-# create a callback wrapper for the search engine results
94
-def make_callback(engine_name, callback, params, result_container):
95
-
96
-    # creating a callback wrapper for the search engine results
97
-    def process_callback(response, **kwargs):
98
-        # check if redirect comparing to the True value,
99
-        # because resp can be a Mock object, and any attribut name returns something.
100
-        if response.is_redirect is True:
101
-            logger.debug('{0} redirect on: {1}'.format(engine_name, response))
102
-            return
103
-
104
-        response.search_params = params
105
-
106
-        search_duration = time() - params['started']
107
-        # update stats with current page-load-time
108
-        with threading.RLock():
109
-            engines[engine_name].stats['page_load_time'] += search_duration
110
-
111
-        timeout_overhead = 0.2  # seconds
112
-        timeout_limit = engines[engine_name].timeout + timeout_overhead
113
-
114
-        if search_duration > timeout_limit:
115
-            with threading.RLock():
116
-                engines[engine_name].stats['errors'] += 1
117
-            return
118
-
119
-        # callback
120
-        search_results = callback(response)
121
-
122
-        # add results
123
-        for result in search_results:
124
-            result['engine'] = engine_name
125
-
126
-        result_container.extend(engine_name, search_results)
127
-
128
-    return process_callback
129
-
130
-
131 174
 def get_search_query_from_webapp(preferences, form):
132 175
     query = None
133 176
     query_engines = []
@@ -254,6 +297,9 @@ class Search(object):
254 297
     def search(self):
255 298
         global number_of_searches
256 299
 
300
+        # start time
301
+        start_time = time()
302
+
257 303
         # init vars
258 304
         requests = []
259 305
 
@@ -266,6 +312,9 @@ class Search(object):
266 312
 
267 313
         search_query = self.search_query
268 314
 
315
+        # max of all selected engine timeout
316
+        timeout_limit = 0
317
+
269 318
         # start search-reqest for all selected engines
270 319
         for selected_engine in search_query.engines:
271 320
             if selected_engine['name'] not in engines:
@@ -294,7 +343,7 @@ class Search(object):
294 343
             request_params = default_request_params()
295 344
             request_params['headers']['User-Agent'] = user_agent
296 345
             request_params['category'] = selected_engine['category']
297
-            request_params['started'] = time()
346
+            request_params['started'] = start_time
298 347
             request_params['pageno'] = search_query.pageno
299 348
 
300 349
             if hasattr(engine, 'language') and engine.language:
@@ -306,52 +355,16 @@ class Search(object):
306 355
             request_params['safesearch'] = search_query.safesearch
307 356
             request_params['time_range'] = search_query.time_range
308 357
 
309
-            # update request parameters dependent on
310
-            # search-engine (contained in engines folder)
311
-            engine.request(search_query.query.encode('utf-8'), request_params)
312
-
313
-            if request_params['url'] is None:
314
-                # TODO add support of offline engines
315
-                pass
316
-
317
-            # create a callback wrapper for the search engine results
318
-            callback = make_callback(
319
-                selected_engine['name'],
320
-                engine.response,
321
-                request_params,
322
-                self.result_container)
323
-
324
-            # create dictionary which contain all
325
-            # informations about the request
326
-            request_args = dict(
327
-                headers=request_params['headers'],
328
-                hooks=dict(response=callback),
329
-                cookies=request_params['cookies'],
330
-                timeout=engine.timeout,
331
-                verify=request_params['verify']
332
-            )
333
-
334
-            # specific type of request (GET or POST)
335
-            if request_params['method'] == 'GET':
336
-                req = requests_lib.get
337
-            else:
338
-                req = requests_lib.post
339
-                request_args['data'] = request_params['data']
358
+            # append request to list
359
+            requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
340 360
 
341
-            # ignoring empty urls
342
-            if not request_params['url']:
343
-                continue
361
+            # update timeout_limit
362
+            timeout_limit = max(timeout_limit, engine.timeout)
344 363
 
345
-            # append request to list
346
-            requests.append((req, request_params['url'],
347
-                             request_args,
348
-                             selected_engine['name']))
349
-
350
-        if not requests:
351
-            return self.result_container
352
-        # send all search-request
353
-        threaded_requests(requests)
354
-        start_new_thread(gc.collect, tuple())
364
+        if requests:
365
+            # send all search-request
366
+            search_multiple_requests(requests, self.result_container, timeout_limit - (time() - start_time))
367
+            start_new_thread(gc.collect, tuple())
355 368
 
356 369
         # return results, suggestions, answers and infoboxes
357 370
         return self.result_container

+ 2
- 0
searx/webapp.py ファイルの表示

@@ -593,6 +593,8 @@ def preferences():
593 593
             if e.timeout > settings['outgoing']['request_timeout']:
594 594
                 stats[e.name]['warn_timeout'] = True
595 595
 
596
+    # get first element [0], the engine time,
597
+    # and then the second element [1] : the time (the first one is the label)
596 598
     for engine_stat in get_engines_stats()[0][1]:
597 599
         stats[engine_stat.get('name')]['time'] = round(engine_stat.get('avg'), 3)
598 600
         if engine_stat.get('avg') > settings['outgoing']['request_timeout']: