浏览代码

add comments to search.py

* add comments
* add licence-header
Thomas Pointhuber 10 年前
父节点
当前提交
c9bab0e833
共有 1 个文件被更改,包括 118 次插入0 次删除
  1. 118
    0
      searx/search.py

+ 118
- 0
searx/search.py 查看文件

1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
1
 import grequests
18
 import grequests
2
 from itertools import izip_longest, chain
19
 from itertools import izip_longest, chain
3
 from datetime import datetime
20
 from datetime import datetime
9
 from searx.languages import language_codes
26
 from searx.languages import language_codes
10
 from searx.utils import gen_useragent
27
 from searx.utils import gen_useragent
11
 
28
 
29
+
12
 number_of_searches = 0
30
 number_of_searches = 0
13
 
31
 
14
 
32
 
33
+# get default reqest parameter
15
 def default_request_params():
34
 def default_request_params():
16
     return {
35
     return {
17
         'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
36
         'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
18
 
37
 
19
 
38
 
39
+# create a callback wrapper for the search engine results
20
 def make_callback(engine_name, results, suggestions, callback, params):
40
 def make_callback(engine_name, results, suggestions, callback, params):
41
+
21
     # creating a callback wrapper for the search engine results
42
     # creating a callback wrapper for the search engine results
22
     def process_callback(response, **kwargs):
43
     def process_callback(response, **kwargs):
23
         cb_res = []
44
         cb_res = []
24
         response.search_params = params
45
         response.search_params = params
46
+
47
+        # update stats with current page-load-time
25
         engines[engine_name].stats['page_load_time'] += \
48
         engines[engine_name].stats['page_load_time'] += \
26
             (datetime.now() - params['started']).total_seconds()
49
             (datetime.now() - params['started']).total_seconds()
50
+
27
         try:
51
         try:
28
             search_results = callback(response)
52
             search_results = callback(response)
29
         except Exception, e:
53
         except Exception, e:
54
+            # increase errors stats
30
             engines[engine_name].stats['errors'] += 1
55
             engines[engine_name].stats['errors'] += 1
31
             results[engine_name] = cb_res
56
             results[engine_name] = cb_res
57
+
58
+            # print engine name and specific error message
32
             print '[E] Error with engine "{0}":\n\t{1}'.format(
59
             print '[E] Error with engine "{0}":\n\t{1}'.format(
33
                 engine_name, str(e))
60
                 engine_name, str(e))
34
             return
61
             return
62
+            
35
         for result in search_results:
63
         for result in search_results:
36
             result['engine'] = engine_name
64
             result['engine'] = engine_name
65
+
66
+            # if it is a suggestion, add it to list of suggestions
37
             if 'suggestion' in result:
67
             if 'suggestion' in result:
38
                 # TODO type checks
68
                 # TODO type checks
39
                 suggestions.add(result['suggestion'])
69
                 suggestions.add(result['suggestion'])
40
                 continue
70
                 continue
71
+
72
+            # append result
41
             cb_res.append(result)
73
             cb_res.append(result)
74
+
42
         results[engine_name] = cb_res
75
         results[engine_name] = cb_res
76
+
43
     return process_callback
77
     return process_callback
44
 
78
 
45
 
79
 
80
+# score results and remove duplications
46
 def score_results(results):
81
 def score_results(results):
82
+    # calculate scoring parameters
47
     flat_res = filter(
83
     flat_res = filter(
48
         None, chain.from_iterable(izip_longest(*results.values())))
84
         None, chain.from_iterable(izip_longest(*results.values())))
49
     flat_len = len(flat_res)
85
     flat_len = len(flat_res)
50
     engines_len = len(results)
86
     engines_len = len(results)
87
+
51
     results = []
88
     results = []
89
+
52
     # deduplication + scoring
90
     # deduplication + scoring
53
     for i, res in enumerate(flat_res):
91
     for i, res in enumerate(flat_res):
54
 
92
 
62
         res['engines'] = [res['engine']]
100
         res['engines'] = [res['engine']]
63
         weight = 1.0
101
         weight = 1.0
64
 
102
 
103
+        # get weight of this engine if possible
65
         if hasattr(engines[res['engine']], 'weight'):
104
         if hasattr(engines[res['engine']], 'weight'):
66
             weight = float(engines[res['engine']].weight)
105
             weight = float(engines[res['engine']].weight)
67
 
106
 
107
+        # calculate score for that engine
68
         score = int((flat_len - i) / engines_len) * weight + 1
108
         score = int((flat_len - i) / engines_len) * weight + 1
109
+
69
         duplicated = False
110
         duplicated = False
70
 
111
 
112
+        # check for duplicates
71
         for new_res in results:
113
         for new_res in results:
114
+            # remove / from the end of the url if required
72
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
115
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
73
             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
116
             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
117
+
118
+            # check if that result is a duplicate
74
             if res['host'] == new_res['host'] and\
119
             if res['host'] == new_res['host'] and\
75
                unquote(p1) == unquote(p2) and\
120
                unquote(p1) == unquote(p2) and\
76
                res['parsed_url'].query == new_res['parsed_url'].query and\
121
                res['parsed_url'].query == new_res['parsed_url'].query and\
77
                res.get('template') == new_res.get('template'):
122
                res.get('template') == new_res.get('template'):
78
                 duplicated = new_res
123
                 duplicated = new_res
79
                 break
124
                 break
125
+
126
+        # merge duplicates together
80
         if duplicated:
127
         if duplicated:
128
+            # using content with more text
81
             if res.get('content') > duplicated.get('content'):
129
             if res.get('content') > duplicated.get('content'):
82
                 duplicated['content'] = res['content']
130
                 duplicated['content'] = res['content']
131
+
132
+            # increase result-score
83
             duplicated['score'] += score
133
             duplicated['score'] += score
134
+
135
+            # add engine to list of result-engines
84
             duplicated['engines'].append(res['engine'])
136
             duplicated['engines'].append(res['engine'])
137
+
138
+            # using https if possible
85
             if duplicated['parsed_url'].scheme == 'https':
139
             if duplicated['parsed_url'].scheme == 'https':
86
                 continue
140
                 continue
87
             elif res['parsed_url'].scheme == 'https':
141
             elif res['parsed_url'].scheme == 'https':
88
                 duplicated['url'] = res['parsed_url'].geturl()
142
                 duplicated['url'] = res['parsed_url'].geturl()
89
                 duplicated['parsed_url'] = res['parsed_url']
143
                 duplicated['parsed_url'] = res['parsed_url']
144
+
145
+        # if there is no duplicate found, append result
90
         else:
146
         else:
91
             res['score'] = score
147
             res['score'] = score
92
             results.append(res)
148
             results.append(res)
149
+
150
+    # return results sorted by score
93
     return sorted(results, key=itemgetter('score'), reverse=True)
151
     return sorted(results, key=itemgetter('score'), reverse=True)
94
 
152
 
95
 
153
 
98
     """Search information container"""
156
     """Search information container"""
99
 
157
 
100
     def __init__(self, request):
158
     def __init__(self, request):
159
+        # init vars
101
         super(Search, self).__init__()
160
         super(Search, self).__init__()
102
         self.query = None
161
         self.query = None
103
         self.engines = []
162
         self.engines = []
105
         self.paging = False
164
         self.paging = False
106
         self.pageno = 1
165
         self.pageno = 1
107
         self.lang = 'all'
166
         self.lang = 'all'
167
+
168
+        # set blocked engines
108
         if request.cookies.get('blocked_engines'):
169
         if request.cookies.get('blocked_engines'):
109
             self.blocked_engines = request.cookies['blocked_engines'].split(',')  # noqa
170
             self.blocked_engines = request.cookies['blocked_engines'].split(',')  # noqa
110
         else:
171
         else:
111
             self.blocked_engines = []
172
             self.blocked_engines = []
173
+
112
         self.results = []
174
         self.results = []
113
         self.suggestions = []
175
         self.suggestions = []
114
         self.request_data = {}
176
         self.request_data = {}
115
 
177
 
178
+        # set specific language if set
116
         if request.cookies.get('language')\
179
         if request.cookies.get('language')\
117
            and request.cookies['language'] in (x[0] for x in language_codes):
180
            and request.cookies['language'] in (x[0] for x in language_codes):
118
             self.lang = request.cookies['language']
181
             self.lang = request.cookies['language']
119
 
182
 
183
+        # set request method
120
         if request.method == 'POST':
184
         if request.method == 'POST':
121
             self.request_data = request.form
185
             self.request_data = request.form
122
         else:
186
         else:
126
         if not self.request_data.get('q'):
190
         if not self.request_data.get('q'):
127
             raise Exception('noquery')
191
             raise Exception('noquery')
128
 
192
 
193
+        # set query
129
         self.query = self.request_data['q']
194
         self.query = self.request_data['q']
130
 
195
 
196
+        # set pagenumber
131
         pageno_param = self.request_data.get('pageno', '1')
197
         pageno_param = self.request_data.get('pageno', '1')
132
         if not pageno_param.isdigit() or int(pageno_param) < 1:
198
         if not pageno_param.isdigit() or int(pageno_param) < 1:
133
             raise Exception('wrong pagenumber')
199
             raise Exception('wrong pagenumber')
134
 
200
 
135
         self.pageno = int(pageno_param)
201
         self.pageno = int(pageno_param)
136
 
202
 
203
+        # parse query, if tags are set, which change the serch engine or search-language
137
         self.parse_query()
204
         self.parse_query()
138
 
205
 
139
         self.categories = []
206
         self.categories = []
140
 
207
 
208
+        # if engines are calculated from query, set categories by using that informations
141
         if self.engines:
209
         if self.engines:
142
             self.categories = list(set(engine['category']
210
             self.categories = list(set(engine['category']
143
                                        for engine in self.engines))
211
                                        for engine in self.engines))
212
+
213
+        # otherwise, using defined categories to calculate which engines should be used
144
         else:
214
         else:
215
+            # set used categories
145
             for pd_name, pd in self.request_data.items():
216
             for pd_name, pd in self.request_data.items():
146
                 if pd_name.startswith('category_'):
217
                 if pd_name.startswith('category_'):
147
                     category = pd_name[9:]
218
                     category = pd_name[9:]
219
+                    # if category is not found in list, skip
148
                     if not category in categories:
220
                     if not category in categories:
149
                         continue
221
                         continue
222
+
223
+                    # add category to list
150
                     self.categories.append(category)
224
                     self.categories.append(category)
225
+
226
+            # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie)
151
             if not self.categories:
227
             if not self.categories:
152
                 cookie_categories = request.cookies.get('categories', '')
228
                 cookie_categories = request.cookies.get('categories', '')
153
                 cookie_categories = cookie_categories.split(',')
229
                 cookie_categories = cookie_categories.split(',')
154
                 for ccateg in cookie_categories:
230
                 for ccateg in cookie_categories:
155
                     if ccateg in categories:
231
                     if ccateg in categories:
156
                         self.categories.append(ccateg)
232
                         self.categories.append(ccateg)
233
+
234
+            # if still no category is specified, using general as default-category
157
             if not self.categories:
235
             if not self.categories:
158
                 self.categories = ['general']
236
                 self.categories = ['general']
159
 
237
 
238
+            # using all engines for that search, which are declared under the specific categories
160
             for categ in self.categories:
239
             for categ in self.categories:
161
                 self.engines.extend({'category': categ,
240
                 self.engines.extend({'category': categ,
162
                                      'name': x.name}
241
                                      'name': x.name}
163
                                     for x in categories[categ]
242
                                     for x in categories[categ]
164
                                     if not x.name in self.blocked_engines)
243
                                     if not x.name in self.blocked_engines)
165
 
244
 
245
+    # parse query, if tags are set, which change the serch engine or search-language
166
     def parse_query(self):
246
     def parse_query(self):
167
         query_parts = self.query.split()
247
         query_parts = self.query.split()
168
         modified = False
248
         modified = False
249
+
250
+        # check if language-prefix is set
169
         if query_parts[0].startswith(':'):
251
         if query_parts[0].startswith(':'):
170
             lang = query_parts[0][1:].lower()
252
             lang = query_parts[0][1:].lower()
171
 
253
 
254
+            # check if any language-code equal with declared language-codes
172
             for lc in language_codes:
255
             for lc in language_codes:
173
                 lang_id, lang_name, country = map(str.lower, lc)
256
                 lang_id, lang_name, country = map(str.lower, lc)
257
+
258
+                # if correct language-code is found, set it as new search-language
174
                 if lang == lang_id\
259
                 if lang == lang_id\
175
                    or lang_id.startswith(lang)\
260
                    or lang_id.startswith(lang)\
176
                    or lang == lang_name\
261
                    or lang == lang_name\
179
                     modified = True
264
                     modified = True
180
                     break
265
                     break
181
 
266
 
267
+        # check if category/engine prefix is set
182
         elif query_parts[0].startswith('!'):
268
         elif query_parts[0].startswith('!'):
183
             prefix = query_parts[0][1:].replace('_', ' ')
269
             prefix = query_parts[0][1:].replace('_', ' ')
184
 
270
 
271
+            # check if prefix equal with engine shortcut
185
             if prefix in engine_shortcuts\
272
             if prefix in engine_shortcuts\
186
                and not engine_shortcuts[prefix] in self.blocked_engines:
273
                and not engine_shortcuts[prefix] in self.blocked_engines:
187
                 modified = True
274
                 modified = True
188
                 self.engines.append({'category': 'none',
275
                 self.engines.append({'category': 'none',
189
                                      'name': engine_shortcuts[prefix]})
276
                                      'name': engine_shortcuts[prefix]})
277
+
278
+            # check if prefix equal with engine name
190
             elif prefix in engines\
279
             elif prefix in engines\
191
                     and not prefix in self.blocked_engines:
280
                     and not prefix in self.blocked_engines:
192
                 modified = True
281
                 modified = True
193
                 self.engines.append({'category': 'none',
282
                 self.engines.append({'category': 'none',
194
                                     'name': prefix})
283
                                     'name': prefix})
284
+
285
+            # check if prefix equal with categorie name
195
             elif prefix in categories:
286
             elif prefix in categories:
196
                 modified = True
287
                 modified = True
288
+                # using all engines for that search, which are declared under that categorie name
197
                 self.engines.extend({'category': prefix,
289
                 self.engines.extend({'category': prefix,
198
                                     'name': engine.name}
290
                                     'name': engine.name}
199
                                     for engine in categories[prefix]
291
                                     for engine in categories[prefix]
200
                                     if not engine in self.blocked_engines)
292
                                     if not engine in self.blocked_engines)
293
+
294
+        # if language, category or engine were specificed in this query, search for more tags which does the same
201
         if modified:
295
         if modified:
202
             self.query = self.query.replace(query_parts[0], '', 1).strip()
296
             self.query = self.query.replace(query_parts[0], '', 1).strip()
203
             self.parse_query()
297
             self.parse_query()
204
 
298
 
299
+    # do search-request
205
     def search(self, request):
300
     def search(self, request):
206
         global number_of_searches
301
         global number_of_searches
302
+
303
+        # init vars
207
         requests = []
304
         requests = []
208
         results = {}
305
         results = {}
209
         suggestions = set()
306
         suggestions = set()
307
+
308
+        # increase number of active searches
210
         number_of_searches += 1
309
         number_of_searches += 1
310
+
311
+        # set default useragent
211
         #user_agent = request.headers.get('User-Agent', '')
312
         #user_agent = request.headers.get('User-Agent', '')
212
         user_agent = gen_useragent()
313
         user_agent = gen_useragent()
213
 
314
 
315
+        # start search-reqest for all selected engines
214
         for selected_engine in self.engines:
316
         for selected_engine in self.engines:
215
             if selected_engine['name'] not in engines:
317
             if selected_engine['name'] not in engines:
216
                 continue
318
                 continue
217
 
319
 
218
             engine = engines[selected_engine['name']]
320
             engine = engines[selected_engine['name']]
219
 
321
 
322
+            # if paging is not supported, skip
220
             if self.pageno > 1 and not engine.paging:
323
             if self.pageno > 1 and not engine.paging:
221
                 continue
324
                 continue
222
 
325
 
326
+            # if search-language is set and engine does not provide language-support, skip
223
             if self.lang != 'all' and not engine.language_support:
327
             if self.lang != 'all' and not engine.language_support:
224
                 continue
328
                 continue
225
 
329
 
330
+            # set default request parameters
226
             request_params = default_request_params()
331
             request_params = default_request_params()
227
             request_params['headers']['User-Agent'] = user_agent
332
             request_params['headers']['User-Agent'] = user_agent
228
             request_params['category'] = selected_engine['category']
333
             request_params['category'] = selected_engine['category']
229
             request_params['started'] = datetime.now()
334
             request_params['started'] = datetime.now()
230
             request_params['pageno'] = self.pageno
335
             request_params['pageno'] = self.pageno
231
             request_params['language'] = self.lang
336
             request_params['language'] = self.lang
337
+
338
+            # update request parameters dependent on search-engine (contained in engines folder)
232
             request_params = engine.request(self.query.encode('utf-8'),
339
             request_params = engine.request(self.query.encode('utf-8'),
233
                                             request_params)
340
                                             request_params)
234
 
341
 
236
                 # TODO add support of offline engines
343
                 # TODO add support of offline engines
237
                 pass
344
                 pass
238
 
345
 
346
+            # create a callback wrapper for the search engine results
239
             callback = make_callback(
347
             callback = make_callback(
240
                 selected_engine['name'],
348
                 selected_engine['name'],
241
                 results,
349
                 results,
244
                 request_params
352
                 request_params
245
             )
353
             )
246
 
354
 
355
+            # create dictionary which contain all informations about the request
247
             request_args = dict(
356
             request_args = dict(
248
                 headers=request_params['headers'],
357
                 headers=request_params['headers'],
249
                 hooks=dict(response=callback),
358
                 hooks=dict(response=callback),
251
                 timeout=engine.timeout
360
                 timeout=engine.timeout
252
             )
361
             )
253
 
362
 
363
+            # specific type of request (GET or POST)
254
             if request_params['method'] == 'GET':
364
             if request_params['method'] == 'GET':
255
                 req = grequests.get
365
                 req = grequests.get
256
             else:
366
             else:
261
             if not request_params['url']:
371
             if not request_params['url']:
262
                 continue
372
                 continue
263
 
373
 
374
+            # append request to list
264
             requests.append(req(request_params['url'], **request_args))
375
             requests.append(req(request_params['url'], **request_args))
376
+
377
+        # send all search-request
265
         grequests.map(requests)
378
         grequests.map(requests)
379
+
380
+        # update engine-specific stats
266
         for engine_name, engine_results in results.items():
381
         for engine_name, engine_results in results.items():
267
             engines[engine_name].stats['search_count'] += 1
382
             engines[engine_name].stats['search_count'] += 1
268
             engines[engine_name].stats['result_count'] += len(engine_results)
383
             engines[engine_name].stats['result_count'] += len(engine_results)
269
 
384
 
385
+        # score results and remove duplications
270
         results = score_results(results)
386
         results = score_results(results)
271
 
387
 
388
+        # update engine stats, using calculated score
272
         for result in results:
389
         for result in results:
273
             for res_engine in result['engines']:
390
             for res_engine in result['engines']:
274
                 engines[result['engine']]\
391
                 engines[result['engine']]\
275
                     .stats['score_count'] += result['score']
392
                     .stats['score_count'] += result['score']
276
 
393
 
394
+        # return results and suggestions
277
         return results, suggestions
395
         return results, suggestions