浏览代码

Merge branch 'comments' of https://github.com/pointhi/searx

Conflicts:
	searx/search.py
Adam Tauber 10 年前
父节点
当前提交
bd2db71fa6
共有 4 个文件被更改,包括 174 次插入2 次删除
  1. 20
    0
      searx/__init__.py
  2. 19
    1
      searx/autocomplete.py
  3. 18
    0
      searx/languages.py
  4. 117
    1
      searx/search.py

+ 20
- 0
searx/__init__.py 查看文件

@@ -1,3 +1,20 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
1 18
 from os import environ
2 19
 from os.path import realpath, dirname, join, abspath
3 20
 try:
@@ -10,11 +27,14 @@ except:
10 27
 searx_dir = abspath(dirname(__file__))
11 28
 engine_dir = dirname(realpath(__file__))
12 29
 
30
+# if possible set path to settings using the enviroment variable SEARX_SETTINGS_PATH
13 31
 if 'SEARX_SETTINGS_PATH' in environ:
14 32
     settings_path = environ['SEARX_SETTINGS_PATH']
33
+# otherwise using default path
15 34
 else:
16 35
     settings_path = join(searx_dir, 'settings.yml')
17 36
 
18 37
 
38
+# load settings
19 39
 with open(settings_path) as settings_yaml:
20 40
     settings = load(settings_yaml)

+ 19
- 1
searx/autocomplete.py 查看文件

@@ -1,3 +1,21 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
18
+
1 19
 from lxml import etree
2 20
 from requests import get
3 21
 from json import loads
@@ -22,7 +40,7 @@ def dbpedia(query):
22 40
 
23 41
 
24 42
 def duckduckgo(query):
25
-    # wikipedia autocompleter
43
+    # duckduckgo autocompleter
26 44
     url = 'https://ac.duckduckgo.com/ac/?{0}&type=list'
27 45
 
28 46
     resp = loads(get(url.format(urlencode(dict(q=query)))).text)

+ 18
- 0
searx/languages.py 查看文件

@@ -1,3 +1,21 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
18
+# list of language codes
1 19
 language_codes = (
2 20
     ("ar_XA", "Arabic", "Arabia"),
3 21
     ("bg_BG", "Bulgarian", "Bulgaria"),

+ 117
- 1
searx/search.py 查看文件

@@ -1,3 +1,20 @@
1
+'''
2
+searx is free software: you can redistribute it and/or modify
3
+it under the terms of the GNU Affero General Public License as published by
4
+the Free Software Foundation, either version 3 of the License, or
5
+(at your option) any later version.
6
+
7
+searx is distributed in the hope that it will be useful,
8
+but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
+GNU Affero General Public License for more details.
11
+
12
+You should have received a copy of the GNU Affero General Public License
13
+along with searx. If not, see < http://www.gnu.org/licenses/ >.
14
+
15
+(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
16
+'''
17
+
1 18
 import grequests
2 19
 from itertools import izip_longest, chain
3 20
 from datetime import datetime
@@ -9,45 +26,65 @@ from searx.engines import (
9 26
 from searx.languages import language_codes
10 27
 from searx.utils import gen_useragent
11 28
 
29
+
12 30
 number_of_searches = 0
13 31
 
14 32
 
33
+# get default reqest parameter
15 34
 def default_request_params():
16 35
     return {
17 36
         'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
18 37
 
19 38
 
39
+# create a callback wrapper for the search engine results
20 40
 def make_callback(engine_name, results, suggestions, callback, params):
41
+
21 42
     # creating a callback wrapper for the search engine results
22 43
     def process_callback(response, **kwargs):
23 44
         cb_res = []
24 45
         response.search_params = params
46
+
47
+        # update stats with current page-load-time
25 48
         engines[engine_name].stats['page_load_time'] += \
26 49
             (datetime.now() - params['started']).total_seconds()
50
+
27 51
         try:
28 52
             search_results = callback(response)
29 53
         except Exception, e:
54
+            # increase errors stats
30 55
             engines[engine_name].stats['errors'] += 1
31 56
             results[engine_name] = cb_res
57
+
58
+            # print engine name and specific error message
32 59
             print '[E] Error with engine "{0}":\n\t{1}'.format(
33 60
                 engine_name, str(e))
34 61
             return
62
+
35 63
         for result in search_results:
36 64
             result['engine'] = engine_name
65
+
66
+            # if it is a suggestion, add it to list of suggestions
37 67
             if 'suggestion' in result:
38 68
                 # TODO type checks
39 69
                 suggestions.add(result['suggestion'])
40 70
                 continue
71
+
72
+            # append result
41 73
             cb_res.append(result)
74
+
42 75
         results[engine_name] = cb_res
76
+
43 77
     return process_callback
44 78
 
45 79
 
80
+# score results and remove duplications
46 81
 def score_results(results):
82
+    # calculate scoring parameters
47 83
     flat_res = filter(
48 84
         None, chain.from_iterable(izip_longest(*results.values())))
49 85
     flat_len = len(flat_res)
50 86
     engines_len = len(results)
87
+
51 88
     results = []
52 89
 
53 90
     # pass 1: deduplication + scoring
@@ -63,34 +100,53 @@ def score_results(results):
63 100
         res['engines'] = [res['engine']]
64 101
         weight = 1.0
65 102
 
103
+        # get weight of this engine if possible
66 104
         if hasattr(engines[res['engine']], 'weight'):
67 105
             weight = float(engines[res['engine']].weight)
68 106
 
107
+        # calculate score for that engine
69 108
         score = int((flat_len - i) / engines_len) * weight + 1
109
+
70 110
         duplicated = False
71 111
 
112
+        # check for duplicates
72 113
         for new_res in results:
114
+            # remove / from the end of the url if required
73 115
             p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path  # noqa
74 116
             p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path  # noqa
117
+
118
+            # check if that result is a duplicate
75 119
             if res['host'] == new_res['host'] and\
76 120
                unquote(p1) == unquote(p2) and\
77 121
                res['parsed_url'].query == new_res['parsed_url'].query and\
78 122
                res.get('template') == new_res.get('template'):
79 123
                 duplicated = new_res
80 124
                 break
125
+
126
+        # merge duplicates together
81 127
         if duplicated:
128
+            # using content with more text
82 129
             if res.get('content') > duplicated.get('content'):
83 130
                 duplicated['content'] = res['content']
131
+
132
+            # increase result-score
84 133
             duplicated['score'] += score
134
+
135
+            # add engine to list of result-engines
85 136
             duplicated['engines'].append(res['engine'])
137
+
138
+            # using https if possible
86 139
             if duplicated['parsed_url'].scheme == 'https':
87 140
                 continue
88 141
             elif res['parsed_url'].scheme == 'https':
89 142
                 duplicated['url'] = res['parsed_url'].geturl()
90 143
                 duplicated['parsed_url'] = res['parsed_url']
144
+
145
+        # if there is no duplicate found, append result
91 146
         else:
92 147
             res['score'] = score
93 148
             results.append(res)
149
+
94 150
     results = sorted(results, key=itemgetter('score'), reverse=True)
95 151
 
96 152
     # pass 2 : group results by category and template
@@ -99,7 +155,7 @@ def score_results(results):
99 155
 
100 156
     for i, res in enumerate(results):
101 157
         # FIXME : handle more than one category per engine
102
-        category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template'] 
158
+        category = engines[res['engine']].categories[0] + ':' + '' if 'template' not in res else res['template']
103 159
 
104 160
         current = None if category not in categoryPositions else categoryPositions[category]
105 161
 
@@ -134,6 +190,7 @@ class Search(object):
134 190
     """Search information container"""
135 191
 
136 192
     def __init__(self, request):
193
+        # init vars
137 194
         super(Search, self).__init__()
138 195
         self.query = None
139 196
         self.engines = []
@@ -141,18 +198,23 @@ class Search(object):
141 198
         self.paging = False
142 199
         self.pageno = 1
143 200
         self.lang = 'all'
201
+
202
+        # set blocked engines
144 203
         if request.cookies.get('blocked_engines'):
145 204
             self.blocked_engines = request.cookies['blocked_engines'].split(',')  # noqa
146 205
         else:
147 206
             self.blocked_engines = []
207
+
148 208
         self.results = []
149 209
         self.suggestions = []
150 210
         self.request_data = {}
151 211
 
212
+        # set specific language if set
152 213
         if request.cookies.get('language')\
153 214
            and request.cookies['language'] in (x[0] for x in language_codes):
154 215
             self.lang = request.cookies['language']
155 216
 
217
+        # set request method
156 218
         if request.method == 'POST':
157 219
             self.request_data = request.form
158 220
         else:
@@ -162,51 +224,72 @@ class Search(object):
162 224
         if not self.request_data.get('q'):
163 225
             raise Exception('noquery')
164 226
 
227
+        # set query
165 228
         self.query = self.request_data['q']
166 229
 
230
+        # set pagenumber
167 231
         pageno_param = self.request_data.get('pageno', '1')
168 232
         if not pageno_param.isdigit() or int(pageno_param) < 1:
169 233
             raise Exception('wrong pagenumber')
170 234
 
171 235
         self.pageno = int(pageno_param)
172 236
 
237
+        # parse query, if tags are set, which change the serch engine or search-language
173 238
         self.parse_query()
174 239
 
175 240
         self.categories = []
176 241
 
242
+        # if engines are calculated from query, set categories by using that informations
177 243
         if self.engines:
178 244
             self.categories = list(set(engine['category']
179 245
                                        for engine in self.engines))
246
+
247
+        # otherwise, using defined categories to calculate which engines should be used
180 248
         else:
249
+            # set used categories
181 250
             for pd_name, pd in self.request_data.items():
182 251
                 if pd_name.startswith('category_'):
183 252
                     category = pd_name[9:]
253
+                    # if category is not found in list, skip
184 254
                     if not category in categories:
185 255
                         continue
256
+
257
+                    # add category to list
186 258
                     self.categories.append(category)
259
+
260
+            # if no category is specified for this search, using user-defined default-configuration which (is stored in cookie)
187 261
             if not self.categories:
188 262
                 cookie_categories = request.cookies.get('categories', '')
189 263
                 cookie_categories = cookie_categories.split(',')
190 264
                 for ccateg in cookie_categories:
191 265
                     if ccateg in categories:
192 266
                         self.categories.append(ccateg)
267
+
268
+            # if still no category is specified, using general as default-category
193 269
             if not self.categories:
194 270
                 self.categories = ['general']
195 271
 
272
+            # using all engines for that search, which are declared under the specific categories
196 273
             for categ in self.categories:
197 274
                 self.engines.extend({'category': categ,
198 275
                                      'name': x.name}
199 276
                                     for x in categories[categ]
200 277
                                     if not x.name in self.blocked_engines)
201 278
 
279
+    # parse query, if tags are set, which change the serch engine or search-language
202 280
     def parse_query(self):
203 281
         query_parts = self.query.split()
204 282
         modified = False
283
+
284
+        # check if language-prefix is set
205 285
         if query_parts[0].startswith(':'):
206 286
             lang = query_parts[0][1:].lower()
207 287
 
288
+            # check if any language-code is equal with declared language-codes
208 289
             for lc in language_codes:
209 290
                 lang_id, lang_name, country = map(str.lower, lc)
291
+
292
+                # if correct language-code is found, set it as new search-language
210 293
                 if lang == lang_id\
211 294
                    or lang_id.startswith(lang)\
212 295
                    or lang == lang_name\
@@ -215,56 +298,78 @@ class Search(object):
215 298
                     modified = True
216 299
                     break
217 300
 
301
+        # check if category/engine prefix is set
218 302
         elif query_parts[0].startswith('!'):
219 303
             prefix = query_parts[0][1:].replace('_', ' ')
220 304
 
305
+            # check if prefix is equal with engine shortcut
221 306
             if prefix in engine_shortcuts\
222 307
                and not engine_shortcuts[prefix] in self.blocked_engines:
223 308
                 modified = True
224 309
                 self.engines.append({'category': 'none',
225 310
                                      'name': engine_shortcuts[prefix]})
311
+
312
+            # check if prefix is equal with engine name
226 313
             elif prefix in engines\
227 314
                     and not prefix in self.blocked_engines:
228 315
                 modified = True
229 316
                 self.engines.append({'category': 'none',
230 317
                                     'name': prefix})
318
+
319
+            # check if prefix is equal with categorie name
231 320
             elif prefix in categories:
232 321
                 modified = True
322
+                # using all engines for that search, which are declared under that categorie name
233 323
                 self.engines.extend({'category': prefix,
234 324
                                     'name': engine.name}
235 325
                                     for engine in categories[prefix]
236 326
                                     if not engine in self.blocked_engines)
327
+
328
+        # if language, category or engine were specificed in this query, search for more tags which does the same
237 329
         if modified:
238 330
             self.query = self.query.replace(query_parts[0], '', 1).strip()
239 331
             self.parse_query()
240 332
 
333
+    # do search-request
241 334
     def search(self, request):
242 335
         global number_of_searches
336
+
337
+        # init vars
243 338
         requests = []
244 339
         results = {}
245 340
         suggestions = set()
341
+
342
+        # increase number of searches
246 343
         number_of_searches += 1
344
+
345
+        # set default useragent
247 346
         #user_agent = request.headers.get('User-Agent', '')
248 347
         user_agent = gen_useragent()
249 348
 
349
+        # start search-reqest for all selected engines
250 350
         for selected_engine in self.engines:
251 351
             if selected_engine['name'] not in engines:
252 352
                 continue
253 353
 
254 354
             engine = engines[selected_engine['name']]
255 355
 
356
+            # if paging is not supported, skip
256 357
             if self.pageno > 1 and not engine.paging:
257 358
                 continue
258 359
 
360
+            # if search-language is set and engine does not provide language-support, skip
259 361
             if self.lang != 'all' and not engine.language_support:
260 362
                 continue
261 363
 
364
+            # set default request parameters
262 365
             request_params = default_request_params()
263 366
             request_params['headers']['User-Agent'] = user_agent
264 367
             request_params['category'] = selected_engine['category']
265 368
             request_params['started'] = datetime.now()
266 369
             request_params['pageno'] = self.pageno
267 370
             request_params['language'] = self.lang
371
+
372
+            # update request parameters dependent on search-engine (contained in engines folder)
268 373
             request_params = engine.request(self.query.encode('utf-8'),
269 374
                                             request_params)
270 375
 
@@ -272,6 +377,7 @@ class Search(object):
272 377
                 # TODO add support of offline engines
273 378
                 pass
274 379
 
380
+            # create a callback wrapper for the search engine results
275 381
             callback = make_callback(
276 382
                 selected_engine['name'],
277 383
                 results,
@@ -280,6 +386,7 @@ class Search(object):
280 386
                 request_params
281 387
             )
282 388
 
389
+            # create dictionary which contain all informations about the request
283 390
             request_args = dict(
284 391
                 headers=request_params['headers'],
285 392
                 hooks=dict(response=callback),
@@ -287,6 +394,7 @@ class Search(object):
287 394
                 timeout=engine.timeout
288 395
             )
289 396
 
397
+            # specific type of request (GET or POST)
290 398
             if request_params['method'] == 'GET':
291 399
                 req = grequests.get
292 400
             else:
@@ -297,17 +405,25 @@ class Search(object):
297 405
             if not request_params['url']:
298 406
                 continue
299 407
 
408
+            # append request to list
300 409
             requests.append(req(request_params['url'], **request_args))
410
+
411
+        # send all search-request
301 412
         grequests.map(requests)
413
+
414
+        # update engine-specific stats
302 415
         for engine_name, engine_results in results.items():
303 416
             engines[engine_name].stats['search_count'] += 1
304 417
             engines[engine_name].stats['result_count'] += len(engine_results)
305 418
 
419
+        # score results and remove duplications
306 420
         results = score_results(results)
307 421
 
422
+        # update engine stats, using calculated score
308 423
         for result in results:
309 424
             for res_engine in result['engines']:
310 425
                 engines[result['engine']]\
311 426
                     .stats['score_count'] += result['score']
312 427
 
428
+        # return results and suggestions
313 429
         return results, suggestions