__init__.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. from os.path import realpath, dirname, splitext, join
  15. import sys
  16. from imp import load_source
  17. from itertools import izip_longest, chain
  18. from operator import itemgetter
  19. from urlparse import urlparse
  20. from datetime import datetime
  21. import grequests
  22. from flask.ext.babel import gettext
  23. from searx import settings
  24. from searx.utils import gen_useragent
  25. engine_dir = dirname(realpath(__file__))
  26. number_of_searches = 0
  27. engines = {}
  28. categories = {'general': []}
  29. engine_shortcuts = {}
  30. def load_module(filename):
  31. modname = splitext(filename)[0]
  32. if modname in sys.modules:
  33. del sys.modules[modname]
  34. filepath = join(engine_dir, filename)
  35. module = load_source(modname, filepath)
  36. module.name = modname
  37. return module
  38. if not 'engines' in settings or not settings['engines']:
  39. print '[E] Error no engines found. Edit your settings.yml'
  40. exit(2)
  41. for engine_data in settings['engines']:
  42. engine_name = engine_data['engine']
  43. engine = load_module(engine_name + '.py')
  44. for param_name in engine_data:
  45. if param_name == 'engine':
  46. continue
  47. if param_name == 'categories':
  48. if engine_data['categories'] == 'none':
  49. engine.categories = []
  50. else:
  51. engine.categories = map(
  52. str.strip, engine_data['categories'].split(','))
  53. continue
  54. setattr(engine, param_name, engine_data[param_name])
  55. if not hasattr(engine, 'paging'):
  56. engine.paging = False
  57. if not hasattr(engine, 'categories'):
  58. engine.categories = ['general']
  59. if not hasattr(engine, 'language_support'):
  60. #engine.language_support = False
  61. engine.language_support = True
  62. if not hasattr(engine, 'timeout'):
  63. #engine.language_support = False
  64. engine.timeout = settings['server']['request_timeout']
  65. if not hasattr(engine, 'shortcut'):
  66. #engine.shortcut = '''
  67. engine.shortcut = ''
  68. # checking required variables
  69. for engine_attr in dir(engine):
  70. if engine_attr.startswith('_'):
  71. continue
  72. if getattr(engine, engine_attr) is None:
  73. print '[E] Engine config error: Missing attribute "{0}.{1}"'.format(engine.name, engine_attr) # noqa
  74. sys.exit(1)
  75. engines[engine.name] = engine
  76. engine.stats = {
  77. 'result_count': 0,
  78. 'search_count': 0,
  79. 'page_load_time': 0,
  80. 'score_count': 0,
  81. 'errors': 0
  82. }
  83. if hasattr(engine, 'categories'):
  84. for category_name in engine.categories:
  85. categories.setdefault(category_name, []).append(engine)
  86. else:
  87. categories['general'].append(engine)
  88. if engine.shortcut:
  89. # TODO check duplications
  90. engine_shortcuts[engine.shortcut] = engine.name
  91. def default_request_params():
  92. return {
  93. 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}}
  94. def make_callback(engine_name, results, suggestions, callback, params):
  95. # creating a callback wrapper for the search engine results
  96. def process_callback(response, **kwargs):
  97. cb_res = []
  98. response.search_params = params
  99. engines[engine_name].stats['page_load_time'] += \
  100. (datetime.now() - params['started']).total_seconds()
  101. try:
  102. search_results = callback(response)
  103. except Exception, e:
  104. engines[engine_name].stats['errors'] += 1
  105. results[engine_name] = cb_res
  106. print '[E] Error with engine "{0}":\n\t{1}'.format(
  107. engine_name, str(e))
  108. return
  109. for result in search_results:
  110. result['engine'] = engine_name
  111. if 'suggestion' in result:
  112. # TODO type checks
  113. suggestions.add(result['suggestion'])
  114. continue
  115. cb_res.append(result)
  116. results[engine_name] = cb_res
  117. return process_callback
  118. def score_results(results):
  119. flat_res = filter(
  120. None, chain.from_iterable(izip_longest(*results.values())))
  121. flat_len = len(flat_res)
  122. engines_len = len(results)
  123. results = []
  124. # deduplication + scoring
  125. for i, res in enumerate(flat_res):
  126. res['parsed_url'] = urlparse(res['url'])
  127. res['host'] = res['parsed_url'].netloc
  128. if res['host'].startswith('www.'):
  129. res['host'] = res['host'].replace('www.', '', 1)
  130. res['engines'] = [res['engine']]
  131. weight = 1.0
  132. if hasattr(engines[res['engine']], 'weight'):
  133. weight = float(engines[res['engine']].weight)
  134. score = int((flat_len - i) / engines_len) * weight + 1
  135. duplicated = False
  136. for new_res in results:
  137. p1 = res['parsed_url'].path[:-1] if res['parsed_url'].path.endswith('/') else res['parsed_url'].path # noqa
  138. p2 = new_res['parsed_url'].path[:-1] if new_res['parsed_url'].path.endswith('/') else new_res['parsed_url'].path # noqa
  139. if res['host'] == new_res['host'] and\
  140. p1 == p2 and\
  141. res['parsed_url'].query == new_res['parsed_url'].query and\
  142. res.get('template') == new_res.get('template'):
  143. duplicated = new_res
  144. break
  145. if duplicated:
  146. if res.get('content') > duplicated.get('content'):
  147. duplicated['content'] = res['content']
  148. duplicated['score'] += score
  149. duplicated['engines'].append(res['engine'])
  150. if duplicated['parsed_url'].scheme == 'https':
  151. continue
  152. elif res['parsed_url'].scheme == 'https':
  153. duplicated['url'] = res['parsed_url'].geturl()
  154. duplicated['parsed_url'] = res['parsed_url']
  155. else:
  156. res['score'] = score
  157. results.append(res)
  158. return sorted(results, key=itemgetter('score'), reverse=True)
  159. def search(query, request, selected_engines, pageno=1, lang='all'):
  160. global engines, categories, number_of_searches
  161. requests = []
  162. results = {}
  163. suggestions = set()
  164. number_of_searches += 1
  165. #user_agent = request.headers.get('User-Agent', '')
  166. user_agent = gen_useragent()
  167. for selected_engine in selected_engines:
  168. if selected_engine['name'] not in engines:
  169. continue
  170. engine = engines[selected_engine['name']]
  171. if pageno > 1 and not engine.paging:
  172. continue
  173. if lang != 'all' and not engine.language_support:
  174. continue
  175. request_params = default_request_params()
  176. request_params['headers']['User-Agent'] = user_agent
  177. request_params['category'] = selected_engine['category']
  178. request_params['started'] = datetime.now()
  179. request_params['pageno'] = pageno
  180. request_params['language'] = lang
  181. request_params = engine.request(query.encode('utf-8'), request_params)
  182. callback = make_callback(
  183. selected_engine['name'],
  184. results,
  185. suggestions,
  186. engine.response,
  187. request_params
  188. )
  189. request_args = dict(
  190. headers=request_params['headers'],
  191. hooks=dict(response=callback),
  192. cookies=request_params['cookies'],
  193. timeout=engine.timeout
  194. )
  195. if request_params['method'] == 'GET':
  196. req = grequests.get
  197. else:
  198. req = grequests.post
  199. request_args['data'] = request_params['data']
  200. # ignoring empty urls
  201. if not request_params['url']:
  202. continue
  203. requests.append(req(request_params['url'], **request_args))
  204. grequests.map(requests)
  205. for engine_name, engine_results in results.items():
  206. engines[engine_name].stats['search_count'] += 1
  207. engines[engine_name].stats['result_count'] += len(engine_results)
  208. results = score_results(results)
  209. for result in results:
  210. for res_engine in result['engines']:
  211. engines[result['engine']].stats['score_count'] += result['score']
  212. return results, suggestions
  213. def get_engines_stats():
  214. # TODO refactor
  215. pageloads = []
  216. results = []
  217. scores = []
  218. errors = []
  219. scores_per_result = []
  220. max_pageload = max_results = max_score = max_errors = max_score_per_result = 0 # noqa
  221. for engine in engines.values():
  222. if engine.stats['search_count'] == 0:
  223. continue
  224. results_num = \
  225. engine.stats['result_count'] / float(engine.stats['search_count'])
  226. load_times = engine.stats['page_load_time'] / float(engine.stats['search_count']) # noqa
  227. if results_num:
  228. score = engine.stats['score_count'] / float(engine.stats['search_count']) # noqa
  229. score_per_result = score / results_num
  230. else:
  231. score = score_per_result = 0.0
  232. max_results = max(results_num, max_results)
  233. max_pageload = max(load_times, max_pageload)
  234. max_score = max(score, max_score)
  235. max_score_per_result = max(score_per_result, max_score_per_result)
  236. max_errors = max(max_errors, engine.stats['errors'])
  237. pageloads.append({'avg': load_times, 'name': engine.name})
  238. results.append({'avg': results_num, 'name': engine.name})
  239. scores.append({'avg': score, 'name': engine.name})
  240. errors.append({'avg': engine.stats['errors'], 'name': engine.name})
  241. scores_per_result.append({
  242. 'avg': score_per_result,
  243. 'name': engine.name
  244. })
  245. for engine in pageloads:
  246. engine['percentage'] = int(engine['avg'] / max_pageload * 100)
  247. for engine in results:
  248. engine['percentage'] = int(engine['avg'] / max_results * 100)
  249. for engine in scores:
  250. engine['percentage'] = int(engine['avg'] / max_score * 100)
  251. for engine in scores_per_result:
  252. engine['percentage'] = int(engine['avg'] / max_score_per_result * 100)
  253. for engine in errors:
  254. if max_errors:
  255. engine['percentage'] = int(float(engine['avg']) / max_errors * 100)
  256. else:
  257. engine['percentage'] = 0
  258. return [
  259. (
  260. gettext('Page loads (sec)'),
  261. sorted(pageloads, key=itemgetter('avg'))
  262. ),
  263. (
  264. gettext('Number of results'),
  265. sorted(results, key=itemgetter('avg'), reverse=True)
  266. ),
  267. (
  268. gettext('Scores'),
  269. sorted(scores, key=itemgetter('avg'), reverse=True)
  270. ),
  271. (
  272. gettext('Scores per result'),
  273. sorted(scores_per_result, key=itemgetter('avg'), reverse=True)
  274. ),
  275. (
  276. gettext('Errors'),
  277. sorted(errors, key=itemgetter('avg'), reverse=True)
  278. ),
  279. ]