123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import gc
  15. import threading
  16. from thread import start_new_thread
  17. from time import time
  18. from uuid import uuid4
  19. import requests.exceptions
  20. import searx.poolrequests as requests_lib
  21. from searx.engines import (
  22. categories, engines
  23. )
  24. from searx.answerers import ask
  25. from searx.utils import gen_useragent
  26. from searx.query import RawTextQuery, SearchQuery
  27. from searx.results import ResultContainer
  28. from searx import logger
  29. from searx.plugins import plugins
  30. from searx.languages import language_codes
  31. from searx.exceptions import SearxParameterException
  32. logger = logger.getChild('search')
  33. number_of_searches = 0
  34. language_code_set = set(l[0].lower() for l in language_codes)
  35. language_code_set.add('all')
  36. def send_http_request(engine, request_params, start_time, timeout_limit):
  37. # for page_load_time stats
  38. time_before_request = time()
  39. # create dictionary which contain all
  40. # informations about the request
  41. request_args = dict(
  42. headers=request_params['headers'],
  43. cookies=request_params['cookies'],
  44. timeout=timeout_limit,
  45. verify=request_params['verify']
  46. )
  47. # specific type of request (GET or POST)
  48. if request_params['method'] == 'GET':
  49. req = requests_lib.get
  50. else:
  51. req = requests_lib.post
  52. request_args['data'] = request_params['data']
  53. # send the request
  54. response = req(request_params['url'], **request_args)
  55. # is there a timeout (no parsing in this case)
  56. timeout_overhead = 0.2 # seconds
  57. time_after_request = time()
  58. search_duration = time_after_request - start_time
  59. if search_duration > timeout_limit + timeout_overhead:
  60. raise requests.exceptions.Timeout(response=response)
  61. with threading.RLock():
  62. # no error : reset the suspend variables
  63. engine.continuous_errors = 0
  64. engine.suspend_end_time = 0
  65. # update stats with current page-load-time
  66. # only the HTTP request
  67. engine.stats['page_load_time'] += time_after_request - time_before_request
  68. engine.stats['page_load_count'] += 1
  69. # everything is ok : return the response
  70. return response
  71. def search_one_request(engine, query, request_params, start_time, timeout_limit):
  72. # update request parameters dependent on
  73. # search-engine (contained in engines folder)
  74. engine.request(query, request_params)
  75. # ignoring empty urls
  76. if request_params['url'] is None:
  77. return []
  78. if not request_params['url']:
  79. return []
  80. # send request
  81. response = send_http_request(engine, request_params, start_time, timeout_limit)
  82. # parse the response
  83. response.search_params = request_params
  84. return engine.response(response)
  85. def search_one_request_safe(engine_name, query, request_params, result_container, start_time, timeout_limit):
  86. engine = engines[engine_name]
  87. try:
  88. # send requests and parse the results
  89. search_results = search_one_request(engine, query, request_params, start_time, timeout_limit)
  90. # add results
  91. result_container.extend(engine_name, search_results)
  92. # update engine time when there is no exception
  93. with threading.RLock():
  94. engine.stats['engine_time'] += time() - start_time
  95. engine.stats['engine_time_count'] += 1
  96. return True
  97. except Exception as e:
  98. engine.stats['errors'] += 1
  99. search_duration = time() - start_time
  100. requests_exception = False
  101. if (issubclass(e.__class__, requests.exceptions.Timeout)):
  102. # requests timeout (connect or read)
  103. logger.error("engine {0} : HTTP requests timeout"
  104. "(search duration : {1} s, timeout: {2} s) : {3}"
  105. .format(engine_name, search_duration, timeout_limit, e.__class__.__name__))
  106. requests_exception = True
  107. elif (issubclass(e.__class__, requests.exceptions.RequestException)):
  108. # other requests exception
  109. logger.exception("engine {0} : requests exception"
  110. "(search duration : {1} s, timeout: {2} s) : {3}"
  111. .format(engine_name, search_duration, timeout_limit, e))
  112. requests_exception = True
  113. else:
  114. # others errors
  115. logger.exception('engine {0} : exception : {1}'.format(engine_name, e))
  116. # update continuous_errors / suspend_end_time
  117. if requests_exception:
  118. with threading.RLock():
  119. engine.continuous_errors += 1
  120. engine.suspend_end_time = time() + min(60, engine.continuous_errors)
  121. #
  122. return False
  123. def search_multiple_requests(requests, result_container, start_time, timeout_limit):
  124. search_id = uuid4().__str__()
  125. for engine_name, query, request_params in requests:
  126. th = threading.Thread(
  127. target=search_one_request_safe,
  128. args=(engine_name, query, request_params, result_container, start_time, timeout_limit),
  129. name=search_id,
  130. )
  131. th._engine_name = engine_name
  132. th.start()
  133. for th in threading.enumerate():
  134. if th.name == search_id:
  135. remaining_time = max(0.0, timeout_limit - (time() - start_time))
  136. th.join(remaining_time)
  137. if th.isAlive():
  138. logger.warning('engine timeout: {0}'.format(th._engine_name))
  139. # get default reqest parameter
  140. def default_request_params():
  141. return {
  142. 'method': 'GET',
  143. 'headers': {},
  144. 'data': {},
  145. 'url': '',
  146. 'cookies': {},
  147. 'verify': True
  148. }
  149. def get_search_query_from_webapp(preferences, form):
  150. # no text for the query ?
  151. if not form.get('q'):
  152. raise SearxParameterException('q', '')
  153. # set blocked engines
  154. disabled_engines = preferences.engines.get_disabled()
  155. # parse query, if tags are set, which change
  156. # the serch engine or search-language
  157. raw_text_query = RawTextQuery(form['q'], disabled_engines)
  158. raw_text_query.parse_query()
  159. # set query
  160. query = raw_text_query.getSearchQuery()
  161. # get and check page number
  162. pageno_param = form.get('pageno', '1')
  163. if not pageno_param.isdigit() or int(pageno_param) < 1:
  164. raise SearxParameterException('pageno', pageno_param)
  165. query_pageno = int(pageno_param)
  166. # get language
  167. # set specific language if set on request, query or preferences
  168. # TODO support search with multible languages
  169. if len(raw_text_query.languages):
  170. query_lang = raw_text_query.languages[-1]
  171. elif 'language' in form:
  172. query_lang = form.get('language')
  173. else:
  174. query_lang = preferences.get_value('language')
  175. # check language
  176. if query_lang.lower() not in language_code_set:
  177. raise SearxParameterException('language', query_lang)
  178. # get safesearch
  179. if 'safesearch' in form:
  180. query_safesearch = form.get('safesearch')
  181. # first check safesearch
  182. if not query_safesearch.isdigit():
  183. raise SearxParameterException('safesearch', query_safesearch)
  184. query_safesearch = int(query_safesearch)
  185. else:
  186. query_safesearch = preferences.get_value('safesearch')
  187. # safesearch : second check
  188. if query_safesearch < 0 or query_safesearch > 2:
  189. raise SearxParameterException('safesearch', query_safesearch)
  190. # get time_range
  191. query_time_range = form.get('time_range')
  192. # check time_range
  193. if query_time_range not in ('None', None, '', 'day', 'week', 'month', 'year'):
  194. raise SearxParameterException('time_range', query_time_range)
  195. # query_engines
  196. query_engines = raw_text_query.engines
  197. # query_categories
  198. query_categories = []
  199. # if engines are calculated from query,
  200. # set categories by using that informations
  201. if query_engines and raw_text_query.specific:
  202. query_categories = list(set(engine['category']
  203. for engine in query_engines))
  204. # otherwise, using defined categories to
  205. # calculate which engines should be used
  206. else:
  207. # set categories/engines
  208. load_default_categories = True
  209. for pd_name, pd in form.items():
  210. if pd_name == 'categories':
  211. query_categories.extend(categ for categ in map(unicode.strip, pd.split(',')) if categ in categories)
  212. elif pd_name == 'engines':
  213. pd_engines = [{'category': engines[engine].categories[0],
  214. 'name': engine}
  215. for engine in map(unicode.strip, pd.split(',')) if engine in engines]
  216. if pd_engines:
  217. query_engines.extend(pd_engines)
  218. load_default_categories = False
  219. elif pd_name.startswith('category_'):
  220. category = pd_name[9:]
  221. # if category is not found in list, skip
  222. if category not in categories:
  223. continue
  224. if pd != 'off':
  225. # add category to list
  226. query_categories.append(category)
  227. elif category in query_categories:
  228. # remove category from list if property is set to 'off'
  229. query_categories.remove(category)
  230. if not load_default_categories:
  231. if not query_categories:
  232. query_categories = list(set(engine['category']
  233. for engine in query_engines))
  234. else:
  235. # if no category is specified for this search,
  236. # using user-defined default-configuration which
  237. # (is stored in cookie)
  238. if not query_categories:
  239. cookie_categories = preferences.get_value('categories')
  240. for ccateg in cookie_categories:
  241. if ccateg in categories:
  242. query_categories.append(ccateg)
  243. # if still no category is specified, using general
  244. # as default-category
  245. if not query_categories:
  246. query_categories = ['general']
  247. # using all engines for that search, which are
  248. # declared under the specific categories
  249. for categ in query_categories:
  250. query_engines.extend({'category': categ,
  251. 'name': engine.name}
  252. for engine in categories[categ]
  253. if (engine.name, categ) not in disabled_engines)
  254. return SearchQuery(query, query_engines, query_categories,
  255. query_lang, query_safesearch, query_pageno, query_time_range)
  256. class Search(object):
  257. """Search information container"""
  258. def __init__(self, search_query):
  259. # init vars
  260. super(Search, self).__init__()
  261. self.search_query = search_query
  262. self.result_container = ResultContainer()
  263. # do search-request
  264. def search(self):
  265. global number_of_searches
  266. # start time
  267. start_time = time()
  268. # answeres ?
  269. answerers_results = ask(self.search_query)
  270. if answerers_results:
  271. for results in answerers_results:
  272. self.result_container.extend('answer', results)
  273. return self.result_container
  274. # init vars
  275. requests = []
  276. # increase number of searches
  277. number_of_searches += 1
  278. # set default useragent
  279. # user_agent = request.headers.get('User-Agent', '')
  280. user_agent = gen_useragent()
  281. search_query = self.search_query
  282. # max of all selected engine timeout
  283. timeout_limit = 0
  284. # start search-reqest for all selected engines
  285. for selected_engine in search_query.engines:
  286. if selected_engine['name'] not in engines:
  287. continue
  288. engine = engines[selected_engine['name']]
  289. # skip suspended engines
  290. if engine.suspend_end_time >= time():
  291. logger.debug('Engine currently suspended: %s', selected_engine['name'])
  292. continue
  293. # if paging is not supported, skip
  294. if search_query.pageno > 1 and not engine.paging:
  295. continue
  296. # if search-language is set and engine does not
  297. # provide language-support, skip
  298. if search_query.lang != 'all' and not engine.language_support:
  299. continue
  300. # if time_range is not supported, skip
  301. if search_query.time_range and not engine.time_range_support:
  302. continue
  303. # set default request parameters
  304. request_params = default_request_params()
  305. request_params['headers']['User-Agent'] = user_agent
  306. request_params['category'] = selected_engine['category']
  307. request_params['pageno'] = search_query.pageno
  308. if hasattr(engine, 'language') and engine.language:
  309. request_params['language'] = engine.language
  310. else:
  311. request_params['language'] = search_query.lang
  312. # 0 = None, 1 = Moderate, 2 = Strict
  313. request_params['safesearch'] = search_query.safesearch
  314. request_params['time_range'] = search_query.time_range
  315. # append request to list
  316. requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params))
  317. # update timeout_limit
  318. timeout_limit = max(timeout_limit, engine.timeout)
  319. if requests:
  320. # send all search-request
  321. search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
  322. start_new_thread(gc.collect, tuple())
  323. # return results, suggestions, answers and infoboxes
  324. return self.result_container
  325. class SearchWithPlugins(Search):
  326. """Similar to the Search class but call the plugins."""
  327. def __init__(self, search_query, request):
  328. super(SearchWithPlugins, self).__init__(search_query)
  329. self.request = request
  330. def search(self):
  331. if plugins.call('pre_search', self.request, self):
  332. super(SearchWithPlugins, self).search()
  333. plugins.call('post_search', self.request, self)
  334. results = self.result_container.get_ordered_results()
  335. for result in results:
  336. plugins.call('on_result', self.request, self, result)
  337. return self.result_container