search.py 17KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. '''
  2. searx is free software: you can redistribute it and/or modify
  3. it under the terms of the GNU Affero General Public License as published by
  4. the Free Software Foundation, either version 3 of the License, or
  5. (at your option) any later version.
  6. searx is distributed in the hope that it will be useful,
  7. but WITHOUT ANY WARRANTY; without even the implied warranty of
  8. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  9. GNU Affero General Public License for more details.
  10. You should have received a copy of the GNU Affero General Public License
  11. along with searx. If not, see < http://www.gnu.org/licenses/ >.
  12. (C) 2013- by Adam Tauber, <asciimoo@gmail.com>
  13. '''
  14. import requests as requests_lib
  15. import threading
  16. import re
  17. from itertools import izip_longest, chain
  18. from datetime import datetime
  19. from operator import itemgetter
  20. from Queue import Queue
  21. from time import time
  22. from urlparse import urlparse, unquote
  23. from searx.engines import (
  24. categories, engines
  25. )
  26. from searx.languages import language_codes
  27. from searx.utils import gen_useragent
  28. from searx.query import Query
  29. number_of_searches = 0
  30. def threaded_requests(requests):
  31. timeout_limit = max(r[2]['timeout'] for r in requests)
  32. search_start = time()
  33. for fn, url, request_args in requests:
  34. th = threading.Thread(
  35. target=fn,
  36. args=(url,),
  37. kwargs=request_args,
  38. name='search_request',
  39. )
  40. th.start()
  41. for th in threading.enumerate():
  42. if th.name == 'search_request':
  43. remaining_time = max(0.0, timeout_limit - (time() - search_start))
  44. th.join(remaining_time)
  45. if th.isAlive():
  46. print('engine timeout')
  47. # get default reqest parameter
  48. def default_request_params():
  49. return {
  50. 'method': 'GET', 'headers': {}, 'data': {}, 'url': '', 'cookies': {}, 'verify': True}
  51. # create a callback wrapper for the search engine results
  52. def make_callback(engine_name,
  53. results_queue,
  54. suggestions,
  55. answers,
  56. infoboxes,
  57. callback,
  58. params):
  59. # creating a callback wrapper for the search engine results
  60. def process_callback(response, **kwargs):
  61. response.search_params = params
  62. # callback
  63. try:
  64. search_results = callback(response)
  65. except Exception, e:
  66. # increase errors stats
  67. engines[engine_name].stats['errors'] += 1
  68. # print engine name and specific error message
  69. print '[E] Error with engine "{0}":\n\t{1}'.format(
  70. engine_name, str(e))
  71. return
  72. # add results
  73. for result in search_results:
  74. result['engine'] = engine_name
  75. results_queue.put_nowait((engine_name, search_results))
  76. # update stats with current page-load-time
  77. engines[engine_name].stats['page_load_time'] += \
  78. (datetime.now() - params['started']).total_seconds()
  79. return process_callback
  80. # return the meaningful length of the content for a result
  81. def content_result_len(content):
  82. if isinstance(content, basestring):
  83. content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
  84. return len(content)
  85. else:
  86. return 0
  87. # score results and remove duplications
  88. def score_results(results):
  89. # calculate scoring parameters
  90. flat_res = filter(
  91. None, chain.from_iterable(izip_longest(*results.values())))
  92. flat_len = len(flat_res)
  93. engines_len = len(results)
  94. results = []
  95. # pass 1: deduplication + scoring
  96. for i, res in enumerate(flat_res):
  97. res['parsed_url'] = urlparse(res['url'])
  98. res['host'] = res['parsed_url'].netloc
  99. if res['host'].startswith('www.'):
  100. res['host'] = res['host'].replace('www.', '', 1)
  101. res['engines'] = [res['engine']]
  102. weight = 1.0
  103. # strip multiple spaces and cariage returns from content
  104. if res.get('content'):
  105. res['content'] = re.sub(' +', ' ',
  106. res['content'].strip().replace('\n', ''))
  107. # get weight of this engine if possible
  108. if hasattr(engines[res['engine']], 'weight'):
  109. weight = float(engines[res['engine']].weight)
  110. # calculate score for that engine
  111. score = int((flat_len - i) / engines_len) * weight + 1
  112. # check for duplicates
  113. duplicated = False
  114. for new_res in results:
  115. # remove / from the end of the url if required
  116. p1 = res['parsed_url'].path[:-1]\
  117. if res['parsed_url'].path.endswith('/')\
  118. else res['parsed_url'].path
  119. p2 = new_res['parsed_url'].path[:-1]\
  120. if new_res['parsed_url'].path.endswith('/')\
  121. else new_res['parsed_url'].path
  122. # check if that result is a duplicate
  123. if res['host'] == new_res['host'] and\
  124. unquote(p1) == unquote(p2) and\
  125. res['parsed_url'].query == new_res['parsed_url'].query and\
  126. res.get('template') == new_res.get('template'):
  127. duplicated = new_res
  128. break
  129. # merge duplicates together
  130. if duplicated:
  131. # using content with more text
  132. if content_result_len(res.get('content', '')) >\
  133. content_result_len(duplicated.get('content', '')):
  134. duplicated['content'] = res['content']
  135. # increase result-score
  136. duplicated['score'] += score
  137. # add engine to list of result-engines
  138. duplicated['engines'].append(res['engine'])
  139. # using https if possible
  140. if duplicated['parsed_url'].scheme == 'https':
  141. continue
  142. elif res['parsed_url'].scheme == 'https':
  143. duplicated['url'] = res['parsed_url'].geturl()
  144. duplicated['parsed_url'] = res['parsed_url']
  145. # if there is no duplicate found, append result
  146. else:
  147. res['score'] = score
  148. results.append(res)
  149. results = sorted(results, key=itemgetter('score'), reverse=True)
  150. # pass 2 : group results by category and template
  151. gresults = []
  152. categoryPositions = {}
  153. for i, res in enumerate(results):
  154. # FIXME : handle more than one category per engine
  155. category = engines[res['engine']].categories[0] + ':' + ''\
  156. if 'template' not in res\
  157. else res['template']
  158. current = None if category not in categoryPositions\
  159. else categoryPositions[category]
  160. # group with previous results using the same category
  161. # if the group can accept more result and is not too far
  162. # from the current position
  163. if current is not None and (current['count'] > 0)\
  164. and (len(gresults) - current['index'] < 20):
  165. # group with the previous results using
  166. # the same category with this one
  167. index = current['index']
  168. gresults.insert(index, res)
  169. # update every index after the current one
  170. # (including the current one)
  171. for k in categoryPositions:
  172. v = categoryPositions[k]['index']
  173. if v >= index:
  174. categoryPositions[k]['index'] = v+1
  175. # update this category
  176. current['count'] -= 1
  177. else:
  178. # same category
  179. gresults.append(res)
  180. # update categoryIndex
  181. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  182. # return gresults
  183. return gresults
  184. def merge_two_infoboxes(infobox1, infobox2):
  185. if 'urls' in infobox2:
  186. urls1 = infobox1.get('urls', None)
  187. if urls1 is None:
  188. urls1 = []
  189. infobox1.set('urls', urls1)
  190. urlSet = set()
  191. for url in infobox1.get('urls', []):
  192. urlSet.add(url.get('url', None))
  193. for url in infobox2.get('urls', []):
  194. if url.get('url', None) not in urlSet:
  195. urls1.append(url)
  196. if 'attributes' in infobox2:
  197. attributes1 = infobox1.get('attributes', None)
  198. if attributes1 is None:
  199. attributes1 = []
  200. infobox1.set('attributes', attributes1)
  201. attributeSet = set()
  202. for attribute in infobox1.get('attributes', []):
  203. if attribute.get('label', None) not in attributeSet:
  204. attributeSet.add(attribute.get('label', None))
  205. for attribute in infobox2.get('attributes', []):
  206. attributes1.append(attribute)
  207. if 'content' in infobox2:
  208. content1 = infobox1.get('content', None)
  209. content2 = infobox2.get('content', '')
  210. if content1 is not None:
  211. if content_result_len(content2) > content_result_len(content1):
  212. infobox1['content'] = content2
  213. else:
  214. infobox1.set('content', content2)
  215. def merge_infoboxes(infoboxes):
  216. results = []
  217. infoboxes_id = {}
  218. for infobox in infoboxes:
  219. add_infobox = True
  220. infobox_id = infobox.get('id', None)
  221. if infobox_id is not None:
  222. existingIndex = infoboxes_id.get(infobox_id, None)
  223. if existingIndex is not None:
  224. merge_two_infoboxes(results[existingIndex], infobox)
  225. add_infobox = False
  226. if add_infobox:
  227. results.append(infobox)
  228. infoboxes_id[infobox_id] = len(results)-1
  229. return results
  230. class Search(object):
  231. """Search information container"""
  232. def __init__(self, request):
  233. # init vars
  234. super(Search, self).__init__()
  235. self.query = None
  236. self.engines = []
  237. self.categories = []
  238. self.paging = False
  239. self.pageno = 1
  240. self.lang = 'all'
  241. # set blocked engines
  242. if request.cookies.get('blocked_engines'):
  243. self.blocked_engines = request.cookies['blocked_engines'].split(',') # noqa
  244. else:
  245. self.blocked_engines = []
  246. self.results = []
  247. self.suggestions = []
  248. self.answers = []
  249. self.infoboxes = []
  250. self.request_data = {}
  251. # set specific language if set
  252. if request.cookies.get('language')\
  253. and request.cookies['language'] in (x[0] for x in language_codes):
  254. self.lang = request.cookies['language']
  255. # set request method
  256. if request.method == 'POST':
  257. self.request_data = request.form
  258. else:
  259. self.request_data = request.args
  260. # TODO better exceptions
  261. if not self.request_data.get('q'):
  262. raise Exception('noquery')
  263. # set pagenumber
  264. pageno_param = self.request_data.get('pageno', '1')
  265. if not pageno_param.isdigit() or int(pageno_param) < 1:
  266. raise Exception('wrong pagenumber')
  267. self.pageno = int(pageno_param)
  268. # parse query, if tags are set, which change
  269. # the serch engine or search-language
  270. query_obj = Query(self.request_data['q'], self.blocked_engines)
  271. query_obj.parse_query()
  272. # set query
  273. self.query = query_obj.getSearchQuery()
  274. # get last selected language in query, if possible
  275. # TODO support search with multible languages
  276. if len(query_obj.languages):
  277. self.lang = query_obj.languages[-1]
  278. self.engines = query_obj.engines
  279. self.categories = []
  280. # if engines are calculated from query,
  281. # set categories by using that informations
  282. if self.engines:
  283. self.categories = list(set(engine['category']
  284. for engine in self.engines))
  285. # otherwise, using defined categories to
  286. # calculate which engines should be used
  287. else:
  288. # set used categories
  289. for pd_name, pd in self.request_data.items():
  290. if pd_name.startswith('category_'):
  291. category = pd_name[9:]
  292. # if category is not found in list, skip
  293. if category not in categories:
  294. continue
  295. # add category to list
  296. self.categories.append(category)
  297. # if no category is specified for this search,
  298. # using user-defined default-configuration which
  299. # (is stored in cookie)
  300. if not self.categories:
  301. cookie_categories = request.cookies.get('categories', '')
  302. cookie_categories = cookie_categories.split(',')
  303. for ccateg in cookie_categories:
  304. if ccateg in categories:
  305. self.categories.append(ccateg)
  306. # if still no category is specified, using general
  307. # as default-category
  308. if not self.categories:
  309. self.categories = ['general']
  310. # using all engines for that search, which are
  311. # declared under the specific categories
  312. for categ in self.categories:
  313. self.engines.extend({'category': categ,
  314. 'name': x.name}
  315. for x in categories[categ]
  316. if x.name not in self.blocked_engines)
  317. # do search-request
  318. def search(self, request):
  319. global number_of_searches
  320. # init vars
  321. requests = []
  322. results_queue = Queue()
  323. suggestions = set()
  324. answers = set()
  325. infoboxes = []
  326. # increase number of searches
  327. number_of_searches += 1
  328. # set default useragent
  329. # user_agent = request.headers.get('User-Agent', '')
  330. user_agent = gen_useragent()
  331. # start search-reqest for all selected engines
  332. for selected_engine in self.engines:
  333. if selected_engine['name'] not in engines:
  334. continue
  335. engine = engines[selected_engine['name']]
  336. # if paging is not supported, skip
  337. if self.pageno > 1 and not engine.paging:
  338. continue
  339. # if search-language is set and engine does not
  340. # provide language-support, skip
  341. if self.lang != 'all' and not engine.language_support:
  342. continue
  343. # set default request parameters
  344. request_params = default_request_params()
  345. request_params['headers']['User-Agent'] = user_agent
  346. request_params['category'] = selected_engine['category']
  347. request_params['started'] = datetime.now()
  348. request_params['pageno'] = self.pageno
  349. request_params['language'] = self.lang
  350. # update request parameters dependent on
  351. # search-engine (contained in engines folder)
  352. request_params = engine.request(self.query.encode('utf-8'),
  353. request_params)
  354. if request_params['url'] is None:
  355. # TODO add support of offline engines
  356. pass
  357. # create a callback wrapper for the search engine results
  358. callback = make_callback(
  359. selected_engine['name'],
  360. results_queue,
  361. suggestions,
  362. answers,
  363. infoboxes,
  364. engine.response,
  365. request_params
  366. )
  367. # create dictionary which contain all
  368. # informations about the request
  369. request_args = dict(
  370. headers=request_params['headers'],
  371. hooks=dict(response=callback),
  372. cookies=request_params['cookies'],
  373. timeout=engine.timeout,
  374. verify=request_params['verify']
  375. )
  376. # specific type of request (GET or POST)
  377. if request_params['method'] == 'GET':
  378. req = requests_lib.get
  379. else:
  380. req = requests_lib.post
  381. request_args['data'] = request_params['data']
  382. # ignoring empty urls
  383. if not request_params['url']:
  384. continue
  385. # append request to list
  386. requests.append((req, request_params['url'], request_args))
  387. # send all search-request
  388. threaded_requests(requests)
  389. results = {}
  390. while not results_queue.empty():
  391. engine_name, engine_results = results_queue.get_nowait()
  392. # TODO type checks
  393. [suggestions.add(x['suggestion'])
  394. for x in list(engine_results)
  395. if 'suggestion' in x
  396. and engine_results.remove(x) is None]
  397. [answers.add(x['answer'])
  398. for x in list(engine_results)
  399. if 'answer' in x
  400. and engine_results.remove(x) is None]
  401. infoboxes.extend(x for x in list(engine_results)
  402. if 'infobox' in x
  403. and engine_results.remove(x) is None)
  404. results[engine_name] = engine_results
  405. # update engine-specific stats
  406. for engine_name, engine_results in results.items():
  407. engines[engine_name].stats['search_count'] += 1
  408. engines[engine_name].stats['result_count'] += len(engine_results)
  409. # score results and remove duplications
  410. results = score_results(results)
  411. # merge infoboxes according to their ids
  412. infoboxes = merge_infoboxes(infoboxes)
  413. # update engine stats, using calculated score
  414. for result in results:
  415. for res_engine in result['engines']:
  416. engines[result['engine']]\
  417. .stats['score_count'] += result['score']
  418. # return results, suggestions, answers and infoboxes
  419. return results, suggestions, answers, infoboxes