results.py 8.7KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254
  1. import re
  2. from collections import defaultdict
  3. from operator import itemgetter
  4. from threading import RLock
  5. from urlparse import urlparse, unquote
  6. from searx.engines import engines
  7. CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile('[,;:!?\./\\\\ ()-_]', re.M | re.U)
  8. WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)
  9. # return the meaningful length of the content for a result
  10. def result_content_len(content):
  11. if isinstance(content, basestring):
  12. return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
  13. else:
  14. return 0
  15. def compare_urls(url_a, url_b):
  16. if url_a.netloc != url_b.netloc or url_a.query != url_b.query:
  17. return False
  18. # remove / from the end of the url if required
  19. path_a = url_a.path[:-1]\
  20. if url_a.path.endswith('/')\
  21. else url_a.path
  22. path_b = url_b.path[:-1]\
  23. if url_b.path.endswith('/')\
  24. else url_b.path
  25. return unquote(path_a) == unquote(path_b)
  26. def merge_two_infoboxes(infobox1, infobox2):
  27. if 'urls' in infobox2:
  28. urls1 = infobox1.get('urls', None)
  29. if urls1 is None:
  30. urls1 = []
  31. infobox1['urls'] = urls1
  32. urlSet = set()
  33. for url in infobox1.get('urls', []):
  34. urlSet.add(url.get('url', None))
  35. for url in infobox2.get('urls', []):
  36. if url.get('url', None) not in urlSet:
  37. urls1.append(url)
  38. if 'img_src' in infobox2:
  39. img1 = infobox1.get('img_src', None)
  40. img2 = infobox2.get('img_src')
  41. if img1 is None:
  42. infobox1['img_src'] = img2
  43. if 'attributes' in infobox2:
  44. attributes1 = infobox1.get('attributes', None)
  45. if attributes1 is None:
  46. attributes1 = []
  47. infobox1['attributes'] = attributes1
  48. attributeSet = set()
  49. for attribute in infobox1.get('attributes', []):
  50. if attribute.get('label', None) not in attributeSet:
  51. attributeSet.add(attribute.get('label', None))
  52. for attribute in infobox2.get('attributes', []):
  53. attributes1.append(attribute)
  54. if 'content' in infobox2:
  55. content1 = infobox1.get('content', None)
  56. content2 = infobox2.get('content', '')
  57. if content1 is not None:
  58. if result_content_len(content2) > result_content_len(content1):
  59. infobox1['content'] = content2
  60. else:
  61. infobox1['content'] = content2
  62. def result_score(result):
  63. weight = 1.0
  64. for result_engine in result['engines']:
  65. if hasattr(engines[result_engine], 'weight'):
  66. weight *= float(engines[result_engine].weight)
  67. occurences = len(result['positions'])
  68. return sum((occurences * weight) / position for position in result['positions'])
  69. class ResultContainer(object):
  70. """docstring for ResultContainer"""
  71. def __init__(self):
  72. super(ResultContainer, self).__init__()
  73. self.results = defaultdict(list)
  74. self._merged_results = []
  75. self.infoboxes = []
  76. self._infobox_ids = {}
  77. self.suggestions = set()
  78. self.answers = set()
  79. self.number_of_results = 0
  80. def extend(self, engine_name, results):
  81. for result in list(results):
  82. if 'suggestion' in result:
  83. self.suggestions.add(result['suggestion'])
  84. results.remove(result)
  85. elif 'answer' in result:
  86. self.answers.add(result['answer'])
  87. results.remove(result)
  88. elif 'infobox' in result:
  89. self._merge_infobox(result)
  90. results.remove(result)
  91. elif 'number_of_results' in result:
  92. self.number_of_results = max(self.number_of_results, result['number_of_results'])
  93. results.remove(result)
  94. with RLock():
  95. engines[engine_name].stats['search_count'] += 1
  96. engines[engine_name].stats['result_count'] += len(results)
  97. if not results:
  98. return
  99. self.results[engine_name].extend(results)
  100. for i, result in enumerate(results):
  101. try:
  102. result['url'] = result['url'].decode('utf-8')
  103. except:
  104. pass
  105. position = i + 1
  106. self._merge_result(result, position)
  107. def _merge_infobox(self, infobox):
  108. add_infobox = True
  109. infobox_id = infobox.get('id', None)
  110. if infobox_id is not None:
  111. existingIndex = self._infobox_ids.get(infobox_id, None)
  112. if existingIndex is not None:
  113. merge_two_infoboxes(self.infoboxes[existingIndex], infobox)
  114. add_infobox = False
  115. if add_infobox:
  116. self.infoboxes.append(infobox)
  117. self._infobox_ids[infobox_id] = len(self.infoboxes) - 1
  118. def _merge_result(self, result, position):
  119. result['parsed_url'] = urlparse(result['url'])
  120. # if the result has no scheme, use http as default
  121. if not result['parsed_url'].scheme:
  122. result['parsed_url'] = result['parsed_url']._replace(scheme="http")
  123. result['url'] = result['parsed_url'].geturl()
  124. result['host'] = result['parsed_url'].netloc
  125. if result['host'].startswith('www.'):
  126. result['host'] = result['host'].replace('www.', '', 1)
  127. result['engines'] = [result['engine']]
  128. # strip multiple spaces and cariage returns from content
  129. if result.get('content'):
  130. result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])
  131. # check for duplicates
  132. duplicated = False
  133. for merged_result in self._merged_results:
  134. if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
  135. and result.get('template') == merged_result.get('template'):
  136. duplicated = merged_result
  137. break
  138. # merge duplicates together
  139. if duplicated:
  140. # using content with more text
  141. if result_content_len(result.get('content', '')) >\
  142. result_content_len(duplicated.get('content', '')):
  143. duplicated['content'] = result['content']
  144. # add the new position
  145. duplicated['positions'].append(position)
  146. # add engine to list of result-engines
  147. duplicated['engines'].append(result['engine'])
  148. # using https if possible
  149. if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
  150. duplicated['url'] = result['parsed_url'].geturl()
  151. duplicated['parsed_url'] = result['parsed_url']
  152. # if there is no duplicate found, append result
  153. else:
  154. result['positions'] = [position]
  155. with RLock():
  156. self._merged_results.append(result)
  157. def get_ordered_results(self):
  158. for result in self._merged_results:
  159. score = result_score(result)
  160. result['score'] = score
  161. with RLock():
  162. for result_engine in result['engines']:
  163. engines[result_engine].stats['score_count'] += score
  164. results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)
  165. # pass 2 : group results by category and template
  166. gresults = []
  167. categoryPositions = {}
  168. for i, res in enumerate(results):
  169. # FIXME : handle more than one category per engine
  170. category = engines[res['engine']].categories[0] + ':' + ''\
  171. if 'template' not in res\
  172. else res['template']
  173. current = None if category not in categoryPositions\
  174. else categoryPositions[category]
  175. # group with previous results using the same category
  176. # if the group can accept more result and is not too far
  177. # from the current position
  178. if current is not None and (current['count'] > 0)\
  179. and (len(gresults) - current['index'] < 20):
  180. # group with the previous results using
  181. # the same category with this one
  182. index = current['index']
  183. gresults.insert(index, res)
  184. # update every index after the current one
  185. # (including the current one)
  186. for k in categoryPositions:
  187. v = categoryPositions[k]['index']
  188. if v >= index:
  189. categoryPositions[k]['index'] = v + 1
  190. # update this category
  191. current['count'] -= 1
  192. else:
  193. # same category
  194. gresults.append(res)
  195. # update categoryIndex
  196. categoryPositions[category] = {'index': len(gresults), 'count': 8}
  197. # return gresults
  198. return gresults
  199. def results_length(self):
  200. return len(self._merged_results)