|  | @@ -0,0 +1,550 @@
 | 
	
		
			
			|  | 1 | +"""
 | 
	
		
			
			|  | 2 | +searx is free software: you can redistribute it and/or modify
 | 
	
		
			
			|  | 3 | +it under the terms of the GNU Affero General Public License as published by
 | 
	
		
			
			|  | 4 | +the Free Software Foundation, either version 3 of the License, or
 | 
	
		
			
			|  | 5 | +(at your option) any later version.
 | 
	
		
			
			|  | 6 | +
 | 
	
		
			
			|  | 7 | +searx is distributed in the hope that it will be useful,
 | 
	
		
			
			|  | 8 | +but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
	
		
			
			|  | 9 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 | 
	
		
			
			|  | 10 | +GNU Affero General Public License for more details.
 | 
	
		
			
			|  | 11 | +
 | 
	
		
			
			|  | 12 | +You should have received a copy of the GNU Affero General Public License
 | 
	
		
			
			|  | 13 | +along with searx. If not, see < http://www.gnu.org/licenses/ >.
 | 
	
		
			
			|  | 14 | +
 | 
	
		
			
			|  | 15 | +(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
 | 
	
		
			
			|  | 16 | +"""
 | 
	
		
			
			|  | 17 | +
 | 
	
		
			
			|  | 18 | +import threading
 | 
	
		
			
			|  | 19 | +import re
 | 
	
		
			
			|  | 20 | +import searx.poolrequests as requests_lib
 | 
	
		
			
			|  | 21 | +from itertools import izip_longest, chain
 | 
	
		
			
			|  | 22 | +from operator import itemgetter
 | 
	
		
			
			|  | 23 | +from Queue import Queue
 | 
	
		
			
			|  | 24 | +from time import time
 | 
	
		
			
			|  | 25 | +from urlparse import urlparse, unquote
 | 
	
		
			
			|  | 26 | +from searx import settings
 | 
	
		
			
			|  | 27 | +from searx.engines import engines
 | 
	
		
			
			|  | 28 | +
 | 
	
		
			
			|  | 29 | +from searx.utils import gen_useragent, prettify_url, highlight_content, html_to_text
 | 
	
		
			
			|  | 30 | +from searx.plugins import plugins
 | 
	
		
			
			|  | 31 | +from searx.query import Query
 | 
	
		
			
			|  | 32 | +from searx import logger
 | 
	
		
			
			|  | 33 | +
 | 
	
		
			
			|  | 34 | +logger = logger.getChild('search')
 | 
	
		
			
			|  | 35 | +
 | 
	
		
			
			|  | 36 | +number_of_searches = 0
 | 
	
		
			
			|  | 37 | +
 | 
	
		
			
			|  | 38 | +
 | 
	
		
			
			|  | 39 | +def search_request_wrapper(fn, url, engine_name, **kwargs):
 | 
	
		
			
			|  | 40 | +    try:
 | 
	
		
			
			|  | 41 | +        return fn(url, **kwargs)
 | 
	
		
			
			|  | 42 | +    except:
 | 
	
		
			
			|  | 43 | +        # increase errors stats
 | 
	
		
			
			|  | 44 | +        engines[engine_name].stats['errors'] += 1
 | 
	
		
			
			|  | 45 | +
 | 
	
		
			
			|  | 46 | +        # print engine name and specific error message
 | 
	
		
			
			|  | 47 | +        logger.exception('engine crash: {0}'.format(engine_name))
 | 
	
		
			
			|  | 48 | +        return
 | 
	
		
			
			|  | 49 | +
 | 
	
		
			
			|  | 50 | +
 | 
	
		
			
			|  | 51 | +def threaded_requests(requests):
 | 
	
		
			
			|  | 52 | +    timeout_limit = max(r[2]['timeout'] for r in requests)
 | 
	
		
			
			|  | 53 | +    search_start = time()
 | 
	
		
			
			|  | 54 | +    for fn, url, request_args, engine_name in requests:
 | 
	
		
			
			|  | 55 | +        request_args['timeout'] = timeout_limit
 | 
	
		
			
			|  | 56 | +        th = threading.Thread(
 | 
	
		
			
			|  | 57 | +            target=search_request_wrapper,
 | 
	
		
			
			|  | 58 | +            args=(fn, url, engine_name),
 | 
	
		
			
			|  | 59 | +            kwargs=request_args,
 | 
	
		
			
			|  | 60 | +            name='search_request',
 | 
	
		
			
			|  | 61 | +        )
 | 
	
		
			
			|  | 62 | +        th._engine_name = engine_name
 | 
	
		
			
			|  | 63 | +        th.start()
 | 
	
		
			
			|  | 64 | +
 | 
	
		
			
			|  | 65 | +    for th in threading.enumerate():
 | 
	
		
			
			|  | 66 | +        if th.name == 'search_request':
 | 
	
		
			
			|  | 67 | +            remaining_time = max(0.0, timeout_limit - (time() - search_start))
 | 
	
		
			
			|  | 68 | +            th.join(remaining_time)
 | 
	
		
			
			|  | 69 | +            if th.isAlive():
 | 
	
		
			
			|  | 70 | +                logger.warning('engine timeout: {0}'.format(th._engine_name))
 | 
	
		
			
			|  | 71 | +
 | 
	
		
			
			|  | 72 | +
 | 
	
		
			
			|  | 73 | +# get default reqest parameter
 | 
	
		
			
			|  | 74 | +def default_request_params():
 | 
	
		
			
			|  | 75 | +    return {
 | 
	
		
			
			|  | 76 | +        'method': 'GET',
 | 
	
		
			
			|  | 77 | +        'headers': {},
 | 
	
		
			
			|  | 78 | +        'data': {},
 | 
	
		
			
			|  | 79 | +        'url': '',
 | 
	
		
			
			|  | 80 | +        'cookies': {},
 | 
	
		
			
			|  | 81 | +        'verify': True
 | 
	
		
			
			|  | 82 | +    }
 | 
	
		
			
			|  | 83 | +
 | 
	
		
			
			|  | 84 | +
 | 
	
		
			
			|  | 85 | +# create a callback wrapper for the search engine results
 | 
	
		
			
			|  | 86 | +def make_callback(engine_name, results_queue, callback, params):
 | 
	
		
			
			|  | 87 | +    # creating a callback wrapper for the search engine results
 | 
	
		
			
			|  | 88 | +    def process_callback(response, **kwargs):
 | 
	
		
			
			|  | 89 | +        # check if redirect comparing to the True value,
 | 
	
		
			
			|  | 90 | +        # because resp can be a Mock object, and any attribut name returns something.
 | 
	
		
			
			|  | 91 | +        if response.is_redirect is True:
 | 
	
		
			
			|  | 92 | +            logger.debug('{0} redirect on: {1}'.format(engine_name, response))
 | 
	
		
			
			|  | 93 | +            return
 | 
	
		
			
			|  | 94 | +
 | 
	
		
			
			|  | 95 | +        response.search_params = params
 | 
	
		
			
			|  | 96 | +
 | 
	
		
			
			|  | 97 | +        timeout_overhead = 0.2  # seconds
 | 
	
		
			
			|  | 98 | +        search_duration = time() - params['started']
 | 
	
		
			
			|  | 99 | +        timeout_limit = engines[engine_name].timeout + timeout_overhead
 | 
	
		
			
			|  | 100 | +        if search_duration > timeout_limit:
 | 
	
		
			
			|  | 101 | +            engines[engine_name].stats['page_load_time'] += timeout_limit
 | 
	
		
			
			|  | 102 | +            engines[engine_name].stats['errors'] += 1
 | 
	
		
			
			|  | 103 | +            return
 | 
	
		
			
			|  | 104 | +
 | 
	
		
			
			|  | 105 | +        # callback
 | 
	
		
			
			|  | 106 | +        search_results = callback(response)
 | 
	
		
			
			|  | 107 | +
 | 
	
		
			
			|  | 108 | +        # add results
 | 
	
		
			
			|  | 109 | +        for result in search_results:
 | 
	
		
			
			|  | 110 | +            result['engine'] = engine_name
 | 
	
		
			
			|  | 111 | +
 | 
	
		
			
			|  | 112 | +        results_queue.put_nowait((engine_name, search_results))
 | 
	
		
			
			|  | 113 | +
 | 
	
		
			
			|  | 114 | +        # update stats with current page-load-time
 | 
	
		
			
			|  | 115 | +        engines[engine_name].stats['page_load_time'] += search_duration
 | 
	
		
			
			|  | 116 | +
 | 
	
		
			
			|  | 117 | +    return process_callback
 | 
	
		
			
			|  | 118 | +
 | 
	
		
			
			|  | 119 | +
 | 
	
		
			
			|  | 120 | +# return the meaningful length of the content for a result
 | 
	
		
			
			|  | 121 | +def content_result_len(content):
 | 
	
		
			
			|  | 122 | +    if isinstance(content, basestring):
 | 
	
		
			
			|  | 123 | +        content = re.sub('[,;:!?\./\\\\ ()-_]', '', content)
 | 
	
		
			
			|  | 124 | +        return len(content)
 | 
	
		
			
			|  | 125 | +    else:
 | 
	
		
			
			|  | 126 | +        return 0
 | 
	
		
			
			|  | 127 | +
 | 
	
		
			
			|  | 128 | +
 | 
	
		
			
			|  | 129 | +# score results and remove duplications
 | 
	
		
			
			|  | 130 | +def score_results(results):
 | 
	
		
			
			|  | 131 | +    # calculate scoring parameters
 | 
	
		
			
			|  | 132 | +    flat_res = filter(
 | 
	
		
			
			|  | 133 | +        None, chain.from_iterable(izip_longest(*results.values())))
 | 
	
		
			
			|  | 134 | +    flat_len = len(flat_res)
 | 
	
		
			
			|  | 135 | +    engines_len = len(results)
 | 
	
		
			
			|  | 136 | +
 | 
	
		
			
			|  | 137 | +    results = []
 | 
	
		
			
			|  | 138 | +
 | 
	
		
			
			|  | 139 | +    # pass 1: deduplication + scoring
 | 
	
		
			
			|  | 140 | +    for i, res in enumerate(flat_res):
 | 
	
		
			
			|  | 141 | +
 | 
	
		
			
			|  | 142 | +        res['parsed_url'] = urlparse(res['url'])
 | 
	
		
			
			|  | 143 | +
 | 
	
		
			
			|  | 144 | +        res['host'] = res['parsed_url'].netloc
 | 
	
		
			
			|  | 145 | +
 | 
	
		
			
			|  | 146 | +        if res['host'].startswith('www.'):
 | 
	
		
			
			|  | 147 | +            res['host'] = res['host'].replace('www.', '', 1)
 | 
	
		
			
			|  | 148 | +
 | 
	
		
			
			|  | 149 | +        res['engines'] = [res['engine']]
 | 
	
		
			
			|  | 150 | +
 | 
	
		
			
			|  | 151 | +        weight = 1.0
 | 
	
		
			
			|  | 152 | +
 | 
	
		
			
			|  | 153 | +        # strip multiple spaces and cariage returns from content
 | 
	
		
			
			|  | 154 | +        if res.get('content'):
 | 
	
		
			
			|  | 155 | +            res['content'] = re.sub(' +', ' ',
 | 
	
		
			
			|  | 156 | +                                    res['content'].strip().replace('\n', ''))
 | 
	
		
			
			|  | 157 | +
 | 
	
		
			
			|  | 158 | +        # get weight of this engine if possible
 | 
	
		
			
			|  | 159 | +        if hasattr(engines[res['engine']], 'weight'):
 | 
	
		
			
			|  | 160 | +            weight = float(engines[res['engine']].weight)
 | 
	
		
			
			|  | 161 | +
 | 
	
		
			
			|  | 162 | +        # calculate score for that engine
 | 
	
		
			
			|  | 163 | +        score = int((flat_len - i) / engines_len) * weight + 1
 | 
	
		
			
			|  | 164 | +
 | 
	
		
			
			|  | 165 | +        # check for duplicates
 | 
	
		
			
			|  | 166 | +        duplicated = False
 | 
	
		
			
			|  | 167 | +        for new_res in results:
 | 
	
		
			
			|  | 168 | +            # remove / from the end of the url if required
 | 
	
		
			
			|  | 169 | +            p1 = res['parsed_url'].path[:-1] \
 | 
	
		
			
			|  | 170 | +                if res['parsed_url'].path.endswith('/') \
 | 
	
		
			
			|  | 171 | +                else res['parsed_url'].path
 | 
	
		
			
			|  | 172 | +            p2 = new_res['parsed_url'].path[:-1] \
 | 
	
		
			
			|  | 173 | +                if new_res['parsed_url'].path.endswith('/') \
 | 
	
		
			
			|  | 174 | +                else new_res['parsed_url'].path
 | 
	
		
			
			|  | 175 | +
 | 
	
		
			
			|  | 176 | +            # check if that result is a duplicate
 | 
	
		
			
			|  | 177 | +            if res['host'] == new_res['host'] and unquote(p1) == unquote(p2) and res['parsed_url'].query == new_res['parsed_url'].query and res.get('template') == new_res.get('template'):
 | 
	
		
			
			|  | 178 | +                duplicated = new_res
 | 
	
		
			
			|  | 179 | +                break
 | 
	
		
			
			|  | 180 | +
 | 
	
		
			
			|  | 181 | +        # merge duplicates together
 | 
	
		
			
			|  | 182 | +        if duplicated:
 | 
	
		
			
			|  | 183 | +            # using content with more text
 | 
	
		
			
			|  | 184 | +            if content_result_len(res.get('content', '')) > \
 | 
	
		
			
			|  | 185 | +                    content_result_len(duplicated.get('content', '')):
 | 
	
		
			
			|  | 186 | +                duplicated['content'] = res['content']
 | 
	
		
			
			|  | 187 | +
 | 
	
		
			
			|  | 188 | +            # increase result-score
 | 
	
		
			
			|  | 189 | +            duplicated['score'] += score
 | 
	
		
			
			|  | 190 | +
 | 
	
		
			
			|  | 191 | +            # add engine to list of result-engines
 | 
	
		
			
			|  | 192 | +            duplicated['engines'].append(res['engine'])
 | 
	
		
			
			|  | 193 | +
 | 
	
		
			
			|  | 194 | +            # using https if possible
 | 
	
		
			
			|  | 195 | +            if duplicated['parsed_url'].scheme == 'https':
 | 
	
		
			
			|  | 196 | +                continue
 | 
	
		
			
			|  | 197 | +            elif res['parsed_url'].scheme == 'https':
 | 
	
		
			
			|  | 198 | +                duplicated['url'] = res['parsed_url'].geturl()
 | 
	
		
			
			|  | 199 | +                duplicated['parsed_url'] = res['parsed_url']
 | 
	
		
			
			|  | 200 | +
 | 
	
		
			
			|  | 201 | +        # if there is no duplicate found, append result
 | 
	
		
			
			|  | 202 | +        else:
 | 
	
		
			
			|  | 203 | +            res['score'] = score
 | 
	
		
			
			|  | 204 | +            # if the result has no scheme, use http as default
 | 
	
		
			
			|  | 205 | +            if res['parsed_url'].scheme == '':
 | 
	
		
			
			|  | 206 | +                res['parsed_url'] = res['parsed_url']._replace(scheme="http")
 | 
	
		
			
			|  | 207 | +
 | 
	
		
			
			|  | 208 | +            results.append(res)
 | 
	
		
			
			|  | 209 | +
 | 
	
		
			
			|  | 210 | +    results = sorted(results, key=itemgetter('score'), reverse=True)
 | 
	
		
			
			|  | 211 | +
 | 
	
		
			
			|  | 212 | +    # pass 2 : group results by category and template
 | 
	
		
			
			|  | 213 | +    gresults = []
 | 
	
		
			
			|  | 214 | +    categoryPositions = {}
 | 
	
		
			
			|  | 215 | +
 | 
	
		
			
			|  | 216 | +    for i, res in enumerate(results):
 | 
	
		
			
			|  | 217 | +        # FIXME : handle more than one category per engine
 | 
	
		
			
			|  | 218 | +        category = engines[res['engine']].categories[0] + ':' + '' \
 | 
	
		
			
			|  | 219 | +            if 'template' not in res \
 | 
	
		
			
			|  | 220 | +            else res['template']
 | 
	
		
			
			|  | 221 | +
 | 
	
		
			
			|  | 222 | +        current = None if category not in categoryPositions \
 | 
	
		
			
			|  | 223 | +            else categoryPositions[category]
 | 
	
		
			
			|  | 224 | +
 | 
	
		
			
			|  | 225 | +        # group with previous results using the same category
 | 
	
		
			
			|  | 226 | +        # if the group can accept more result and is not too far
 | 
	
		
			
			|  | 227 | +        # from the current position
 | 
	
		
			
			|  | 228 | +        if current is not None and (current['count'] > 0) \
 | 
	
		
			
			|  | 229 | +                and (len(gresults) - current['index'] < 20):
 | 
	
		
			
			|  | 230 | +            # group with the previous results using
 | 
	
		
			
			|  | 231 | +            # the same category with this one
 | 
	
		
			
			|  | 232 | +            index = current['index']
 | 
	
		
			
			|  | 233 | +            gresults.insert(index, res)
 | 
	
		
			
			|  | 234 | +
 | 
	
		
			
			|  | 235 | +            # update every index after the current one
 | 
	
		
			
			|  | 236 | +            # (including the current one)
 | 
	
		
			
			|  | 237 | +            for k in categoryPositions:
 | 
	
		
			
			|  | 238 | +                v = categoryPositions[k]['index']
 | 
	
		
			
			|  | 239 | +                if v >= index:
 | 
	
		
			
			|  | 240 | +                    categoryPositions[k]['index'] = v + 1
 | 
	
		
			
			|  | 241 | +
 | 
	
		
			
			|  | 242 | +            # update this category
 | 
	
		
			
			|  | 243 | +            current['count'] -= 1
 | 
	
		
			
			|  | 244 | +
 | 
	
		
			
			|  | 245 | +        else:
 | 
	
		
			
			|  | 246 | +            # same category
 | 
	
		
			
			|  | 247 | +            gresults.append(res)
 | 
	
		
			
			|  | 248 | +
 | 
	
		
			
			|  | 249 | +            # update categoryIndex
 | 
	
		
			
			|  | 250 | +            categoryPositions[category] = {'index': len(gresults), 'count': 8}
 | 
	
		
			
			|  | 251 | +
 | 
	
		
			
			|  | 252 | +    # return gresults
 | 
	
		
			
			|  | 253 | +    return gresults
 | 
	
		
			
			|  | 254 | +
 | 
	
		
			
			|  | 255 | +
 | 
	
		
			
			|  | 256 | +def merge_two_infoboxes(infobox1, infobox2):
 | 
	
		
			
			|  | 257 | +    if 'urls' in infobox2:
 | 
	
		
			
			|  | 258 | +        urls1 = infobox1.get('urls', None)
 | 
	
		
			
			|  | 259 | +        if urls1 is None:
 | 
	
		
			
			|  | 260 | +            urls1 = []
 | 
	
		
			
			|  | 261 | +            infobox1.set('urls', urls1)
 | 
	
		
			
			|  | 262 | +
 | 
	
		
			
			|  | 263 | +        urlSet = set()
 | 
	
		
			
			|  | 264 | +        for url in infobox1.get('urls', []):
 | 
	
		
			
			|  | 265 | +            urlSet.add(url.get('url', None))
 | 
	
		
			
			|  | 266 | +
 | 
	
		
			
			|  | 267 | +        for url in infobox2.get('urls', []):
 | 
	
		
			
			|  | 268 | +            if url.get('url', None) not in urlSet:
 | 
	
		
			
			|  | 269 | +                urls1.append(url)
 | 
	
		
			
			|  | 270 | +
 | 
	
		
			
			|  | 271 | +    if 'attributes' in infobox2:
 | 
	
		
			
			|  | 272 | +        attributes1 = infobox1.get('attributes', None)
 | 
	
		
			
			|  | 273 | +        if attributes1 is None:
 | 
	
		
			
			|  | 274 | +            attributes1 = []
 | 
	
		
			
			|  | 275 | +            infobox1.set('attributes', attributes1)
 | 
	
		
			
			|  | 276 | +
 | 
	
		
			
			|  | 277 | +        attributeSet = set()
 | 
	
		
			
			|  | 278 | +        for attribute in infobox1.get('attributes', []):
 | 
	
		
			
			|  | 279 | +            if attribute.get('label', None) not in attributeSet:
 | 
	
		
			
			|  | 280 | +                attributeSet.add(attribute.get('label', None))
 | 
	
		
			
			|  | 281 | +
 | 
	
		
			
			|  | 282 | +        for attribute in infobox2.get('attributes', []):
 | 
	
		
			
			|  | 283 | +            attributes1.append(attribute)
 | 
	
		
			
			|  | 284 | +
 | 
	
		
			
			|  | 285 | +    if 'content' in infobox2:
 | 
	
		
			
			|  | 286 | +        content1 = infobox1.get('content', None)
 | 
	
		
			
			|  | 287 | +        content2 = infobox2.get('content', '')
 | 
	
		
			
			|  | 288 | +        if content1 is not None:
 | 
	
		
			
			|  | 289 | +            if content_result_len(content2) > content_result_len(content1):
 | 
	
		
			
			|  | 290 | +                infobox1['content'] = content2
 | 
	
		
			
			|  | 291 | +        else:
 | 
	
		
			
			|  | 292 | +            infobox1.set('content', content2)
 | 
	
		
			
			|  | 293 | +
 | 
	
		
			
			|  | 294 | +
 | 
	
		
			
			|  | 295 | +def merge_infoboxes(infoboxes):
 | 
	
		
			
			|  | 296 | +    results = []
 | 
	
		
			
			|  | 297 | +    infoboxes_id = {}
 | 
	
		
			
			|  | 298 | +    for infobox in infoboxes:
 | 
	
		
			
			|  | 299 | +        add_infobox = True
 | 
	
		
			
			|  | 300 | +        infobox_id = infobox.get('id', None)
 | 
	
		
			
			|  | 301 | +        if infobox_id is not None:
 | 
	
		
			
			|  | 302 | +            existing_index = infoboxes_id.get(infobox_id, None)
 | 
	
		
			
			|  | 303 | +            if existing_index is not None:
 | 
	
		
			
			|  | 304 | +                merge_two_infoboxes(results[existing_index], infobox)
 | 
	
		
			
			|  | 305 | +                add_infobox = False
 | 
	
		
			
			|  | 306 | +
 | 
	
		
			
			|  | 307 | +        if add_infobox:
 | 
	
		
			
			|  | 308 | +            results.append(infobox)
 | 
	
		
			
			|  | 309 | +            infoboxes_id[infobox_id] = len(results) - 1
 | 
	
		
			
			|  | 310 | +
 | 
	
		
			
			|  | 311 | +    return results
 | 
	
		
			
			|  | 312 | +
 | 
	
		
			
			|  | 313 | +
 | 
	
		
			
			|  | 314 | +class Search(object):
 | 
	
		
			
			|  | 315 | +    """Search information container"""
 | 
	
		
			
			|  | 316 | +
 | 
	
		
			
			|  | 317 | +    def __init__(self, task):
 | 
	
		
			
			|  | 318 | +        # init vars
 | 
	
		
			
			|  | 319 | +        #  super(SearchAPI, self).__init__()
 | 
	
		
			
			|  | 320 | +        self.query = None
 | 
	
		
			
			|  | 321 | +        self.engines = []
 | 
	
		
			
			|  | 322 | +        self.plugins = []
 | 
	
		
			
			|  | 323 | +        self.categories = []
 | 
	
		
			
			|  | 324 | +        self.paging = False
 | 
	
		
			
			|  | 325 | +        self.pageno = 1
 | 
	
		
			
			|  | 326 | +        self.lang = 'all'
 | 
	
		
			
			|  | 327 | +
 | 
	
		
			
			|  | 328 | +        # set blocked engines
 | 
	
		
			
			|  | 329 | +        self.blocked_engines = []  # get_blocked_engines(engines, request.cookies)
 | 
	
		
			
			|  | 330 | +
 | 
	
		
			
			|  | 331 | +        self.results = []
 | 
	
		
			
			|  | 332 | +        self.suggestions = list()
 | 
	
		
			
			|  | 333 | +        self.answers = list()
 | 
	
		
			
			|  | 334 | +        self.infoboxes = []
 | 
	
		
			
			|  | 335 | +        self.request_data = {}
 | 
	
		
			
			|  | 336 | +
 | 
	
		
			
			|  | 337 | +        # set specific language if set
 | 
	
		
			
			|  | 338 | +        if 'language' in task['settings']:
 | 
	
		
			
			|  | 339 | +            self.lang = task['settings']['language']
 | 
	
		
			
			|  | 340 | +
 | 
	
		
			
			|  | 341 | +        if 'plugins' in task['settings']:
 | 
	
		
			
			|  | 342 | +            for plugin in task['settings']['plugins']:
 | 
	
		
			
			|  | 343 | +                if plugin['allow']:
 | 
	
		
			
			|  | 344 | +                    self.plugins.append(plugin)
 | 
	
		
			
			|  | 345 | +
 | 
	
		
			
			|  | 346 | +        if task['pageno']:
 | 
	
		
			
			|  | 347 | +            self.pageno = int(task['pageno'])
 | 
	
		
			
			|  | 348 | +
 | 
	
		
			
			|  | 349 | +        # parse query, if tags are set, which change
 | 
	
		
			
			|  | 350 | +        # the search engine or search-language
 | 
	
		
			
			|  | 351 | +        query_obj = Query(str(task['query']), self.blocked_engines)
 | 
	
		
			
			|  | 352 | +        query_obj.parse_query()
 | 
	
		
			
			|  | 353 | +
 | 
	
		
			
			|  | 354 | +        # set query
 | 
	
		
			
			|  | 355 | +        self.query = query_obj.getSearchQuery()
 | 
	
		
			
			|  | 356 | +
 | 
	
		
			
			|  | 357 | +        # get last selected language in query, if possible
 | 
	
		
			
			|  | 358 | +        # TODO support search with multible languages
 | 
	
		
			
			|  | 359 | +        if len(query_obj.languages):
 | 
	
		
			
			|  | 360 | +            self.lang = query_obj.languages[-1]
 | 
	
		
			
			|  | 361 | +
 | 
	
		
			
			|  | 362 | +        self.engines = query_obj.engines
 | 
	
		
			
			|  | 363 | +
 | 
	
		
			
			|  | 364 | +        self.categories = []
 | 
	
		
			
			|  | 365 | +
 | 
	
		
			
			|  | 366 | +        # if engines are calculated from query,
 | 
	
		
			
			|  | 367 | +        # set categories by using that informations
 | 
	
		
			
			|  | 368 | +        if self.engines and query_obj.specific:
 | 
	
		
			
			|  | 369 | +            self.categories = list(set(engine['category']
 | 
	
		
			
			|  | 370 | +                                       for engine in self.engines))
 | 
	
		
			
			|  | 371 | +
 | 
	
		
			
			|  | 372 | +        # otherwise, using defined categories to
 | 
	
		
			
			|  | 373 | +        # calculate which engines should be used
 | 
	
		
			
			|  | 374 | +        else:
 | 
	
		
			
			|  | 375 | +            if 'selected_categories' in task and task['selected_categories']:
 | 
	
		
			
			|  | 376 | +                self.categories = task['selected_categories']
 | 
	
		
			
			|  | 377 | +
 | 
	
		
			
			|  | 378 | +            # if still no category is specified, using general
 | 
	
		
			
			|  | 379 | +            # as default-category
 | 
	
		
			
			|  | 380 | +            if not self.categories:
 | 
	
		
			
			|  | 381 | +                self.categories = ['general']
 | 
	
		
			
			|  | 382 | +
 | 
	
		
			
			|  | 383 | +            # set categories/engines
 | 
	
		
			
			|  | 384 | +            # load_default_categories = True
 | 
	
		
			
			|  | 385 | +            for engine in task['settings']['engines']:
 | 
	
		
			
			|  | 386 | +                if not engine['disabled']:
 | 
	
		
			
			|  | 387 | +                    for categ in engine['categories']:
 | 
	
		
			
			|  | 388 | +                        if categ in self.categories:
 | 
	
		
			
			|  | 389 | +                            self.engines.append({'category': categ,
 | 
	
		
			
			|  | 390 | +                                                 'name': engine['name']})
 | 
	
		
			
			|  | 391 | +
 | 
	
		
			
			|  | 392 | +    # do search-request
 | 
	
		
			
			|  | 393 | +    def search(self, task):
 | 
	
		
			
			|  | 394 | +        global number_of_searches
 | 
	
		
			
			|  | 395 | +
 | 
	
		
			
			|  | 396 | +        # init vars
 | 
	
		
			
			|  | 397 | +        requests = []
 | 
	
		
			
			|  | 398 | +        results_queue = Queue()
 | 
	
		
			
			|  | 399 | +        results = {}
 | 
	
		
			
			|  | 400 | +
 | 
	
		
			
			|  | 401 | +        # increase number of searches
 | 
	
		
			
			|  | 402 | +        number_of_searches += 1
 | 
	
		
			
			|  | 403 | +
 | 
	
		
			
			|  | 404 | +        # set default useragent
 | 
	
		
			
			|  | 405 | +        # user_agent = request.headers.get('User-Agent', '')
 | 
	
		
			
			|  | 406 | +        user_agent = gen_useragent()
 | 
	
		
			
			|  | 407 | +
 | 
	
		
			
			|  | 408 | +        # start search-reqest for all selected engines
 | 
	
		
			
			|  | 409 | +        for selected_engine in self.engines:
 | 
	
		
			
			|  | 410 | +            if selected_engine['name'] not in engines:
 | 
	
		
			
			|  | 411 | +                continue
 | 
	
		
			
			|  | 412 | +
 | 
	
		
			
			|  | 413 | +            engine = engines[selected_engine['name']]
 | 
	
		
			
			|  | 414 | +
 | 
	
		
			
			|  | 415 | +            # if paging is not supported, skip
 | 
	
		
			
			|  | 416 | +            if self.pageno > 1 and not engine.paging:
 | 
	
		
			
			|  | 417 | +                continue
 | 
	
		
			
			|  | 418 | +
 | 
	
		
			
			|  | 419 | +            # if search-language is set and engine does not
 | 
	
		
			
			|  | 420 | +            # provide language-support, skip
 | 
	
		
			
			|  | 421 | +            if self.lang != 'all' and not engine.language_support:
 | 
	
		
			
			|  | 422 | +                continue
 | 
	
		
			
			|  | 423 | +
 | 
	
		
			
			|  | 424 | +            # set default request parameters
 | 
	
		
			
			|  | 425 | +            request_params = default_request_params()
 | 
	
		
			
			|  | 426 | +            request_params['headers']['User-Agent'] = user_agent
 | 
	
		
			
			|  | 427 | +            request_params['category'] = selected_engine['category']
 | 
	
		
			
			|  | 428 | +            request_params['started'] = time()
 | 
	
		
			
			|  | 429 | +            request_params['pageno'] = self.pageno
 | 
	
		
			
			|  | 430 | +
 | 
	
		
			
			|  | 431 | +            if hasattr(engine, 'language') and engine.language:
 | 
	
		
			
			|  | 432 | +                request_params['language'] = engine.language
 | 
	
		
			
			|  | 433 | +            else:
 | 
	
		
			
			|  | 434 | +                request_params['language'] = self.lang
 | 
	
		
			
			|  | 435 | +
 | 
	
		
			
			|  | 436 | +                # try:
 | 
	
		
			
			|  | 437 | +                # 0 = None, 1 = Moderate, 2 = Strict
 | 
	
		
			
			|  | 438 | +                # request_params['safesearch'] = int(request.cookies.get('safesearch'))
 | 
	
		
			
			|  | 439 | +                # except Exception:
 | 
	
		
			
			|  | 440 | +            request_params['safesearch'] = settings['search']['safe_search']
 | 
	
		
			
			|  | 441 | +
 | 
	
		
			
			|  | 442 | +            # update request parameters dependent on
 | 
	
		
			
			|  | 443 | +            # search-engine (contained in engines folder)
 | 
	
		
			
			|  | 444 | +            engine.request(task['query'].encode('utf-8'), request_params)
 | 
	
		
			
			|  | 445 | +
 | 
	
		
			
			|  | 446 | +            # update request parameters dependent on
 | 
	
		
			
			|  | 447 | +            # search-engine (contained in engines folder)
 | 
	
		
			
			|  | 448 | +            if request_params['url'] is None:
 | 
	
		
			
			|  | 449 | +                # TODO add support of offline engines
 | 
	
		
			
			|  | 450 | +                pass
 | 
	
		
			
			|  | 451 | +
 | 
	
		
			
			|  | 452 | +            # create a callback wrapper for the search engine results
 | 
	
		
			
			|  | 453 | +            callback = make_callback(
 | 
	
		
			
			|  | 454 | +                selected_engine['name'],
 | 
	
		
			
			|  | 455 | +                results_queue,
 | 
	
		
			
			|  | 456 | +                engine.response,
 | 
	
		
			
			|  | 457 | +                request_params)
 | 
	
		
			
			|  | 458 | +
 | 
	
		
			
			|  | 459 | +            # create dictionary which contain all
 | 
	
		
			
			|  | 460 | +            # informations about the request
 | 
	
		
			
			|  | 461 | +            request_args = dict(
 | 
	
		
			
			|  | 462 | +                headers=request_params['headers'],
 | 
	
		
			
			|  | 463 | +                hooks=dict(response=callback),
 | 
	
		
			
			|  | 464 | +                cookies=request_params['cookies'],
 | 
	
		
			
			|  | 465 | +                timeout=engine.timeout,
 | 
	
		
			
			|  | 466 | +                verify=request_params['verify']
 | 
	
		
			
			|  | 467 | +            )
 | 
	
		
			
			|  | 468 | +
 | 
	
		
			
			|  | 469 | +            # specific type of request (GET or POST)
 | 
	
		
			
			|  | 470 | +            if request_params['method'] == 'GET':
 | 
	
		
			
			|  | 471 | +                req = requests_lib.get
 | 
	
		
			
			|  | 472 | +            else:
 | 
	
		
			
			|  | 473 | +                req = requests_lib.post
 | 
	
		
			
			|  | 474 | +                request_args['data'] = request_params['data']
 | 
	
		
			
			|  | 475 | +
 | 
	
		
			
			|  | 476 | +            # ignoring empty urls
 | 
	
		
			
			|  | 477 | +            if not request_params['url']:
 | 
	
		
			
			|  | 478 | +                continue
 | 
	
		
			
			|  | 479 | +
 | 
	
		
			
			|  | 480 | +            # append request to list
 | 
	
		
			
			|  | 481 | +            requests.append((req, request_params['url'],
 | 
	
		
			
			|  | 482 | +                             request_args,
 | 
	
		
			
			|  | 483 | +                             selected_engine['name']))
 | 
	
		
			
			|  | 484 | +
 | 
	
		
			
			|  | 485 | +        if not requests:
 | 
	
		
			
			|  | 486 | +            return self
 | 
	
		
			
			|  | 487 | +        # send all search-request
 | 
	
		
			
			|  | 488 | +        threaded_requests(requests)
 | 
	
		
			
			|  | 489 | +
 | 
	
		
			
			|  | 490 | +        while not results_queue.empty():
 | 
	
		
			
			|  | 491 | +            engine_name, engine_results = results_queue.get_nowait()
 | 
	
		
			
			|  | 492 | +
 | 
	
		
			
			|  | 493 | +            # TODO type checks
 | 
	
		
			
			|  | 494 | +            [self.suggestions.append(x['suggestion'])
 | 
	
		
			
			|  | 495 | +             for x in list(engine_results)
 | 
	
		
			
			|  | 496 | +             if 'suggestion' in x
 | 
	
		
			
			|  | 497 | +             and engine_results.remove(x) is None]
 | 
	
		
			
			|  | 498 | +
 | 
	
		
			
			|  | 499 | +            [self.answers.append(x['answer'])
 | 
	
		
			
			|  | 500 | +             for x in list(engine_results)
 | 
	
		
			
			|  | 501 | +             if 'answer' in x
 | 
	
		
			
			|  | 502 | +             and engine_results.remove(x) is None]
 | 
	
		
			
			|  | 503 | +
 | 
	
		
			
			|  | 504 | +            self.infoboxes.extend(x for x in list(engine_results)
 | 
	
		
			
			|  | 505 | +                                  if 'infobox' in x
 | 
	
		
			
			|  | 506 | +                                  and engine_results.remove(x) is None)
 | 
	
		
			
			|  | 507 | +
 | 
	
		
			
			|  | 508 | +            results[engine_name] = engine_results
 | 
	
		
			
			|  | 509 | +
 | 
	
		
			
			|  | 510 | +        # update engine-specific stats
 | 
	
		
			
			|  | 511 | +        for engine_name, engine_results in results.items():
 | 
	
		
			
			|  | 512 | +            engines[engine_name].stats['search_count'] += 1
 | 
	
		
			
			|  | 513 | +            engines[engine_name].stats['result_count'] += len(engine_results)
 | 
	
		
			
			|  | 514 | +
 | 
	
		
			
			|  | 515 | +        # score results and remove duplications
 | 
	
		
			
			|  | 516 | +        self.results = score_results(results)
 | 
	
		
			
			|  | 517 | +
 | 
	
		
			
			|  | 518 | +        # merge infoboxes according to their ids
 | 
	
		
			
			|  | 519 | +        self.infoboxes = merge_infoboxes(self.infoboxes)
 | 
	
		
			
			|  | 520 | +
 | 
	
		
			
			|  | 521 | +        # update engine stats, using calculated score
 | 
	
		
			
			|  | 522 | +        for result in self.results:
 | 
	
		
			
			|  | 523 | +            plugins.callAPI('on_result', self.plugins, locals())
 | 
	
		
			
			|  | 524 | +
 | 
	
		
			
			|  | 525 | +            for res_engine in result['engines']:
 | 
	
		
			
			|  | 526 | +                engines[result['engine']] \
 | 
	
		
			
			|  | 527 | +                    .stats['score_count'] += result['score']
 | 
	
		
			
			|  | 528 | +
 | 
	
		
			
			|  | 529 | +            result['pretty_url'] = prettify_url(result['url'])
 | 
	
		
			
			|  | 530 | +
 | 
	
		
			
			|  | 531 | +            # TODO, check if timezone is calculated right
 | 
	
		
			
			|  | 532 | +            if 'publishedDate' in result:
 | 
	
		
			
			|  | 533 | +                result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z')
 | 
	
		
			
			|  | 534 | +
 | 
	
		
			
			|  | 535 | +            if not self.paging and engines[result['engine']].paging:
 | 
	
		
			
			|  | 536 | +                self.paging = True
 | 
	
		
			
			|  | 537 | +
 | 
	
		
			
			|  | 538 | +            if 'content' in result:
 | 
	
		
			
			|  | 539 | +                result['content_html'] = highlight_content(result['content'],
 | 
	
		
			
			|  | 540 | +                                                           self.query.encode('utf-8'))  # noqa
 | 
	
		
			
			|  | 541 | +            result['title_html'] = highlight_content(result['title'],
 | 
	
		
			
			|  | 542 | +                                                     self.query.encode('utf-8'))
 | 
	
		
			
			|  | 543 | +
 | 
	
		
			
			|  | 544 | +            if result.get('content'):
 | 
	
		
			
			|  | 545 | +                result['content'] = html_to_text(result['content']).strip()
 | 
	
		
			
			|  | 546 | +            # removing html content and whitespace duplications
 | 
	
		
			
			|  | 547 | +            result['title'] = ' '.join(html_to_text(result['title']).strip().split())
 | 
	
		
			
			|  | 548 | +
 | 
	
		
			
			|  | 549 | +            # return results, suggestions, answers and infoboxes
 | 
	
		
			
			|  | 550 | +        return self
 |