|  | @@ -2,83 +2,40 @@
 | 
	
		
			
			| 2 | 2 |  
 | 
	
		
			
			| 3 | 3 |  # This script generates languages.py from intersecting each engine's supported languages.
 | 
	
		
			
			| 4 | 4 |  #
 | 
	
		
			
			| 5 |  | -# The country names are obtained from http://api.geonames.org which requires registering as a user.
 | 
	
		
			
			| 6 |  | -#
 | 
	
		
			
			| 7 | 5 |  # Output files (engines_languages.json and languages.py)
 | 
	
		
			
			| 8 | 6 |  # are written in current directory to avoid overwriting in case something goes wrong.
 | 
	
		
			
			| 9 | 7 |  
 | 
	
		
			
			| 10 |  | -from requests import get
 | 
	
		
			
			| 11 |  | -from lxml.html import fromstring
 | 
	
		
			
			| 12 |  | -from json import loads, dump
 | 
	
		
			
			|  | 8 | +from json import dump
 | 
	
		
			
			| 13 | 9 |  import io
 | 
	
		
			
			| 14 | 10 |  from sys import path
 | 
	
		
			
			|  | 11 | +from babel import Locale, UnknownLocaleError
 | 
	
		
			
			|  | 12 | +from babel.languages import get_global
 | 
	
		
			
			|  | 13 | +
 | 
	
		
			
			| 15 | 14 |  path.append('../searx')  # noqa
 | 
	
		
			
			| 16 | 15 |  from searx import settings
 | 
	
		
			
			| 17 |  | -from searx.url_utils import urlencode
 | 
	
		
			
			| 18 | 16 |  from searx.engines import initialize_engines, engines
 | 
	
		
			
			| 19 | 17 |  
 | 
	
		
			
			| 20 |  | -# Geonames API for country names.
 | 
	
		
			
			| 21 |  | -geonames_user = ''  # ADD USER NAME HERE
 | 
	
		
			
			| 22 |  | -country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
 | 
	
		
			
			| 23 |  | -
 | 
	
		
			
			| 24 | 18 |  # Output files.
 | 
	
		
			
			| 25 | 19 |  engines_languages_file = 'engines_languages.json'
 | 
	
		
			
			| 26 | 20 |  languages_file = 'languages.py'
 | 
	
		
			
			| 27 | 21 |  
 | 
	
		
			
			| 28 |  | -engines_languages = {}
 | 
	
		
			
			| 29 |  | -
 | 
	
		
			
			| 30 |  | -
 | 
	
		
			
			| 31 |  | -# To filter out invalid codes and dialects.
 | 
	
		
			
			| 32 |  | -def valid_code(lang_code):
 | 
	
		
			
			| 33 |  | -    # filter invalid codes
 | 
	
		
			
			| 34 |  | -    # sl-SL is technically not invalid, but still a mistake
 | 
	
		
			
			| 35 |  | -    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
 | 
	
		
			
			| 36 |  | -    invalid_countries = ['UK', 'XA', 'XL']
 | 
	
		
			
			| 37 |  | -    if lang_code[:2] == 'xx'\
 | 
	
		
			
			| 38 |  | -       or lang_code in invalid_codes\
 | 
	
		
			
			| 39 |  | -       or lang_code[-2:] in invalid_countries\
 | 
	
		
			
			| 40 |  | -       or is_dialect(lang_code):
 | 
	
		
			
			| 41 |  | -        return False
 | 
	
		
			
			| 42 |  | -
 | 
	
		
			
			| 43 |  | -    return True
 | 
	
		
			
			| 44 |  | -
 | 
	
		
			
			| 45 |  | -
 | 
	
		
			
			| 46 |  | -# Language codes with any additional tags other than language and country.
 | 
	
		
			
			| 47 |  | -def is_dialect(lang_code):
 | 
	
		
			
			| 48 |  | -    lang_code = lang_code.split('-')
 | 
	
		
			
			| 49 |  | -    if len(lang_code) > 2 or len(lang_code[0]) > 3:
 | 
	
		
			
			| 50 |  | -        return True
 | 
	
		
			
			| 51 |  | -    if len(lang_code) == 2 and len(lang_code[1]) > 2:
 | 
	
		
			
			| 52 |  | -        return True
 | 
	
		
			
			| 53 |  | -
 | 
	
		
			
			| 54 |  | -    return False
 | 
	
		
			
			| 55 |  | -
 | 
	
		
			
			| 56 |  | -
 | 
	
		
			
			| 57 |  | -# Get country name in specified language.
 | 
	
		
			
			| 58 |  | -def get_country_name(locale):
 | 
	
		
			
			| 59 |  | -    if geonames_user is '':
 | 
	
		
			
			| 60 |  | -        return ''
 | 
	
		
			
			| 61 |  | -
 | 
	
		
			
			| 62 |  | -    locale = locale.split('-')
 | 
	
		
			
			| 63 |  | -    if len(locale) != 2:
 | 
	
		
			
			| 64 |  | -        return ''
 | 
	
		
			
			| 65 |  | -
 | 
	
		
			
			| 66 |  | -    url = country_names_url.format(parameters=urlencode({'lang': locale[0],
 | 
	
		
			
			| 67 |  | -                                                         'country': locale[1],
 | 
	
		
			
			| 68 |  | -                                                         'username': geonames_user}))
 | 
	
		
			
			| 69 |  | -    response = get(url)
 | 
	
		
			
			| 70 |  | -    json = loads(response.text)
 | 
	
		
			
			| 71 |  | -    content = json.get('geonames', None)
 | 
	
		
			
			| 72 |  | -    if content is None or len(content) != 1:
 | 
	
		
			
			| 73 |  | -        print("No country name found for " + locale[0] + "-" + locale[1])
 | 
	
		
			
			| 74 |  | -        return ''
 | 
	
		
			
			| 75 |  | -
 | 
	
		
			
			| 76 |  | -    return content[0].get('countryName', '')
 | 
	
		
			
			|  | 22 | +# custom fixes for non standard locale codes
 | 
	
		
			
			|  | 23 | +# sl-SL is technically not invalid, but still a mistake
 | 
	
		
			
			|  | 24 | +# TODO: move to respective engines
 | 
	
		
			
			|  | 25 | +locale_fixes = {
 | 
	
		
			
			|  | 26 | +    'sl-sl': 'sl-SI',
 | 
	
		
			
			|  | 27 | +    'ar-xa': 'ar-SA',
 | 
	
		
			
			|  | 28 | +    'es-xl': 'es-419',
 | 
	
		
			
			|  | 29 | +    'zh-chs': 'zh-Hans-CN',
 | 
	
		
			
			|  | 30 | +    'zh-cht': 'zh-Hant-TW',
 | 
	
		
			
			|  | 31 | +    'tzh-tw': 'zh-Hant-TW',
 | 
	
		
			
			|  | 32 | +    'tzh-hk': 'zh-Hant-HK'
 | 
	
		
			
			|  | 33 | +}
 | 
	
		
			
			| 77 | 34 |  
 | 
	
		
			
			| 78 | 35 |  
 | 
	
		
			
			| 79 | 36 |  # Fetchs supported languages for each engine and writes json file with those.
 | 
	
		
			
			| 80 | 37 |  def fetch_supported_languages():
 | 
	
		
			
			| 81 |  | -    initialize_engines(settings['engines'])
 | 
	
		
			
			|  | 38 | +    engines_languages = {}
 | 
	
		
			
			| 82 | 39 |      for engine_name in engines:
 | 
	
		
			
			| 83 | 40 |          if hasattr(engines[engine_name], 'fetch_supported_languages'):
 | 
	
		
			
			| 84 | 41 |              try:
 | 
	
	
		
			
			|  | @@ -90,81 +47,134 @@ def fetch_supported_languages():
 | 
	
		
			
			| 90 | 47 |      with io.open(engines_languages_file, "w", encoding="utf-8") as f:
 | 
	
		
			
			| 91 | 48 |          dump(engines_languages, f, ensure_ascii=False)
 | 
	
		
			
			| 92 | 49 |  
 | 
	
		
			
			|  | 50 | +    return engines_languages
 | 
	
		
			
			|  | 51 | +
 | 
	
		
			
			|  | 52 | +
 | 
	
		
			
			|  | 53 | +# Get babel Locale object from lang_code if possible.
 | 
	
		
			
			|  | 54 | +def get_locale(lang_code):
 | 
	
		
			
			|  | 55 | +    try:
 | 
	
		
			
			|  | 56 | +        locale = Locale.parse(lang_code, sep='-')
 | 
	
		
			
			|  | 57 | +        return locale
 | 
	
		
			
			|  | 58 | +    except (UnknownLocaleError, ValueError):
 | 
	
		
			
			|  | 59 | +        return None
 | 
	
		
			
			|  | 60 | +
 | 
	
		
			
			|  | 61 | +
 | 
	
		
			
			|  | 62 | +# Append engine_name to list of engines that support locale.
 | 
	
		
			
			|  | 63 | +def add_engine_counter(lang_code, engine_name, languages):
 | 
	
		
			
			|  | 64 | +    if lang_code in languages:
 | 
	
		
			
			|  | 65 | +        if 'counter' not in languages[lang_code]:
 | 
	
		
			
			|  | 66 | +            languages[lang_code]['counter'] = [engine_name]
 | 
	
		
			
			|  | 67 | +        elif engine_name not in languages[lang_code]['counter']:
 | 
	
		
			
			|  | 68 | +            languages[lang_code]['counter'].append(engine_name)
 | 
	
		
			
			| 93 | 69 |  
 | 
	
		
			
			| 94 |  | -# Join all language lists.
 | 
	
		
			
			| 95 |  | -# Iterate all languages supported by each engine.
 | 
	
		
			
			| 96 |  | -def join_language_lists():
 | 
	
		
			
			| 97 |  | -    global languages
 | 
	
		
			
			| 98 |  | -    # include wikipedia first for more accurate language names
 | 
	
		
			
			| 99 |  | -    languages = {code: lang for code, lang
 | 
	
		
			
			| 100 |  | -                 in engines_languages['wikipedia'].items()
 | 
	
		
			
			| 101 |  | -                 if valid_code(code)}
 | 
	
		
			
			| 102 | 70 |  
 | 
	
		
			
			|  | 71 | +# Join all language lists.
 | 
	
		
			
			|  | 72 | +# TODO: Add language names from engine's language list if name not known by babel.
 | 
	
		
			
			|  | 73 | +def join_language_lists(engines_languages):
 | 
	
		
			
			|  | 74 | +    language_list = {}
 | 
	
		
			
			| 103 | 75 |      for engine_name in engines_languages:
 | 
	
		
			
			| 104 |  | -        for locale in engines_languages[engine_name]:
 | 
	
		
			
			| 105 |  | -            if valid_code(locale):
 | 
	
		
			
			| 106 |  | -                # if language is not on list or if it has no name yet
 | 
	
		
			
			| 107 |  | -                if locale not in languages or not languages[locale].get('name'):
 | 
	
		
			
			| 108 |  | -                    if isinstance(engines_languages[engine_name], dict):
 | 
	
		
			
			| 109 |  | -                        languages[locale] = engines_languages[engine_name][locale]
 | 
	
		
			
			| 110 |  | -                    else:
 | 
	
		
			
			| 111 |  | -                        languages[locale] = {}
 | 
	
		
			
			| 112 |  | -
 | 
	
		
			
			| 113 |  | -            # add to counter of engines that support given language
 | 
	
		
			
			| 114 |  | -            lang = locale.split('-')[0]
 | 
	
		
			
			| 115 |  | -            if lang in languages:
 | 
	
		
			
			| 116 |  | -                if 'counter' not in languages[lang]:
 | 
	
		
			
			| 117 |  | -                    languages[lang]['counter'] = [engine_name]
 | 
	
		
			
			| 118 |  | -                elif engine_name not in languages[lang]['counter']:
 | 
	
		
			
			| 119 |  | -                    languages[lang]['counter'].append(engine_name)
 | 
	
		
			
			| 120 |  | -
 | 
	
		
			
			| 121 |  | -    # filter list to include only languages supported by most engines
 | 
	
		
			
			| 122 |  | -    min_supported_engines = int(0.70 * len(engines_languages))
 | 
	
		
			
			| 123 |  | -    languages = {code: lang for code, lang
 | 
	
		
			
			| 124 |  | -                 in languages.items()
 | 
	
		
			
			| 125 |  | -                 if len(lang.get('counter', [])) >= min_supported_engines or
 | 
	
		
			
			| 126 |  | -                 len(languages.get(code.split('-')[0], {}).get('counter', [])) >= min_supported_engines}
 | 
	
		
			
			| 127 |  | -
 | 
	
		
			
			| 128 |  | -    # get locales that have no name or country yet
 | 
	
		
			
			| 129 |  | -    for locale in languages.keys():
 | 
	
		
			
			| 130 |  | -        # try to get language names
 | 
	
		
			
			| 131 |  | -        if not languages[locale].get('name'):
 | 
	
		
			
			| 132 |  | -            name = languages.get(locale.split('-')[0], {}).get('name', None)
 | 
	
		
			
			| 133 |  | -            if name:
 | 
	
		
			
			| 134 |  | -                languages[locale]['name'] = name
 | 
	
		
			
			| 135 |  | -            else:
 | 
	
		
			
			| 136 |  | -                # filter out locales with no name
 | 
	
		
			
			| 137 |  | -                del languages[locale]
 | 
	
		
			
			| 138 |  | -                continue
 | 
	
		
			
			| 139 |  | -
 | 
	
		
			
			| 140 |  | -        # try to get language name in english
 | 
	
		
			
			| 141 |  | -        if not languages[locale].get('english_name'):
 | 
	
		
			
			| 142 |  | -            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
 | 
	
		
			
			| 143 |  | -
 | 
	
		
			
			| 144 |  | -        # try to get country name
 | 
	
		
			
			| 145 |  | -        if locale.find('-') > 0 and not languages[locale].get('country'):
 | 
	
		
			
			| 146 |  | -            languages[locale]['country'] = get_country_name(locale) or ''
 | 
	
		
			
			| 147 |  | -
 | 
	
		
			
			| 148 |  | -
 | 
	
		
			
			| 149 |  | -# Remove countryless language if language is featured in only one country.
 | 
	
		
			
			| 150 |  | -def filter_single_country_languages():
 | 
	
		
			
			| 151 |  | -    prev_lang = None
 | 
	
		
			
			| 152 |  | -    prev_code = None
 | 
	
		
			
			| 153 |  | -    for code in sorted(languages):
 | 
	
		
			
			| 154 |  | -        lang = code.split('-')[0]
 | 
	
		
			
			| 155 |  | -        if lang == prev_lang:
 | 
	
		
			
			|  | 76 | +        for lang_code in engines_languages[engine_name]:
 | 
	
		
			
			|  | 77 | +
 | 
	
		
			
			|  | 78 | +            # apply custom fixes if necessary
 | 
	
		
			
			|  | 79 | +            if lang_code.lower() in locale_fixes:
 | 
	
		
			
			|  | 80 | +                lang_code = locale_fixes[lang_code.lower()]
 | 
	
		
			
			|  | 81 | +
 | 
	
		
			
			|  | 82 | +            locale = get_locale(lang_code)
 | 
	
		
			
			|  | 83 | +
 | 
	
		
			
			|  | 84 | +            # ensure that lang_code uses standard language and country codes
 | 
	
		
			
			|  | 85 | +            if locale and locale.territory:
 | 
	
		
			
			|  | 86 | +                lang_code = locale.language + '-' + locale.territory
 | 
	
		
			
			|  | 87 | +
 | 
	
		
			
			|  | 88 | +            # add locale if it's not in list
 | 
	
		
			
			|  | 89 | +            if lang_code not in language_list:
 | 
	
		
			
			|  | 90 | +                if locale:
 | 
	
		
			
			|  | 91 | +                    language_list[lang_code] = {'name': locale.get_language_name().title(),
 | 
	
		
			
			|  | 92 | +                                                'english_name': locale.english_name,
 | 
	
		
			
			|  | 93 | +                                                'country': locale.get_territory_name() or ''}
 | 
	
		
			
			|  | 94 | +
 | 
	
		
			
			|  | 95 | +                    # also add language without country
 | 
	
		
			
			|  | 96 | +                    if locale.language not in language_list:
 | 
	
		
			
			|  | 97 | +                        language_list[locale.language] = {'name': locale.get_language_name().title(),
 | 
	
		
			
			|  | 98 | +                                                          'english_name': locale.english_name}
 | 
	
		
			
			|  | 99 | +                else:
 | 
	
		
			
			|  | 100 | +                    language_list[lang_code] = {}
 | 
	
		
			
			|  | 101 | +
 | 
	
		
			
			|  | 102 | +            # count engine for both language_country combination and language alone
 | 
	
		
			
			|  | 103 | +            add_engine_counter(lang_code, engine_name, language_list)
 | 
	
		
			
			|  | 104 | +            add_engine_counter(lang_code.split('-')[0], engine_name, language_list)
 | 
	
		
			
			|  | 105 | +
 | 
	
		
			
			|  | 106 | +    return language_list
 | 
	
		
			
			|  | 107 | +
 | 
	
		
			
			|  | 108 | +
 | 
	
		
			
			|  | 109 | +# Filter language list so it only includes the most supported languages and countries.
 | 
	
		
			
			|  | 110 | +def filter_language_list(all_languages):
 | 
	
		
			
			|  | 111 | +    min_supported_engines = 10
 | 
	
		
			
			|  | 112 | +    main_engines = [engine_name for engine_name in engines.keys()
 | 
	
		
			
			|  | 113 | +                    if 'general' in engines[engine_name].categories and
 | 
	
		
			
			|  | 114 | +                       engines[engine_name].supported_languages and
 | 
	
		
			
			|  | 115 | +                       not engines[engine_name].disabled]
 | 
	
		
			
			|  | 116 | +
 | 
	
		
			
			|  | 117 | +    # filter list to include only languages supported by most engines or all default general engines
 | 
	
		
			
			|  | 118 | +    filtered_languages = {code: lang for code, lang
 | 
	
		
			
			|  | 119 | +                          in all_languages.items()
 | 
	
		
			
			|  | 120 | +                          if (len(lang.get('counter', [])) >= min_supported_engines or
 | 
	
		
			
			|  | 121 | +                              all(main_engine in lang.get('counter', [])
 | 
	
		
			
			|  | 122 | +                                  for main_engine in main_engines))}
 | 
	
		
			
			|  | 123 | +
 | 
	
		
			
			|  | 124 | +    return filtered_languages
 | 
	
		
			
			|  | 125 | +
 | 
	
		
			
			|  | 126 | +
 | 
	
		
			
			|  | 127 | +# Add country codes to languages without one and filter out language codes.
 | 
	
		
			
			|  | 128 | +def assign_country_codes(filtered_languages, all_languages):
 | 
	
		
			
			|  | 129 | +    sorted_languages = sorted(all_languages,
 | 
	
		
			
			|  | 130 | +                              key=lambda lang: len(all_languages[lang].get('counter', [])),
 | 
	
		
			
			|  | 131 | +                              reverse=True)
 | 
	
		
			
			|  | 132 | +    previous_lang = None
 | 
	
		
			
			|  | 133 | +    previous_code = None
 | 
	
		
			
			|  | 134 | +    countries = 0
 | 
	
		
			
			|  | 135 | +    for current_code in sorted(filtered_languages):
 | 
	
		
			
			|  | 136 | +        current_lang = current_code.split('-')[0]
 | 
	
		
			
			|  | 137 | +
 | 
	
		
			
			|  | 138 | +        # count country codes per language
 | 
	
		
			
			|  | 139 | +        if current_lang == previous_lang:
 | 
	
		
			
			| 156 | 140 |              countries += 1
 | 
	
		
			
			|  | 141 | +
 | 
	
		
			
			| 157 | 142 |          else:
 | 
	
		
			
			| 158 |  | -            if prev_lang is not None and countries == 1:
 | 
	
		
			
			| 159 |  | -                del languages[prev_lang]
 | 
	
		
			
			| 160 |  | -                languages[prev_code]['country'] = ''
 | 
	
		
			
			|  | 143 | +            if previous_lang is not None:
 | 
	
		
			
			|  | 144 | +                # if language has no single country code
 | 
	
		
			
			|  | 145 | +                if countries == 0:
 | 
	
		
			
			|  | 146 | +                    # try to get country code with most supported engines
 | 
	
		
			
			|  | 147 | +                    for l in sorted_languages:
 | 
	
		
			
			|  | 148 | +                        l_parts = l.split('-')
 | 
	
		
			
			|  | 149 | +                        if len(l_parts) == 2 and l_parts[0] == previous_lang:
 | 
	
		
			
			|  | 150 | +                            filtered_languages[l] = all_languages[l]
 | 
	
		
			
			|  | 151 | +                            filtered_languages[l]['country'] = ''
 | 
	
		
			
			|  | 152 | +                            countries = 1
 | 
	
		
			
			|  | 153 | +                            break
 | 
	
		
			
			|  | 154 | +
 | 
	
		
			
			|  | 155 | +                    if countries == 0:
 | 
	
		
			
			|  | 156 | +                        # get most likely country code from babel
 | 
	
		
			
			|  | 157 | +                        subtags = get_global('likely_subtags').get(previous_lang)
 | 
	
		
			
			|  | 158 | +                        if subtags:
 | 
	
		
			
			|  | 159 | +                            subtag_parts = subtags.split('_')
 | 
	
		
			
			|  | 160 | +                            new_code = subtag_parts[0] + '-' + subtag_parts[-1]
 | 
	
		
			
			|  | 161 | +                            filtered_languages[new_code] = all_languages[previous_lang]
 | 
	
		
			
			|  | 162 | +                            countries = 1
 | 
	
		
			
			|  | 163 | +
 | 
	
		
			
			|  | 164 | +                if countries == 1:
 | 
	
		
			
			|  | 165 | +                    # remove countryless version of language if there's only one country
 | 
	
		
			
			|  | 166 | +                    del filtered_languages[previous_lang]
 | 
	
		
			
			|  | 167 | +                    if previous_code in filtered_languages:
 | 
	
		
			
			|  | 168 | +                        filtered_languages[previous_code]['country'] = ''
 | 
	
		
			
			|  | 169 | +
 | 
	
		
			
			| 161 | 170 |              countries = 0
 | 
	
		
			
			| 162 |  | -            prev_lang = lang
 | 
	
		
			
			| 163 |  | -        prev_code = code
 | 
	
		
			
			|  | 171 | +            previous_lang = current_lang
 | 
	
		
			
			|  | 172 | +
 | 
	
		
			
			|  | 173 | +        previous_code = current_code
 | 
	
		
			
			| 164 | 174 |  
 | 
	
		
			
			| 165 | 175 |  
 | 
	
		
			
			| 166 | 176 |  # Write languages.py.
 | 
	
		
			
			| 167 |  | -def write_languages_file():
 | 
	
		
			
			|  | 177 | +def write_languages_file(languages):
 | 
	
		
			
			| 168 | 178 |      new_file = open(languages_file, 'wb')
 | 
	
		
			
			| 169 | 179 |      file_content = '# -*- coding: utf-8 -*-\n'\
 | 
	
		
			
			| 170 | 180 |                     + '# list of language codes\n'\
 | 
	
	
		
			
			|  | @@ -183,7 +193,9 @@ def write_languages_file():
 | 
	
		
			
			| 183 | 193 |  
 | 
	
		
			
			| 184 | 194 |  
 | 
	
		
			
			| 185 | 195 |  if __name__ == "__main__":
 | 
	
		
			
			| 186 |  | -    fetch_supported_languages()
 | 
	
		
			
			| 187 |  | -    join_language_lists()
 | 
	
		
			
			| 188 |  | -    filter_single_country_languages()
 | 
	
		
			
			| 189 |  | -    write_languages_file()
 | 
	
		
			
			|  | 196 | +    initialize_engines(settings['engines'])
 | 
	
		
			
			|  | 197 | +    engines_languages = fetch_supported_languages()
 | 
	
		
			
			|  | 198 | +    all_languages = join_language_lists(engines_languages)
 | 
	
		
			
			|  | 199 | +    filtered_languages = filter_language_list(all_languages)
 | 
	
		
			
			|  | 200 | +    assign_country_codes(filtered_languages, all_languages)
 | 
	
		
			
			|  | 201 | +    write_languages_file(filtered_languages)
 |