|  | @@ -32,23 +32,26 @@ languages = {}
 | 
	
		
			
			| 32 | 32 |  def valid_code(lang_code):
 | 
	
		
			
			| 33 | 33 |      # filter invalid codes
 | 
	
		
			
			| 34 | 34 |      # sl-SL is technically not invalid, but still a mistake
 | 
	
		
			
			|  | 35 | +    invalid_codes = ['sl-SL', 'wt-WT', 'jw']
 | 
	
		
			
			|  | 36 | +    invalid_countries = ['UK', 'XA', 'XL']
 | 
	
		
			
			| 35 | 37 |      if lang_code[:2] == 'xx'\
 | 
	
		
			
			| 36 |  | -       or lang_code == 'sl-SL'\
 | 
	
		
			
			| 37 |  | -       or lang_code == 'wt-WT'\
 | 
	
		
			
			| 38 |  | -       or lang_code == 'jw'\
 | 
	
		
			
			| 39 |  | -       or lang_code[-2:] == 'UK'\
 | 
	
		
			
			| 40 |  | -       or lang_code[-2:] == 'XA'\
 | 
	
		
			
			| 41 |  | -       or lang_code[-2:] == 'XL':
 | 
	
		
			
			|  | 38 | +       or lang_code in invalid_codes\
 | 
	
		
			
			|  | 39 | +       or lang_code[-2:] in invalid_countries\
 | 
	
		
			
			|  | 40 | +       or is_dialect(lang_code):
 | 
	
		
			
			| 42 | 41 |          return False
 | 
	
		
			
			| 43 | 42 |  
 | 
	
		
			
			| 44 |  | -    # filter dialects
 | 
	
		
			
			|  | 43 | +    return True
 | 
	
		
			
			|  | 44 | +
 | 
	
		
			
			|  | 45 | +
 | 
	
		
			
			|  | 46 | +# Language codes with any additional tags other than language and country.
 | 
	
		
			
			|  | 47 | +def is_dialect(lang_code):
 | 
	
		
			
			| 45 | 48 |      lang_code = lang_code.split('-')
 | 
	
		
			
			| 46 | 49 |      if len(lang_code) > 2 or len(lang_code[0]) > 3:
 | 
	
		
			
			| 47 |  | -        return False
 | 
	
		
			
			|  | 50 | +        return True
 | 
	
		
			
			| 48 | 51 |      if len(lang_code) == 2 and len(lang_code[1]) > 2:
 | 
	
		
			
			| 49 |  | -        return False
 | 
	
		
			
			|  | 52 | +        return True
 | 
	
		
			
			| 50 | 53 |  
 | 
	
		
			
			| 51 |  | -    return True
 | 
	
		
			
			|  | 54 | +    return False
 | 
	
		
			
			| 52 | 55 |  
 | 
	
		
			
			| 53 | 56 |  
 | 
	
		
			
			| 54 | 57 |  # Get country name in specified language.
 | 
	
	
		
			
			|  | @@ -83,19 +86,17 @@ def fetch_supported_languages():
 | 
	
		
			
			| 83 | 86 |                  print e
 | 
	
		
			
			| 84 | 87 |  
 | 
	
		
			
			| 85 | 88 |      # write json file
 | 
	
		
			
			| 86 |  | -    f = io.open(engines_languages_file, "w", encoding="utf-8")
 | 
	
		
			
			| 87 |  | -    f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
 | 
	
		
			
			| 88 |  | -    f.close()
 | 
	
		
			
			|  | 89 | +    with io.open(engines_languages_file, "w", encoding="utf-8") as f:
 | 
	
		
			
			|  | 90 | +        f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
 | 
	
		
			
			| 89 | 91 |  
 | 
	
		
			
			| 90 | 92 |  
 | 
	
		
			
			| 91 | 93 |  # Join all language lists.
 | 
	
		
			
			| 92 | 94 |  # Iterate all languages supported by each engine.
 | 
	
		
			
			| 93 | 95 |  def join_language_lists():
 | 
	
		
			
			| 94 | 96 |      # include wikipedia first for more accurate language names
 | 
	
		
			
			| 95 |  | -    # exclude languages with too few articles
 | 
	
		
			
			| 96 | 97 |      languages.update({code: lang for code, lang
 | 
	
		
			
			| 97 | 98 |                        in engines_languages['wikipedia'].iteritems()
 | 
	
		
			
			| 98 |  | -                      if valid_code(code) and lang['articles'] >= 100000})
 | 
	
		
			
			|  | 99 | +                      if valid_code(code)})
 | 
	
		
			
			| 99 | 100 |  
 | 
	
		
			
			| 100 | 101 |      for engine_name in engines_languages:
 | 
	
		
			
			| 101 | 102 |          for locale in engines_languages[engine_name]:
 | 
	
	
		
			
			|  | @@ -104,25 +105,27 @@ def join_language_lists():
 | 
	
		
			
			| 104 | 105 |  
 | 
	
		
			
			| 105 | 106 |              # if language is not on list or if it has no name yet
 | 
	
		
			
			| 106 | 107 |              if locale not in languages or not languages[locale].get('name'):
 | 
	
		
			
			| 107 |  | -                if isinstance(engines_languages[engine_name], dict) \
 | 
	
		
			
			| 108 |  | -                  and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
 | 
	
		
			
			|  | 108 | +                if isinstance(engines_languages[engine_name], dict):
 | 
	
		
			
			| 109 | 109 |                      languages[locale] = engines_languages[engine_name][locale]
 | 
	
		
			
			| 110 | 110 |                  else:
 | 
	
		
			
			| 111 | 111 |                      languages[locale] = {}
 | 
	
		
			
			| 112 | 112 |  
 | 
	
		
			
			| 113 | 113 |      # get locales that have no name or country yet
 | 
	
		
			
			| 114 | 114 |      for locale in languages.keys():
 | 
	
		
			
			|  | 115 | +        # try to get language names
 | 
	
		
			
			| 115 | 116 |          if not languages[locale].get('name'):
 | 
	
		
			
			| 116 |  | -            # try to get language names
 | 
	
		
			
			| 117 | 117 |              name = languages.get(locale.split('-')[0], {}).get('name', None)
 | 
	
		
			
			| 118 | 118 |              if name:
 | 
	
		
			
			| 119 | 119 |                  languages[locale]['name'] = name
 | 
	
		
			
			| 120 |  | -                languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
 | 
	
		
			
			| 121 | 120 |              else:
 | 
	
		
			
			| 122 | 121 |                  # filter out locales with no name
 | 
	
		
			
			| 123 | 122 |                  del languages[locale]
 | 
	
		
			
			| 124 | 123 |                  continue
 | 
	
		
			
			| 125 | 124 |  
 | 
	
		
			
			|  | 125 | +        # try to get language name in english
 | 
	
		
			
			|  | 126 | +        if not languages[locale].get('english_name'):
 | 
	
		
			
			|  | 127 | +            languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
 | 
	
		
			
			|  | 128 | +
 | 
	
		
			
			| 126 | 129 |          # try to get country name
 | 
	
		
			
			| 127 | 130 |          if locale.find('-') > 0 and not languages[locale].get('country'):
 | 
	
		
			
			| 128 | 131 |              languages[locale]['country'] = get_country_name(locale) or ''
 | 
	
	
		
			
			|  | @@ -145,10 +148,10 @@ def filter_single_country_languages():
 | 
	
		
			
			| 145 | 148 |  # Write languages.py.
 | 
	
		
			
			| 146 | 149 |  def write_languages_file():
 | 
	
		
			
			| 147 | 150 |      new_file = open(languages_file, 'w')
 | 
	
		
			
			| 148 |  | -    file_content = '# -*- coding: utf-8 -*-\n'
 | 
	
		
			
			| 149 |  | -    file_content += '# list of language codes\n'
 | 
	
		
			
			| 150 |  | -    file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
 | 
	
		
			
			| 151 |  | -    file_content += '\nlanguage_codes = ('
 | 
	
		
			
			|  | 151 | +    file_content = '# -*- coding: utf-8 -*-\n'\
 | 
	
		
			
			|  | 152 | +                   + '# list of language codes\n'\
 | 
	
		
			
			|  | 153 | +                   + '# this file is generated automatically by utils/update_search_languages.py\n'\
 | 
	
		
			
			|  | 154 | +                   + '\nlanguage_codes = ('
 | 
	
		
			
			| 152 | 155 |      for code in sorted(languages):
 | 
	
		
			
			| 153 | 156 |          file_content += '\n    (u"' + code + '"'\
 | 
	
		
			
			| 154 | 157 |                          + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
 |