fetch_languages.py 5.8KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from intersecting each engine's supported languages.
  3. #
  4. # The country names are obtained from http://api.geonames.org which requires registering as a user.
  5. #
  6. # Output files (engines_languages.json and languages.py)
  7. # are written in current directory to avoid overwriting in case something goes wrong.
  8. from requests import get
  9. from urllib import urlencode
  10. from lxml.html import fromstring
  11. from json import loads, dumps
  12. import io
  13. from sys import path
  14. path.append('../searx') # noqa
  15. from searx.engines import engines
  16. # Geonames API for country names.
  17. geonames_user = '' # ADD USER NAME HERE
  18. country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
  19. # Output files.
  20. engines_languages_file = 'engines_languages.json'
  21. languages_file = 'languages.py'
  22. engines_languages = {}
  23. languages = {}
  24. # To filter out invalid codes and dialects.
  25. def valid_code(lang_code):
  26. # filter invalid codes
  27. # sl-SL is technically not invalid, but still a mistake
  28. if lang_code[:2] == 'xx'\
  29. or lang_code == 'sl-SL'\
  30. or lang_code == 'wt-WT'\
  31. or lang_code == 'jw'\
  32. or lang_code[-2:] == 'UK'\
  33. or lang_code[-2:] == 'XA'\
  34. or lang_code[-2:] == 'XL':
  35. return False
  36. # filter dialects
  37. lang_code = lang_code.split('-')
  38. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  39. return False
  40. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  41. return False
  42. return True
  43. # Get country name in specified language.
  44. def get_country_name(locale):
  45. if geonames_user is '':
  46. return ''
  47. locale = locale.split('-')
  48. if len(locale) != 2:
  49. return ''
  50. url = country_names_url.format(parameters=urlencode({'lang': locale[0],
  51. 'country': locale[1],
  52. 'username': geonames_user}))
  53. response = get(url)
  54. json = loads(response.text)
  55. content = json.get('geonames', None)
  56. if content is None or len(content) != 1:
  57. print "No country name found for " + locale[0] + "-" + locale[1]
  58. return ''
  59. return content[0].get('countryName', '')
  60. # Fetchs supported languages for each engine and writes json file with those.
  61. def fetch_supported_languages():
  62. for engine_name in engines:
  63. if hasattr(engines[engine_name], 'fetch_supported_languages'):
  64. try:
  65. engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
  66. except Exception as e:
  67. print e
  68. # write json file
  69. f = io.open(engines_languages_file, "w", encoding="utf-8")
  70. f.write(unicode(dumps(engines_languages, ensure_ascii=False, encoding="utf-8")))
  71. f.close()
  72. # Join all language lists.
  73. # Iterate all languages supported by each engine.
  74. def join_language_lists():
  75. # include wikipedia first for more accurate language names
  76. # exclude languages with too few articles
  77. languages.update({code: lang for code, lang
  78. in engines_languages['wikipedia'].iteritems()
  79. if valid_code(code) and lang['articles'] >= 100000})
  80. for engine_name in engines_languages:
  81. for locale in engines_languages[engine_name]:
  82. if not valid_code(locale):
  83. continue
  84. # if language is not on list or if it has no name yet
  85. if locale not in languages or not languages[locale].get('name'):
  86. if isinstance(engines_languages[engine_name], dict) \
  87. and engines_languages[engine_name][locale].get('articles', float('inf')) >= 100000:
  88. languages[locale] = engines_languages[engine_name][locale]
  89. else:
  90. languages[locale] = {}
  91. # get locales that have no name or country yet
  92. for locale in languages.keys():
  93. if not languages[locale].get('name'):
  94. # try to get language names
  95. name = languages.get(locale.split('-')[0], {}).get('name', None)
  96. if name:
  97. languages[locale]['name'] = name
  98. languages[locale]['english_name'] = languages.get(locale.split('-')[0], {}).get('english_name', '')
  99. else:
  100. # filter out locales with no name
  101. del languages[locale]
  102. continue
  103. # try to get country name
  104. if locale.find('-') > 0 and not languages[locale].get('country'):
  105. languages[locale]['country'] = get_country_name(locale) or ''
  106. # Remove countryless language if language is featured in only one country.
  107. def filter_single_country_languages():
  108. prev_lang = None
  109. for code in sorted(languages):
  110. lang = code.split('-')[0]
  111. if lang == prev_lang:
  112. countries += 1
  113. else:
  114. if prev_lang is not None and countries == 1:
  115. del languages[prev_lang]
  116. countries = 0
  117. prev_lang = lang
  118. # Write languages.py.
  119. def write_languages_file():
  120. new_file = open(languages_file, 'w')
  121. file_content = '# -*- coding: utf-8 -*-\n'
  122. file_content += '# list of language codes\n'
  123. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  124. file_content += '\nlanguage_codes = ('
  125. for code in sorted(languages):
  126. file_content += '\n (u"' + code + '"'\
  127. + ', u"' + languages[code]['name'].split(' (')[0] + '"'\
  128. + ', u"' + languages[code].get('country', '') + '"'\
  129. + ', u"' + languages[code].get('english_name', '').split(' (')[0] + '"),'
  130. # remove last comma
  131. file_content = file_content[:-1]
  132. file_content += '\n)\n'
  133. new_file.write(file_content.encode('utf8'))
  134. new_file.close()
  135. if __name__ == "__main__":
  136. fetch_supported_languages()
  137. join_language_lists()
  138. filter_single_country_languages()
  139. write_languages_file()