update_languages.py 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from
  3. # intersecting each engine's supported languages.
  4. #
  5. # The language's native names are obtained from
  6. # Wikipedia and Google's supported languages.
  7. #
  8. # The country names are obtained from http://api.geonames.org
  9. # which requires registering as a user.
  10. #
  11. # Output file (languages.py) is written in current directory
  12. # to avoid overwriting in case something goes wrong.
  13. from requests import get
  14. from urllib import urlencode
  15. from lxml.html import fromstring
  16. from json import loads
  17. from sys import path
  18. path.append('../searx')
  19. from searx.engines import engines
  20. # list of names
  21. wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  22. google_languages_url = 'https://www.google.com/preferences?#languages'
  23. country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
  24. geonames_user = '' # add user name here
  25. google_json_name = 'google.preferences.langMap'
  26. languages = {}
  27. # To filter out invalid codes and dialects.
  28. def valid_code(lang_code):
  29. # filter invalid codes
  30. if lang_code[:2] == 'xx'\
  31. or lang_code == 'jw'\
  32. or lang_code[-2:] == 'UK'\
  33. or lang_code[-2:] == 'XA'\
  34. or lang_code[-2:] == 'XL':
  35. return False
  36. # filter dialects
  37. lang_code = lang_code.split('-')
  38. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  39. return False
  40. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  41. return False
  42. return True
  43. # Get country name in specified language.
  44. def get_country_name(locale):
  45. if geonames_user is '':
  46. return ''
  47. locale = locale.split('-')
  48. if len(locale) != 2:
  49. return ''
  50. url = country_names_url.format(parameters=urlencode({'lang': locale[0],
  51. 'country': locale[1],
  52. 'username': geonames_user}))
  53. response = get(url)
  54. json = loads(response.text)
  55. content = json.get('geonames', None)
  56. if content is None or len(content) != 1:
  57. print "No country name found for " + locale[0] + "-" + locale[1]
  58. print json
  59. return ''
  60. return content[0].get('countryName', '')
  61. # Get language names from Wikipedia.
  62. def get_wikipedia_languages():
  63. response = get(wiki_languages_url)
  64. dom = fromstring(response.text)
  65. tables = dom.xpath('//table[contains(@class,"sortable")]')
  66. for table in tables:
  67. # exclude header row
  68. trs = table.xpath('.//tr')[1:]
  69. for tr in trs:
  70. td = tr.xpath('./td')
  71. code = td[3].xpath('./a')[0].text
  72. name = td[2].xpath('./a')[0].text
  73. english_name = td[1].xpath('./a')[0].text
  74. articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
  75. # exclude language variants and languages with few articles
  76. if code not in languages and articles >= 10000 and valid_code(code):
  77. languages[code] = (name, '', english_name)
  78. # Get language names from Google.
  79. def get_google_languages():
  80. response = get(google_languages_url)
  81. dom = fromstring(response.text)
  82. options = dom.xpath('//select[@name="hl"]/option')
  83. for option in options:
  84. code = option.xpath('./@value')[0].split('-')[0]
  85. name = option.text[:-1].title()
  86. if code not in languages and valid_code(code):
  87. languages[code] = (name, '', '')
  88. # Join all language lists.
  89. # iterate all languages supported by each engine
  90. def join_language_lists():
  91. for engine_name in engines:
  92. for locale in engines[engine_name].supported_languages:
  93. locale = locale.replace('_', '-')
  94. if locale not in languages and valid_code(locale):
  95. # try to get language name
  96. language = languages.get(locale.split('-')[0], None)
  97. if language == None:
  98. print engine_name + ": " + locale
  99. continue
  100. country = get_country_name(locale)
  101. languages[locale] = (language[0], country, language[2])
  102. # Remove countryless language if language is featured in only one country.
  103. def filter_single_country_languages():
  104. prev_lang = None
  105. for code in sorted(languages):
  106. lang = code.split('-')[0]
  107. if lang == prev_lang:
  108. countries += 1
  109. else:
  110. if prev_lang is not None and countries == 1:
  111. del languages[prev_lang]
  112. countries = 0
  113. prev_lang = lang
  114. # Write languages.py.
  115. def write_languages_file():
  116. new_file = open('languages.py', 'w')
  117. file_content = '# -*- coding: utf-8 -*-\n'
  118. file_content += '# list of language codes\n'
  119. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  120. file_content += '\nlanguage_codes = ('
  121. for code in sorted(languages):
  122. (name, country, english) = languages[code]
  123. file_content += '\n (u"' + code + '"'\
  124. + ', u"' + name + '"'\
  125. + ', u"' + country + '"'\
  126. + ', u"' + english + '"),'
  127. # remove last comma
  128. file_content = file_content[:-1]
  129. file_content += '\n)\n'
  130. new_file.write(file_content.encode('utf8'))
  131. new_file.close()
  132. if __name__ == "__main__":
  133. get_wikipedia_languages()
  134. get_google_languages()
  135. join_language_lists()
  136. filter_single_country_languages()
  137. write_languages_file()