update_languages.py 5.4KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from
  3. # intersecting each engine's supported languages.
  4. #
  5. # The language's native names are obtained from
  6. # Wikipedia and Google's supported languages.
  7. #
  8. # The country names are obtained from http://api.geonames.org
  9. # which requires registering as a user.
  10. #
  11. # Output file (languages.py) is written in current directory
  12. # to avoid overwriting in case something goes wrong.
  13. from requests import get
  14. from urllib import urlencode
  15. from lxml.html import fromstring
  16. from json import loads
  17. from sys import path
  18. path.append('../searx')
  19. from searx.engines import engines
  20. # list of names
  21. wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  22. google_languages_url = 'https://www.google.com/preferences?#languages'
  23. country_names_url = 'http://api.geonames.org/countryInfoJSON?{parameters}'
  24. geonames_user = '' # add user name here
  25. google_json_name = 'google.preferences.langMap'
  26. languages = {}
  27. # To filter out invalid codes and dialects.
  28. def valid_code(lang_code):
  29. # filter invalid codes
  30. # sl-SL is technically not invalid, but still a mistake
  31. if lang_code[:2] == 'xx'\
  32. or lang_code == 'sl-SL'\
  33. or lang_code == 'jw'\
  34. or lang_code[-2:] == 'UK'\
  35. or lang_code[-2:] == 'XA'\
  36. or lang_code[-2:] == 'XL':
  37. return False
  38. # filter dialects
  39. lang_code = lang_code.split('-')
  40. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  41. return False
  42. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  43. return False
  44. return True
  45. # Get country name in specified language.
  46. def get_country_name(locale):
  47. if geonames_user is '':
  48. return ''
  49. locale = locale.split('-')
  50. if len(locale) != 2:
  51. return ''
  52. url = country_names_url.format(parameters=urlencode({'lang': locale[0],
  53. 'country': locale[1],
  54. 'username': geonames_user}))
  55. response = get(url)
  56. json = loads(response.text)
  57. content = json.get('geonames', None)
  58. if content is None or len(content) != 1:
  59. print "No country name found for " + locale[0] + "-" + locale[1]
  60. print json
  61. return ''
  62. return content[0].get('countryName', '')
  63. # Get language names from Wikipedia.
  64. def get_wikipedia_languages():
  65. response = get(wiki_languages_url)
  66. dom = fromstring(response.text)
  67. tables = dom.xpath('//table[contains(@class,"sortable")]')
  68. for table in tables:
  69. # exclude header row
  70. trs = table.xpath('.//tr')[1:]
  71. for tr in trs:
  72. td = tr.xpath('./td')
  73. code = td[3].xpath('./a')[0].text
  74. name = td[2].xpath('./a')[0].text
  75. english_name = td[1].xpath('./a')[0].text
  76. articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
  77. # exclude language variants and languages with few articles
  78. if code not in languages and articles >= 10000 and valid_code(code):
  79. languages[code] = (name, '', english_name)
  80. # Get language names from Google.
  81. def get_google_languages():
  82. response = get(google_languages_url)
  83. dom = fromstring(response.text)
  84. options = dom.xpath('//select[@name="hl"]/option')
  85. for option in options:
  86. code = option.xpath('./@value')[0].split('-')[0]
  87. name = option.text[:-1].title()
  88. if code not in languages and valid_code(code):
  89. languages[code] = (name, '', '')
  90. # Join all language lists.
  91. # iterate all languages supported by each engine
  92. def join_language_lists():
  93. for engine_name in engines:
  94. for locale in engines[engine_name].supported_languages:
  95. locale = locale.replace('_', '-')
  96. if locale not in languages and valid_code(locale):
  97. # try to get language name
  98. language = languages.get(locale.split('-')[0], None)
  99. if language == None:
  100. print engine_name + ": " + locale
  101. continue
  102. country = get_country_name(locale)
  103. languages[locale] = (language[0], country, language[2])
  104. # Remove countryless language if language is featured in only one country.
  105. def filter_single_country_languages():
  106. prev_lang = None
  107. for code in sorted(languages):
  108. lang = code.split('-')[0]
  109. if lang == prev_lang:
  110. countries += 1
  111. else:
  112. if prev_lang is not None and countries == 1:
  113. del languages[prev_lang]
  114. countries = 0
  115. prev_lang = lang
  116. # Write languages.py.
  117. def write_languages_file():
  118. new_file = open('languages.py', 'w')
  119. file_content = '# -*- coding: utf-8 -*-\n'
  120. file_content += '# list of language codes\n'
  121. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  122. file_content += '\nlanguage_codes = ('
  123. for code in sorted(languages):
  124. (name, country, english) = languages[code]
  125. file_content += '\n (u"' + code + '"'\
  126. + ', u"' + name + '"'\
  127. + ', u"' + country + '"'\
  128. + ', u"' + english + '"),'
  129. # remove last comma
  130. file_content = file_content[:-1]
  131. file_content += '\n)\n'
  132. new_file.write(file_content.encode('utf8'))
  133. new_file.close()
  134. if __name__ == "__main__":
  135. get_wikipedia_languages()
  136. get_google_languages()
  137. join_language_lists()
  138. filter_single_country_languages()
  139. write_languages_file()