update_languages.py 4.0KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from
  3. # intersecting each engine's supported languages.
  4. #
  5. # The language's native names are obtained from
  6. # Wikipedia's supported languages.
  7. #
  8. # Output file (languages.py) is written in current directory
  9. # to avoid overwriting in case something goes wrong.
  10. from requests import get
  11. from re import sub
  12. from lxml.html import fromstring, tostring
  13. from json import loads
  14. from sys import path
  15. path.append('../searx')
  16. from searx.engines import engines
  17. # list of language names
  18. wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  19. google_languages_url = 'https://www.google.com/preferences?#languages'
  20. google_json_name = 'google.preferences.langMap'
  21. languages = {}
  22. # To filter out invalid codes and dialects.
  23. def valid_code(lang_code):
  24. # filter invalid codes
  25. if lang_code[:2] == 'xx'\
  26. or lang_code == 'jw'\
  27. or lang_code[-2:] == 'UK'\
  28. or lang_code[-2:] == 'XA'\
  29. or lang_code[-2:] == 'XL':
  30. return False
  31. # filter dialects
  32. lang_code = lang_code.split('-')
  33. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  34. return False
  35. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  36. print lang_code
  37. return False
  38. return True
  39. # Get language names from Wikipedia.
  40. def get_wikipedia_languages():
  41. response = get(wiki_languages_url)
  42. dom = fromstring(response.text)
  43. tables = dom.xpath('//table[contains(@class,"sortable")]')
  44. for table in tables:
  45. # exclude header row
  46. trs = table.xpath('.//tr')[1:]
  47. for tr in trs:
  48. td = tr.xpath('./td')
  49. code = td[3].xpath('./a')[0].text
  50. name = td[2].xpath('./a')[0].text
  51. english_name = td[1].xpath('./a')[0].text
  52. articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
  53. # exclude languages with few articles and language variants
  54. if code not in languages and articles >= 100 and valid_code(code):
  55. languages[code] = (name, '', english_name)
  56. # Get language names from Google.
  57. def get_google_languages():
  58. response = get(google_languages_url)
  59. dom = fromstring(response.text)
  60. options = dom.xpath('//select[@name="hl"]/option')
  61. for option in options:
  62. code = option.xpath('./@value')[0]
  63. name = option.text[:-1]
  64. if code not in languages and valid_code(code):
  65. languages[code] = (name, '', '')
  66. # Join all language lists.
  67. # iterate all languages supported by each engine
  68. def join_language_lists():
  69. for engine_name in engines:
  70. for locale in engines[engine_name].supported_languages:
  71. locale = locale.replace('_', '-')
  72. if locale not in languages and valid_code(locale):
  73. # try to get language name
  74. language = languages.get(locale.split('-')[0], None)
  75. if language == None:
  76. # print engine_name + ": " + locale
  77. continue
  78. (name, country, english) = language
  79. languages[locale] = (name, country, english)
  80. # Write languages.py.
  81. def write_languages_file():
  82. new_file = open('languages.py', 'w')
  83. file_content = '# -*- coding: utf-8 -*-\n'
  84. file_content += '# list of language codes\n'
  85. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  86. file_content += '\nlanguage_codes = ('
  87. for code in languages:
  88. (name, country, english) = languages[code]
  89. file_content += '\n (u"' + code + '"'\
  90. + ', u"' + name + '"'\
  91. + ', u"' + country + '"'\
  92. + ', u"' + english + '"),'
  93. # remove last comma
  94. file_content = file_content[:-1]
  95. file_content += '\n)\n'
  96. new_file.write(file_content.encode('utf8'))
  97. new_file.close()
  98. def main():
  99. get_wikipedia_languages()
  100. get_google_languages()
  101. join_language_lists()
  102. write_languages_file()
  103. if __name__ == "__main__":
  104. main()