update_languages.py 3.9KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from
  3. # intersecting each engine's supported languages.
  4. #
  5. # The language's native names are obtained from
  6. # Wikipedia's supported languages.
  7. #
  8. # Output file (languages.py) is written in current directory
  9. # to avoid overwriting in case something goes wrong.
  10. from requests import get
  11. from re import sub
  12. from lxml.html import fromstring, tostring
  13. from json import loads
  14. from sys import path
  15. path.append('../searx')
  16. from searx.engines import engines
  17. # list of language names
  18. wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  19. google_languages_url = 'https://www.google.com/preferences?#languages'
  20. google_json_name = 'google.preferences.langMap'
  21. languages = {}
  22. # To filter out invalid codes and dialects.
  23. def valid_code(lang_code):
  24. # filter invalid codes
  25. if lang_code[:2] == 'xx'\
  26. or lang_code == 'jw'\
  27. or lang_code[-2:] == 'UK'\
  28. or lang_code[-2:] == 'XA'\
  29. or lang_code[-2:] == 'XL':
  30. return False
  31. # filter dialects
  32. lang_code = lang_code.split('-')
  33. if len(lang_code) > 2 or len(lang_code[0]) > 3:
  34. return False
  35. if len(lang_code) == 2 and len(lang_code[1]) > 2:
  36. return False
  37. return True
  38. # Get language names from Wikipedia.
  39. def get_wikipedia_languages():
  40. response = get(wiki_languages_url)
  41. dom = fromstring(response.text)
  42. tables = dom.xpath('//table[contains(@class,"sortable")]')
  43. for table in tables:
  44. # exclude header row
  45. trs = table.xpath('.//tr')[1:]
  46. for tr in trs:
  47. td = tr.xpath('./td')
  48. code = td[3].xpath('./a')[0].text
  49. name = td[2].xpath('./a')[0].text
  50. english_name = td[1].xpath('./a')[0].text
  51. articles = int(td[4].xpath('./a/b')[0].text.replace(',',''))
  52. # exclude language variants and languages with few articles
  53. if code not in languages and articles >= 1000 and valid_code(code):
  54. languages[code] = (name, '', english_name)
  55. # Get language names from Google.
  56. def get_google_languages():
  57. response = get(google_languages_url)
  58. dom = fromstring(response.text)
  59. options = dom.xpath('//select[@name="hl"]/option')
  60. for option in options:
  61. code = option.xpath('./@value')[0]
  62. name = option.text[:-1]
  63. if code not in languages and valid_code(code):
  64. languages[code] = (name, '', '')
  65. # Join all language lists.
  66. # iterate all languages supported by each engine
  67. def join_language_lists():
  68. for engine_name in engines:
  69. for locale in engines[engine_name].supported_languages:
  70. locale = locale.replace('_', '-')
  71. if locale not in languages and valid_code(locale):
  72. # try to get language name
  73. language = languages.get(locale.split('-')[0], None)
  74. if language == None:
  75. print engine_name + ": " + locale
  76. continue
  77. (name, country, english) = language
  78. languages[locale] = (name, country, english)
  79. # Write languages.py.
  80. def write_languages_file():
  81. new_file = open('languages.py', 'w')
  82. file_content = '# -*- coding: utf-8 -*-\n'
  83. file_content += '# list of language codes\n'
  84. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  85. file_content += '\nlanguage_codes = ('
  86. for code in languages:
  87. (name, country, english) = languages[code]
  88. file_content += '\n (u"' + code + '"'\
  89. + ', u"' + name + '"'\
  90. + ', u"' + country + '"'\
  91. + ', u"' + english + '"),'
  92. # remove last comma
  93. file_content = file_content[:-1]
  94. file_content += '\n)\n'
  95. new_file.write(file_content.encode('utf8'))
  96. new_file.close()
  97. if __name__ == "__main__":
  98. get_wikipedia_languages()
  99. get_google_languages()
  100. join_language_lists()
  101. write_languages_file()