update_languages.py 3.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. # This script generates languages.py from
  3. # intersecting each engine's supported languages.
  4. #
  5. # The language's native names are obtained from
  6. # Wikipedia's supported languages.
  7. #
  8. # Output file (languages.py) is written in current directory
  9. # to avoid overwriting in case something goes wrong.
  10. from requests import get
  11. from re import sub
  12. from lxml.html import fromstring
  13. from json import loads
  14. from sys import path
  15. path.append('../searx')
  16. from searx.engines import engines
  17. # list of language names
  18. wiki_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
  19. google_languages_url = 'https://www.google.com/preferences?#languages'
  20. google_json_name = 'google.preferences.langMap'
  21. languages = {}
  22. # Get language names from Wikipedia.
  23. def get_wikipedia_languages():
  24. response = get(wiki_languages_url)
  25. dom = fromstring(response.text)
  26. tables = dom.xpath('//table[contains(@class,"sortable")]')
  27. for table in tables:
  28. # exclude header row
  29. trs = table.xpath('.//tr')[1:]
  30. for tr in trs:
  31. td = tr.xpath('./td')
  32. code = td[3].xpath('./a')[0].text
  33. name = td[2].xpath('./a')[0].text
  34. english_name = td[1].xpath('./a')[0].text
  35. if code not in languages:
  36. languages[code] = (name, '', english_name)
  37. # Get language names from Google.
  38. def get_google_languages():
  39. response = get(google_languages_url)
  40. dom = fromstring(response.text)
  41. options = dom.xpath('//select[@name="hl"]/option')
  42. for option in options:
  43. code = option.xpath('./@value')[0]
  44. name = option.text[:-1]
  45. if code not in languages:
  46. languages[code] = (name, '', '')
  47. # Join all language lists.
  48. # iterate all languages supported by each engine
  49. def join_language_lists():
  50. for engine_name in engines:
  51. for locale in engines[engine_name].supported_languages:
  52. locale = locale.replace('_', '-')
  53. if locale not in languages:
  54. # try to get language name
  55. language = languages.get(locale.split('-')[0], None)
  56. if language == None:
  57. print engine_name + ": " + locale
  58. continue
  59. (name, country, english) = language
  60. languages[locale] = (name, country, english)
  61. # Write languages.py.
  62. def write_languages_file():
  63. new_file = open('languages.py', 'w')
  64. file_content = '# -*- coding: utf-8 -*-\n'
  65. file_content += '# list of language codes\n'
  66. file_content += '# this file is generated automatically by utils/update_search_languages.py\n'
  67. file_content += '\nlanguage_codes = ('
  68. for code in languages:
  69. (name, country, english) = languages[code]
  70. file_content += '\n (u"' + code + '"'\
  71. + ', u"' + name + '"'\
  72. + ', u"' + country[1:-1] + '"'\
  73. + ', u"' + english + '"),'
  74. # remove last comma
  75. file_content = file_content[:-1]
  76. file_content += '\n)\n'
  77. new_file.write(file_content.encode('utf8'))
  78. new_file.close()
  79. def main():
  80. get_wikipedia_languages()
  81. get_google_languages()
  82. join_language_lists()
  83. write_languages_file()
  84. if __name__ == "__main__":
  85. main()