duckduckgo_definitions.py 5.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. import json
  2. from urllib import urlencode
  3. from lxml import html
  4. from searx.engines.xpath import extract_text
  5. url = 'https://api.duckduckgo.com/'\
  6. + '?{query}&format=json&pretty=0&no_redirect=1&d=1'
  7. def result_to_text(url, text, htmlResult):
  8. # TODO : remove result ending with "Meaning" or "Category"
  9. dom = html.fromstring(htmlResult)
  10. a = dom.xpath('//a')
  11. if len(a) >= 1:
  12. return extract_text(a[0])
  13. else:
  14. return text
  15. def html_to_text(htmlFragment):
  16. dom = html.fromstring(htmlFragment)
  17. return extract_text(dom)
  18. def request(query, params):
  19. # TODO add kl={locale}
  20. params['url'] = url.format(query=urlencode({'q': query}))
  21. return params
  22. def response(resp):
  23. search_res = json.loads(resp.text)
  24. results = []
  25. content = ''
  26. heading = search_res.get('Heading', '')
  27. attributes = []
  28. urls = []
  29. infobox_id = None
  30. relatedTopics = []
  31. # add answer if there is one
  32. answer = search_res.get('Answer', '')
  33. if answer != '':
  34. results.append({'answer': html_to_text(answer)})
  35. # add infobox
  36. if 'Definition' in search_res:
  37. content = content + search_res.get('Definition', '')
  38. if 'Abstract' in search_res:
  39. content = content + search_res.get('Abstract', '')
  40. # image
  41. image = search_res.get('Image', '')
  42. image = None if image == '' else image
  43. # attributes
  44. if 'Infobox' in search_res:
  45. infobox = search_res.get('Infobox', None)
  46. if 'content' in infobox:
  47. for info in infobox.get('content'):
  48. attributes.append({'label': info.get('label'),
  49. 'value': info.get('value')})
  50. # urls
  51. for ddg_result in search_res.get('Results', []):
  52. if 'FirstURL' in ddg_result:
  53. firstURL = ddg_result.get('FirstURL', '')
  54. text = ddg_result.get('Text', '')
  55. urls.append({'title': text, 'url': firstURL})
  56. results.append({'title': heading, 'url': firstURL})
  57. # related topics
  58. for ddg_result in search_res.get('RelatedTopics', None):
  59. if 'FirstURL' in ddg_result:
  60. suggestion = result_to_text(ddg_result.get('FirstURL', None),
  61. ddg_result.get('Text', None),
  62. ddg_result.get('Result', None))
  63. if suggestion != heading:
  64. results.append({'suggestion': suggestion})
  65. elif 'Topics' in ddg_result:
  66. suggestions = []
  67. relatedTopics.append({'name': ddg_result.get('Name', ''),
  68. 'suggestions': suggestions})
  69. for topic_result in ddg_result.get('Topics', []):
  70. suggestion = result_to_text(topic_result.get('FirstURL', None),
  71. topic_result.get('Text', None),
  72. topic_result.get('Result', None))
  73. if suggestion != heading:
  74. suggestions.append(suggestion)
  75. # abstract
  76. abstractURL = search_res.get('AbstractURL', '')
  77. if abstractURL != '':
  78. # add as result ? problem always in english
  79. infobox_id = abstractURL
  80. urls.append({'title': search_res.get('AbstractSource'),
  81. 'url': abstractURL})
  82. # definition
  83. definitionURL = search_res.get('DefinitionURL', '')
  84. if definitionURL != '':
  85. # add as result ? as answer ? problem always in english
  86. infobox_id = definitionURL
  87. urls.append({'title': search_res.get('DefinitionSource'),
  88. 'url': definitionURL})
  89. # entity
  90. entity = search_res.get('Entity', None)
  91. # TODO continent / country / department / location / waterfall /
  92. # mountain range :
  93. # link to map search, get weather, near by locations
  94. # TODO musician : link to music search
  95. # TODO concert tour : ??
  96. # TODO film / actor / television / media franchise :
  97. # links to IMDB / rottentomatoes (or scrap result)
  98. # TODO music : link tu musicbrainz / last.fm
  99. # TODO book : ??
  100. # TODO artist / playwright : ??
  101. # TODO compagny : ??
  102. # TODO software / os : ??
  103. # TODO software engineer : ??
  104. # TODO prepared food : ??
  105. # TODO website : ??
  106. # TODO performing art : ??
  107. # TODO prepared food : ??
  108. # TODO programming language : ??
  109. # TODO file format : ??
  110. if len(heading) > 0:
  111. # TODO get infobox.meta.value where .label='article_title'
  112. if image is None and len(attributes) == 0 and len(urls) == 1 and\
  113. len(relatedTopics) == 0 and len(content) == 0:
  114. results.append({
  115. 'url': urls[0]['url'],
  116. 'title': heading,
  117. 'content': content
  118. })
  119. else:
  120. results.append({
  121. 'infobox': heading,
  122. 'id': infobox_id,
  123. 'entity': entity,
  124. 'content': content,
  125. 'img_src': image,
  126. 'attributes': attributes,
  127. 'urls': urls,
  128. 'relatedTopics': relatedTopics
  129. })
  130. return results