creates simple multilingual infobox using wikipedia's api

9 anos atrás · 8d335dbdae
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -43,3 +43,4 @@ generally made searx better:
 
				 - Kang-min Liu
			
 
				 - Kirill Isakov
			
 
				 - Guilhem Bonnefille
			
 
				+- Marc Abonce Seguin
			
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
 
				         results.append({'title': title, 'url': official_website})
			
 
				 
			
 
				     wikipedia_link_count = 0
			
 
				+    wikipedia_link = get_wikilink(result, language + 'wiki')
			
 
				+    wikipedia_link_count += add_url(urls,
			
 
				+                                    'Wikipedia (' + language + ')',
			
 
				+                                    wikipedia_link)
			
 
				     if language != 'en':
			
 
				+        wikipedia_en_link = get_wikilink(result, 'enwiki')
			
 
				         wikipedia_link_count += add_url(urls,
			
 
				-                                        'Wikipedia (' + language + ')',
			
 
				-                                        get_wikilink(result, language +
			
 
				-                                                     'wiki'))
			
 
				-    wikipedia_en_link = get_wikilink(result, 'enwiki')
			
 
				-    wikipedia_link_count += add_url(urls,
			
 
				-                                    'Wikipedia (en)',
			
 
				-                                    wikipedia_en_link)
			
 
				+                                        'Wikipedia (en)',
			
 
				+                                        wikipedia_en_link)
			
 
				     if wikipedia_link_count == 0:
			
 
				         misc_language = get_wiki_firstlanguage(result, 'wiki')
			
 
				         if misc_language is not None:
			
@@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
 
				     else:
			
 
				         results.append({
			
 
				                        'infobox': title,
			
 
				-                       'id': wikipedia_en_link,
			
 
				+                       'id': wikipedia_link,
			
 
				                        'content': description,
			
 
				                        'attributes': attributes,
			
 
				                        'urls': urls
			
--- a/searx/engines/wikipedia.py
+++ b/searx/engines/wikipedia.py
@@ -0,0 +1,114 @@
 
				+"""
			
 
				+ Wikipedia (Web)
			
 
				+
			
 
				+ @website     https://{language}.wikipedia.org
			
 
				+ @provide-api yes
			
 
				+
			
 
				+ @using-api   yes
			
 
				+ @results     JSON
			
 
				+ @stable      yes
			
 
				+ @parse       url, infobox
			
 
				+"""
			
 
				+
			
 
				+from json import loads
			
 
				+from urllib import urlencode, quote
			
 
				+
			
 
				+# search-url
			
 
				+base_url = 'https://{language}.wikipedia.org/'
			
 
				+search_postfix = 'w/api.php?'\
			
 
				+    'action=query'\
			
 
				+    '&format=json'\
			
 
				+    '&{query}'\
			
 
				+    '&prop=extracts|pageimages'\
			
 
				+    '&exintro'\
			
 
				+    '&explaintext'\
			
 
				+    '&pithumbsize=300'\
			
 
				+    '&redirects'
			
 
				+
			
 
				+
			
 
				+# set language in base_url
			
 
				+def url_lang(lang):
			
 
				+    if lang == 'all':
			
 
				+        language = 'en'
			
 
				+    else:
			
 
				+        language = lang.split('_')[0]
			
 
				+
			
 
				+    return base_url.format(language=language)
			
 
				+
			
 
				+
			
 
				+# do search-request
			
 
				+def request(query, params):
			
 
				+    if query.islower():
			
 
				+        query += '|' + query.title()
			
 
				+
			
 
				+    params['url'] = url_lang(params['language']) \
			
 
				+        + search_postfix.format(query=urlencode({'titles': query}))
			
 
				+
			
 
				+    return params
			
 
				+
			
 
				+
			
 
				+# get first meaningful paragraph
			
 
				+# this should filter out disambiguation pages and notes above first paragraph
			
 
				+# "magic numbers" were obtained by fine tuning
			
 
				+def extract_first_paragraph(content, title, image):
			
 
				+    first_paragraph = None
			
 
				+
			
 
				+    failed_attempts = 0
			
 
				+    for paragraph in content.split('\n'):
			
 
				+
			
 
				+        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
			
 
				+        length = len(paragraph)
			
 
				+
			
 
				+        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
			
 
				+            first_paragraph = paragraph
			
 
				+            break
			
 
				+
			
 
				+        failed_attempts += 1
			
 
				+        if failed_attempts > 3:
			
 
				+            return None
			
 
				+
			
 
				+    return first_paragraph
			
 
				+
			
 
				+
			
 
				+# get response from search-request
			
 
				+def response(resp):
			
 
				+    results = []
			
 
				+
			
 
				+    search_result = loads(resp.content)
			
 
				+
			
 
				+    # wikipedia article's unique id
			
 
				+    # first valid id is assumed to be the requested article
			
 
				+    for article_id in search_result['query']['pages']:
			
 
				+        page = search_result['query']['pages'][article_id]
			
 
				+        if int(article_id) > 0:
			
 
				+            break
			
 
				+
			
 
				+    if int(article_id) < 0:
			
 
				+        return []
			
 
				+
			
 
				+    title = page.get('title')
			
 
				+
			
 
				+    image = page.get('thumbnail')
			
 
				+    if image:
			
 
				+        image = image.get('source')
			
 
				+
			
 
				+    extract = page.get('extract')
			
 
				+
			
 
				+    summary = extract_first_paragraph(extract, title, image)
			
 
				+    if not summary:
			
 
				+        return []
			
 
				+
			
 
				+    # link to wikipedia article
			
 
				+    # parenthesis are not quoted to make infobox mergeable with wikidata's
			
 
				+    wikipedia_link = url_lang(resp.search_params['language']) \
			
 
				+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
			
 
				+
			
 
				+    results.append({'url': wikipedia_link, 'title': title})
			
 
				+
			
 
				+    results.append({'infobox': title,
			
 
				+                    'id': wikipedia_link,
			
 
				+                    'content': summary,
			
 
				+                    'img_src': image,
			
 
				+                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
			
 
				+
			
 
				+    return results
			
--- a/searx/results.py
+++ b/searx/results.py
@@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2):
 
				         urls1 = infobox1.get('urls', None)
			
 
				         if urls1 is None:
			
 
				             urls1 = []
			
 
				-            infobox1.set('urls', urls1)
			
 
				+            infobox1['urls'] = urls1
			
 
				 
			
 
				         urlSet = set()
			
 
				         for url in infobox1.get('urls', []):
			
@@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
 
				             if url.get('url', None) not in urlSet:
			
 
				                 urls1.append(url)
			
 
				 
			
 
				+    if 'img_src' in infobox2:
			
 
				+        img1 = infobox1.get('img_src', None)
			
 
				+        img2 = infobox2.get('img_src')
			
 
				+        if img1 is None:
			
 
				+            infobox1['img_src'] = img2
			
 
				+
			
 
				     if 'attributes' in infobox2:
			
 
				         attributes1 = infobox1.get('attributes', None)
			
 
				         if attributes1 is None:
			
 
				             attributes1 = []
			
 
				-            infobox1.set('attributes', attributes1)
			
 
				+            infobox1['attributes'] = attributes1
			
 
				 
			
 
				         attributeSet = set()
			
 
				         for attribute in infobox1.get('attributes', []):
			
@@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2):
 
				             if result_content_len(content2) > result_content_len(content1):
			
 
				                 infobox1['content'] = content2
			
 
				         else:
			
 
				-            infobox1.set('content', content2)
			
 
				+            infobox1['content'] = content2
			
 
				 
			
 
				 
			
 
				 def result_score(result):
			
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -43,10 +43,9 @@ engines:
 
				     shortcut : bs
			
 
				 
			
 
				   - name : wikipedia
			
 
				-    engine : mediawiki
			
 
				+    engine : wikipedia
			
 
				     shortcut : wp
			
 
				     base_url : 'https://{language}.wikipedia.org/'
			
 
				-    number_of_results : 1
			
 
				 
			
 
				   - name : bing
			
 
				     engine : bing
			
@@ -93,6 +92,7 @@ engines:
 
				   - name : ddg definitions
			
 
				     engine : duckduckgo_definitions
			
 
				     shortcut : ddd
			
 
				+    disabled : True
			
 
				 
			
 
				   - name : digg
			
 
				     engine : digg
			
--- a/searx/templates/oscar/infobox.html
+++ b/searx/templates/oscar/infobox.html
@@ -1,8 +1,9 @@
 
				 <div class="panel panel-default infobox">
			
 
				     <div class="panel-heading">
			
 
				-        <h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4>
			
 
				+		<bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
			
 
				     </div>
			
 
				     <div class="panel-body">
			
 
				+		<bdi>
			
 
				         {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
			
 
				         {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
			
 
				 
			
@@ -28,5 +29,6 @@
 
				             {% endfor %}
			
 
				         </div>
			
 
				         {% endif %}
			
 
				+		</bdi>
			
 
				     </div>
			
 
				 </div>
			
--- a/tests/unit/engines/test_wikipedia.py
+++ b/tests/unit/engines/test_wikipedia.py
@@ -0,0 +1,160 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+from collections import defaultdict
			
 
				+import mock
			
 
				+from searx.engines import wikipedia
			
 
				+from searx.testing import SearxTestCase
			
 
				+
			
 
				+
			
 
				+class TestWikipediaEngine(SearxTestCase):
			
 
				+
			
 
				+    def test_request(self):
			
 
				+        query = 'test_query'
			
 
				+        dicto = defaultdict(dict)
			
 
				+        dicto['language'] = 'fr_FR'
			
 
				+        params = wikipedia.request(query, dicto)
			
 
				+        self.assertIn('url', params)
			
 
				+        self.assertIn(query, params['url'])
			
 
				+        self.assertIn('test_query', params['url'])
			
 
				+        self.assertIn('Test_Query', params['url'])
			
 
				+        self.assertIn('fr.wikipedia.org', params['url'])
			
 
				+
			
 
				+        query = 'Test_Query'
			
 
				+        params = wikipedia.request(query, dicto)
			
 
				+        self.assertIn('Test_Query', params['url'])
			
 
				+        self.assertNotIn('test_query', params['url'])
			
 
				+
			
 
				+        dicto['language'] = 'all'
			
 
				+        params = wikipedia.request(query, dicto)
			
 
				+        self.assertIn('en', params['url'])
			
 
				+
			
 
				+    def test_response(self):
			
 
				+        dicto = defaultdict(dict)
			
 
				+        dicto['language'] = 'fr'
			
 
				+
			
 
				+        self.assertRaises(AttributeError, wikipedia.response, None)
			
 
				+        self.assertRaises(AttributeError, wikipedia.response, [])
			
 
				+        self.assertRaises(AttributeError, wikipedia.response, '')
			
 
				+        self.assertRaises(AttributeError, wikipedia.response, '[]')
			
 
				+
			
 
				+        # page not found
			
 
				+        json = """
			
 
				+        {
			
 
				+            "batchcomplete": "",
			
 
				+            "query": {
			
 
				+                "normalized": [],
			
 
				+                "pages": {
			
 
				+                    "-1": {
			
 
				+                        "ns": 0,
			
 
				+                        "title": "",
			
 
				+                        "missing": ""
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }"""
			
 
				+        response = mock.Mock(content=json, search_params=dicto)
			
 
				+        self.assertEqual(wikipedia.response(response), [])
			
 
				+
			
 
				+        # normal case
			
 
				+        json = """
			
 
				+        {
			
 
				+            "batchcomplete": "",
			
 
				+            "query": {
			
 
				+                "normalized": [],
			
 
				+                "pages": {
			
 
				+                    "12345": {
			
 
				+                        "pageid": 12345,
			
 
				+                        "ns": 0,
			
 
				+                        "title": "The Title",
			
 
				+                        "extract": "The Title is...",
			
 
				+                        "thumbnail": {
			
 
				+                            "source": "img_src.jpg"
			
 
				+                        },
			
 
				+                        "pageimage": "img_name.jpg"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }"""
			
 
				+        response = mock.Mock(content=json, search_params=dicto)
			
 
				+        results = wikipedia.response(response)
			
 
				+        self.assertEqual(type(results), list)
			
 
				+        self.assertEqual(len(results), 2)
			
 
				+        self.assertEqual(results[0]['title'], u'The Title')
			
 
				+        self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url'])
			
 
				+        self.assertEqual(results[1]['infobox'], u'The Title')
			
 
				+        self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id'])
			
 
				+        self.assertIn('The Title is...', results[1]['content'])
			
 
				+        self.assertEqual(results[1]['img_src'], 'img_src.jpg')
			
 
				+
			
 
				+        # disambiguation page
			
 
				+        json = """
			
 
				+        {
			
 
				+            "batchcomplete": "",
			
 
				+            "query": {
			
 
				+                "normalized": [],
			
 
				+                "pages": {
			
 
				+                    "12345": {
			
 
				+                        "pageid": 12345,
			
 
				+                        "ns": 0,
			
 
				+                        "title": "The Title",
			
 
				+                        "extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """  # noqa
			
 
				+        json += """
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }"""
			
 
				+        response = mock.Mock(content=json, search_params=dicto)
			
 
				+        results = wikipedia.response(response)
			
 
				+        self.assertEqual(type(results), list)
			
 
				+        self.assertEqual(len(results), 0)
			
 
				+
			
 
				+        # no image
			
 
				+        json = """
			
 
				+        {
			
 
				+            "batchcomplete": "",
			
 
				+            "query": {
			
 
				+                "normalized": [],
			
 
				+                "pages": {
			
 
				+                    "12345": {
			
 
				+                        "pageid": 12345,
			
 
				+                        "ns": 0,
			
 
				+                        "title": "The Title",
			
 
				+                        "extract": "The Title is......................................................................................................................................................................................." """  # noqa
			
 
				+        json += """
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }"""
			
 
				+        response = mock.Mock(content=json, search_params=dicto)
			
 
				+        results = wikipedia.response(response)
			
 
				+        self.assertEqual(type(results), list)
			
 
				+        self.assertEqual(len(results), 2)
			
 
				+        self.assertIn('The Title is...', results[1]['content'])
			
 
				+        self.assertEqual(results[1]['img_src'], None)
			
 
				+
			
 
				+        # title not in first paragraph
			
 
				+        json = u"""
			
 
				+        {
			
 
				+            "batchcomplete": "",
			
 
				+            "query": {
			
 
				+                "normalized": [],
			
 
				+                "pages": {
			
 
				+                    "12345": {
			
 
				+                        "pageid": 12345,
			
 
				+                        "ns": 0,
			
 
				+                        "title": "披頭四樂隊",
			
 
				+                        "extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """  # noqa
			
 
				+        json += """
			
 
				+                        "thumbnail": {
			
 
				+                            "source": "img_src.jpg"
			
 
				+                        },
			
 
				+                        "pageimage": "img_name.jpg"
			
 
				+                    }
			
 
				+                }
			
 
				+            }
			
 
				+        }"""
			
 
				+        response = mock.Mock(content=json, search_params=dicto)
			
 
				+        results = wikipedia.response(response)
			
 
				+        self.assertEqual(type(results), list)
			
 
				+        self.assertEqual(len(results), 2)
			
 
				+        self.assertEqual(results[1]['infobox'], u'披頭四樂隊')
			
 
				+        self.assertIn(u'披头士乐队...', results[1]['content'])