Procházet zdrojové kódy

[enh] wikipedia infobox

creates simple multilingual infobox using wikipedia's api
a01200356 před 9 roky
rodič
revize
8d335dbdae

+ 1
- 0
AUTHORS.rst Zobrazit soubor

@@ -43,3 +43,4 @@ generally made searx better:
43 43
 - Kang-min Liu
44 44
 - Kirill Isakov
45 45
 - Guilhem Bonnefille
46
+- Marc Abonce Seguin

+ 8
- 8
searx/engines/wikidata.py Zobrazit soubor

@@ -86,15 +86,15 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
86 86
         results.append({'title': title, 'url': official_website})
87 87
 
88 88
     wikipedia_link_count = 0
89
+    wikipedia_link = get_wikilink(result, language + 'wiki')
90
+    wikipedia_link_count += add_url(urls,
91
+                                    'Wikipedia (' + language + ')',
92
+                                    wikipedia_link)
89 93
     if language != 'en':
94
+        wikipedia_en_link = get_wikilink(result, 'enwiki')
90 95
         wikipedia_link_count += add_url(urls,
91
-                                        'Wikipedia (' + language + ')',
92
-                                        get_wikilink(result, language +
93
-                                                     'wiki'))
94
-    wikipedia_en_link = get_wikilink(result, 'enwiki')
95
-    wikipedia_link_count += add_url(urls,
96
-                                    'Wikipedia (en)',
97
-                                    wikipedia_en_link)
96
+                                        'Wikipedia (en)',
97
+                                        wikipedia_en_link)
98 98
     if wikipedia_link_count == 0:
99 99
         misc_language = get_wiki_firstlanguage(result, 'wiki')
100 100
         if misc_language is not None:
@@ -188,7 +188,7 @@ def getDetail(jsonresponse, wikidata_id, language, locale):
188 188
     else:
189 189
         results.append({
190 190
                        'infobox': title,
191
-                       'id': wikipedia_en_link,
191
+                       'id': wikipedia_link,
192 192
                        'content': description,
193 193
                        'attributes': attributes,
194 194
                        'urls': urls

+ 114
- 0
searx/engines/wikipedia.py Zobrazit soubor

@@ -0,0 +1,114 @@
1
+"""
2
+ Wikipedia (Web)
3
+
4
+ @website     https://{language}.wikipedia.org
5
+ @provide-api yes
6
+
7
+ @using-api   yes
8
+ @results     JSON
9
+ @stable      yes
10
+ @parse       url, infobox
11
+"""
12
+
13
+from json import loads
14
+from urllib import urlencode, quote
15
+
16
+# search-url
17
+base_url = 'https://{language}.wikipedia.org/'
18
+search_postfix = 'w/api.php?'\
19
+    'action=query'\
20
+    '&format=json'\
21
+    '&{query}'\
22
+    '&prop=extracts|pageimages'\
23
+    '&exintro'\
24
+    '&explaintext'\
25
+    '&pithumbsize=300'\
26
+    '&redirects'
27
+
28
+
29
+# set language in base_url
30
+def url_lang(lang):
31
+    if lang == 'all':
32
+        language = 'en'
33
+    else:
34
+        language = lang.split('_')[0]
35
+
36
+    return base_url.format(language=language)
37
+
38
+
39
+# do search-request
40
+def request(query, params):
41
+    if query.islower():
42
+        query += '|' + query.title()
43
+
44
+    params['url'] = url_lang(params['language']) \
45
+        + search_postfix.format(query=urlencode({'titles': query}))
46
+
47
+    return params
48
+
49
+
50
+# get first meaningful paragraph
51
+# this should filter out disambiguation pages and notes above first paragraph
52
+# "magic numbers" were obtained by fine tuning
53
+def extract_first_paragraph(content, title, image):
54
+    first_paragraph = None
55
+
56
+    failed_attempts = 0
57
+    for paragraph in content.split('\n'):
58
+
59
+        starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
60
+        length = len(paragraph)
61
+
62
+        if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
63
+            first_paragraph = paragraph
64
+            break
65
+
66
+        failed_attempts += 1
67
+        if failed_attempts > 3:
68
+            return None
69
+
70
+    return first_paragraph
71
+
72
+
73
+# get response from search-request
74
+def response(resp):
75
+    results = []
76
+
77
+    search_result = loads(resp.content)
78
+
79
+    # wikipedia article's unique id
80
+    # first valid id is assumed to be the requested article
81
+    for article_id in search_result['query']['pages']:
82
+        page = search_result['query']['pages'][article_id]
83
+        if int(article_id) > 0:
84
+            break
85
+
86
+    if int(article_id) < 0:
87
+        return []
88
+
89
+    title = page.get('title')
90
+
91
+    image = page.get('thumbnail')
92
+    if image:
93
+        image = image.get('source')
94
+
95
+    extract = page.get('extract')
96
+
97
+    summary = extract_first_paragraph(extract, title, image)
98
+    if not summary:
99
+        return []
100
+
101
+    # link to wikipedia article
102
+    # parenthesis are not quoted to make infobox mergeable with wikidata's
103
+    wikipedia_link = url_lang(resp.search_params['language']) \
104
+        + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')).replace('%28', '(').replace('%29', ')')
105
+
106
+    results.append({'url': wikipedia_link, 'title': title})
107
+
108
+    results.append({'infobox': title,
109
+                    'id': wikipedia_link,
110
+                    'content': summary,
111
+                    'img_src': image,
112
+                    'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
113
+
114
+    return results

+ 9
- 3
searx/results.py Zobrazit soubor

@@ -37,7 +37,7 @@ def merge_two_infoboxes(infobox1, infobox2):
37 37
         urls1 = infobox1.get('urls', None)
38 38
         if urls1 is None:
39 39
             urls1 = []
40
-            infobox1.set('urls', urls1)
40
+            infobox1['urls'] = urls1
41 41
 
42 42
         urlSet = set()
43 43
         for url in infobox1.get('urls', []):
@@ -47,11 +47,17 @@ def merge_two_infoboxes(infobox1, infobox2):
47 47
             if url.get('url', None) not in urlSet:
48 48
                 urls1.append(url)
49 49
 
50
+    if 'img_src' in infobox2:
51
+        img1 = infobox1.get('img_src', None)
52
+        img2 = infobox2.get('img_src')
53
+        if img1 is None:
54
+            infobox1['img_src'] = img2
55
+
50 56
     if 'attributes' in infobox2:
51 57
         attributes1 = infobox1.get('attributes', None)
52 58
         if attributes1 is None:
53 59
             attributes1 = []
54
-            infobox1.set('attributes', attributes1)
60
+            infobox1['attributes'] = attributes1
55 61
 
56 62
         attributeSet = set()
57 63
         for attribute in infobox1.get('attributes', []):
@@ -68,7 +74,7 @@ def merge_two_infoboxes(infobox1, infobox2):
68 74
             if result_content_len(content2) > result_content_len(content1):
69 75
                 infobox1['content'] = content2
70 76
         else:
71
-            infobox1.set('content', content2)
77
+            infobox1['content'] = content2
72 78
 
73 79
 
74 80
 def result_score(result):

+ 2
- 2
searx/settings.yml Zobrazit soubor

@@ -43,10 +43,9 @@ engines:
43 43
     shortcut : bs
44 44
 
45 45
   - name : wikipedia
46
-    engine : mediawiki
46
+    engine : wikipedia
47 47
     shortcut : wp
48 48
     base_url : 'https://{language}.wikipedia.org/'
49
-    number_of_results : 1
50 49
 
51 50
   - name : bing
52 51
     engine : bing
@@ -93,6 +92,7 @@ engines:
93 92
   - name : ddg definitions
94 93
     engine : duckduckgo_definitions
95 94
     shortcut : ddd
95
+    disabled : True
96 96
 
97 97
   - name : digg
98 98
     engine : digg

+ 3
- 1
searx/templates/oscar/infobox.html Zobrazit soubor

@@ -1,8 +1,9 @@
1 1
 <div class="panel panel-default infobox">
2 2
     <div class="panel-heading">
3
-        <h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4>
3
+		<bdi><h4 class="panel-title infobox_part">{{ infobox.infobox }}</h4></bdi>
4 4
     </div>
5 5
     <div class="panel-body">
6
+		<bdi>
6 7
         {% if infobox.img_src %}<img class="img-responsive center-block infobox_part" src="{{ image_proxify(infobox.img_src) }}" alt="{{ infobox.infobox }}" />{% endif %}
7 8
         {% if infobox.content %}<p class="infobox_part">{{ infobox.content }}</p>{% endif %}
8 9
 
@@ -28,5 +29,6 @@
28 29
             {% endfor %}
29 30
         </div>
30 31
         {% endif %}
32
+		</bdi>
31 33
     </div>
32 34
 </div>

+ 160
- 0
tests/unit/engines/test_wikipedia.py Zobrazit soubor

@@ -0,0 +1,160 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import wikipedia
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestWikipediaEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['language'] = 'fr_FR'
14
+        params = wikipedia.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn(query, params['url'])
17
+        self.assertIn('test_query', params['url'])
18
+        self.assertIn('Test_Query', params['url'])
19
+        self.assertIn('fr.wikipedia.org', params['url'])
20
+
21
+        query = 'Test_Query'
22
+        params = wikipedia.request(query, dicto)
23
+        self.assertIn('Test_Query', params['url'])
24
+        self.assertNotIn('test_query', params['url'])
25
+
26
+        dicto['language'] = 'all'
27
+        params = wikipedia.request(query, dicto)
28
+        self.assertIn('en', params['url'])
29
+
30
+    def test_response(self):
31
+        dicto = defaultdict(dict)
32
+        dicto['language'] = 'fr'
33
+
34
+        self.assertRaises(AttributeError, wikipedia.response, None)
35
+        self.assertRaises(AttributeError, wikipedia.response, [])
36
+        self.assertRaises(AttributeError, wikipedia.response, '')
37
+        self.assertRaises(AttributeError, wikipedia.response, '[]')
38
+
39
+        # page not found
40
+        json = """
41
+        {
42
+            "batchcomplete": "",
43
+            "query": {
44
+                "normalized": [],
45
+                "pages": {
46
+                    "-1": {
47
+                        "ns": 0,
48
+                        "title": "",
49
+                        "missing": ""
50
+                    }
51
+                }
52
+            }
53
+        }"""
54
+        response = mock.Mock(content=json, search_params=dicto)
55
+        self.assertEqual(wikipedia.response(response), [])
56
+
57
+        # normal case
58
+        json = """
59
+        {
60
+            "batchcomplete": "",
61
+            "query": {
62
+                "normalized": [],
63
+                "pages": {
64
+                    "12345": {
65
+                        "pageid": 12345,
66
+                        "ns": 0,
67
+                        "title": "The Title",
68
+                        "extract": "The Title is...",
69
+                        "thumbnail": {
70
+                            "source": "img_src.jpg"
71
+                        },
72
+                        "pageimage": "img_name.jpg"
73
+                    }
74
+                }
75
+            }
76
+        }"""
77
+        response = mock.Mock(content=json, search_params=dicto)
78
+        results = wikipedia.response(response)
79
+        self.assertEqual(type(results), list)
80
+        self.assertEqual(len(results), 2)
81
+        self.assertEqual(results[0]['title'], u'The Title')
82
+        self.assertIn('fr.wikipedia.org/wiki/The_Title', results[0]['url'])
83
+        self.assertEqual(results[1]['infobox'], u'The Title')
84
+        self.assertIn('fr.wikipedia.org/wiki/The_Title', results[1]['id'])
85
+        self.assertIn('The Title is...', results[1]['content'])
86
+        self.assertEqual(results[1]['img_src'], 'img_src.jpg')
87
+
88
+        # disambiguation page
89
+        json = """
90
+        {
91
+            "batchcomplete": "",
92
+            "query": {
93
+                "normalized": [],
94
+                "pages": {
95
+                    "12345": {
96
+                        "pageid": 12345,
97
+                        "ns": 0,
98
+                        "title": "The Title",
99
+                        "extract": "The Title can be:\\nThe Title 1\\nThe Title 2\\nThe Title 3\\nThe Title 4......................................................................................................................................." """  # noqa
100
+        json += """
101
+                    }
102
+                }
103
+            }
104
+        }"""
105
+        response = mock.Mock(content=json, search_params=dicto)
106
+        results = wikipedia.response(response)
107
+        self.assertEqual(type(results), list)
108
+        self.assertEqual(len(results), 0)
109
+
110
+        # no image
111
+        json = """
112
+        {
113
+            "batchcomplete": "",
114
+            "query": {
115
+                "normalized": [],
116
+                "pages": {
117
+                    "12345": {
118
+                        "pageid": 12345,
119
+                        "ns": 0,
120
+                        "title": "The Title",
121
+                        "extract": "The Title is......................................................................................................................................................................................." """  # noqa
122
+        json += """
123
+                    }
124
+                }
125
+            }
126
+        }"""
127
+        response = mock.Mock(content=json, search_params=dicto)
128
+        results = wikipedia.response(response)
129
+        self.assertEqual(type(results), list)
130
+        self.assertEqual(len(results), 2)
131
+        self.assertIn('The Title is...', results[1]['content'])
132
+        self.assertEqual(results[1]['img_src'], None)
133
+
134
+        # title not in first paragraph
135
+        json = u"""
136
+        {
137
+            "batchcomplete": "",
138
+            "query": {
139
+                "normalized": [],
140
+                "pages": {
141
+                    "12345": {
142
+                        "pageid": 12345,
143
+                        "ns": 0,
144
+                        "title": "披頭四樂隊",
145
+                        "extract": "披头士乐队....................................................................................................................................................................................................\\n披頭四樂隊...", """  # noqa
146
+        json += """
147
+                        "thumbnail": {
148
+                            "source": "img_src.jpg"
149
+                        },
150
+                        "pageimage": "img_name.jpg"
151
+                    }
152
+                }
153
+            }
154
+        }"""
155
+        response = mock.Mock(content=json, search_params=dicto)
156
+        results = wikipedia.response(response)
157
+        self.assertEqual(type(results), list)
158
+        self.assertEqual(len(results), 2)
159
+        self.assertEqual(results[1]['infobox'], u'披頭四樂隊')
160
+        self.assertIn(u'披头士乐队...', results[1]['content'])