瀏覽代碼

[add] arxiv engine

jibe-b 7 年之前
父節點
當前提交
3e3672e079
共有 3 個檔案被更改,包括 137 行新增0 行删除
  1. 73
    0
      searx/engines/arxiv.py
  2. 6
    0
      searx/settings.yml
  3. 58
    0
      tests/unit/engines/test_arxiv.py

+ 73
- 0
searx/engines/arxiv.py 查看文件

@@ -0,0 +1,73 @@
1
+#!/usr/bin/env python
2
+
3
+"""
4
+ ArXiV (Scientific preprints)
5
+ @website     https://axiv.org
6
+ @provide-api yes (export.arxiv.org/api/query)
7
+ @using-api   yes
8
+ @results     XML-RSS
9
+ @stable      yes
10
+ @parse       url, title, publishedDate, content
11
+ More info on api: https://arxiv.org/help/api/user-manual
12
+"""
13
+
14
+from lxml import html
15
+from datetime import datetime
16
+from searx.url_utils import urlencode
17
+
18
+
19
+categories = ['science']
20
+
21
+base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
22
+           + '{query}&start={offset}&max_results={number_of_results}'
23
+
24
+# engine dependent config
25
+number_of_results = 10
26
+
27
+
28
+def request(query, params):
29
+    # basic search
30
+    offset = (params['pageno'] - 1) * number_of_results
31
+
32
+    string_args = dict(query=query,
33
+                       offset=offset,
34
+                       number_of_results=number_of_results)
35
+
36
+    params['url'] = base_url.format(**string_args)
37
+
38
+    return params
39
+
40
+
41
+def response(resp):
42
+    results = []
43
+
44
+    search_results = html.fromstring(resp.text.encode('utf-8')).xpath('//entry')
45
+
46
+    for entry in search_results:
47
+        title = entry.xpath('.//title')[0].text
48
+
49
+        url = entry.xpath('.//id')[0].text
50
+
51
+        content = entry.xpath('.//summary')[0].text
52
+
53
+        #  If a doi is available, add it to the snipppet
54
+        try:
55
+            doi = entry.xpath('.//link[@title="doi"]')[0].text
56
+            content = 'DOI: ' + doi + ' Abstract: ' + content
57
+        except:
58
+            pass
59
+
60
+        if len(content) > 300:
61
+                    content = content[0:300] + "..."
62
+        # TODO: center snippet on query term
63
+
64
+        publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ')
65
+
66
+        res_dict = {'url': url,
67
+                    'title': title,
68
+                    'publishedDate': publishedDate,
69
+                    'content': content}
70
+
71
+        results.append(res_dict)
72
+
73
+    return results

+ 6
- 0
searx/settings.yml 查看文件

@@ -60,6 +60,12 @@ engines:
60 60
     disabled : True
61 61
     shortcut : ai
62 62
 
63
+  - name : arxiv
64
+    engine : arxiv
65
+    shortcut : arx
66
+    categories : science
67
+    timeout : 4.0
68
+
63 69
   - name : base
64 70
     engine : base
65 71
     shortcut : bs

+ 58
- 0
tests/unit/engines/test_arxiv.py 查看文件

@@ -0,0 +1,58 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import arxiv
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestBaseEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 1
14
+        params = arxiv.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn('export.arxiv.org/api/', params['url'])
17
+
18
+    def test_response(self):
19
+        self.assertRaises(AttributeError, arxiv.response, None)
20
+        self.assertRaises(AttributeError, arxiv.response, [])
21
+        self.assertRaises(AttributeError, arxiv.response, '')
22
+        self.assertRaises(AttributeError, arxiv.response, '[]')
23
+
24
+        response = mock.Mock(text='''<?xml version="1.0" encoding="UTF-8"?>
25
+<feed xmlns="http://www.w3.org/2005/Atom"></feed>''')
26
+        self.assertEqual(arxiv.response(response), [])
27
+
28
+        xml_mock = '''<?xml version="1.0" encoding="UTF-8"?>
29
+<feed xmlns="http://www.w3.org/2005/Atom">
30
+  <title type="html">ArXiv Query: search_query=all:test_query&amp;id_list=&amp;start=0&amp;max_results=1</title>
31
+  <id>http://arxiv.org/api/1</id>
32
+  <updated>2000-01-21T00:00:00-01:00</updated>
33
+  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>
34
+  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
35
+  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
36
+  <entry>
37
+    <id>http://arxiv.org/1</id>
38
+    <updated>2000-01-01T00:00:01Z</updated>
39
+    <published>2000-01-01T00:00:01Z</published>
40
+    <title>Mathematical proof.</title>
41
+    <summary>Mathematical formula.</summary>
42
+    <author>
43
+      <name>A. B.</name>
44
+    </author>
45
+    <link href="http://arxiv.org/1" rel="alternate" type="text/html"/>
46
+    <link title="pdf" href="http://arxiv.org/1" rel="related" type="application/pdf"/>
47
+    <category term="math.QA" scheme="http://arxiv.org/schemas/atom"/>
48
+    <category term="1" scheme="http://arxiv.org/schemas/atom"/>
49
+  </entry>
50
+</feed>
51
+'''
52
+
53
+        response = mock.Mock(text=xml_mock.encode('utf-8'))
54
+        results = arxiv.response(response)
55
+        self.assertEqual(type(results), list)
56
+        self.assertEqual(len(results), 1)
57
+        self.assertEqual(results[0]['title'], 'Mathematical proof.')
58
+        self.assertEqual(results[0]['content'], 'Mathematical formula.')