jibe-b 7 лет назад
Родитель
Сommit
df0d915806
4 измененных файлов: 146 добавлений и 0 удалений
  1. 101
    0
      searx/engines/pubmed.py
  2. 6
    0
      searx/settings.yml
  3. 2
    0
      searx/url_utils.py
  4. 37
    0
      tests/unit/engines/pubmed.py

+ 101
- 0
searx/engines/pubmed.py Просмотреть файл

@@ -0,0 +1,101 @@
1
+#!/usr/bin/env python
2
+
3
+"""
4
+ PubMed (Scholar publications)
5
+ @website     https://www.ncbi.nlm.nih.gov/pubmed/
6
+ @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/)
7
+ @using-api   yes
8
+ @results     XML
9
+ @stable      yes
10
+ @parse       url, title, publishedDate, content
11
+ More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/
12
+"""
13
+
14
+from lxml import etree
15
+from datetime import datetime
16
+from searx.url_utils import urlencode, urlopen
17
+
18
+
19
+categories = ['science']
20
+
21
+base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\
22
+           + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
23
+
24
+# engine dependent config
25
+number_of_results = 10
26
+pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
27
+
28
+
29
+def request(query, params):
30
+    # basic search
31
+    offset = (params['pageno'] - 1) * number_of_results
32
+
33
+    string_args = dict(query=urlencode({'term': query}),
34
+                       offset=offset,
35
+                       hits=number_of_results)
36
+
37
+    params['url'] = base_url.format(**string_args)
38
+
39
+    return params
40
+
41
+
42
+def response(resp):
43
+    results = []
44
+
45
+    # First retrieve notice of each result
46
+    pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
47
+                              + 'db=pubmed&retmode=xml&id={pmids_string}'
48
+
49
+    # handle Python2 vs Python3 management of bytes and strings
50
+    try:
51
+        pmids_results = etree.XML(resp.text.encode('utf-8'))
52
+    except AttributeError:
53
+        pmids_results = etree.XML(resp.text)
54
+
55
+    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
56
+    pmids_string = ''
57
+
58
+    for item in pmids:
59
+        pmids_string += item.text + ','
60
+
61
+    retrieve_notice_args = dict(pmids_string=pmids_string)
62
+
63
+    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
64
+
65
+    search_results_xml = urlopen(retrieve_url_encoded).read()
66
+    search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
67
+
68
+    for entry in search_results:
69
+        title = entry.xpath('.//Article/ArticleTitle')[0].text
70
+
71
+        pmid = entry.xpath('.//PMID')[0].text
72
+        url = pubmed_url + pmid
73
+
74
+        try:
75
+            content = entry.xpath('.//Abstract/AbstractText')[0].text
76
+        except:
77
+            content = 'No abstract is available for this publication.'
78
+
79
+        #  If a doi is available, add it to the snipppet
80
+        try:
81
+            doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
82
+            content = 'DOI: ' + doi + ' Abstract: ' + content
83
+        except:
84
+            pass
85
+
86
+        if len(content) > 300:
87
+                    content = content[0:300] + "..."
88
+        # TODO: center snippet on query term
89
+
90
+        publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text
91
+                                          + '-' + entry.xpath('.//DateCreated/Month')[0].text
92
+                                          + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
93
+
94
+        res_dict = {'url': url,
95
+                    'title': title,
96
+                    'publishedDate': publishedDate,
97
+                    'content': content}
98
+
99
+        results.append(res_dict)
100
+
101
+        return results

+ 6
- 0
searx/settings.yml Просмотреть файл

@@ -460,6 +460,12 @@ engines:
460 460
     url: https://pirateproxy.red/
461 461
     timeout : 3.0
462 462
 
463
+  - name : pubmed
464
+    engine : pubmed
465
+    shortcut : pub
466
+    categories: science
467
+    oa_first : false
468
+
463 469
   - name : qwant
464 470
     engine : qwant
465 471
     shortcut : qw

+ 2
- 0
searx/url_utils.py Просмотреть файл

@@ -3,6 +3,7 @@ from sys import version_info
3 3
 if version_info[0] == 2:
4 4
     from urllib import quote, quote_plus, unquote, urlencode
5 5
     from urlparse import parse_qs, parse_qsl, urljoin, urlparse, urlunparse, ParseResult
6
+    from urllib2 import urlopen
6 7
 else:
7 8
     from urllib.parse import (
8 9
         parse_qs,
@@ -16,6 +17,7 @@ else:
16 17
         urlunparse,
17 18
         ParseResult
18 19
     )
20
+    from urllib.request import urlopen
19 21
 
20 22
 
21 23
 __export__ = (parse_qs,

+ 37
- 0
tests/unit/engines/pubmed.py Просмотреть файл

@@ -0,0 +1,37 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import pubmed
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestPubmedEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 1
14
+        params = pubmed.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn('eutils.ncbi.nlm.nih.gov/', params['url'])
17
+        self.assertIn('term', params['url'])
18
+
19
+    def test_response(self):
20
+        self.assertRaises(AttributeError, pubmed.response, None)
21
+        self.assertRaises(AttributeError, pubmed.response, [])
22
+        self.assertRaises(AttributeError, pubmed.response, '')
23
+        self.assertRaises(AttributeError, pubmed.response, '[]')
24
+
25
+        response = mock.Mock(text='<PubmedArticleSet></PubmedArticleSet>')
26
+        self.assertEqual(pubmed.response(response), [])
27
+
28
+        xml_mock = """<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
29
+<Id>1</Id>
30
+</IdList></eSearchResult>
31
+"""
32
+
33
+        response = mock.Mock(text=xml_mock.encode('utf-8'))
34
+        results = pubmed.response(response)
35
+        self.assertEqual(type(results), list)
36
+        self.assertEqual(len(results), 1)
37
+        self.assertEqual(results[0]['content'], 'No abstract is available for this publication.')