Browse Source

Merge pull request #1075 from kvch/finish-jibe-b-engines

Finish PRs of @jibe-b: pubmed, oa_doi_rewrite, openaire, arxiv
Adam Tauber 7 years ago
parent
commit
3d50b0288d
No account linked to committer's email

+ 76
- 0
searx/engines/arxiv.py View File

1
+#!/usr/bin/env python
2
+
3
+"""
4
+ ArXiV (Scientific preprints)
5
+ @website     https://arxiv.org
6
+ @provide-api yes (export.arxiv.org/api/query)
7
+ @using-api   yes
8
+ @results     XML-RSS
9
+ @stable      yes
10
+ @parse       url, title, publishedDate, content
11
+ More info on api: https://arxiv.org/help/api/user-manual
12
+"""
13
+
14
+from lxml import html
15
+from datetime import datetime
16
+from searx.url_utils import urlencode
17
+
18
+
19
+categories = ['science']
20
+
21
+base_url = 'http://export.arxiv.org/api/query?search_query=all:'\
22
+           + '{query}&start={offset}&max_results={number_of_results}'
23
+
24
+# engine dependent config
25
+number_of_results = 10
26
+
27
+
28
+def request(query, params):
29
+    # basic search
30
+    offset = (params['pageno'] - 1) * number_of_results
31
+
32
+    string_args = dict(query=query,
33
+                       offset=offset,
34
+                       number_of_results=number_of_results)
35
+
36
+    params['url'] = base_url.format(**string_args)
37
+
38
+    return params
39
+
40
+
41
+def response(resp):
42
+    results = []
43
+
44
+    dom = html.fromstring(resp.content)
45
+    search_results = dom.xpath('//entry')
46
+
47
+    for entry in search_results:
48
+        title = entry.xpath('.//title')[0].text
49
+
50
+        url = entry.xpath('.//id')[0].text
51
+
52
+        content_string = '{doi_content}{abstract_content}'
53
+
54
+        abstract = entry.xpath('.//summary')[0].text
55
+
56
+        #  If a doi is available, add it to the snipppet
57
+        try:
58
+            doi_content = entry.xpath('.//link[@title="doi"]')[0].text
59
+            content = content_string.format(doi_content=doi_content, abstract_content=abstract)
60
+        except:
61
+            content = content_string.format(doi_content="", abstract_content=abstract)
62
+
63
+        if len(content) > 300:
64
+                    content = content[0:300] + "..."
65
+        # TODO: center snippet on query term
66
+
67
+        publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ')
68
+
69
+        res_dict = {'url': url,
70
+                    'title': title,
71
+                    'publishedDate': publishedDate,
72
+                    'content': content}
73
+
74
+        results.append(res_dict)
75
+
76
+    return results

+ 1
- 1
searx/engines/base.py View File

73
 def response(resp):
73
 def response(resp):
74
     results = []
74
     results = []
75
 
75
 
76
-    search_results = etree.XML(resp.text)
76
+    search_results = etree.XML(resp.content)
77
 
77
 
78
     for entry in search_results.xpath('./result/doc'):
78
     for entry in search_results.xpath('./result/doc'):
79
         content = "No description available"
79
         content = "No description available"

+ 98
- 0
searx/engines/pubmed.py View File

1
+#!/usr/bin/env python
2
+
3
+"""
4
+ PubMed (Scholar publications)
5
+ @website     https://www.ncbi.nlm.nih.gov/pubmed/
6
+ @provide-api yes (https://www.ncbi.nlm.nih.gov/home/develop/api/)
7
+ @using-api   yes
8
+ @results     XML
9
+ @stable      yes
10
+ @parse       url, title, publishedDate, content
11
+ More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/
12
+"""
13
+
14
+from flask_babel import gettext
15
+from lxml import etree
16
+from datetime import datetime
17
+from searx.url_utils import urlencode
18
+from searx.poolrequests import get
19
+
20
+
21
+categories = ['science']
22
+
23
+base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'\
24
+           + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
25
+
26
+# engine dependent config
27
+number_of_results = 10
28
+pubmed_url = 'https://www.ncbi.nlm.nih.gov/pubmed/'
29
+
30
+
31
+def request(query, params):
32
+    # basic search
33
+    offset = (params['pageno'] - 1) * number_of_results
34
+
35
+    string_args = dict(query=urlencode({'term': query}),
36
+                       offset=offset,
37
+                       hits=number_of_results)
38
+
39
+    params['url'] = base_url.format(**string_args)
40
+
41
+    return params
42
+
43
+
44
+def response(resp):
45
+    results = []
46
+
47
+    # First retrieve notice of each result
48
+    pubmed_retrieve_api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?'\
49
+                              + 'db=pubmed&retmode=xml&id={pmids_string}'
50
+
51
+    pmids_results = etree.XML(resp.content)
52
+    pmids = pmids_results.xpath('//eSearchResult/IdList/Id')
53
+    pmids_string = ''
54
+
55
+    for item in pmids:
56
+        pmids_string += item.text + ','
57
+
58
+    retrieve_notice_args = dict(pmids_string=pmids_string)
59
+
60
+    retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
61
+
62
+    search_results_xml = get(retrieve_url_encoded).content
63
+    search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
64
+
65
+    for entry in search_results:
66
+        title = entry.xpath('.//Article/ArticleTitle')[0].text
67
+
68
+        pmid = entry.xpath('.//PMID')[0].text
69
+        url = pubmed_url + pmid
70
+
71
+        try:
72
+            content = entry.xpath('.//Abstract/AbstractText')[0].text
73
+        except:
74
+            content = gettext('No abstract is available for this publication.')
75
+
76
+        #  If a doi is available, add it to the snipppet
77
+        try:
78
+            doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
79
+            content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
80
+        except:
81
+            pass
82
+
83
+        if len(content) > 300:
84
+                    content = content[0:300] + "..."
85
+        # TODO: center snippet on query term
86
+
87
+        publishedDate = datetime.strptime(entry.xpath('.//DateCreated/Year')[0].text
88
+                                          + '-' + entry.xpath('.//DateCreated/Month')[0].text
89
+                                          + '-' + entry.xpath('.//DateCreated/Day')[0].text, '%Y-%m-%d')
90
+
91
+        res_dict = {'url': url,
92
+                    'title': title,
93
+                    'publishedDate': publishedDate,
94
+                    'content': content}
95
+
96
+        results.append(res_dict)
97
+
98
+        return results

+ 2
- 2
searx/plugins/__init__.py View File

22
 
22
 
23
 logger = logger.getChild('plugins')
23
 logger = logger.getChild('plugins')
24
 
24
 
25
-from searx.plugins import (doai_rewrite,
25
+from searx.plugins import (oa_doi_rewrite,
26
                            https_rewrite,
26
                            https_rewrite,
27
                            infinite_scroll,
27
                            infinite_scroll,
28
                            open_results_on_new_tab,
28
                            open_results_on_new_tab,
78
 
78
 
79
 
79
 
80
 plugins = PluginStore()
80
 plugins = PluginStore()
81
-plugins.register(doai_rewrite)
81
+plugins.register(oa_doi_rewrite)
82
 plugins.register(https_rewrite)
82
 plugins.register(https_rewrite)
83
 plugins.register(infinite_scroll)
83
 plugins.register(infinite_scroll)
84
 plugins.register(open_results_on_new_tab)
84
 plugins.register(open_results_on_new_tab)

searx/plugins/doai_rewrite.py → searx/plugins/oa_doi_rewrite.py View File

1
 from flask_babel import gettext
1
 from flask_babel import gettext
2
 import re
2
 import re
3
 from searx.url_utils import urlparse, parse_qsl
3
 from searx.url_utils import urlparse, parse_qsl
4
+from searx import settings
5
+
4
 
6
 
5
 regex = re.compile(r'10\.\d{4,9}/[^\s]+')
7
 regex = re.compile(r'10\.\d{4,9}/[^\s]+')
6
 
8
 
7
-name = gettext('DOAI rewrite')
9
+name = gettext('Open Access DOI rewrite')
8
 description = gettext('Avoid paywalls by redirecting to open-access versions of publications when available')
10
 description = gettext('Avoid paywalls by redirecting to open-access versions of publications when available')
9
 default_on = False
11
 default_on = False
10
 preference_section = 'privacy'
12
 preference_section = 'privacy'
11
 
13
 
14
+doi_resolvers = settings['doi_resolvers']
15
+
12
 
16
 
13
 def extract_doi(url):
17
 def extract_doi(url):
14
     match = regex.search(url.path)
18
     match = regex.search(url.path)
21
     return None
25
     return None
22
 
26
 
23
 
27
 
28
+def get_doi_resolver(args, preference_doi_resolver):
29
+    doi_resolvers = settings['doi_resolvers']
30
+    doi_resolver = args.get('doi_resolver', preference_doi_resolver)[0]
31
+    if doi_resolver not in doi_resolvers:
32
+        doi_resolvers = settings['default_doi_resolver']
33
+    return doi_resolver
34
+
35
+
24
 def on_result(request, search, result):
36
 def on_result(request, search, result):
25
     doi = extract_doi(result['parsed_url'])
37
     doi = extract_doi(result['parsed_url'])
26
     if doi and len(doi) < 50:
38
     if doi and len(doi) < 50:
27
         for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
39
         for suffix in ('/', '.pdf', '/full', '/meta', '/abstract'):
28
             if doi.endswith(suffix):
40
             if doi.endswith(suffix):
29
                 doi = doi[:-len(suffix)]
41
                 doi = doi[:-len(suffix)]
30
-        result['url'] = 'http://doai.io/' + doi
42
+        result['url'] = get_doi_resolver(request.args, request.preferences.get_value('doi_resolver')) + doi
31
         result['parsed_url'] = urlparse(result['url'])
43
         result['parsed_url'] = urlparse(result['url'])
32
     return True
44
     return True

+ 4
- 1
searx/preferences.py View File

15
 LANGUAGE_CODES.append('all')
15
 LANGUAGE_CODES.append('all')
16
 DISABLED = 0
16
 DISABLED = 0
17
 ENABLED = 1
17
 ENABLED = 1
18
+DOI_RESOLVERS = list(settings['doi_resolvers'])
18
 
19
 
19
 
20
 
20
 class MissingArgumentException(Exception):
21
 class MissingArgumentException(Exception):
266
                                    'results_on_new_tab': MapSetting(False, map={'0': False,
267
                                    'results_on_new_tab': MapSetting(False, map={'0': False,
267
                                                                                 '1': True,
268
                                                                                 '1': True,
268
                                                                                 'False': False,
269
                                                                                 'False': False,
269
-                                                                                'True': True})}
270
+                                                                                'True': True}),
271
+                                   'doi_resolver': MultipleChoiceSetting(['oadoi.org'], choices=DOI_RESOLVERS),
272
+                                   }
270
 
273
 
271
         self.engines = EnginesSetting('engines', choices=engines)
274
         self.engines = EnginesSetting('engines', choices=engines)
272
         self.plugins = PluginsSetting('plugins', choices=plugins)
275
         self.plugins = PluginsSetting('plugins', choices=plugins)

+ 31
- 0
searx/settings.yml View File

60
     disabled : True
60
     disabled : True
61
     shortcut : ai
61
     shortcut : ai
62
 
62
 
63
+  - name : arxiv
64
+    engine : arxiv
65
+    shortcut : arx
66
+    categories : science
67
+    timeout : 4.0
68
+
63
   - name : base
69
   - name : base
64
     engine : base
70
     engine : base
65
     shortcut : bs
71
     shortcut : bs
409
     shortcut : nt
415
     shortcut : nt
410
     disabled : True
416
     disabled : True
411
 
417
 
418
+  - name : openaire
419
+    engine : json_engine
420
+    paging : True
421
+    search_url : http://api.openaire.eu/search/datasets?format=json&page={pageno}&size=10&title={query}
422
+    results_query : response/results/result
423
+    url_query : metadata/oaf:entity/oaf:result/children/instance/webresource/url/$
424
+    title_query : metadata/oaf:entity/oaf:result/title/$
425
+    content_query : metadata/oaf:entity/oaf:result/description/$
426
+    categories : science
427
+    shortcut : oa
428
+    timeout: 5.0
429
+
412
   - name : openstreetmap
430
   - name : openstreetmap
413
     engine : openstreetmap
431
     engine : openstreetmap
414
     shortcut : osm
432
     shortcut : osm
442
     url: https://pirateproxy.red/
460
     url: https://pirateproxy.red/
443
     timeout : 3.0
461
     timeout : 3.0
444
 
462
 
463
+  - name : pubmed
464
+    engine : pubmed
465
+    shortcut : pub
466
+    categories: science
467
+    timeout : 3.0
468
+
445
   - name : qwant
469
   - name : qwant
446
     engine : qwant
470
     engine : qwant
447
     shortcut : qw
471
     shortcut : qw
694
     tr : Türkçe (Turkish)
718
     tr : Türkçe (Turkish)
695
     uk : українська мова (Ukrainian)
719
     uk : українська мова (Ukrainian)
696
     zh : 中文 (Chinese)
720
     zh : 中文 (Chinese)
721
+
722
+doi_resolvers :
723
+  oadoi.org : 'https://oadoi.org/'
724
+  doi.org : 'https://doi.org/'
725
+  doai.io  : 'http://doai.io/'
726
+
727
+default_doi_resolver : 'oadoi.org'

+ 12
- 0
searx/templates/oscar/preferences.html View File

118
                             <option value="0" {% if not results_on_new_tab %}selected="selected"{% endif %}>{{ _('Off')}}</option>
118
                             <option value="0" {% if not results_on_new_tab %}selected="selected"{% endif %}>{{ _('Off')}}</option>
119
                         </select>
119
                         </select>
120
                     {{ preferences_item_footer(info, label, rtl) }}
120
                     {{ preferences_item_footer(info, label, rtl) }}
121
+
122
+                    {% set label = _('Open Access DOI resolver') %}
123
+                    {% set info = _('Redirect to open-access versions of publications when available (plugin required)') %}
124
+                    {{ preferences_item_header(info, label, rtl) }}
125
+                        <select class="form-control" id='doi_resolver' name='doi_resolver'>
126
+                            {% for doi_resolver_name,doi_resolver_url in doi_resolvers.items() %}
127
+                            <option value="{{ doi_resolver_name }}" {% if doi_resolver_name == current_doi_resolver %}selected="selected"{% endif %}>
128
+                                    {{ doi_resolver_name }} - {{ doi_resolver_url }}
129
+                            </option>
130
+                             {% endfor %}
131
+                         </select>
132
+                    {{ preferences_item_footer(info, label, rtl) }}
121
                 </div>
133
                 </div>
122
                 </fieldset>
134
                 </fieldset>
123
             </div>
135
             </div>

+ 7
- 1
searx/webapp.py View File

66
 from searx.query import RawTextQuery
66
 from searx.query import RawTextQuery
67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
67
 from searx.autocomplete import searx_bang, backends as autocomplete_backends
68
 from searx.plugins import plugins
68
 from searx.plugins import plugins
69
+from searx.plugins.oa_doi_rewrite import get_doi_resolver
69
 from searx.preferences import Preferences, ValidationException
70
 from searx.preferences import Preferences, ValidationException
70
 from searx.answerers import answerers
71
 from searx.answerers import answerers
71
 from searx.url_utils import urlencode, urlparse, urljoin
72
 from searx.url_utils import urlencode, urlparse, urljoin
695
                   shortcuts={y: x for x, y in engine_shortcuts.items()},
696
                   shortcuts={y: x for x, y in engine_shortcuts.items()},
696
                   themes=themes,
697
                   themes=themes,
697
                   plugins=plugins,
698
                   plugins=plugins,
699
+                  doi_resolvers=settings['doi_resolvers'],
700
+                  current_doi_resolver=get_doi_resolver(request.args, request.preferences.get_value('doi_resolver')),
698
                   allowed_plugins=allowed_plugins,
701
                   allowed_plugins=allowed_plugins,
699
                   theme=get_current_theme_name(),
702
                   theme=get_current_theme_name(),
700
                   preferences_url_params=request.preferences.get_as_url_params(),
703
                   preferences_url_params=request.preferences.get_as_url_params(),
839
                     'autocomplete': settings['search']['autocomplete'],
842
                     'autocomplete': settings['search']['autocomplete'],
840
                     'safe_search': settings['search']['safe_search'],
843
                     'safe_search': settings['search']['safe_search'],
841
                     'default_theme': settings['ui']['default_theme'],
844
                     'default_theme': settings['ui']['default_theme'],
842
-                    'version': VERSION_STRING})
845
+                    'version': VERSION_STRING,
846
+                    'doi_resolvers': [r for r in search['doi_resolvers']],
847
+                    'default_doi_resolver': settings['default_doi_resolver'],
848
+                    })
843
 
849
 
844
 
850
 
845
 @app.errorhandler(404)
851
 @app.errorhandler(404)

+ 37
- 0
tests/unit/engines/pubmed.py View File

1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import pubmed
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestPubmedEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 1
14
+        params = pubmed.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn('eutils.ncbi.nlm.nih.gov/', params['url'])
17
+        self.assertIn('term', params['url'])
18
+
19
+    def test_response(self):
20
+        self.assertRaises(AttributeError, pubmed.response, None)
21
+        self.assertRaises(AttributeError, pubmed.response, [])
22
+        self.assertRaises(AttributeError, pubmed.response, '')
23
+        self.assertRaises(AttributeError, pubmed.response, '[]')
24
+
25
+        response = mock.Mock(text='<PubmedArticleSet></PubmedArticleSet>')
26
+        self.assertEqual(pubmed.response(response), [])
27
+
28
+        xml_mock = """<eSearchResult><Count>1</Count><RetMax>1</RetMax><RetStart>0</RetStart><IdList>
29
+<Id>1</Id>
30
+</IdList></eSearchResult>
31
+"""
32
+
33
+        response = mock.Mock(text=xml_mock.encode('utf-8'))
34
+        results = pubmed.response(response)
35
+        self.assertEqual(type(results), list)
36
+        self.assertEqual(len(results), 1)
37
+        self.assertEqual(results[0]['content'], 'No abstract is available for this publication.')

+ 58
- 0
tests/unit/engines/test_arxiv.py View File

1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import arxiv
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestBaseEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 1
14
+        params = arxiv.request(query, dicto)
15
+        self.assertIn('url', params)
16
+        self.assertIn('export.arxiv.org/api/', params['url'])
17
+
18
+    def test_response(self):
19
+        self.assertRaises(AttributeError, arxiv.response, None)
20
+        self.assertRaises(AttributeError, arxiv.response, [])
21
+        self.assertRaises(AttributeError, arxiv.response, '')
22
+        self.assertRaises(AttributeError, arxiv.response, '[]')
23
+
24
+        response = mock.Mock(content=b'''<?xml version="1.0" encoding="UTF-8"?>
25
+<feed xmlns="http://www.w3.org/2005/Atom"></feed>''')
26
+        self.assertEqual(arxiv.response(response), [])
27
+
28
+        xml_mock = b'''<?xml version="1.0" encoding="UTF-8"?>
29
+<feed xmlns="http://www.w3.org/2005/Atom">
30
+  <title type="html">ArXiv Query: search_query=all:test_query&amp;id_list=&amp;start=0&amp;max_results=1</title>
31
+  <id>http://arxiv.org/api/1</id>
32
+  <updated>2000-01-21T00:00:00-01:00</updated>
33
+  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:totalResults>
34
+  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
35
+  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">1</opensearch:itemsPerPage>
36
+  <entry>
37
+    <id>http://arxiv.org/1</id>
38
+    <updated>2000-01-01T00:00:01Z</updated>
39
+    <published>2000-01-01T00:00:01Z</published>
40
+    <title>Mathematical proof.</title>
41
+    <summary>Mathematical formula.</summary>
42
+    <author>
43
+      <name>A. B.</name>
44
+    </author>
45
+    <link href="http://arxiv.org/1" rel="alternate" type="text/html"/>
46
+    <link title="pdf" href="http://arxiv.org/1" rel="related" type="application/pdf"/>
47
+    <category term="math.QA" scheme="http://arxiv.org/schemas/atom"/>
48
+    <category term="1" scheme="http://arxiv.org/schemas/atom"/>
49
+  </entry>
50
+</feed>
51
+'''
52
+
53
+        response = mock.Mock(content=xml_mock)
54
+        results = arxiv.response(response)
55
+        self.assertEqual(type(results), list)
56
+        self.assertEqual(len(results), 1)
57
+        self.assertEqual(results[0]['title'], 'Mathematical proof.')
58
+        self.assertEqual(results[0]['content'], 'Mathematical formula.')

+ 3
- 3
tests/unit/engines/test_base.py View File

21
         self.assertRaises(AttributeError, base.response, '')
21
         self.assertRaises(AttributeError, base.response, '')
22
         self.assertRaises(AttributeError, base.response, '[]')
22
         self.assertRaises(AttributeError, base.response, '[]')
23
 
23
 
24
-        response = mock.Mock(text='<response></response>')
24
+        response = mock.Mock(content=b'<response></response>')
25
         self.assertEqual(base.response(response), [])
25
         self.assertEqual(base.response(response), [])
26
 
26
 
27
-        xml_mock = """<?xml version="1.0"?>
27
+        xml_mock = b"""<?xml version="1.0"?>
28
 <response>
28
 <response>
29
   <lst name="responseHeader">
29
   <lst name="responseHeader">
30
     <int name="status">0</int>
30
     <int name="status">0</int>
83
   </result>
83
   </result>
84
 </response>"""
84
 </response>"""
85
 
85
 
86
-        response = mock.Mock(text=xml_mock.encode('utf-8'))
86
+        response = mock.Mock(content=xml_mock)
87
         results = base.response(response)
87
         results = base.response(response)
88
         self.assertEqual(type(results), list)
88
         self.assertEqual(type(results), list)
89
         self.assertEqual(len(results), 1)
89
         self.assertEqual(len(results), 1)