Selaa lähdekoodia

Merge pull request #881 from mmuman/framalibre

Add framalibre engine
Adam Tauber 8 vuotta sitten
vanhempi
commit
80df181575
3 muutettua tiedostoa jossa 180 lisäystä ja 0 poistoa
  1. 72
    0
      searx/engines/framalibre.py
  2. 5
    0
      searx/settings.yml
  3. 103
    0
      tests/unit/engines/test_framalibre.py

+ 72
- 0
searx/engines/framalibre.py Näytä tiedosto

@@ -0,0 +1,72 @@
1
+"""
2
+ FramaLibre (It)
3
+
4
+ @website     https://framalibre.org/
5
+ @provide-api no
6
+
7
+ @using-api   no
8
+ @results     HTML
9
+ @stable      no (HTML can change)
10
+ @parse       url, title, content, thumbnail, img_src
11
+"""
12
+
13
+from urlparse import urljoin
14
+from cgi import escape
15
+from urllib import urlencode
16
+from lxml import html
17
+from searx.engines.xpath import extract_text
18
+from dateutil import parser
19
+
20
+# engine dependent config
21
+categories = ['it']
22
+paging = True
23
+
24
+# search-url
25
+base_url = 'https://framalibre.org/'
26
+search_url = base_url + 'recherche-par-crit-res?{query}&page={offset}'
27
+
28
+# specific xpath variables
29
+results_xpath = '//div[@class="nodes-list-row"]/div[contains(@typeof,"sioc:Item")]'
30
+link_xpath = './/h3[@class="node-title"]/a[@href]'
31
+thumbnail_xpath = './/img[@class="media-object img-responsive"]/@src'
32
+content_xpath = './/div[@class="content"]//p'
33
+
34
+
35
+# do search-request
36
+def request(query, params):
37
+    offset = (params['pageno'] - 1)
38
+    params['url'] = search_url.format(query=urlencode({'keys': query}),
39
+                                      offset=offset)
40
+
41
+    return params
42
+
43
+
44
+# get response from search-request
45
+def response(resp):
46
+    results = []
47
+
48
+    dom = html.fromstring(resp.text)
49
+
50
+    # parse results
51
+    for result in dom.xpath(results_xpath):
52
+        link = result.xpath(link_xpath)[0]
53
+        href = urljoin(base_url, link.attrib.get('href'))
54
+        # there's also a span (class="rdf-meta element-hidden" property="dc:title")'s content property for this...
55
+        title = escape(extract_text(link))
56
+        thumbnail_tags = result.xpath(thumbnail_xpath)
57
+        thumbnail = None
58
+        if len(thumbnail_tags) > 0:
59
+            thumbnail = extract_text(thumbnail_tags[0])
60
+            if thumbnail[0] == '/':
61
+                thumbnail = base_url + thumbnail
62
+        content = escape(extract_text(result.xpath(content_xpath)))
63
+
64
+        # append result
65
+        results.append({'url': href,
66
+                        'title': title,
67
+                        'thumbnail': thumbnail,
68
+                        'img_src': thumbnail,
69
+                        'content': content})
70
+
71
+    # return results
72
+    return results

+ 5
- 0
searx/settings.yml Näytä tiedosto

@@ -465,6 +465,11 @@ engines:
465 465
     shortcut : scc
466 466
     disabled : True
467 467
 
468
+  - name : framalibre
469
+    engine : framalibre
470
+    shortcut : frl
471
+    disabled : True
472
+
468 473
 #  - name : searx
469 474
 #    engine : searx_engine
470 475
 #    shortcut : se

+ 103
- 0
tests/unit/engines/test_framalibre.py Näytä tiedosto

@@ -0,0 +1,103 @@
1
+# -*- coding: utf-8 -*-
2
+from collections import defaultdict
3
+import mock
4
+from searx.engines import framalibre
5
+from searx.testing import SearxTestCase
6
+
7
+
8
+class TestFramalibreEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dicto = defaultdict(dict)
13
+        dicto['pageno'] = 0
14
+        params = framalibre.request(query, dicto)
15
+        self.assertTrue('url' in params)
16
+        self.assertTrue(query in params['url'])
17
+        self.assertTrue('framalibre.org' in params['url'])
18
+
19
+    def test_response(self):
20
+        self.assertRaises(AttributeError, framalibre.response, None)
21
+        self.assertRaises(AttributeError, framalibre.response, [])
22
+        self.assertRaises(AttributeError, framalibre.response, '')
23
+        self.assertRaises(AttributeError, framalibre.response, '[]')
24
+
25
+        response = mock.Mock(text='{}')
26
+        self.assertEqual(framalibre.response(response), [])
27
+
28
+        response = mock.Mock(text='{"data": []}')
29
+        self.assertEqual(framalibre.response(response), [])
30
+
31
+        html = u"""
32
+        <div class="nodes-list-row">
33
+          <div id="node-431"
34
+              class="node node-logiciel-annuaires node-promoted node-teaser node-teaser node-sheet clearfix nodes-list"
35
+              about="/content/gogs" typeof="sioc:Item foaf:Document">
36
+            <header class="media">
37
+              <div class="media-left">
38
+                <div class="field field-name-field-logo field-type-image field-label-hidden">
39
+                  <div class="field-items">
40
+                    <div class="field-item even">
41
+                      <a href="/content/gogs">
42
+                        <img class="media-object img-responsive" typeof="foaf:Image"
43
+ src="https://framalibre.org/sites/default/files/styles/teaser_logo/public/leslogos/gogs-lg.png?itok=rrCxKKBy"
44
+ width="70" height="70" alt="" />
45
+                      </a>
46
+                    </div>
47
+                  </div>
48
+                </div>
49
+              </div>
50
+              <div class="media-body">
51
+                <h3 class="node-title"><a href="/content/gogs">Gogs</a></h3>
52
+                <span property="dc:title" content="Gogs" class="rdf-meta element-hidden"></span>
53
+                <div class="field field-name-field-annuaires field-type-taxonomy-term-reference field-label-hidden">
54
+                  <div class="field-items">
55
+                    <div class="field-item even">
56
+                      <a href="/annuaires/cloudwebapps"
57
+ typeof="skos:Concept" property="rdfs:label skos:prefLabel"
58
+ datatype="" class="label label-primary">Cloud/webApps</a>
59
+                    </div>
60
+                  </div>
61
+                </div>
62
+              </div>
63
+            </header>
64
+            <div class="content">
65
+              <div class="field field-name-field-votre-appr-ciation field-type-fivestar field-label-hidden">
66
+                <div class="field-items">
67
+                  <div class="field-item even">
68
+                  </div>
69
+                </div>
70
+              </div>
71
+              <div class="field field-name-body field-type-text-with-summary field-label-hidden">
72
+                <div class="field-items">
73
+                  <div class="field-item even" property="content:encoded">
74
+                    <p>Gogs est une interface web basée sur git et une bonne alternative à GitHub.</p>
75
+                  </div>
76
+                </div>
77
+              </div>
78
+            </div>
79
+            <footer>
80
+              <a href="/content/gogs" class="read-more btn btn-default btn-sm">Voir la notice</a>
81
+              <div class="field field-name-field-lien-officiel field-type-link-field field-label-hidden">
82
+                <div class="field-items">
83
+                  <div class="field-item even">
84
+                    <a href="https://gogs.io/" target="_blank" title="Voir le site officiel">
85
+                      <span class="glyphicon glyphicon-globe"></span>
86
+                      <span class="sr-only">Lien officiel</span>
87
+                    </a>
88
+                  </div>
89
+                </div>
90
+              </div>
91
+            </footer>
92
+          </div>
93
+        </div>
94
+        """
95
+        response = mock.Mock(text=html)
96
+        results = framalibre.response(response)
97
+        self.assertEqual(type(results), list)
98
+        self.assertEqual(len(results), 1)
99
+        self.assertEqual(results[0]['title'], 'Gogs')
100
+        self.assertEqual(results[0]['url'],
101
+                         'https://framalibre.org/content/gogs')
102
+        self.assertEqual(results[0]['content'],
103
+                         u"Gogs est une interface web basée sur git et une bonne alternative à GitHub.")