Browse Source

Add Torrentz.eu search engine

Kirill Isakov 9 years ago
parent
commit
7fbc12ee4e
4 changed files with 203 additions and 12 deletions
  1. 14
    12
      searx/engines/nyaa.py
  2. 93
    0
      searx/engines/torrentz.py
  3. 5
    0
      searx/settings.yml
  4. 91
    0
      tests/unit/engines/test_torrentz.py

+ 14
- 12
searx/engines/nyaa.py View File

@@ -43,6 +43,19 @@ def int_or_zero(num):
43 43
         return int(num)
44 44
     return 0
45 45
 
46
+# get multiplier to convert torrent size to bytes
47
+def get_filesize_mul(suffix):
48
+    return {
49
+        'KB': 1024,
50
+        'MB': 1024 ** 2,
51
+        'GB': 1024 ** 3,
52
+        'TB': 1024 ** 4,
53
+
54
+        'KIB': 1024,
55
+        'MIB': 1024 ** 2,
56
+        'GIB': 1024 ** 3,
57
+        'TIB': 1024 ** 4
58
+    }[str(suffix).upper()]
46 59
 
47 60
 # do search-request
48 61
 def request(query, params):
@@ -74,18 +87,7 @@ def response(resp):
74 87
         # torrent size
75 88
         try:
76 89
             file_size, suffix = result.xpath(xpath_filesize)[0].split(' ')
77
-
78
-            # convert torrent size to bytes.
79
-            # if there is no correct index in this dictionary,
80
-            # the try block fails as it should
81
-            multiplier = {
82
-                'KIB': 1024,
83
-                'MIB': 1024 ** 2,
84
-                'GIB': 1024 ** 3,
85
-                'TIB': 1024 ** 4
86
-            }[suffix.upper()]
87
-
88
-            file_size = int(float(file_size) * multiplier)
90
+            file_size = int(float(file_size) * get_filesize_mul(suffix))
89 91
         except Exception as e:
90 92
             file_size = None
91 93
 

+ 93
- 0
searx/engines/torrentz.py View File

@@ -0,0 +1,93 @@
1
+"""
2
+ Torrentz.eu (BitTorrent meta-search engine)
3
+
4
+ @website      https://torrentz.eu/
5
+ @provide-api  no
6
+
7
+ @using-api    no
8
+ @results      HTML
9
+ @stable       no (HTML can change, although unlikely,
10
+                   see https://torrentz.eu/torrentz.btsearch)
11
+ @parse        url, title, publishedDate, seed, leech, filesize, magnetlink
12
+"""
13
+
14
+import re
15
+from cgi import escape
16
+from urllib import urlencode
17
+from lxml import html
18
+from searx.engines.xpath import extract_text
19
+from datetime import datetime
20
+from searx.engines.nyaa import int_or_zero, get_filesize_mul
21
+
22
+# engine dependent config
23
+categories = ['files', 'videos', 'music']
24
+paging = True
25
+
26
+# search-url
27
+# https://torrentz.eu/search?f=EXAMPLE&p=6
28
+base_url = 'https://torrentz.eu/'
29
+search_url = base_url + 'search?{query}'
30
+
31
+
32
+# do search-request
33
+def request(query, params):
34
+    page = params['pageno'] - 1
35
+    query = urlencode({'q': query, 'p': page})
36
+    params['url'] = search_url.format(query=query)
37
+    return params
38
+
39
+
40
+# get response from search-request
41
+def response(resp):
42
+    results = []
43
+
44
+    dom = html.fromstring(resp.text)
45
+
46
+    for result in dom.xpath('//div[@class="results"]/dl'):
47
+        name_cell = result.xpath('./dt')[0]
48
+        title = extract_text(name_cell)
49
+
50
+        # skip rows that do not contain a link to a torrent
51
+        links = name_cell.xpath('./a')
52
+        if len(links) != 1:
53
+            continue
54
+
55
+        # extract url and remove a slash in the beginning
56
+        link = links[0].attrib.get('href').lstrip('/')
57
+
58
+        seed = result.xpath('./dd/span[@class="u"]/text()')[0].replace(',', '')
59
+        leech = result.xpath('./dd/span[@class="d"]/text()')[0].replace(',', '')
60
+
61
+        params = {
62
+            'url': base_url + link,
63
+            'title': title,
64
+            'seed': int_or_zero(seed),
65
+            'leech': int_or_zero(leech),
66
+            'template': 'torrent.html'
67
+        }
68
+
69
+        # let's try to calculate the torrent size
70
+        try:
71
+            size_str = result.xpath('./dd/span[@class="s"]/text()')[0]
72
+            size, suffix = size_str.split()
73
+            params['filesize'] = int(size) * get_filesize_mul(suffix)
74
+        except Exception as e:
75
+            pass
76
+
77
+        # does our link contain a valid SHA1 sum?
78
+        if re.compile('[0-9a-fA-F]{40}').match(link):
79
+            # add a magnet link to the result
80
+            params['magnetlink'] = 'magnet:?xt=urn:btih:' + link
81
+
82
+        # extract and convert creation date
83
+        try:
84
+            date_str = result.xpath('./dd/span[@class="a"]/span')[0].attrib.get('title')
85
+            # Fri, 25 Mar 2016 16:29:01
86
+            date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S')
87
+            params['publishedDate'] = date
88
+        except Exception as e:
89
+            pass
90
+
91
+        results.append(params)
92
+
93
+    return results

+ 5
- 0
searx/settings.yml View File

@@ -271,6 +271,11 @@ engines:
271 271
     shortcut : sw
272 272
     disabled : True
273 273
 
274
+  - name : torrentz
275
+    engine : torrentz
276
+    timeout : 5.0
277
+    shortcut : to
278
+
274 279
   - name : twitter
275 280
     engine : twitter
276 281
     shortcut : tw

+ 91
- 0
tests/unit/engines/test_torrentz.py View File

@@ -0,0 +1,91 @@
1
+import mock
2
+from collections import defaultdict
3
+from searx.engines import torrentz
4
+from searx.testing import SearxTestCase
5
+from datetime import datetime
6
+
7
+
8
+class TestTorrentzEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dic = defaultdict(dict)
13
+        dic['pageno'] = 1
14
+        params = torrentz.request(query, dic)
15
+        self.assertTrue('url' in params)
16
+        self.assertTrue(query in params['url'])
17
+        self.assertTrue('torrentz.eu' in params['url'])
18
+
19
+    def test_response(self):
20
+        resp = mock.Mock(text='<html></html>')
21
+        self.assertEqual(torrentz.response(resp), [])
22
+
23
+        html = """
24
+        <div class="results">
25
+          <dl>
26
+            <dt>
27
+              <a href="/4362e08b1d80e1820fb2550b752f9f3126fe76d6">
28
+                Completely valid info
29
+              </a>
30
+              books ebooks
31
+            </dt>
32
+            <dd>
33
+              <span class="v">1</span>
34
+              <span class="a">
35
+                <span title="Sun, 22 Nov 2015 03:01:42">4 months</span>
36
+              </span>
37
+              <span class="s">30 MB</span>
38
+              <span class="u">14</span>
39
+              <span class="d">1</span>
40
+            </dd>
41
+          </dl>
42
+
43
+          <dl>
44
+            <dt>
45
+              <a href="/poaskdpokaspod">
46
+                Invalid hash and date and filesize
47
+              </a>
48
+              books ebooks
49
+            </dt>
50
+            <dd>
51
+              <span class="v">1</span>
52
+              <span class="a">
53
+                <span title="Sun, 2124091j0j190gm42">4 months</span>
54
+              </span>
55
+              <span class="s">30MB</span>
56
+              <span class="u">5,555</span>
57
+              <span class="d">1,234,567</span>
58
+            </dd>
59
+          </dl>
60
+        </div>
61
+        """
62
+
63
+        resp = mock.Mock(text=html)
64
+        results = torrentz.response(resp)
65
+
66
+        self.assertEqual(type(results), list)
67
+        self.assertEqual(len(results), 2)
68
+
69
+        # testing against the first result
70
+        r = results[0]
71
+        self.assertEqual(r['url'], 'https://torrentz.eu/4362e08b1d80e1820fb2550b752f9f3126fe76d6')
72
+        self.assertEqual(r['title'], 'Completely valid info books ebooks')
73
+        # 22 Nov 2015 03:01:42
74
+        self.assertEqual(r['publishedDate'], datetime(2015, 11, 22, 3, 1, 42))
75
+        self.assertEqual(r['seed'], 14)
76
+        self.assertEqual(r['leech'], 1)
77
+        self.assertEqual(r['filesize'], 30 * 1024 * 1024)
78
+        self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4362e08b1d80e1820fb2550b752f9f3126fe76d6')
79
+
80
+        # testing against the second result
81
+        r = results[1]
82
+        self.assertEqual(r['url'], 'https://torrentz.eu/poaskdpokaspod')
83
+        self.assertEqual(r['title'], 'Invalid hash and date and filesize books ebooks')
84
+        self.assertEqual(r['seed'], 5555)
85
+        self.assertEqual(r['leech'], 1234567)
86
+
87
+        # in the second result we have invalid hash, creation date & torrent size,
88
+        # so these tests should fail
89
+        self.assertFalse('magnetlink' in r)
90
+        self.assertFalse('filesize' in r)
91
+        self.assertFalse('publishedDate' in r)