Browse Source

Add Tokyo Toshokan search engine

Kirill Isakov 9 years ago
parent
commit
547b8a8765
3 changed files with 218 additions and 0 deletions
  1. 102
    0
      searx/engines/tokyotoshokan.py
  2. 6
    0
      searx/settings.yml
  3. 110
    0
      tests/unit/engines/test_tokyotoshokan.py

+ 102
- 0
searx/engines/tokyotoshokan.py View File

@@ -0,0 +1,102 @@
1
+"""
2
+ Tokyo Toshokan (A BitTorrent Library for Japanese Media)
3
+
4
+ @website      https://www.tokyotosho.info/
5
+ @provide-api  no
6
+ @using-api    no
7
+ @results      HTML
8
+ @stable       no (HTML can change)
9
+ @parse        url, title, publishedDate, seed, leech,
10
+               filesize, magnetlink, content
11
+"""
12
+
13
+import re
14
+from cgi import escape
15
+from urllib import urlencode
16
+from lxml import html
17
+from searx.engines.xpath import extract_text
18
+from datetime import datetime
19
+from searx.engines.nyaa import int_or_zero, get_filesize_mul
20
+
21
+# engine dependent config
22
+categories = ['files', 'videos', 'music']
23
+paging = True
24
+
25
+# search-url
26
+base_url = 'https://www.tokyotosho.info/'
27
+search_url = base_url + 'search.php?{query}'
28
+
29
+
30
+# do search-request
31
+def request(query, params):
32
+    query = urlencode({'page': params['pageno'],
33
+                       'terms': query})
34
+    params['url'] = search_url.format(query=query)
35
+    return params
36
+
37
+
38
+# get response from search-request
39
+def response(resp):
40
+    results = []
41
+
42
+    dom = html.fromstring(resp.text)
43
+    rows = dom.xpath('//table[@class="listing"]//tr[contains(@class, "category_0")]')
44
+
45
+    # check if there are no results or page layout was changed so we cannot parse it
46
+    # currently there are two rows for each result, so total count must be even
47
+    if len(rows) == 0 or len(rows) % 2 != 0:
48
+        return []
49
+
50
+    # regular expression for parsing torrent size strings
51
+    size_re = re.compile('Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE)
52
+
53
+    # processing the results, two rows at a time
54
+    for i in xrange(0, len(rows), 2):
55
+        # parse the first row
56
+        name_row = rows[i]
57
+
58
+        links = name_row.xpath('./td[@class="desc-top"]/a')
59
+        params = {
60
+            'template': 'torrent.html',
61
+            'url': links[-1].attrib.get('href'),
62
+            'title': extract_text(links[-1])
63
+        }
64
+        # I have not yet seen any torrents without magnet links, but
65
+        # it's better to be prepared to stumble upon one some day
66
+        if len(links) == 2:
67
+            magnet = links[0].attrib.get('href')
68
+            if magnet.startswith('magnet'):
69
+                # okay, we have a valid magnet link, let's add it to the result
70
+                params['magnetlink'] = magnet
71
+
72
+        # no more info in the first row, start parsing the second one
73
+        info_row = rows[i + 1]
74
+        desc = extract_text(info_row.xpath('./td[@class="desc-bot"]')[0])
75
+        for item in desc.split('|'):
76
+            item = item.strip()
77
+            if item.startswith('Size:'):
78
+                try:
79
+                    # ('1.228', 'GB')
80
+                    groups = size_re.match(item).groups()
81
+                    multiplier = get_filesize_mul(groups[1])
82
+                    params['filesize'] = int(multiplier * float(groups[0]))
83
+                except Exception as e:
84
+                    pass
85
+            elif item.startswith('Date:'):
86
+                try:
87
+                    # Date: 2016-02-21 21:44 UTC
88
+                    date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC')
89
+                    params['publishedDate'] = date
90
+                except Exception as e:
91
+                    pass
92
+            elif item.startswith('Comment:'):
93
+                params['content'] = item
94
+        stats = info_row.xpath('./td[@class="stats"]/span')
95
+        # has the layout not changed yet?
96
+        if len(stats) == 3:
97
+            params['seed'] = int_or_zero(extract_text(stats[0]))
98
+            params['leech'] = int_or_zero(extract_text(stats[1]))
99
+
100
+        results.append(params)
101
+
102
+    return results

+ 6
- 0
searx/settings.yml View File

@@ -271,6 +271,12 @@ engines:
271 271
     shortcut : sw
272 272
     disabled : True
273 273
 
274
+  - name : tokyotoshokan
275
+    engine : tokyotoshokan
276
+    shortcut : tt
277
+    timeout : 6.0
278
+    disabled : True
279
+
274 280
   - name : torrentz
275 281
     engine : torrentz
276 282
     timeout : 5.0

+ 110
- 0
tests/unit/engines/test_tokyotoshokan.py View File

@@ -0,0 +1,110 @@
1
+import mock
2
+from collections import defaultdict
3
+from searx.engines import tokyotoshokan
4
+from searx.testing import SearxTestCase
5
+from datetime import datetime
6
+
7
+
8
+class TestTokyotoshokanEngine(SearxTestCase):
9
+
10
+    def test_request(self):
11
+        query = 'test_query'
12
+        dic = defaultdict(dict)
13
+        dic['pageno'] = 1
14
+        params = tokyotoshokan.request(query, dic)
15
+        self.assertTrue('url' in params)
16
+        self.assertTrue(query in params['url'])
17
+        self.assertTrue('tokyotosho.info' in params['url'])
18
+
19
+    def test_response(self):
20
+        resp = mock.Mock(text='<html></html>')
21
+        self.assertEqual(tokyotoshokan.response(resp), [])
22
+
23
+        html = """
24
+        <table class="listing">
25
+          <tbody>
26
+            <tr class="shade category_0">
27
+              <td rowspan="2">
28
+                <a href="/?cat=7"><span class="sprite_cat-raw"></span></a>
29
+              </td>
30
+              <td class="desc-top">
31
+                <a href="magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b">
32
+                  <span class="sprite_magnet"></span>
33
+                </a>
34
+                <a rel="nofollow" type="application/x-bittorrent" href="http://www.nyaa.se/f">
35
+                  Koyomimonogatari
36
+                </a>
37
+              </td>
38
+              <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td>
39
+            </tr>
40
+            <tr class="shade category_0">
41
+              <td class="desc-bot">
42
+                Authorized: <span class="auth_ok">Yes</span>
43
+                Submitter: <a href="?username=Ohys">Ohys</a> |
44
+                Size: 10.5MB |
45
+                Date: 2016-03-26 16:41 UTC |
46
+                Comment: sample comment
47
+              </td>
48
+              <td style="color: #BBB; font-family: monospace" class="stats" align="right">
49
+                S: <span style="color: red">53</span>
50
+                L: <span style="color: red">18</span>
51
+                C: <span style="color: red">0</span>
52
+                ID: 975700
53
+              </td>
54
+            </tr>
55
+
56
+            <tr class="category_0">
57
+              <td rowspan="2">
58
+                <a href="/?cat=7"><span class="sprite_cat-raw"></span></a>
59
+              </td>
60
+              <td class="desc-top">
61
+                <a rel="nofollow" type="application/x-bittorrent" href="http://google.com/q">
62
+                  Owarimonogatari
63
+                </a>
64
+              </td>
65
+              <td class="web"><a rel="nofollow" href="details.php?id=975700">Details</a></td>
66
+            </tr>
67
+            <tr class="category_0">
68
+              <td class="desc-bot">
69
+                Submitter: <a href="?username=Ohys">Ohys</a> |
70
+                Size: 932.84EB |
71
+                Date: QWERTY-03-26 16:41 UTC
72
+              </td>
73
+              <td style="color: #BBB; font-family: monospace" class="stats" align="right">
74
+                S: <span style="color: red">0</span>
75
+              </td>
76
+            </tr>
77
+          </tbody>
78
+        </table>
79
+        """
80
+
81
+        resp = mock.Mock(text=html)
82
+        results = tokyotoshokan.response(resp)
83
+
84
+        self.assertEqual(type(results), list)
85
+        self.assertEqual(len(results), 2)
86
+
87
+        # testing the first result, which has correct format
88
+        # and should have all information fields filled
89
+        r = results[0]
90
+        self.assertEqual(r['url'], 'http://www.nyaa.se/f')
91
+        self.assertEqual(r['title'], 'Koyomimonogatari')
92
+        self.assertEqual(r['magnetlink'], 'magnet:?xt=urn:btih:4c19eb46b5113685fbd2288ed2531b0b')
93
+        self.assertEqual(r['filesize'], int(1024 * 1024 * 10.5))
94
+        self.assertEqual(r['publishedDate'], datetime(2016, 03, 26, 16, 41))
95
+        self.assertEqual(r['content'], 'Comment: sample comment')
96
+        self.assertEqual(r['seed'], 53)
97
+        self.assertEqual(r['leech'], 18)
98
+
99
+        # testing the second result, which does not include magnet link,
100
+        # seed & leech info, and has incorrect size & creation date
101
+        r = results[1]
102
+        self.assertEqual(r['url'], 'http://google.com/q')
103
+        self.assertEqual(r['title'], 'Owarimonogatari')
104
+
105
+        self.assertFalse('magnetlink' in r)
106
+        self.assertFalse('filesize' in r)
107
+        self.assertFalse('content' in r)
108
+        self.assertFalse('publishedDate' in r)
109
+        self.assertFalse('seed' in r)
110
+        self.assertFalse('leech' in r)