Browse Source

Merge pull request #208 from pointhi/new_engines

add 1x.com engine, improve yacy-engine
Adam Tauber 10 years ago
parent
commit
03137eebd9
5 changed files with 157 additions and 13 deletions
  1. 82
    0
      searx/engines/www1x.py
  2. 12
    13
      searx/engines/yacy.py
  3. 5
    0
      searx/settings.yml
  4. 57
    0
      searx/tests/engines/test_www1x.py
  5. 1
    0
      searx/tests/test_engines.py

+ 82
- 0
searx/engines/www1x.py View File

@@ -0,0 +1,82 @@
1
+## 1x (Images)
2
+#
3
+# @website     http://1x.com/
4
+# @provide-api no
5
+#
6
+# @using-api   no
7
+# @results     HTML
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, thumbnail, img_src, content
10
+
11
+
12
+from urllib import urlencode
13
+from urlparse import urljoin
14
+from lxml import html
15
+import string
16
+import re
17
+
18
+# engine dependent config
19
+categories = ['images']
20
+paging = False
21
+
22
+# search-url
23
+base_url = 'http://1x.com'
24
+search_url = base_url+'/backend/search.php?{query}'
25
+
26
+
27
+# do search-request
28
+def request(query, params):
29
+    params['url'] = search_url.format(query=urlencode({'q': query}))
30
+
31
+    return params
32
+
33
+
34
+# get response from search-request
35
+def response(resp):
36
+    results = []
37
+
38
+    # get links from result-text
39
+    regex = re.compile('(</a>|<a)')
40
+    results_parts = re.split(regex, resp.text)
41
+
42
+    cur_element = ''
43
+
44
+    # iterate over link parts
45
+    for result_part in results_parts:
46
+        # processed start and end of link
47
+        if result_part == '<a':
48
+            cur_element = result_part
49
+            continue
50
+        elif result_part != '</a>':
51
+            cur_element += result_part
52
+            continue
53
+
54
+        cur_element += result_part
55
+
56
+        # fix xml-error
57
+        cur_element = string.replace(cur_element, '"></a>', '"/></a>')
58
+
59
+        dom = html.fromstring(cur_element)
60
+        link = dom.xpath('//a')[0]
61
+
62
+        url = urljoin(base_url, link.attrib.get('href'))
63
+        title = link.attrib.get('title', '')
64
+
65
+        thumbnail_src = urljoin(base_url, link.xpath('.//img')[0].attrib['src'])
66
+        # TODO: get image with higher resolution
67
+        img_src = thumbnail_src
68
+
69
+        # check if url is showing to a photo
70
+        if '/photo/' not in url:
71
+            continue
72
+
73
+        # append result
74
+        results.append({'url': url,
75
+                        'title': title,
76
+                        'img_src': img_src,
77
+                        'content': '',
78
+                        'thumbnail_src': thumbnail_src,
79
+                        'template': 'images.html'})
80
+
81
+    # return results
82
+    return results

+ 12
- 13
searx/engines/yacy.py View File

@@ -68,9 +68,18 @@ def response(resp):
68 68
 
69 69
     search_results = raw_search_results.get('channels', {})[0].get('items', [])
70 70
 
71
-    if resp.search_params['category'] == 'general':
71
+    for result in search_results:
72
+        # parse image results
73
+        if result.get('image'):
74
+            # append result
75
+            results.append({'url': result['url'],
76
+                            'title': result['title'],
77
+                            'content': '',
78
+                            'img_src': result['image'],
79
+                            'template': 'images.html'})
80
+
72 81
         # parse general results
73
-        for result in search_results:
82
+        else:
74 83
             publishedDate = parser.parse(result['pubDate'])
75 84
 
76 85
             # append result
@@ -79,17 +88,7 @@ def response(resp):
79 88
                             'content': result['description'],
80 89
                             'publishedDate': publishedDate})
81 90
 
82
-    elif resp.search_params['category'] == 'images':
83
-        # parse image results
84
-        for result in search_results:
85
-            # append result
86
-            results.append({'url': result['url'],
87
-                            'title': result['title'],
88
-                            'content': '',
89
-                            'img_src': result['image'],
90
-                            'template': 'images.html'})
91
-
92
-    #TODO parse video, audio and file results
91
+        #TODO parse video, audio and file results
93 92
 
94 93
     # return results
95 94
     return results

+ 5
- 0
searx/settings.yml View File

@@ -83,6 +83,11 @@ engines:
83 83
     engine : www500px
84 84
     shortcut : px
85 85
 
86
+  - name : 1x
87
+    engine : www1x
88
+    shortcut : 1x
89
+    disabled : True
90
+
86 91
   - name : flickr
87 92
     categories : images
88 93
     shortcut : fl

+ 57
- 0
searx/tests/engines/test_www1x.py View File

@@ -0,0 +1,57 @@
1
+from collections import defaultdict
2
+import mock
3
+from searx.engines import www1x
4
+from searx.testing import SearxTestCase
5
+
6
+
7
+class TestWww1xEngine(SearxTestCase):
8
+
9
+    def test_request(self):
10
+        query = 'test_query'
11
+        params = www1x.request(query, defaultdict(dict))
12
+        self.assertTrue('url' in params)
13
+        self.assertTrue(query in params['url'])
14
+        self.assertTrue('1x.com' in params['url'])
15
+
16
+    def test_response(self):
17
+        self.assertRaises(AttributeError, www1x.response, None)
18
+        self.assertRaises(AttributeError, www1x.response, [])
19
+        self.assertRaises(AttributeError, www1x.response, '')
20
+        self.assertRaises(AttributeError, www1x.response, '[]')
21
+
22
+        response = mock.Mock(text='<html></html>')
23
+        self.assertEqual(www1x.response(response), [])
24
+        html = """
25
+        <?xml version="1.0" encoding="UTF-8"?><!DOCTYPE characters
26
+        [
27
+        <!ELEMENT characters (character*) >
28
+        <!ELEMENT character  (#PCDATA   ) >
29
+
30
+        <!ENTITY iexcl   "&#161;" >
31
+        <!ENTITY cent    "&#162;" >
32
+        <!ENTITY pound   "&#163;" >
33
+        ]
34
+        ><root><searchresult><![CDATA[<table border="0" cellpadding="0" cellspacing="0" width="100%">
35
+        <tr>
36
+            <td style="min-width: 220px;" valign="top">
37
+                <div style="font-size: 30px; margin: 0px 0px 20px 0px;">Photos</div>
38
+                <div>
39
+                    <a href="/photo/123456" class="dynamiclink">
40
+<img border="0" class="searchresult" src="/images/user/testimage-123456.jpg" style="width: 125px; height: 120px;">
41
+                    </a>
42
+                    <a title="sjoerd lammers street photography" href="/member/sjoerdlammers" class="dynamiclink">
43
+<img border="0" class="searchresult" src="/images/profile/60c48b394c677d2fa4d9e7d263aabf44-square.jpg">
44
+                    </a>
45
+                </div>
46
+            </td>
47
+        </table>
48
+        ]]></searchresult></root>
49
+        """
50
+        response = mock.Mock(text=html)
51
+        results = www1x.response(response)
52
+        self.assertEqual(type(results), list)
53
+        self.assertEqual(len(results), 1)
54
+        self.assertEqual(results[0]['url'], 'http://1x.com/photo/123456')
55
+        self.assertEqual(results[0]['thumbnail_src'], 'http://1x.com/images/user/testimage-123456.jpg')
56
+        self.assertEqual(results[0]['content'], '')
57
+        self.assertEqual(results[0]['template'], 'images.html')

+ 1
- 0
searx/tests/test_engines.py View File

@@ -1,2 +1,3 @@
1 1
 from searx.tests.engines.test_dummy import *  # noqa
2 2
 from searx.tests.engines.test_github import *  # noqa
3
+from searx.tests.engines.test_www1x import *  # noqa