Ver código fonte

add bing_images

Thomas Pointhuber 10 anos atrás
pai
commit
cdf74fe563
2 arquivos alterados com 86 adições e 0 exclusões
  1. 81
    0
      searx/engines/bing_images.py
  2. 5
    0
      searx/settings.yml

+ 81
- 0
searx/engines/bing_images.py Ver arquivo

@@ -0,0 +1,81 @@
1
+## Bing (Images)
2
+# 
3
+# @website     https://www.bing.com/images
4
+# @provide-api yes (http://datamarket.azure.com/dataset/bing/search), max. 5000 query/month
5
+# 
6
+# @using-api   no (because of query limit)
7
+# @results     HTML (using search portal)
8
+# @stable      no (HTML can change)
9
+# @parse       url, title, img_src
10
+#
11
+# @todo        currently there are up to 35 images receive per page, because bing does not parse count=10. limited response to 10 images
12
+
13
+from urllib import urlencode
14
+from cgi import escape
15
+from lxml import html
16
+from yaml import load
17
+import re
18
+
19
+# engine dependent config
20
+categories = ['images']
21
+paging = True
22
+
23
+# search-url
24
+base_url = 'https://www.bing.com/'
25
+search_string = 'images/search?{query}&count=10&first={offset}'
26
+
27
+# do search-request
28
+def request(query, params):
29
+    offset = (params['pageno'] - 1) * 10 + 1
30
+
31
+    # required for cookie
32
+    language = 'en-US'
33
+
34
+    search_path = search_string.format(
35
+        query=urlencode({'q': query}),
36
+        offset=offset)
37
+
38
+    params['cookies']['SRCHHPGUSR'] = \
39
+        'NEWWND=0&NRSLT=-1&SRCHLANG=' + language.split('-')[0]
40
+
41
+    params['url'] = base_url + search_path
42
+
43
+    print(params['url'])
44
+
45
+    return params
46
+
47
+
48
+# get response from search-request
49
+def response(resp):
50
+    results = []
51
+
52
+    dom = html.fromstring(resp.content)
53
+
54
+    # init regex for yaml-parsing
55
+    p = re.compile( '({|,)([a-z]+):(")')
56
+
57
+    # parse results
58
+    for result in dom.xpath('//div[@class="dg_u"]'):
59
+        link = result.xpath('./a')[0]
60
+
61
+        # parse yaml-data (it is required to add a space, to make it parsable)
62
+        yaml_data = load(p.sub( r'\1\2: \3', link.attrib.get('m')))
63
+ 
64
+        title = link.attrib.get('t1')
65
+        #url = 'http://' + link.attrib.get('t3')
66
+        url = yaml_data.get('surl')
67
+        img_src = yaml_data.get('imgurl')
68
+
69
+        # append result
70
+        results.append({'template': 'images.html',
71
+                        'url': url,
72
+                        'title': title,
73
+                        'content': '',  
74
+                        'img_src': img_src})
75
+
76
+        # TODO stop parsing if 10 images are found
77
+        if len(results) >= 10:
78
+            break
79
+
80
+    # return results
81
+    return results

+ 5
- 0
searx/settings.yml Ver arquivo

@@ -20,6 +20,11 @@ engines:
20 20
     locale : en-US
21 21
     shortcut : bi
22 22
 
23
+  - name : bing images
24
+    engine : bing_images
25
+    locale : en-US
26
+    shortcut : bii
27
+
23 28
   - name : bing news
24 29
     engine : bing_news
25 30
     locale : en-US