Преглед на файлове

[enh] fix content fetching, parse published date from description

Thomas Pointhuber преди 9 години
родител
ревизия
4508c96667
променени са 2 файла, в които са добавени 40 реда и са изтрити 9 реда
  1. 37
    6
      searx/engines/startpage.py
  2. 3
    3
      searx/tests/engines/test_startpage.py

+ 37
- 6
searx/engines/startpage.py Целия файл

@@ -12,6 +12,8 @@
12 12
 
13 13
 from lxml import html
14 14
 from cgi import escape
15
+from dateutil import parser
16
+from datetime import datetime, timedelta
15 17
 import re
16 18
 from searx.engines.xpath import extract_text
17 19
 
@@ -79,15 +81,44 @@ def response(resp):
79 81
 
80 82
         title = escape(extract_text(link))
81 83
 
82
-        if result.xpath('./p[@class="desc"]'):
83
-            content = escape(extract_text(result.xpath('./p[@class="desc"]')))
84
+        if result.xpath('./p[@class="desc clk"]'):
85
+            content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
84 86
         else:
85 87
             content = ''
86 88
 
87
-        # append result
88
-        results.append({'url': url,
89
-                        'title': title,
90
-                        'content': content})
89
+        published_date = None
90
+
91
+        # check if search result starts with something like: "2 Sep 2014 ... "
92
+        if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
93
+            date_pos = content.find('...')+4
94
+            date_string = content[0:date_pos-5]
95
+            published_date = parser.parse(date_string, dayfirst=True)
96
+
97
+            # fix content string
98
+            content = content[date_pos:]
99
+
100
+        # check if search result starts with something like: "5 days ago ... "
101
+        elif re.match("^[0-9]+ days? ago \.\.\. ", content):
102
+            date_pos = content.find('...')+4
103
+            date_string = content[0:date_pos-5]
104
+
105
+            # calculate datetime
106
+            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
107
+
108
+            # fix content string
109
+            content = content[date_pos:]
110
+
111
+        if published_date:
112
+            # append result
113
+            results.append({'url': url,
114
+                            'title': title,
115
+                            'content': content,
116
+                            'publishedDate': published_date})
117
+        else:
118
+            # append result
119
+            results.append({'url': url,
120
+                            'title': title,
121
+                            'content': content})
91 122
 
92 123
     # return results
93 124
     return results

+ 3
- 3
searx/tests/engines/test_startpage.py Целия файл

@@ -42,7 +42,7 @@ class TestStartpageEngine(SearxTestCase):
42 42
                 </a>
43 43
                 <span id='title_stars_2' name='title_stars_2'>  </span>
44 44
             </h3>
45
-            <p class='desc'>
45
+            <p class='desc clk'>
46 46
                 This should be the content.
47 47
             </p>
48 48
             <p>
@@ -78,7 +78,7 @@ class TestStartpageEngine(SearxTestCase):
78 78
                 </a>
79 79
                 <span id='title_stars_2' name='title_stars_2'>  </span>
80 80
             </h3>
81
-            <p class='desc'>
81
+            <p class='desc clk'>
82 82
                 This should be the content.
83 83
             </p>
84 84
             <p>
@@ -101,7 +101,7 @@ class TestStartpageEngine(SearxTestCase):
101 101
             <h3>
102 102
                 <span id='title_stars_2' name='title_stars_2'>  </span>
103 103
             </h3>
104
-            <p class='desc'>
104
+            <p class='desc clk'>
105 105
                 This should be the content.
106 106
             </p>
107 107
             <p>