|
|
|
|
24
|
url = urljoin(base_url, link.attrib.get('href'))
|
24
|
url = urljoin(base_url, link.attrib.get('href'))
|
25
|
parsed_url = urlparse(url)
|
25
|
parsed_url = urlparse(url)
|
26
|
|
26
|
|
27
|
- if parsed_url.netloc.find('google.com') >= 0:
|
|
|
|
|
27
|
+ if parsed_url.netloc.find('www.google.com') >= 0:
|
28
|
continue
|
28
|
continue
|
29
|
title = ' '.join(link.xpath('.//text()'))
|
29
|
title = ' '.join(link.xpath('.//text()'))
|
30
|
content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
|
30
|
content = escape(' '.join(result.xpath('.//p[@class="desc"]//text()')))
|