为链接爬虫添加缓存支持

62 阅读 0 评论 41 点赞

我是靠谱客的博主迷人黑裤，这篇文章主要介绍为链接爬虫添加缓存支持，现在分享给大家，希望可以做个参考。

#-*- coding:UTF-8 -*-
#原来创建对象时或者是调用类以外的方法时提示没有定义是因为这些类或方法的位置不应该放在主函数后面，而应该放在主函数前面
import urlparse
import urllib2
import random
import time
from datetime import datetime, timedelta
import socket
import robotparser
import csv
import re
import lxml.html
DEFAULT_AGENT = 'wswp'
DEFAULT_DELAY = 5
DEFAULT_RETRIES = 1
DEFAULT_TIMEOUT = 60

def link_crawler(seed_url, link_regex=None, max_depth=-1, max_urls=-1, user_agent='wswp',
scrape_callback=None, cache=None):
"""Crawl from the given seed URL following links matched by link_regex

"""

# the queue of URL's that still need to be crawled 双向队列里面存储url

crawl_queue = [seed_url]
# the URL's that have been seen and at what depth

seen = {seed_url: 0}
# track how many URL's have been downloaded

num_urls = 0

rp = get_robots(seed_url)#获取robots.txt

D = Downloader(DEFAULT_DELAY, DEFAULT_AGENT, None, DEFAULT_RETRIES, DEFAULT_TIMEOUT, None, None)
while crawl_queue:
url = crawl_queue.pop()#移除列表中的元素，并且返回该元素的值

depth = seen[url]
# check url passes robots.txt restrictions

if rp.can_fetch(user_agent, url):#确定指定的用户代理是否允许访问网页

html = D(url)
links = []
if scrape_callback:
links.extend(scrape_callback(url, html) or [])
depth = seen[url]
if depth != max_depth:
# can still crawl further

if link_regex:
# filter for links matching our regular expression

links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link) #返回绝对链接

# check whether already crawled this link

if link not in seen:
seen[link] = depth + 1

# check link is within same domain

if same_domain(seed_url, link):
# success! add this new link to queue

crawl_queue.append(link)
# check whether have reached downloaded maximum

num_urls += 1

if num_urls == max_urls:
break

else:
print 'Blocked by robots.txt:', url
def get_robots(url):
"""Initialize robots parser for this domain

"""

rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))#绝对链接

rp.read()
return rp
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain

"""

link, _ = urlparse.urldefrag(link)
# remove hash to avoid duplicates urldefrag(url)将url分解成去掉fragment的新url和去掉的fragment的二元组

return urlparse.urljoin(seed_url, link)#绝对链接

def same_domain(url1, url2):
"""Return True if both URL's belong to same domain

"""

#将url分解成部件的6元组

return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_links(html):
"""Return a list of links from html

"""

# a regular expression to extract all links from the webpage

#re.compile()函数将正则表达式的字符串形式编译为Pattern实例，然后使用Pattern实例处理文本并获得匹配结果（一个Match实例），最后使用Match实例获得信息，进行其他的操作。

webpage_regex = re.compile('<a[^>]+href=["'](.*?)["']', re.IGNORECASE)
# list of all links from the webpage

return webpage_regex.findall(html)
class Downloader:
def __init__(self, delay=DEFAULT_DELAY, user_agent=DEFAULT_AGENT, proxies=None, num_retries=DEFAULT_RETRIES,
timeout=DEFAULT_TIMEOUT, opener=None, cache=None):
socket.setdefaulttimeout(timeout)
self.throttle = Throttle(delay)
self.user_agent = user_agent
self.proxies = proxies
self.num_retries = num_retries
self.opener = opener
self.cache = cache
def __call__(self, url):
result = None

if self.cache:
try:
result = self.cache[url]
except KeyError:
# url is not available in cache

pass

else:
if self.num_retries > 0 and 500 <= result['code'] < 600:
# server error so ignore result from cache and re-download

result = None

if result is None:
# result was not loaded from cache so still need to download

self.throttle.wait(url)
proxy = random.choice(self.proxies) if self.proxies else None

headers = {'User-agent': self.user_agent}
result = self.download(url, headers, proxy=proxy, num_retries=self.num_retries)
if self.cache:
# save result to cache

self.cache[url] = result
return result['html']
def download(self, url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers or {})
opener = self.opener or urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except Exception as e:
print 'Download error:', str(e)
html = ''

if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors

return self._get(url, headers, proxy, num_retries - 1, data)
else:
code = None

return {'html': html, 'code': code}
class ScrapeCallback:
def __init__(self):
self.writer = csv.writer(open('countries.csv', 'w'))
self.fields = ('area', 'population', 'iso', 'country', 'capital',
'continent', 'tld', 'currency_code', 'currency_name',
'phone', 'postal_code_format', 'postal_code_regex', 'languages')
self.writer.writerow(self.fields)
def __call__(self, url, html):
if re.search('view', url):
tree = lxml.html.fromstring(html)
row = []
for field in self.fields:
row.append(tree.cssselect('table > tr#places_{}__row > td.w2p_fw'.format(field))[0].text_content())
self.writer.writerow(row)
class Throttle:
"""Throttle downloading by sleeping between requests to same domain

"""


def __init__(self, delay):
# amount of delay between downloads for each domain

self.delay = delay
# timestamp of when a domain was last accessed

self.domains = {}
def wait(self, url):
"""Delay if have accessed this domain recently

"""

domain = urlparse.urlsplit(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)', scrape_callback=ScrapeCallback())
C:Anacondapython.exe C:/Users/Administrator/PythonWorkSpace/python04.py
Downloading: http://example.webscraping.com
Downloading: http://example.webscraping.com/index/1
Downloading: http://example.webscraping.com/index/2
Process finished with exit code 1