爬取一个网站有很多种方法,选用哪种方法更加合适,则取决于目标网站的结构
爬取网站常见方法:
1、爬取网站地图
2、遍历每个网页的数据库ID
3、跟踪网页链接
import urllib.request, urllib.error
def download(url,user_agent='wswp', num_retries=2):
'''
下载网页
:param url: 下载地址
:user_agent='wswp' 用户代理
:param num_retries: 重试间隔
:return:
'''
print('Downloading:', url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url,headers=headers)
try:
html = urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0: #下载重试
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url,user_agent, num_retries - 1)
return html
print(download('http://httpstat.us/500')) #测试重复下载
print(download('http://www.meetup.com')) #测试用户代理
def crawl_sitemap(url):
'''
网站地图爬虫
:param url:
:return:
'''
sitemap = download(url)
links = re.findall('<loc>(.*?)</loc>',sitemap)
for link in links:
html = download(link)
print(crawl_sitemap('http://example.webscraping.com/sitemap.xml'))
获取网页的链接
def get_links(html):
'''Return a list of links from html'''
webpage_regex = re.compile('<a[^]+href=["\'](.*?)["\']>', re.IGNORECASE)
return webpage_regex.findall(html)
def link_crawler(seed_url, link_regex):
'''Crawl from the given seed url following links matched by link_regex'''
crawl_queue = [seed_url]
seen = set(crawl_queue) #
while crawl_queue:
url = crawl_queue.pop()
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
link = urljoin(seed_url, link)
if link not in seen: #check if have already seen this link
seen.add(link)
crawl_queue.append(link)
print(link_crawler('http://example.webscraping.com','/(index|view)'))
解析rebots.txt
def link_crawler(seed_url, link_regex):
'''Crawl from the given seed url following links matched by link_regex'''
crawl_queue = [seed_url]
seen = set(crawl_queue) #
while crawl_queue:
url = crawl_queue.pop()
rp = robotparser.RobotFileParser()
if rp.can_fetch(user_agent, url):#
html = download(url)
for link in get_links(html):
if re.match(link_regex, link):
link = urljoin(seed_url, link)
if link not in seen: # check if have already seen this link
seen.add(link)
crawl_queue.append(link)
else:
print('Blocked by robots.txt:', url)
res.set_url('http://example.webscraping.com/robots.txt')
print(res.read())
result = res.can_fetch('GoodCrawler', 'http://www.baidu.com')
支持代理
def download(url, user_agent='wswp', proxy=None,num_retries=2):
'''
下载网页
:param url: 下载地址
:param num_retries: 重试间隔
:return:
'''
print('Downloading:', url)
headers = {'User-agent': user_agent}
request = urllib.request.Request(url, headers=headers)
opener = urllib.request.build_opener()
if proxy:
proxy_params = {urlparse(url).scheme: proxy}
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
try:
html = urllib.request.urlopen(request).read()
except urllib.error.URLError as e:
print('Download error:', e.reason)
html = None
if num_retries > 0: # 下载重试
if hasattr(e, 'code') and 500 <= e.code < 600:
return download(url, user_agent, num_retries - 1)
return html
下载限速
class Throttle:
'''Add a delay between downloads to the same domain'''
def __init__(self,delay):
#amount of delay between downloads for each domain
self.delay = delay
#timestamp of when a domain was last accessed
self.domains = ()
def wait(self,url):
domain = urlparse(url).netloc
last_accessed = self.domains.get(doamin)
if self.delay >0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now()-last_accessed).seconds
if sleep_secs > 0:
# domain has been accessed recently
#so need to sleep
time.sleep(sleep_secs)
#update the last accessed time
self.domains[domain] = datetime.datetime.now()
throttle = Throttle(delay)
throttle.wait(url)
result = download(url,hteaders,proxy=proxy,num_retries=num_retries)