2021-05-18

爬取一个网站有很多种方法,选用哪种方法更加合适,则取决于目标网站的结构

爬取网站常见方法:

1、爬取网站地图

2、遍历每个网页的数据库ID

3、跟踪网页链接


import urllib.request, urllib.error


def download(url,user_agent='wswp', num_retries=2):
    '''
    下载网页
    :param url: 下载地址
    :user_agent='wswp'  用户代理
    :param num_retries: 重试间隔
    :return:
    '''
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url,headers=headers)
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0: #下载重试
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url,user_agent, num_retries - 1)
    return html

print(download('http://httpstat.us/500'))   #测试重复下载
print(download('http://www.meetup.com'))    #测试用户代理
def crawl_sitemap(url):
    '''
    网站地图爬虫
    :param url:
    :return: 
    '''
    sitemap = download(url)
    links = re.findall('<loc>(.*?)</loc>',sitemap)
    for link in links:
        html = download(link)

print(crawl_sitemap('http://example.webscraping.com/sitemap.xml'))

获取网页的链接

def get_links(html):
    '''Return a list of links from html'''
    webpage_regex = re.compile('<a[^]+href=["\'](.*?)["\']>', re.IGNORECASE)
    return webpage_regex.findall(html)


def link_crawler(seed_url, link_regex):
    '''Crawl from the given seed url following links matched by link_regex'''
    crawl_queue = [seed_url]
    seen = set(crawl_queue) #
    while crawl_queue:
        url = crawl_queue.pop()
        html = download(url)
        for link in get_links(html):
            if re.match(link_regex, link):
                link = urljoin(seed_url, link)
                if link not in seen:    #check if have already seen this link
                    seen.add(link)
                    crawl_queue.append(link)

print(link_crawler('http://example.webscraping.com','/(index|view)'))

解析rebots.txt

def link_crawler(seed_url, link_regex):
    '''Crawl from the given seed url following links matched by link_regex'''
    crawl_queue = [seed_url]
    seen = set(crawl_queue)  #
    while crawl_queue:
        url = crawl_queue.pop()
        rp = robotparser.RobotFileParser()
        if rp.can_fetch(user_agent, url):#
            html = download(url)
            for link in get_links(html):
                if re.match(link_regex, link):
                    link = urljoin(seed_url, link)
                    if link not in seen:  # check if have already seen this link
                        seen.add(link)
                        crawl_queue.append(link)
        else:
            print('Blocked by robots.txt:', url)
res.set_url('http://example.webscraping.com/robots.txt')
print(res.read())

result = res.can_fetch('GoodCrawler', 'http://www.baidu.com')

支持代理

def download(url, user_agent='wswp', proxy=None,num_retries=2):
    '''
    下载网页
    :param url: 下载地址
    :param num_retries: 重试间隔
    :return:
    '''
    print('Downloading:', url)
    headers = {'User-agent': user_agent}
    request = urllib.request.Request(url, headers=headers)

    opener = urllib.request.build_opener()
    if proxy:
        proxy_params = {urlparse(url).scheme: proxy}
        opener.add_handler(urllib.request.ProxyHandler(proxy_params))
    try:
        html = urllib.request.urlopen(request).read()
    except urllib.error.URLError as e:
        print('Download error:', e.reason)
        html = None
        if num_retries > 0:  # 下载重试
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_retries - 1)
    return html

下载限速

class Throttle:
    '''Add a delay between downloads to the same domain'''
    def __init__(self,delay):
        #amount of delay between downloads for each domain
        self.delay = delay
        #timestamp of when a domain was last accessed
        self.domains = ()
    def wait(self,url):
        domain = urlparse(url).netloc
        last_accessed = self.domains.get(doamin)
        if self.delay >0 and last_accessed is not None:
            sleep_secs = self.delay - (datetime.datetime.now()-last_accessed).seconds
            if sleep_secs > 0:
                # domain has been accessed recently
                #so need to sleep
                time.sleep(sleep_secs)
        #update the last accessed time
        self.domains[domain] = datetime.datetime.now()

throttle = Throttle(delay)
throttle.wait(url)
result = download(url,hteaders,proxy=proxy,num_retries=num_retries)

 

上一篇:Maven:各个标签元素说明


下一篇:Python爬虫仅需一行代码,熟练掌握crawl第三方库