最近两个月在忙着期末复习和学习计算机网络和java并发编程,一直没有写博客,这两天开始学习python的爬虫,写了个简单的小爬虫练练手。
思路是通过python的BeautifulSoup来对html页面进行解析,然后对解析后的html进行处理。具体如下:
from urllib.request import urlopen from urllib.parse import urlparse from bs4 import BeautifulSoup import re import datetime import random random.seed(datetime.datetime.now()) #获取页面中所有内链的列表 def getInternalLinks(bs,includeUrl): includeUrl='{}://{}'.format(urlparse(includeUrl).scheme,urlparse(includeUrl).netloc) internalLinks=[] #找出所有以/开头的链接 for link in bs.find_all('a',href=re.compile('^(/|.*'+includeUrl+')')): if link.attrs['href'] is not None: if link.attrs['href'] not in internalLinks: if(link.attrs['href'].startswith('/')): internalLinks.append(includeUrl+link.attrs['href']) else: internalLinks.append(link.attrs['href']) return internalLinks #获取页面中的所有外链 def getExternalLinks(bs,excludeUrl): externalLinks=[] for link in bs.find_all('a',href=re.compile('^(http|www|https)((?!'+excludeUrl+').)*$')): if link.attrs['href'] is not None: if link.attrs['href'] not in externalLinks: externalLinks.append(link.attrs['href']) return externalLinks def getRandomLink(link): html=urlopen(link) bs=BeautifulSoup(html,'html.parser') externalLinks=getExternalLinks(bs,urlparse(link).netloc) #若没有外链,则查找内链 if len(externalLinks)==0: print('无外链,查找内链') internalLinks=getInternalLinks(bs,link) if len(internalLinks)==0 : print('无内链,结束查找') return #将内链作为参数进行下一次查询 return getRandomLink(internalLinks[random.randint(0,len(internalLinks)-1)]) else: return externalLinks[random.randint(0,len(externalLinks)-1)] def findRandomLink(link): externalLink=getRandomLink(link) print('{}'.format(externalLink)) findRandomLink(externalLink) findRandomLink('https://baidu.com')