一个python爬虫小案例

最近两个月在忙着期末复习和学习计算机网络和java并发编程,一直没有写博客,这两天开始学习python的爬虫,写了个简单的小爬虫练练手。

思路是通过python的BeautifulSoup来对html页面进行解析,然后对解析后的html进行处理。具体如下:

from urllib.request import urlopen
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import re
import datetime
import random
random.seed(datetime.datetime.now())

#获取页面中所有内链的列表
def getInternalLinks(bs,includeUrl):
    includeUrl='{}://{}'.format(urlparse(includeUrl).scheme,urlparse(includeUrl).netloc)
    internalLinks=[]
    #找出所有以/开头的链接
    for link in bs.find_all('a',href=re.compile('^(/|.*'+includeUrl+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if(link.attrs['href'].startswith('/')):
                    internalLinks.append(includeUrl+link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
#获取页面中的所有外链
def getExternalLinks(bs,excludeUrl):
    externalLinks=[]
    for link in bs.find_all('a',href=re.compile('^(http|www|https)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks
def getRandomLink(link):
    html=urlopen(link)
    bs=BeautifulSoup(html,'html.parser')
    externalLinks=getExternalLinks(bs,urlparse(link).netloc)
    #若没有外链,则查找内链
    if len(externalLinks)==0:
        print('无外链,查找内链')
        internalLinks=getInternalLinks(bs,link)
        if len(internalLinks)==0 :
            print('无内链,结束查找')
            return
        #将内链作为参数进行下一次查询
        return getRandomLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
         return externalLinks[random.randint(0,len(externalLinks)-1)]
def findRandomLink(link):
    externalLink=getRandomLink(link)
    print('{}'.format(externalLink))
    findRandomLink(externalLink)
findRandomLink('https://baidu.com')

  

上一篇:vue组件通讯


下一篇:Vue3 封装第三方组件(一)做一个合格的传声筒