学习了Python3 爬虫实战教程_w3cschool的教程
第一次做爬虫,练手网站是笔趣阁(http://www.ibiqu.net/),反正他们也是爬别人的 ^_^!
将源码贴出来给和我一样的菜鸟参考,代码有点乱,没有写def,也没有做什么优化。
有两个引用的库得单独安装一下
pip install beautifulsoup4
pip install requests
手册地址:http://beautifulsoup.readthedocs.io/zh_CN/latest/
from bs4 import BeautifulSoup import requests import re import time if __name__ == '__main__': t = 1 while t ==1: bookname = input('请输入要下载的书名:') target = 'http://www.ibiqu.net//modules/article/search.php?searchkey='+ bookname #搜索引擎路径 req = requests.get(url = target) html = req.text bf = BeautifulSoup(html,'html.parser') texts = bf.find_all('a') x = 1 for a in texts: if a.string == bookname: url_a = a.get('href') target2 = 'http://www.ibiqu.net' + url_a #目录路径 req2 = requests.get(url = target2) html2 = req2.text q = re.search('正文',html2).end() + 5 h = re.search('</dl>',html2).start() m = html2[q:h] bf2 = BeautifulSoup(m,'html.parser') texts2 = bf2.find_all('a') print('本书共找到到'+ str(len(texts2)) + '个章节') n = int(input('请输入开始下载章节(阿拉伯数字):')) #缓存计数 path = 'D:/pydown/' + bookname + '.txt' f = open(path,mode='a',encoding='utf-8') #创建下载文件 for a in texts2[n-1:]: url_b = a.get('href') name_b = a.string #章名 f.write(name_b + '\n') #写入章节名 target3 = 'http://www.ibiqu.net' + url_b #章路径 req3 = requests.get(url = target3) html3 = req3.text bf3 = BeautifulSoup(html3,'html.parser') d = bf3.find_all('div',id = 'content') p0 = d[0] p1 = p0.find_all('p') print('开始写入'+ name_b) for a in p1: if a.string: #去除空段落 f.write(a.string + '\n') #写入章节内容 n +=1 if n%500 == 0: #定时存盘 f.close() f = open(path,mode='a',encoding='utf-8') print('************缓存清理完成!************') time.sleep(2) #暂停两秒,别把人家服务器挤崩了 ''' y = input('1跳出 >>>') if y: break ''' print('下载结束!') f.close() #关闭文件 x = 0 if x: print('找不到此书,请重新输入正确书名!')