bs去除特定标签。
# url
import easygui as g
import urllib.request
from bs4 import BeautifulSoup
import os
import sys
import re
import config.story2 as urls # 获取url
def set_url(): msg = "请填写一下信息(其中带*号的项为必填项)"
title = "爬虫练习"
fieldNames = ["*小说目录地址", "*组装前半段", "后半段"]
fieldValues = []
fieldValues = g.multenterbox(msg, title, fieldNames)
while True:
if fieldValues == None:
break
errmsg = ""
for i in range(len(fieldNames)):
option = fieldNames[i].strip()
if fieldValues[i].strip() == "" and option[0] == "*":
errmsg += ("【%s】为必填项 " % fieldNames[i])
if errmsg == "":
break
fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues) return fieldValues # 下载网页内容,找到文章标题和对应的下载路径
def get_urls(seed_url,pre_url,last_url):
# 保存文章名称和地址
storyList = {}
response = urllib.request.urlopen(seed_url)
html = response.read().decode('utf-8')
bs = BeautifulSoup(html, "html.parser")
contents = bs.find_all("div", {"class": "c-line-bottom"})
for each in contents:
# 或者文章的data-nsrc属性
nsrc = each.a["data-nsrc"]
#组装url
seed_url = pre_url+nsrc+last_url
# 获取文件标题
title = each.p.string
storyList[title] = seed_url return storyList # 获取每个小说并下载
def getStory():
savepath = r"E:\\stories\\"
storyList = get_urls(urls.url1,urls.url2,urls.url3)
storyNames = list(storyList.keys())
for i in range(len(storyNames)):
# 获取小说:
html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode('utf-8')
bs = BeautifulSoup(html,"html.parser") [s.extract() for s in bs('br')] # 后来发现这个可以啊
content = bs.find_all('p')
#[ss.extract() for ss in content('p')] # 放到这里是否可以,发现不行。TypeError: 'ResultSet' object is not callable
# # 用替换方式去掉br修饰,发现不行
# oldstr = r'<br style="font-size:16px;font-weight:normal;' \
# r'margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);' \
# r'text-align:-webkit-auto;text-indent:0px;white-space:normal;' \
# r'text-overflow:clip;clear:none;display:inline;"/>'
# # print(content) with open(savepath+storyNames[i]+".txt",'w') as f:
f.writelines(str(content)) # download(get_url())
# get_url()
getStory()