python 爬虫练习

2022-10-30 14:17:44
bs去除特定标签。
# url

import easygui as g

import urllib.request

from bs4 import BeautifulSoup

import os

import sys

import re

import config.story2 as urls

# 获取url

def set_url():

    msg = "请填写一下信息(其中带*号的项为必填项)"

    title = "爬虫练习"

    fieldNames = ["*小说目录地址", "*组装前半段", "后半段"]

    fieldValues = []

    fieldValues = g.multenterbox(msg, title, fieldNames)

    while True:

        if fieldValues == None:

            break

        errmsg = ""

        for i in range(len(fieldNames)):

            option = fieldNames[i].strip()

            if fieldValues[i].strip() == "" and option[0] == "*":

                errmsg += ("【%s】为必填项   " % fieldNames[i])

        if errmsg == "":

            break

        fieldValues = g.multenterbox(errmsg, title, fieldNames, fieldValues)

    return fieldValues

# 下载网页内容,找到文章标题和对应的下载路径

def get_urls(seed_url,pre_url,last_url):

    # 保存文章名称和地址

    storyList = {}

    response = urllib.request.urlopen(seed_url)

    html = response.read().decode('utf-8')

    bs = BeautifulSoup(html, "html.parser")

    contents = bs.find_all("div", {"class": "c-line-bottom"})

    for each in contents:

        # 或者文章的data-nsrc属性

        nsrc = each.a["data-nsrc"]

        #组装url

        seed_url = pre_url+nsrc+last_url

        # 获取文件标题

        title = each.p.string

        storyList[title] = seed_url

    return storyList

# 获取每个小说并下载

def getStory():

    savepath = r"E:\\stories\\"

    storyList = get_urls(urls.url1,urls.url2,urls.url3)

    storyNames = list(storyList.keys())

    for i in range(len(storyNames)):

        # 获取小说：

        html = urllib.request.urlopen(storyList[storyNames[i]]).read().decode('utf-8')

        bs = BeautifulSoup(html,"html.parser")

        [s.extract() for s in bs('br')]   # 后来发现这个可以啊

        content = bs.find_all('p')

        #[ss.extract() for ss in content('p')]  # 放到这里是否可以,发现不行。TypeError: 'ResultSet' object is not callable

        # # 用替换方式去掉br修饰，发现不行

        # oldstr = r'<br style="font-size:16px;font-weight:normal;' \

        #          r'margin-left:4px;margin-right:4px;float:none;color:rgb(0, 0, 0);' \

        #          r'text-align:-webkit-auto;text-indent:0px;white-space:normal;' \

        #          r'text-overflow:clip;clear:none;display:inline;"/>'

        #

       # print(content)

        with open(savepath+storyNames[i]+".txt",'w') as f:

             f.writelines(str(content))

# download(get_url())

# get_url()

getStory()
码农公寓

相关文章