import requests import os import re # https://www.17k.com/chapter/263899/5856183.html # 获取每章节下载链接的url def get_toc(html): print('get url') to_url_list = [] toc_block = re.findall('class="tit">正文(.*?)BAIDU_banner_bottom', html, re.S)[0] toc_url = re.findall('href="(.*?)"', toc_block, re.S) start_url = 'https://www.17k.com' for url in toc_url[:-1]: to_url_list.append(start_url + url) return to_url_list # 获取标题和小说内容 def get_article(html): print('get chapter and text') chapter_name = re.search('<h1>(.*?)</h1>', html, re.S).group(1) text_block = re.search('class="p">(.*?)<p class="copy ">', html, re.S).group(1) # print(chapter_name) # print(text_block.replace('<p>', '').replace('</p>', '')) text_content = text_block.replace('<p>', '').replace('</p>', '') save(chapter_name, re.sub('[ \t]', '', text_content)) # sub去除文章中的一大串空格 # 将小说内容保存到电脑上 def save(chapter, article): file_path = r'C:\Users\coremail\Desktop\爬虫\仙剑四' file_name = os.path.join(file_path, chapter + '.txt') os.makedirs(file_path, exist_ok=True) with open(file_name, 'w', encoding='utf-8') as f: f.write(article) # 仙剑四的url url = 'https://www.17k.com/list/263899.html' htmlContent = requests.get(url).content.decode('UTF-8') url_list = get_toc(htmlContent) for novel_url in url_list: print(novel_url) try: get_article(requests.get(novel_url).content.decode('UTF-8')) except Exception as e: print(e) # get_article(requests.get('https://www.17k.com/chapter/263899/5868069.html').content.decode('utf-8')) print('over')