目的
抓取网络中html中的书籍内容,并且按照Markdown的语法,形成Markdown文档。本文的Markdown语法按照Typora(Markdown编辑器)来输出。可以直接用Typora打开。
系统环境
ubuntu16.04, python3.9
方案
爬虫工具比较
- BeautifulSoup4 + requests
优点:简单,api简单,适合入门
缺点: 抓取速度比较慢,因为不是异步抓取,如果需要讲究效率,可以字节做异步。 - Scrapy
优点:抓取速度快,有一整套框架
缺点:复杂,门槛较高,需要点时间入门 - html2txt
使用了一下,虽然用起来很简单,比较傻瓜式,但是很有局限性,不能按照自己的意愿write成Markdown文档,感觉用得不如BeautifulSoup4 爽。
参考:
pip install BeautifulSoup4
pip install fake-useragent
pip install requests
# -*- coding:utf-8 -*-
import html2text
import requests
from bs4 import BeautifulSoup
import uuid
from fake_useragent import UserAgent
import os
ua = UserAgent()
# headers = {"User-Agent": ua.random}
headers = {
"User-Agent": ua.chrome,
}
img_headers = {
# "User-Agent": ua.firefox,
'Host': 'img.dushu.com',
'User-Agent': 'Mozilla/5.0 (iPhone CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/12.0 Mobile/15A372 Safari/604.1',
'Accept': 'image/webp, */*',
'Accept-Language': 'zh-CN, zh q = 0.8, zh-TW q = 0.7, zh-HK q = 0.5, en-US q = 0.3, en q = 0.2',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Referer': 'https://m.dushu.com/showbook/113574/1412792.html',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1'
}
cookies = {
'__gads': "ID=f5d687364432864d-2285bab46db80007:T=1606648841:RT=1606648841:S=ALNI_MYS8QpBvJ7f8bM2PWydyfr1bEn7Ig",
'Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d': "1606658490",
'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d': "1606648421"
}
class m_doshu_com(object):
"""
https://m.doshu.com
"""
def __init__(self, url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412792, page_end=1412816, filename='test.md'):
self.url = "https://m.dushu.com/showbook/113574/1412792.html"
self.page_begin = page_begin
self.page_end = page_end
self.filename = filename
self.fp_file = open(os.path.join("./", self.filename), 'w')
def m_doshu_com_write_1_page(self, url="https://m.dushu.com/showbook/113574/1412792.html"):
"""
use BeautifulSoup() to write 1 page
"""
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
# print(response.status_code)
soup = BeautifulSoup(response.text, 'html.parser')
title = soup.find_all('div', class_="article-detail")[0].find_all('h1')[0].get_text()
print(title)
self.fp_file.write('## {}\n\n'.format(title))
print("--------")
text = soup.find_all('div', class_="text")[0]
for p in text.find_all('p'):
print(p.get_text())
self.fp_file.write('{}\n\n'.format(p.get_text()))
if(p.find('img')):
img_src = p.find('img')['src']
# print(img_headers)
img_reponse = requests.get(url=img_src, headers=img_headers)
if img_reponse.status_code == 200:
file_name = str(uuid.uuid4()) + '.jpg'
if not os.path.exists("assets"):
os.mkdir('assets')
with open(os.path.join('assets/', file_name), 'wb') as fp:
fp.write(img_reponse.content)
print('![](assets/{})'.format(file_name))
self.fp_file.write('![](assets/{})\n\n'.format(file_name))
else:
print('!!! img({}) not ok'.format(img_src))
print('--')
del soup
return response.status_code
def generate(self):
tmp_url = self.url
for index in range(self.page_begin, self.page_end+1):
tmp_url = 'https://m.dushu.com/showbook/113574/{}.html'.format(index)
self.m_doshu_com_write_1_page( url=tmp_url)
print(tmp_url)
self.fp_file.close()
if __name__ == '__main__':
book1 = m_doshu_com(url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412787, page_end=1412816, filename='test.md')
book1.generate()