BeautifulSoup抓取html内容并输出Markdown

目的

抓取网络中html中的书籍内容,并且按照Markdown的语法,形成Markdown文档。本文的Markdown语法按照Typora(Markdown编辑器)来输出。可以直接用Typora打开。

系统环境

ubuntu16.04, python3.9

方案

爬虫工具比较

  • BeautifulSoup4 + requests
    优点:简单,api简单,适合入门
    缺点: 抓取速度比较慢,因为不是异步抓取,如果需要讲究效率,可以字节做异步。
  • Scrapy
    优点:抓取速度快,有一整套框架
    缺点:复杂,门槛较高,需要点时间入门
  • html2txt
    使用了一下,虽然用起来很简单,比较傻瓜式,但是很有局限性,不能按照自己的意愿write成Markdown文档,感觉用得不如BeautifulSoup4 爽。

参考:
pip install BeautifulSoup4
pip install fake-useragent
pip install requests

# -*- coding:utf-8 -*-

import html2text
import requests
from bs4 import BeautifulSoup
import uuid
from fake_useragent import UserAgent
import os

ua = UserAgent()
# headers = {"User-Agent": ua.random}
headers = {
    "User-Agent": ua.chrome,
}

img_headers = {
    # "User-Agent": ua.firefox,
    'Host': 'img.dushu.com',
    'User-Agent': 'Mozilla/5.0 (iPhone CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/12.0 Mobile/15A372 Safari/604.1',
    'Accept': 'image/webp, */*',
    'Accept-Language': 'zh-CN, zh q = 0.8, zh-TW q = 0.7, zh-HK q = 0.5, en-US q = 0.3, en q = 0.2',
    'Accept-Encoding': 'gzip, deflate, br',
    'Connection': 'keep-alive',
    'Referer': 'https://m.dushu.com/showbook/113574/1412792.html',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
    'Upgrade-Insecure-Requests': '1'
}

cookies = {
    '__gads': "ID=f5d687364432864d-2285bab46db80007:T=1606648841:RT=1606648841:S=ALNI_MYS8QpBvJ7f8bM2PWydyfr1bEn7Ig",
    'Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d': "1606658490",
    'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d': "1606648421"
}


class m_doshu_com(object):
    """
    https://m.doshu.com
    """

    def __init__(self, url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412792, page_end=1412816, filename='test.md'):
        self.url = "https://m.dushu.com/showbook/113574/1412792.html"
        self.page_begin = page_begin
        self.page_end = page_end
        self.filename = filename
        self.fp_file = open(os.path.join("./", self.filename), 'w')
        


    def m_doshu_com_write_1_page(self, url="https://m.dushu.com/showbook/113574/1412792.html"):
        """
        use BeautifulSoup() to write 1 page
        """
        response = requests.get(url, headers=headers)
        response.encoding = response.apparent_encoding
        # print(response.status_code)
        soup = BeautifulSoup(response.text, 'html.parser')
        title = soup.find_all('div', class_="article-detail")[0].find_all('h1')[0].get_text()
        print(title)
        self.fp_file.write('## {}\n\n'.format(title))
        print("--------")
        text = soup.find_all('div', class_="text")[0]
        for p in text.find_all('p'):
            print(p.get_text())
            self.fp_file.write('{}\n\n'.format(p.get_text()))
            if(p.find('img')):
                img_src = p.find('img')['src']
                # print(img_headers)
                img_reponse = requests.get(url=img_src, headers=img_headers)
                if img_reponse.status_code == 200:
                    file_name = str(uuid.uuid4()) + '.jpg'
                    if not os.path.exists("assets"):
                        os.mkdir('assets')
                    with open(os.path.join('assets/', file_name), 'wb') as fp:
                        fp.write(img_reponse.content)
                    print('![](assets/{})'.format(file_name))
                    self.fp_file.write('![](assets/{})\n\n'.format(file_name))
                else:
                    print('!!! img({}) not ok'.format(img_src))
            print('--')
        del soup
        return response.status_code


    def generate(self):

        tmp_url = self.url

        for index in range(self.page_begin, self.page_end+1):
            tmp_url = 'https://m.dushu.com/showbook/113574/{}.html'.format(index)
            self.m_doshu_com_write_1_page( url=tmp_url)
            print(tmp_url)

        self.fp_file.close()


if __name__ == '__main__':
    book1 = m_doshu_com(url="https://m.dushu.com/showbook/113574/1412792.html", page_begin=1412787, page_end=1412816, filename='test.md')
    book1.generate()

上一篇:爬虫基础:BeautifulSoup网页解析库


下一篇:[Python从零到壹] 六.网络爬虫之BeautifulSoup爬取豆瓣TOP250电影详解