python之爬取豆瓣---ONE

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2021/7/5 9:55 下午
# @Author  : 姚丹
# @Site    : 
# @File    : spider.py
# @Software: PyCharm


import bs4  # 网页解析,获取数据
import re  # 正则表达式,进行文字匹配
import urllib  # 制定url,获取网页数据
import xlwt  # 进行excel操作
import sqlite3  # 进行sqllite数据库操作

# 忽略https证书
import ssl
import urllib.request

ssl._create_default_https_context = ssl._create_unverified_context


def getData(baseUrl):
    dataList = []
    for i in range(0, 10):
        url = baseUrl + str(i * 25)
        html = askUrl(url)
        print(html)
    #         解析数据
    return dataList


def askUrl(url):
    html = ""
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
        }
        req = urllib.request.Request(url, headers=headers)
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")
        # print(html)
        return html
    except urllib.error.HTTPError as e:
        if hasattr(e, "code"):  # 如果包含code标签则打印出来
            print(e.code)
        if hasattr(e, "reason"):  # 如果包含reason标签则打印出来
            print(e.reason)

    return html

# askUrl("https://movie.douban.com/top250?start=10")

上一篇:Python爬虫之urllib模拟登录及cookie的那点事


下一篇:Python标准库汇总介绍