#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/7/5 9:55 下午
# @Author : 姚丹
# @Site :
# @File : spider.py
# @Software: PyCharm
import bs4 # 网页解析,获取数据
import re # 正则表达式,进行文字匹配
import urllib # 制定url,获取网页数据
import xlwt # 进行excel操作
import sqlite3 # 进行sqllite数据库操作
# 忽略https证书
import ssl
import urllib.request
ssl._create_default_https_context = ssl._create_unverified_context
def getData(baseUrl):
dataList = []
for i in range(0, 10):
url = baseUrl + str(i * 25)
html = askUrl(url)
print(html)
# 解析数据
return dataList
def askUrl(url):
html = ""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36"
}
req = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
# print(html)
return html
except urllib.error.HTTPError as e:
if hasattr(e, "code"): # 如果包含code标签则打印出来
print(e.code)
if hasattr(e, "reason"): # 如果包含reason标签则打印出来
print(e.reason)
return html
# askUrl("https://movie.douban.com/top250?start=10")