Day03 基础篇
文字类和图片类爬虫
1.煎蛋网爬虫
import requests
from lxml import etree
url = 'http://jandan.net/'
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
}
resp = requests.get(url,headers=headers)
#print(resp.status_code) #(无headers)403 服务器拒接访问
if resp.status_code == 200:
html = resp.text
#print(html)
dom = etree.HTML(html)
xpath_pattern = '//div[@class="post f list-post"]//h2/a/text()'
title = dom.xpath(xpath_pattern)
#print('titles',title)
for t in title:
print(t,end="\n")
2.网易新闻头部爬虫
import requests
from lxml import etree
url = 'https://news.163.com/'
headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
#print(html)
dom = etree.HTML(html)
xpath_pattern = '//ul[@class="top_news_ul"]//li/a/text()'
titles = dom.xpath(xpath_pattern)
#print('titles',titles)
for t in titles:
print(t, end="\n")
3.网易新闻热点排行爬虫
import requests
from lxml import etree
url = 'https://news.163.com/'
resp = requests.get(url)
if resp.status_code == 200:
html = resp.text
#print(html)
dom = etree.HTML(html)
xpath_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]//li/a/text()'
titles = dom.xpath(xpath_pattern)
#print('titles',titles)
for t in titles:
print(t, end="\n")
4.os库
os库,全称为operate system,属于python内置库,能够通过python代码控制操作系统的一些功能。它的作用相当于Windows资源管理器,能够实现文件的创建、重命名和删除等操作
import os
# 打印当前文件夹下的所有文件名,返回列表
print(os.listdir())
# 查看当前工作目录
print(os.getcwd())
# 改变工作目录
print(os.chdir('../02爬虫入门'))
print(os.listdir())
# 判断是否存在文件夹或文件
print(os.path.exists('./aaa'))
# 创建文件夹 make directory 如果文件已存在则报错
if not os.path.exists('./aa'):
os.mkdir('./aa')
# 获取当前脚本所在文件夹 __file__特殊变量代表脚本自己
print(os.path.dirname(__file__))
# 拼文件的完整路径
file_path = os.path.join(os.path.dirname(__file__),'aa','test.jpg')
print(file_path)
with open(file_path,'wb') as f:
f.write()
# 重命名、删除等
5.天堂图片网图片爬虫
import os
import requests
from lxml import etree
album_url = 'https://www.ivsky.com/tupian/lugui_v62472/' # 图集页。下有缩略图。
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
# 请求图集页。
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
album_html = resp.text
print(album_html)
# 获取一个图集下的所有缩略图片的url地址
album_dom = etree.HTML(album_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip() # 图集名可能后面有空格 mkdir()会省略空格 导致后面拼路径不正确
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list), img_src_list)
# 以图集名字创建文件夹
if not os.path.exists('./'+album_title):
os.mkdir('./'+album_title)
# 循环图片地址列表,请求每一张图片
for i, img_src in enumerate(img_src_list):
# 拼完整图片url
img_src = 'https:' + img_src
print(img_src)
resp = requests.get(img_src, headers=headers)
print(resp.status_code)
img_content_bytes = resp.content
# 图片二进制信息写入本地
img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i+1}.jpg')
print(img_path)
with open(img_path, mode='wb') as f:
f.write(img_content_bytes)
print(f'第{i+1}张图片保存完毕,保存到了{img_path}')
6.天堂图片网封装
进行方法的封装,能方便扩展功能,避免代码增多复杂后难以维护。
import os
import requests
from lxml import etree
headers = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
home_url = 'https://www.ivsky.com/'
catalog_url = 'https://www.ivsky.com/bizhi/dongwu/'
album_url = "https://www.ivsky.com/tupian/lugui_v62472/"
resp = requests.get(album_url)
if resp.status_code ==200:
album_html = resp.text
#print(album_html)
# 进行方法封装,方便扩展功能,以免代码和缩进增多后难以维护。
import os
import requests
from lxml import etree
# home_url = 'https://www.ivsky.com/'
def get_single_img(img_src, album_title, i):
# 拼完整图片url
img_src = 'https:' + img_src
print(img_src)
resp = requests.get(img_src, headers=headers)
print(resp.status_code)
img_content_bytes = resp.content
# 图片二进制信息写入本地
img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i + 1}.jpg')
print(img_path)
with open(img_path, mode='wb') as f:
f.write(img_content_bytes)
print(f'第{i + 1}张图片保存完毕,保存到了{img_path}')
def get_single_album(album_url):
# album_url = 'https://www.ivsky.com/tupian/lugui_v62472/' # 图集页。下有缩略图。
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}
# 请求图集页。
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
album_html = resp.text
print(album_html)
# 获取一个图集下的所有缩略图片的url地址
album_dom = etree.HTML(album_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip()
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list), img_src_list)
# 以图集名字创建文件夹
if not os.path.exists('./' + album_title): # '体型庞大的陆龟图片(16张)'
os.mkdir('./' + album_title)
# 循环图片地址列表,请求每一张图片
for i, img_src in enumerate(img_src_list):
get_single_img(img_src, album_title, i)
catalog_url = 'https://www.ivsky.com/tupian/dongwutupian/index_2.html'
resp = requests.get(catalog_url)
html = resp.text
dom = etree.HTML(html)
album_url_list = dom.xpath('/@href')
for album_url in album_url_list:
get_single_album(album_url)
album_dom = etree.HTML(album_html)
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
title_pattern = '//h1/text()'
img_src = album_dom.xpath(img_pattern)
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip()
#print(len(img_src))
#print(album_title)
# 创建文件夹
if not os.path.exists('./'+album_title):
os.mkdir('./'+album_title)
# 图片爬取
for s,i in enumerate(img_src):
# 要使用完整的网址
i = 'https:' + i
print(i)
img_resp = requests.get(i,headers = headers)
print(img_resp.status_code)
img_content_byte = resp.content
# 将图片写入到本地
img_path = os.path.join(os.path.dirname(__file__),album_title,f'{s+1}.jpg')
print(img_path)
with open(img_path,mode="wb") as f:
f.write(img_content_byte)
print(f"第{s+1}张图片保存成功")