Python实训Day03笔记:京东评论数据分析与可视化项目

Day03 基础篇

文字类和图片类爬虫

1.煎蛋网爬虫

import requests
from lxml import etree

url = 'http://jandan.net/'

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36',
}

resp = requests.get(url,headers=headers)
#print(resp.status_code) #(无headers)403 服务器拒接访问
if resp.status_code == 200:
    html = resp.text
    #print(html)
    dom = etree.HTML(html)
    xpath_pattern = '//div[@class="post f list-post"]//h2/a/text()'
    title = dom.xpath(xpath_pattern)
    #print('titles',title)

for t in title:
    print(t,end="\n")

2.网易新闻头部爬虫

import requests
from lxml import etree

url = 'https://news.163.com/'

headers = {
'user-agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}

resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text
    #print(html)

    dom = etree.HTML(html)
    xpath_pattern = '//ul[@class="top_news_ul"]//li/a/text()'
    titles = dom.xpath(xpath_pattern)
    #print('titles',titles)
    for t in titles:
        print(t, end="\n")

3.网易新闻热点排行爬虫

import requests
from lxml import etree

url = 'https://news.163.com/'

resp = requests.get(url)
if resp.status_code == 200:
    html = resp.text
    #print(html)

    dom = etree.HTML(html)
    xpath_pattern = '//div[@class="mt35 mod_hot_rank clearfix"]//li/a/text()'
    titles = dom.xpath(xpath_pattern)
    #print('titles',titles)
    for t in titles:
        print(t, end="\n")

4.os库

os库,全称为operate system,属于python内置库,能够通过python代码控制操作系统的一些功能。它的作用相当于Windows资源管理器,能够实现文件的创建、重命名和删除等操作

import os

# 打印当前文件夹下的所有文件名,返回列表
print(os.listdir())
# 查看当前工作目录
print(os.getcwd())
# 改变工作目录
print(os.chdir('../02爬虫入门'))
print(os.listdir())
# 判断是否存在文件夹或文件
print(os.path.exists('./aaa'))
# 创建文件夹 make directory  如果文件已存在则报错
if not os.path.exists('./aa'):
    os.mkdir('./aa')
# 获取当前脚本所在文件夹  __file__特殊变量代表脚本自己
print(os.path.dirname(__file__))
# 拼文件的完整路径
file_path = os.path.join(os.path.dirname(__file__),'aa','test.jpg')
print(file_path)
with open(file_path,'wb') as f:
    f.write()

# 重命名、删除等

5.天堂图片网图片爬虫

import os
import requests
from lxml import etree

album_url = 'https://www.ivsky.com/tupian/lugui_v62472/'    # 图集页。下有缩略图。

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}

# 请求图集页。
resp = requests.get(album_url)
status_code = resp.status_code
print(status_code)
album_html = resp.text
print(album_html)

# 获取一个图集下的所有缩略图片的url地址
album_dom = etree.HTML(album_html)
title_pattern = '//h1/text()'
img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
album_title = album_dom.xpath(title_pattern)[0]
album_title = album_title.strip()   # 图集名可能后面有空格 mkdir()会省略空格  导致后面拼路径不正确
img_src_list = album_dom.xpath(img_pattern)
print(album_title)
print(len(img_src_list), img_src_list)

# 以图集名字创建文件夹
if not os.path.exists('./'+album_title): 
    os.mkdir('./'+album_title)

# 循环图片地址列表,请求每一张图片
for i, img_src in enumerate(img_src_list):
    # 拼完整图片url
    img_src = 'https:' + img_src
    print(img_src)
    resp = requests.get(img_src, headers=headers)
    print(resp.status_code)
    img_content_bytes = resp.content

    # 图片二进制信息写入本地
    img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i+1}.jpg')
    print(img_path)
    with open(img_path, mode='wb') as f:
        f.write(img_content_bytes)
        print(f'第{i+1}张图片保存完毕,保存到了{img_path}')

6.天堂图片网封装

进行方法的封装,能方便扩展功能,避免代码增多复杂后难以维护。

import os
import requests
from lxml import etree

headers = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}


home_url = 'https://www.ivsky.com/'
catalog_url = 'https://www.ivsky.com/bizhi/dongwu/'
album_url = "https://www.ivsky.com/tupian/lugui_v62472/"

resp = requests.get(album_url)
if resp.status_code ==200:
    album_html = resp.text
    #print(album_html)
    # 进行方法封装,方便扩展功能,以免代码和缩进增多后难以维护。
    import os
    import requests
    from lxml import etree


    # home_url = 'https://www.ivsky.com/'

    def get_single_img(img_src, album_title, i):
        # 拼完整图片url
        img_src = 'https:' + img_src
        print(img_src)
        resp = requests.get(img_src, headers=headers)
        print(resp.status_code)
        img_content_bytes = resp.content

        # 图片二进制信息写入本地
        img_path = os.path.join(os.path.dirname(__file__), album_title, f'{i + 1}.jpg')
        print(img_path)
        with open(img_path, mode='wb') as f:
            f.write(img_content_bytes)
            print(f'第{i + 1}张图片保存完毕,保存到了{img_path}')


    def get_single_album(album_url):
        # album_url = 'https://www.ivsky.com/tupian/lugui_v62472/'    # 图集页。下有缩略图。

        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
        }

        # 请求图集页。
        resp = requests.get(album_url)
        status_code = resp.status_code
        print(status_code)
        album_html = resp.text
        print(album_html)

        # 获取一个图集下的所有缩略图片的url地址
        album_dom = etree.HTML(album_html)
        title_pattern = '//h1/text()'
        img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
        album_title = album_dom.xpath(title_pattern)[0]
        album_title = album_title.strip() 
        img_src_list = album_dom.xpath(img_pattern)
        print(album_title)
        print(len(img_src_list), img_src_list)

        # 以图集名字创建文件夹
        if not os.path.exists('./' + album_title):  # '体型庞大的陆龟图片(16张)'
            os.mkdir('./' + album_title)

        # 循环图片地址列表,请求每一张图片
        for i, img_src in enumerate(img_src_list):
            get_single_img(img_src, album_title, i)


    catalog_url = 'https://www.ivsky.com/tupian/dongwutupian/index_2.html'
    resp = requests.get(catalog_url)
    html = resp.text
    dom = etree.HTML(html)
    album_url_list = dom.xpath('/@href')
    for album_url in album_url_list:
        get_single_album(album_url)

    album_dom = etree.HTML(album_html)
    img_pattern = '//ul[@class="pli"]/li/div/a/img/@src'
    title_pattern = '//h1/text()'
    img_src = album_dom.xpath(img_pattern)
    album_title = album_dom.xpath(title_pattern)[0]
    album_title = album_title.strip()
    #print(len(img_src))
    #print(album_title)

# 创建文件夹
if not os.path.exists('./'+album_title):
    os.mkdir('./'+album_title)

# 图片爬取
for s,i in enumerate(img_src):
    # 要使用完整的网址
    i = 'https:' + i
    print(i)
    img_resp = requests.get(i,headers = headers)
    print(img_resp.status_code)
    img_content_byte = resp.content

    # 将图片写入到本地
    img_path = os.path.join(os.path.dirname(__file__),album_title,f'{s+1}.jpg')
    print(img_path)
    with open(img_path,mode="wb") as f:
        f.write(img_content_byte)
        print(f"第{s+1}张图片保存成功")

上一篇:爬取周杰伦歌曲信息分别保存为excel文件和存入MySQL数据库以及发送到你的邮箱中


下一篇:Codeforces 1207G Indie Album AC自动机