PY爬虫 | 爬取下厨房的本周最受欢迎

# CY3761 | 2021-10-27 20:03

# 爬取下厨房的本周最受欢迎-列表数据
import base64
import os.path
from urllib import request  # 下载图片

import openpyxl
from openpyxl.worksheet.hyperlink import Hyperlink  # 插入链接
from openpyxl.drawing.image import Image  # 插入图片
from pyquery import PyQuery as pq

url = 'https://www.xiachufang.com/explore'  # 需要头信息 否则 404
headers = {
    'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                   'Chrome/95.0.4638.54 Safari/537.36')
}
sep = '/'
urlSame = url.split(sep)
urlSame.pop()
urlSame = sep.join(urlSame)

# print(urlSame)

pqa = pq(url=url, encoding='utf-8', headers=headers)
pqb = pqa('.normal-recipe-list li > div')

items = []

path = '下厨房的本周最受欢迎'

if not os.path.exists(path):
    os.makedirs(path)

for k, _ in enumerate(pqb):
    _ = pq(_)
    _src = _('img').attr('data-src')
    _href = urlSame + _('a').attr('href')
    _name = _('p.name a').text()
    _meta = _('p.ing').text()
    _author = _('p.author a').text()
    
    # print(_src, _href, sep='\n')
    # print(_name, _author)
    # print(_meta)
    
    # 保存图片
    """"""
    k = str(k).zfill(2)
    _imgPath = os.getcwd() + '/' + path + '/' + k + '.jpg'
    
    _imgResp = request.urlopen(request.Request(_src))
    _content = _imgResp.read()
    
    if _content:
    
        with open(_imgPath, 'wb') as f:
            f.write(_content)
    
    items.append((_imgPath, _href, _name, _meta, _author))

# print(pqa.html())

# 创建工作表
ow = openpyxl.Workbook()
sheet = ow.active
sheet.title = path

for _ in items:
    sheet.append(['图片', _[0]])  # 1
    sheet.append(['菜名', _[2]])  # 2
    sheet.append(['作者', _[4]])  # 3
    sheet.append(['材料', _[3]])  # 4
    sheet.append(['链接', _[1]])  # 5
    
    sheet.append([])  # 6

for _ in sheet['B']:
    col = chr(_.column + 64)
    row = str(_.row)
    
    if _.row % 6 == 1:  # 图片
        img = Image(_.value)
        size = 0.50  # 图片缩放
        img.width = img.width * size
        img.height = img.height * size
        # 单元格缩放
        sheet.column_dimensions[col].width = img.width
        sheet.row_dimensions[int(row)].height = img.height  # 注意这里不能字符串 否则报错
        
        # print(_.value)
        # print(chr(_.column + 64) + ' ' + str(_.row))
        sheet.add_image(img, col + row)
        _.value = ''
    if _.row % 6 == 5:  # 链接
        _.value = '=HYPERLINK("' + _.value + '","点击访问")'

ow.save('下厨房的本周最受欢迎.xlsx')
ow.close()

PY爬虫 | 爬取下厨房的本周最受欢迎

上一篇:openpyxl 读取excel中某一行数据


下一篇:C++ 函数返回二维数组