# CY3761 | 2021-10-27 20:03
# 爬取下厨房的本周最受欢迎-列表数据
import base64
import os.path
from urllib import request # 下载图片
import openpyxl
from openpyxl.worksheet.hyperlink import Hyperlink # 插入链接
from openpyxl.drawing.image import Image # 插入图片
from pyquery import PyQuery as pq
url = 'https://www.xiachufang.com/explore' # 需要头信息 否则 404
headers = {
'User-Agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/95.0.4638.54 Safari/537.36')
}
sep = '/'
urlSame = url.split(sep)
urlSame.pop()
urlSame = sep.join(urlSame)
# print(urlSame)
pqa = pq(url=url, encoding='utf-8', headers=headers)
pqb = pqa('.normal-recipe-list li > div')
items = []
path = '下厨房的本周最受欢迎'
if not os.path.exists(path):
os.makedirs(path)
for k, _ in enumerate(pqb):
_ = pq(_)
_src = _('img').attr('data-src')
_href = urlSame + _('a').attr('href')
_name = _('p.name a').text()
_meta = _('p.ing').text()
_author = _('p.author a').text()
# print(_src, _href, sep='\n')
# print(_name, _author)
# print(_meta)
# 保存图片
""""""
k = str(k).zfill(2)
_imgPath = os.getcwd() + '/' + path + '/' + k + '.jpg'
_imgResp = request.urlopen(request.Request(_src))
_content = _imgResp.read()
if _content:
with open(_imgPath, 'wb') as f:
f.write(_content)
items.append((_imgPath, _href, _name, _meta, _author))
# print(pqa.html())
# 创建工作表
ow = openpyxl.Workbook()
sheet = ow.active
sheet.title = path
for _ in items:
sheet.append(['图片', _[0]]) # 1
sheet.append(['菜名', _[2]]) # 2
sheet.append(['作者', _[4]]) # 3
sheet.append(['材料', _[3]]) # 4
sheet.append(['链接', _[1]]) # 5
sheet.append([]) # 6
for _ in sheet['B']:
col = chr(_.column + 64)
row = str(_.row)
if _.row % 6 == 1: # 图片
img = Image(_.value)
size = 0.50 # 图片缩放
img.width = img.width * size
img.height = img.height * size
# 单元格缩放
sheet.column_dimensions[col].width = img.width
sheet.row_dimensions[int(row)].height = img.height # 注意这里不能字符串 否则报错
# print(_.value)
# print(chr(_.column + 64) + ' ' + str(_.row))
sheet.add_image(img, col + row)
_.value = ''
if _.row % 6 == 5: # 链接
_.value = '=HYPERLINK("' + _.value + '","点击访问")'
ow.save('下厨房的本周最受欢迎.xlsx')
ow.close()