爬虫下载校花网美女信息-lxml

# coding=utf-8
# !/usr/bin/env python
'''
author: dangxusheng
desc : 下载校花网上的个人信息:名字-学校-图片地址-点赞数
date : 2018-08-29
''' # 导入模块
import requests
from lxml import etree
import json #准备全局变量
home_url = "http://www.xiaohuar.com/"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36",
"Referer": home_url
} # 定义单页解析方法
def one_page_info(page_index=0):
url = home_url + "list-1-" + str(page_index) + ".html"
r = requests.get(url, headers=headers)
html = r.content.decode('gbk')
# print(html)
# exit(1)
html = etree.HTML(html)
div_list = html.xpath('//div[@class="item masonry_brick"]')
info_list = []
for div in div_list:
name = div.xpath('.//span[@class="price"]/text()')[0]
name = name if name != None else '暂无名字' school = div.xpath('.//a[@class="img_album_btn"]/text()')[0]
school = school if school != None else '暂无学校' img_url = div.xpath('./div[1]/div[1]/a[1]/img[1]/@src')[0]
img_url = img_url if img_url != None else '暂无图片'
# 有些url需要补全:/d/file/20180907/075025972927c8e7541b09e272afe5cc.jpg
if str(img_url).find('http') == -1:
img_url = home_url[0:-1] + img_url
else:
pass dianz = div.xpath('.//em[1]/text()')
dianz = dianz if dianz != None else ''
info_list.append({'name': name, 'school': school, 'img_url': img_url, 'dianzan': dianz})
return info_list
# print(info_list) # 遍历列表并按照URL下载保存到文件
def donwload_jpg_2_file(info_list):
for info in info_list:
url = info['img_url']
r = requests.get(url, headers=headers, stream=True)
with open('./xiaohua/%s.jpg' % info['name'], 'wb') as file:
# 分字节下载
for i in r.iter_content(1024):
file.write(i)
print('%s 下载成功' % info['name']) # 入口函数
if __name__ == '__main__':
for i in range(50):
ls = one_page_info(i)
donwload_jpg_2_file(ls)
上一篇:XVIII Open Cup named after E.V. Pankratiev. Eastern Grand Prix


下一篇:(NO.00001)iOS游戏SpeedBoy Lite成形记(二十七)