python爬虫+数据分析+数据可视化
import csv
import pandas as pd
import numpy as np
import asyncio
import aiohttp
from pandas import Series, DataFrame
# import matplotlib as mpl
import matplotlib.pyplot as plt
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/30 '
}
async def get_page(url):
async with aiohttp.ClientSession() as session:
async with await session.get(url=url, headers=headers) as response:
# text()返回字符串形式的响应数据
# read()返回二进制形式的响应数据
# json()返回的就是json对象
# 注意:获取响应数据操作之前一定要使用await进行手动挂起
page_text = await response.text()
tree = etree.HTML(page_text)
titles = tree.xpath('//div[@class="property-content-title"]/h3//text()')
values = tree.xpath('//p[@class="property-price-total"]/span[1]/text()')
layouts = tree.xpath('//div[@class="property-content-info"]/p[1]//text()')
a = ''
for i in layouts:
if i != ' ':
a = a + i
layout = []
for i in range(int(len(a) / 6)):
layout.append(''.join(list(a)[6 * i:6 * i + 6]))
mi = tree.xpath('//div[@class="property-content-info"]/p[2]//text()')
location = tree.xpath('//div[@class="property-content-info"]/p[3]//text()')
high = tree.xpath('//div[@class="property-content-info"]/p[4]//text()')
build_times = tree.xpath('//div[@class="property-content-info"]/p[5]//text()')
address = tree.xpath('//div[@class="property-content-info property-content-info-comm"]/p[1]//text()')
specific_address = tree.xpath(
'//div[@class="property-content-info property-content-info-comm"]/p[2]//text()')
insertion = []
for i in range(int(len(specific_address))):
insertion.append(specific_address[i])
if (i + 1) % 3 != 0:
insertion.insert(len(specific_address), '-')
# print(insertion)
name = tree.xpath('//div[@class="property-extra"]/span[1]/text()')
grade = tree.xpath('//div[@class="property-extra"]/span[2]/text()')
website = tree.xpath('//div[@class="property-extra"]/span[3]/text()')
urls = tree.xpath('//div[@class="property"]/a[1]/@href')
# for url in urls:
# print(url)
# new_page_text = requests.get(url=url, headers=headers, proxies={'HTTP': 'HTTP://121.230.210.132:3256'}).text
# new_tree = etree.HTML(new_page_text)
# add_time = new_tree.xpath('//div[@class="houseInfo"]/table/tbody/tr[6]/td[2]/span[2]/text()')
# print(add_time)
new_specific_address = []
for i in range(int(len(insertion) / 5)):
new_specific_address.append(''.join(insertion[5 * i:5 * i + 5]))
# print(new_specific_address)
print(len(build_times))
print(len(titles))
for i in range(len(titles) - 1):
new_data = [titles[i], values[i] + '万', layout[i], mi[i], location[i], high[i], build_times[i],
address[i],
new_specific_address[i], name[i], grade[i], website[i], urls[i]]
writer.writerow(new_data)
# fp.write('房子描述:' + titles[i] + ' ¥' + '价格:' + values[i] + '万' + '房子构造:' + layout[i] + '房子面积:' + mi[
# i] + '房子朝向:' + location[i] + '楼房层数:' + high[i] + '建造时间:' + build_times[i] +
# '地址:' + address[i] + '详细地址:' + new_specific_address[i] + '户主姓名:' + name[i] + '评分:' + grade[
# i] + '发布公司:' + website[i] + '网站地址:' + urls[i] + '\n')
async def main():
urls = []
url = 'https://bj.58.com/ershoufang/p%d/'
for pageNum in range(1, 9):
urls.append(format(url % pageNum))
tasks = []
for url in urls:
c = get_page(url)
task = asyncio.create_task(c)
tasks.append(task)
await asyncio.wait(tasks)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
plt.rcParams["font.sans-serif"] = [u"SimHei"]
plt.rcParams["axes.unicode_minus"] = False
data = pd.read_csv('room01.csv', encoding='gbk')
print(data.shape)
print(data.dtypes)
print(data.columns)
# 将多余的行删除
index01 = data[data["建造时间"].str[29:33] == ''].index
data.drop(index01, inplace=True)
# 将房子面积转变为double类型新加一列mi
data['mi'] = data["房子面积"].str[29:-26].astype('double')
# 将价格转变为double类型新加一列price
data['price'] = data["¥价格"].str[:-1].astype('double')
# 将建造时间转变为int类型新加一列year
data['year'] = data["建造时间"].str[29:33].astype('int')
# print(data["建造时间"].str[29:33])
# print(data.dtypes)
# 添加一列months表示使用多少个月
data['months'] = (2021 - data['year']) * 12 + 6
# 删除评分中无用的数据
index02 = data[data['评分'].str[3:4] != '分'].index
data.drop(index02, inplace=True)
# 将评分转变为double类型新加一列grade
data['grade'] = data['评分'].str[:-1].astype('double')
def plot01():
# 将价格分组
price_cut = pd.cut(data['price'],
bins=[data['price'].min(), 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000,
data['price'].max()])
# print(price_cut)
# 计算不同价格的房子数量
price_count = price_cut.value_counts()
# print(price_count)
# 查看占比情况
for i in price_count / price_count.sum():
print(i)
# 打印price_count索引
print(price_count.index)
# 绘制不同价格区间占比情况的柱状图
X = np.arange(len(price_count))
print(X)
Y = price_count
print(Y)
plt.figure(figsize=(8, 6))
plt.bar(X, Y, color='b', alpha=0.5)
plt.title("二手房价格分布图")
plt.xlabel("价格区间")
plt.ylabel("数量")
plt.xticks(np.arange(len(price_count)), price_count.index, rotation=30)
plt.ylim([0, price_count.max() + 100])
percents = [str(round(i * 100, 2)) + '%' for i in price_count / price_count.sum()]
for x, y, z in zip(X, Y, percents):
plt.text(x - 0.3, y + 5, z)
plt.show()
def plot02():
# 平均面积集合
means = [int(data[(data['price'] < 100) & (data['price'] >= data['price'].min())]['mi'].mean()),
int(data[(data['price'] < 200) & (data['price'] >= 100)]['mi'].mean()),
int(data[(data['price'] < 300) & (data['price'] >= 200)]['mi'].mean()),
int(data[(data['price'] < 400) & (data['price'] >= 300)]['mi'].mean()),
int(data[(data['price'] < 500) & (data['price'] >= 400)]['mi'].mean()),
int(data[(data['price'] < 600) & (data['price'] >= 500)]['mi'].mean()),
int(data[(data['price'] < 700) & (data['price'] >= 600)]['mi'].mean()),
int(data[(data['price'] < 800) & (data['price'] >= 700)]['mi'].mean()),
int(data[(data['price'] < 900) & (data['price'] >= 800)]['mi'].mean()),
int(data[(data['price'] < 1000) & (data['price'] >= 900)]['mi'].mean()),
int(data[(data['price'] < data['price'].max()) & (data['price'] >= 1000)]['mi'].mean())]
x = [f"[{data['price'].min()},100)", "[100,200)", "[200,300)", "[300,400)", "[400,500)", "[500,600)", "[600,700)",
"[700,800)", "[800,900)", "[900,1000)", f"[1000,{data['price'].max()})"]
X = np.arange(len(x))
Y = means
plt.figure(figsize=(8, 10))
plt.plot(X, Y, '-..', color='b')
plt.title('房子价格和面积之间的关系')
plt.xlabel('价格区间')
plt.ylabel('平均面积')
plt.xticks(np.arange(len(X)), x, rotation=30)
ax = plt.gca()
for i, j in zip(X, Y):
ax.text(i + 0.2, j + 4, j, bbox=dict(facecolor='red', alpha=0.3))
plt.grid(True)
plt.show()
def plot03():
# 分析房子使用时长、面积及价格之间的关系
plt.figure(figsize=(10, 8))
plt.scatter(data['mi'], data['months'], s=data['price'] / 10, c='r')
plt.xlabel("面积")
plt.ylabel("使用月份")
plt.show()
def plot04():
# 分析房子评分、面积及价格之间的关系
plt.figure(figsize=(10, 8))
plt.scatter(data['mi'], data['grade'], s=data['price'] / 10, c='r')
plt.xlabel("面积")
plt.ylabel("评分")
plt.show()
if __name__ == '__main__':
head = ['房子描述', '¥价格', '房子构造', '房子面积', '房子朝向', '楼房层数', '建造时间', '地址', '详细地址', '户主姓名', '评分', '发布公司', '网站地址']
with open('room02.csv', 'a', encoding='gbk', newline='')as f:
writer = csv.writer(f)
writer.writerow(head)
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
f.close()
plot04()