python异步爬虫+数据分析+数据可视化

python爬虫+数据分析+数据可视化

import csv
import pandas as pd
import numpy as np
import asyncio
import aiohttp
from pandas import Series, DataFrame
# import matplotlib as mpl
import matplotlib.pyplot as plt
from lxml import etree

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/84.0.4147.89 Safari/537.36 SLBrowser/7.0.0.4071 SLBChan/30 '
}


async def get_page(url):
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url, headers=headers) as response:
            # text()返回字符串形式的响应数据
            # read()返回二进制形式的响应数据
            # json()返回的就是json对象
            # 注意:获取响应数据操作之前一定要使用await进行手动挂起
            page_text = await response.text()
            tree = etree.HTML(page_text)
            titles = tree.xpath('//div[@class="property-content-title"]/h3//text()')
            values = tree.xpath('//p[@class="property-price-total"]/span[1]/text()')
            layouts = tree.xpath('//div[@class="property-content-info"]/p[1]//text()')
            a = ''
            for i in layouts:
                if i != ' ':
                    a = a + i
            layout = []
            for i in range(int(len(a) / 6)):
                layout.append(''.join(list(a)[6 * i:6 * i + 6]))

            mi = tree.xpath('//div[@class="property-content-info"]/p[2]//text()')
            location = tree.xpath('//div[@class="property-content-info"]/p[3]//text()')
            high = tree.xpath('//div[@class="property-content-info"]/p[4]//text()')
            build_times = tree.xpath('//div[@class="property-content-info"]/p[5]//text()')
            address = tree.xpath('//div[@class="property-content-info property-content-info-comm"]/p[1]//text()')
            specific_address = tree.xpath(
                '//div[@class="property-content-info property-content-info-comm"]/p[2]//text()')
            insertion = []
            for i in range(int(len(specific_address))):
                insertion.append(specific_address[i])
                if (i + 1) % 3 != 0:
                    insertion.insert(len(specific_address), '-')
            # print(insertion)
            name = tree.xpath('//div[@class="property-extra"]/span[1]/text()')
            grade = tree.xpath('//div[@class="property-extra"]/span[2]/text()')
            website = tree.xpath('//div[@class="property-extra"]/span[3]/text()')
            urls = tree.xpath('//div[@class="property"]/a[1]/@href')
            # for url in urls:
            #     print(url)
            #     new_page_text = requests.get(url=url, headers=headers, proxies={'HTTP': 'HTTP://121.230.210.132:3256'}).text
            #     new_tree = etree.HTML(new_page_text)
            #     add_time = new_tree.xpath('//div[@class="houseInfo"]/table/tbody/tr[6]/td[2]/span[2]/text()')
            #     print(add_time)
            new_specific_address = []
            for i in range(int(len(insertion) / 5)):
                new_specific_address.append(''.join(insertion[5 * i:5 * i + 5]))
            # print(new_specific_address)
            print(len(build_times))
            print(len(titles))

            for i in range(len(titles) - 1):
                new_data = [titles[i], values[i] + '万', layout[i], mi[i], location[i], high[i], build_times[i],
                            address[i],
                            new_specific_address[i], name[i], grade[i], website[i], urls[i]]
                writer.writerow(new_data)
                # fp.write('房子描述:' + titles[i] + '   ¥' + '价格:' + values[i] + '万' + '房子构造:' + layout[i] + '房子面积:' + mi[
                #     i] + '房子朝向:' + location[i] + '楼房层数:' + high[i] + '建造时间:' + build_times[i] +
                #          '地址:' + address[i] + '详细地址:' + new_specific_address[i] + '户主姓名:' + name[i] + '评分:' + grade[
                #              i] + '发布公司:' + website[i] + '网站地址:' + urls[i] + '\n')


async def main():
    urls = []

    url = 'https://bj.58.com/ershoufang/p%d/'
    for pageNum in range(1, 9):
        urls.append(format(url % pageNum))

    tasks = []

    for url in urls:
        c = get_page(url)
        task = asyncio.create_task(c)
        tasks.append(task)

    await asyncio.wait(tasks)


pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

plt.rcParams["font.sans-serif"] = [u"SimHei"]
plt.rcParams["axes.unicode_minus"] = False

data = pd.read_csv('room01.csv', encoding='gbk')

print(data.shape)
print(data.dtypes)
print(data.columns)
# 将多余的行删除
index01 = data[data["建造时间"].str[29:33] == ''].index
data.drop(index01, inplace=True)
# 将房子面积转变为double类型新加一列mi
data['mi'] = data["房子面积"].str[29:-26].astype('double')
# 将价格转变为double类型新加一列price
data['price'] = data["¥价格"].str[:-1].astype('double')
# 将建造时间转变为int类型新加一列year
data['year'] = data["建造时间"].str[29:33].astype('int')
# print(data["建造时间"].str[29:33])
# print(data.dtypes)

# 添加一列months表示使用多少个月
data['months'] = (2021 - data['year']) * 12 + 6
# 删除评分中无用的数据
index02 = data[data['评分'].str[3:4] != '分'].index
data.drop(index02, inplace=True)
# 将评分转变为double类型新加一列grade
data['grade'] = data['评分'].str[:-1].astype('double')


def plot01():
    # 将价格分组
    price_cut = pd.cut(data['price'],
                       bins=[data['price'].min(), 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000,
                             data['price'].max()])
    # print(price_cut)

    # 计算不同价格的房子数量
    price_count = price_cut.value_counts()
    # print(price_count)

    # 查看占比情况
    for i in price_count / price_count.sum():
        print(i)

    # 打印price_count索引
    print(price_count.index)

    # 绘制不同价格区间占比情况的柱状图
    X = np.arange(len(price_count))
    print(X)
    Y = price_count
    print(Y)
    plt.figure(figsize=(8, 6))
    plt.bar(X, Y, color='b', alpha=0.5)
    plt.title("二手房价格分布图")
    plt.xlabel("价格区间")
    plt.ylabel("数量")
    plt.xticks(np.arange(len(price_count)), price_count.index, rotation=30)
    plt.ylim([0, price_count.max() + 100])

    percents = [str(round(i * 100, 2)) + '%' for i in price_count / price_count.sum()]
    for x, y, z in zip(X, Y, percents):
        plt.text(x - 0.3, y + 5, z)
    plt.show()


def plot02():
    # 平均面积集合
    means = [int(data[(data['price'] < 100) & (data['price'] >= data['price'].min())]['mi'].mean()),
             int(data[(data['price'] < 200) & (data['price'] >= 100)]['mi'].mean()),
             int(data[(data['price'] < 300) & (data['price'] >= 200)]['mi'].mean()),
             int(data[(data['price'] < 400) & (data['price'] >= 300)]['mi'].mean()),
             int(data[(data['price'] < 500) & (data['price'] >= 400)]['mi'].mean()),
             int(data[(data['price'] < 600) & (data['price'] >= 500)]['mi'].mean()),
             int(data[(data['price'] < 700) & (data['price'] >= 600)]['mi'].mean()),
             int(data[(data['price'] < 800) & (data['price'] >= 700)]['mi'].mean()),
             int(data[(data['price'] < 900) & (data['price'] >= 800)]['mi'].mean()),
             int(data[(data['price'] < 1000) & (data['price'] >= 900)]['mi'].mean()),
             int(data[(data['price'] < data['price'].max()) & (data['price'] >= 1000)]['mi'].mean())]

    x = [f"[{data['price'].min()},100)", "[100,200)", "[200,300)", "[300,400)", "[400,500)", "[500,600)", "[600,700)",
         "[700,800)", "[800,900)", "[900,1000)", f"[1000,{data['price'].max()})"]

    X = np.arange(len(x))

    Y = means

    plt.figure(figsize=(8, 10))
    plt.plot(X, Y, '-..', color='b')
    plt.title('房子价格和面积之间的关系')
    plt.xlabel('价格区间')
    plt.ylabel('平均面积')
    plt.xticks(np.arange(len(X)), x, rotation=30)
    ax = plt.gca()
    for i, j in zip(X, Y):
        ax.text(i + 0.2, j + 4, j, bbox=dict(facecolor='red', alpha=0.3))
    plt.grid(True)

    plt.show()


def plot03():
    # 分析房子使用时长、面积及价格之间的关系
    plt.figure(figsize=(10, 8))

    plt.scatter(data['mi'], data['months'], s=data['price'] / 10, c='r')
    plt.xlabel("面积")
    plt.ylabel("使用月份")
    plt.show()


def plot04():
    # 分析房子评分、面积及价格之间的关系
    plt.figure(figsize=(10, 8))

    plt.scatter(data['mi'], data['grade'], s=data['price'] / 10, c='r')
    plt.xlabel("面积")
    plt.ylabel("评分")
    plt.show()


if __name__ == '__main__':
    head = ['房子描述', '¥价格', '房子构造', '房子面积', '房子朝向', '楼房层数', '建造时间', '地址', '详细地址', '户主姓名', '评分', '发布公司', '网站地址']

    with open('room02.csv', 'a', encoding='gbk', newline='')as f:
        writer = csv.writer(f)
        writer.writerow(head)
        loop = asyncio.get_event_loop()
        loop.run_until_complete(main())
        f.close()
    plot04()

上一篇:1321:【例6.3】删数问题(Noip1994)


下一篇:轻轻松松入门:java 的基础知识