《利用python进行数据分析》第二版第14章-数据分析示例学习笔记1

2022-12-06 09:09:50

文章目录

一、从Bitly获取1.USA.gov数据
- 纯python下对时区进行计数
- 利用pandas对时区进行计数

import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from numpy.random import randn
np.random.seed(123)

一、从Bitly获取1.USA.gov数据

2011 年，短服务商Bitly与美国*网站 USA.gov 合作，提供从以. gov/. mil结尾的短网址的用户收集的匿名数据。以每小时快照为例，文件中各行的格式为 JSON（即 JavaScript Object Notation，一种常用的 Web 数据格式），该数据集共有十八个维度。若只读取某个文件中的第一行，所看到的结果如下：

path = 'datasets/bitly_usagov/example.txt'

open(path).readline()
'''
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
'''

# 通过json.loads() 将JSON字符串逐行加载 转换成Python形式，这里为Python字典对象
import json
path = 'datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path, encoding='utf-8')]

# 查看加载的数据的第一行
records[0]
'''
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
 'c': 'US',
 'nk': 1,
 'tz': 'America/New_York',
 'gr': 'MA',
 'g': 'A6qOVH',
 'h': 'wfLQtf',
 'l': 'orofrog',
 'al': 'en-US,en;q=0.8',
 'hh': '1.usa.gov',
 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
 't': 1331923247,
 'hc': 1331822918,
 'cy': 'Danvers',
 'll': [42.576698, -70.954903]}
'''

纯python下对时区进行计数

找到数据集中最常出现的时区（tz字段）

# 用列表推导式提取时区列表
# 由于并不是所有的记录都有tz时区数据，故会报错
time_zones = [rec['tz'] for rec in records]
'''
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-5-f3fbbc37f129> in <module>
----> 1 time_zones = [rec['tz'] for rec in records]

<ipython-input-5-f3fbbc37f129> in <listcomp>(.0)
----> 1 time_zones = [rec['tz'] for rec in records]

KeyError: 'tz'
'''

# 处理以上报错，在列表推导式结尾添加条件
# 但却发现有些时区是空字符串，这些其实也可以过滤掉（这里不做此项处理）
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
'''
['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York',
 'America/New_York',
 'Europe/Warsaw',
 '',
 '',
 '']
'''

纯python下通过定义函数来实现计数

# 定义函数，在遍历时区时用字典来存储计数
def get_counts(sequence):
    counts = {}
    for x in sequence:
        if x in counts:
            counts[x] += 1
        else:
            counts[x] = 1
    return counts

# 以上函数的另一种实现方式
# defaultdict()的用法
from collections import defaultdict

def get_counts2(sequence):
    counts = defaultdict(int)       # 值将会初始化为0
    for x in sequence:
        counts[x] += 1
    return counts

# 传递time_zones列表给刚刚的函数，得到字典
counts = get_counts(time_zones)

# 查看tz为'America/New_York'的计数
counts['America/New_York']
'''1251'''

# 查看tz字段非缺失值的总的计数，含空值
len(time_zones)
'''3440'''

# 定义函数，获取排名前10的时区及其计数
def top_counts(count_dict, n=10):
    value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]

# 传递存储时区和计数的字典给刚刚的函数，即可得到最常出现的前10个时区
top_counts(counts)
'''
[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]
'''

纯python下利用标准库collections.Counter()类实现计数

from collections import Counter
# 传递时区列表time_zones给Counter()
counts = Counter(time_zones)
counts.most_common(10)
'''
[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]
'''

利用pandas对时区进行计数

# 将原始记录的列表传递给pd.DataFrame()生成DataFrame
frame = pd.DataFrame(records)
frame.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   a            3440 non-null   object 
 1   c            2919 non-null   object 
 2   nk           3440 non-null   float64
 3   tz           3440 non-null   object 
 4   gr           2919 non-null   object 
 5   g            3440 non-null   object 
 6   h            3440 non-null   object 
 7   l            3440 non-null   object 
 8   al           3094 non-null   object 
 9   hh           3440 non-null   object 
 10  r            3440 non-null   object 
 11  u            3440 non-null   object 
 12  t            3440 non-null   float64
 13  hc           3440 non-null   float64
 14  cy           2919 non-null   object 
 15  ll           2919 non-null   object 
 16  _heartbeat_  120 non-null    float64
 17  kw           93 non-null     object 
dtypes: float64(4), object(14)
memory usage: 500.8+ KB
'''

# 利用索引切片查看前10行时区数据
frame['tz'][:10]
'''
0     America/New_York
1       America/Denver
2     America/New_York
3    America/Sao_Paulo
4     America/New_York
5     America/New_York
6        Europe/Warsaw
7                     
8                     
9                     
Name: tz, dtype: object
'''

# 对时区进行计数，用Series 的value_counts()
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
'''
America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Name: tz, dtype: int64
'''

# 处理缺失值
clean_tz = frame['tz'].fillna('Missing')
# 处理空值
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
'''
America/New_York       1251
Unknown                 521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Missing                 120
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
Name: tz, dtype: int64
'''

# 对处理后的前10名数据进行可视化
import seaborn as sns
%matplotlib inline

subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)

# a 字段含有执行 URL 短缩操作的浏览器、设备、应用程序的相关信息

frame['a'][1]
'''
'GoogleMaps/RochesterNY'
'''

frame['a'][50]
'''
'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
'''

# 选取a字段第52行的数据的前50个字符
frame['a'][51][:50]  # long line
'''
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P9'
'''

# 从字段a中解析出感兴趣的信息的做法
# 分离字符串中的第一个标记（大致对应于浏览器信息）
# x.split()[0]表示将a字段的信息遇到空白就分开，并选取第一个标记
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
'''
0               Mozilla/5.0
1    GoogleMaps/RochesterNY
2               Mozilla/4.0
3               Mozilla/5.0
4               Mozilla/5.0
dtype: object
'''

results.value_counts()[:8]
'''
Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64
'''

将时区计数多的时区记录分解为Windows和非Windows用户，并统计相同时区下其占比

# 处理缺失的代理字符串，直接将其排除在外
cframe = frame[frame.a.notnull()]

# 找出windows用户，并新添加一列'os'
cframe['os'] = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
cframe['os'][:5]
'''
0        Windows
1    Not Windows
2        Windows
3    Not Windows
4        Windows
Name: os, dtype: object
'''

# 根据时区列及新生成的操作系统列对数据分组
by_tz_os = cframe.groupby(['tz', 'os'])

# by_tz_os.size()计算每组的大小
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]

os	Not Windows	Windows
tz
	245.0	276.0
Africa/Cairo	0.0	3.0
Africa/Casablanca	0.0	1.0
Africa/Ceuta	0.0	2.0
Africa/Johannesburg	0.0	1.0
Africa/Lusaka	0.0	1.0
America/Anchorage	4.0	1.0
America/Argentina/Buenos_Aires	1.0	0.0
America/Argentina/Cordoba	0.0	1.0
America/Argentina/Mendoza	0.0	1.0

# 得出总体计数最高的时区在原序列中的索引
# agg_counts.sum(axis=1)计算时区总数
# argsort()得出排序后的数据在原序列中的索引
indexer = agg_counts.sum(axis=1).argsort()
indexer[-10:]
'''
tz
Europe/Sofia        35
Europe/Stockholm    78
Europe/Uzhgorod     96
Europe/Vienna       59
Europe/Vilnius      77
Europe/Volgograd    15
Europe/Warsaw       22
Europe/Zurich       12
Pacific/Auckland     0
Pacific/Honolulu    29
dtype: int64
'''


# 用take()方法沿着指定轴返回给定索引处的元素，默认axis=0
count_subset = agg_counts.take(indexer[-10:])
count_subset

os	Not Windows	Windows
tz
America/Sao_Paulo	13.0	20.0
Europe/Madrid	16.0	19.0
Pacific/Honolulu	0.0	36.0
Asia/Tokyo	2.0	35.0
Europe/London	43.0	31.0
America/Denver	132.0	59.0
America/Los_Angeles	130.0	252.0
America/Chicago	115.0	285.0
	245.0	276.0
America/New_York	339.0	912.0

# 可以实现上述结果，但是返回的数据不是原序列中的格式
agg_counts.sum(1).nlargest(10)
'''
tz
America/New_York       1251.0
                        521.0
America/Chicago         400.0
America/Los_Angeles     382.0
America/Denver          191.0
Europe/London            74.0
Asia/Tokyo               37.0
Pacific/Honolulu         36.0
Europe/Madrid            35.0
America/Sao_Paulo        33.0
dtype: float64
'''

# 对绘图数据重新排列
count_subset = count_subset.stack()
count_subset
'''
tz                   os         
America/Sao_Paulo    Not Windows     13.0
                     Windows         20.0
Europe/Madrid        Not Windows     16.0
                     Windows         19.0
Pacific/Honolulu     Not Windows      0.0
                     Windows         36.0
Asia/Tokyo           Not Windows      2.0
                     Windows         35.0
Europe/London        Not Windows     43.0
                     Windows         31.0
America/Denver       Not Windows    132.0
                     Windows         59.0
America/Los_Angeles  Not Windows    130.0
                     Windows        252.0
America/Chicago      Not Windows    115.0
                     Windows        285.0
                     Not Windows    245.0
                     Windows        276.0
America/New_York     Not Windows    339.0
                     Windows        912.0
dtype: float64
'''

# 给列取名为'total'，因为此时前面的为层次化索引
count_subset.name = 'total'
# 剔除层次化索引
count_subset = count_subset.reset_index()
count_subset[:10]

	tz	os	total
0	America/Sao_Paulo	Not Windows	13.0
1	America/Sao_Paulo	Windows	20.0
2	Europe/Madrid	Not Windows	16.0
3	Europe/Madrid	Windows	19.0
4	Pacific/Honolulu	Not Windows	0.0
5	Pacific/Honolulu	Windows	36.0
6	Asia/Tokyo	Not Windows	2.0
7	Asia/Tokyo	Windows	35.0
8	Europe/London	Not Windows	43.0
9	Europe/London	Windows	31.0

# 每个时区分组中，windows用户和非windows用户的数量
sns.barplot(x='total', y='tz', hue='os',  data=count_subset)

# 定义函数，计算按时区分组中windows用户和非windows用户的比例；即将组百分比归一化为1
def norm_total(group):
    group['normed_total'] = group.total / group.total.sum()
    return group

results = count_subset.groupby('tz').apply(norm_total)

sns.barplot(x='normed_total', y='tz', hue='os',  data=results)

# 以下也可用于将组百分比归一化为1的处理
g = count_subset.groupby('tz')
results2 = count_subset.total / g.total.transform('sum')

码农公寓

文章目录

一、从Bitly获取1.USA.gov数据

纯python下对时区进行计数

利用pandas对时区进行计数

相关文章