文章目录
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from numpy.random import randn
np.random.seed(123)
一、从Bitly获取1.USA.gov数据
2011 年,短服务商Bitly与美国*网站 USA.gov 合作,提供从以. gov/. mil结尾的短网址的用户收集的匿名数据。以每小时快照为例,文件中各行的格式为 JSON(即 JavaScript Object Notation,一种常用的 Web 数据格式),该数据集共有十八个维度。若只读取某个文件中的第一行,所看到的结果如下:
path = 'datasets/bitly_usagov/example.txt'
open(path).readline()
'''
'{ "a": "Mozilla\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\/535.11 (KHTML, like Gecko) Chrome\\/17.0.963.78 Safari\\/535.11", "c": "US", "nk": 1, "tz": "America\\/New_York", "gr": "MA", "g": "A6qOVH", "h": "wfLQtf", "l": "orofrog", "al": "en-US,en;q=0.8", "hh": "1.usa.gov", "r": "http:\\/\\/www.facebook.com\\/l\\/7AQEFzjSi\\/1.usa.gov\\/wfLQtf", "u": "http:\\/\\/www.ncbi.nlm.nih.gov\\/pubmed\\/22415991", "t": 1331923247, "hc": 1331822918, "cy": "Danvers", "ll": [ 42.576698, -70.954903 ] }\n'
'''
# 通过json.loads() 将JSON字符串逐行加载 转换成Python形式,这里为Python字典对象
import json
path = 'datasets/bitly_usagov/example.txt'
records = [json.loads(line) for line in open(path, encoding='utf-8')]
# 查看加载的数据的第一行
records[0]
'''
{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
'c': 'US',
'nk': 1,
'tz': 'America/New_York',
'gr': 'MA',
'g': 'A6qOVH',
'h': 'wfLQtf',
'l': 'orofrog',
'al': 'en-US,en;q=0.8',
'hh': '1.usa.gov',
'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991',
't': 1331923247,
'hc': 1331822918,
'cy': 'Danvers',
'll': [42.576698, -70.954903]}
'''
纯python下对时区进行计数
找到数据集中最常出现的时区(tz字段)
# 用列表推导式提取时区列表
# 由于并不是所有的记录都有tz时区数据,故会报错
time_zones = [rec['tz'] for rec in records]
'''
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-5-f3fbbc37f129> in <module>
----> 1 time_zones = [rec['tz'] for rec in records]
<ipython-input-5-f3fbbc37f129> in <listcomp>(.0)
----> 1 time_zones = [rec['tz'] for rec in records]
KeyError: 'tz'
'''
# 处理以上报错,在列表推导式结尾添加条件
# 但却发现有些时区是空字符串,这些其实也可以过滤掉(这里不做此项处理)
time_zones = [rec['tz'] for rec in records if 'tz' in rec]
time_zones[:10]
'''
['America/New_York',
'America/Denver',
'America/New_York',
'America/Sao_Paulo',
'America/New_York',
'America/New_York',
'Europe/Warsaw',
'',
'',
'']
'''
纯python下通过定义函数来实现计数
# 定义函数,在遍历时区时用字典来存储计数
def get_counts(sequence):
counts = {}
for x in sequence:
if x in counts:
counts[x] += 1
else:
counts[x] = 1
return counts
# 以上函数的另一种实现方式
# defaultdict()的用法
from collections import defaultdict
def get_counts2(sequence):
counts = defaultdict(int) # 值将会初始化为0
for x in sequence:
counts[x] += 1
return counts
# 传递time_zones列表给刚刚的函数,得到字典
counts = get_counts(time_zones)
# 查看tz为'America/New_York'的计数
counts['America/New_York']
'''1251'''
# 查看tz字段非缺失值的总的计数,含空值
len(time_zones)
'''3440'''
# 定义函数,获取排名前10的时区及其计数
def top_counts(count_dict, n=10):
value_key_pairs = [(count, tz) for tz, count in count_dict.items()]
value_key_pairs.sort()
return value_key_pairs[-n:]
# 传递存储时区和计数的字典给刚刚的函数,即可得到最常出现的前10个时区
top_counts(counts)
'''
[(33, 'America/Sao_Paulo'),
(35, 'Europe/Madrid'),
(36, 'Pacific/Honolulu'),
(37, 'Asia/Tokyo'),
(74, 'Europe/London'),
(191, 'America/Denver'),
(382, 'America/Los_Angeles'),
(400, 'America/Chicago'),
(521, ''),
(1251, 'America/New_York')]
'''
纯python下利用标准库collections.Counter()类实现计数
from collections import Counter
# 传递时区列表time_zones给Counter()
counts = Counter(time_zones)
counts.most_common(10)
'''
[('America/New_York', 1251),
('', 521),
('America/Chicago', 400),
('America/Los_Angeles', 382),
('America/Denver', 191),
('Europe/London', 74),
('Asia/Tokyo', 37),
('Pacific/Honolulu', 36),
('Europe/Madrid', 35),
('America/Sao_Paulo', 33)]
'''
利用pandas对时区进行计数
# 将原始记录的列表传递给pd.DataFrame()生成DataFrame
frame = pd.DataFrame(records)
frame.info()
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3560 entries, 0 to 3559
Data columns (total 18 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 a 3440 non-null object
1 c 2919 non-null object
2 nk 3440 non-null float64
3 tz 3440 non-null object
4 gr 2919 non-null object
5 g 3440 non-null object
6 h 3440 non-null object
7 l 3440 non-null object
8 al 3094 non-null object
9 hh 3440 non-null object
10 r 3440 non-null object
11 u 3440 non-null object
12 t 3440 non-null float64
13 hc 3440 non-null float64
14 cy 2919 non-null object
15 ll 2919 non-null object
16 _heartbeat_ 120 non-null float64
17 kw 93 non-null object
dtypes: float64(4), object(14)
memory usage: 500.8+ KB
'''
# 利用索引切片查看前10行时区数据
frame['tz'][:10]
'''
0 America/New_York
1 America/Denver
2 America/New_York
3 America/Sao_Paulo
4 America/New_York
5 America/New_York
6 Europe/Warsaw
7
8
9
Name: tz, dtype: object
'''
# 对时区进行计数,用Series 的value_counts()
tz_counts = frame['tz'].value_counts()
tz_counts[:10]
'''
America/New_York 1251
521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
America/Sao_Paulo 33
Name: tz, dtype: int64
'''
# 处理缺失值
clean_tz = frame['tz'].fillna('Missing')
# 处理空值
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]
'''
America/New_York 1251
Unknown 521
America/Chicago 400
America/Los_Angeles 382
America/Denver 191
Missing 120
Europe/London 74
Asia/Tokyo 37
Pacific/Honolulu 36
Europe/Madrid 35
Name: tz, dtype: int64
'''
# 对处理后的前10名数据进行可视化
import seaborn as sns
%matplotlib inline
subset = tz_counts[:10]
sns.barplot(y=subset.index, x=subset.values)
# a 字段含有执行 URL 短缩操作的浏览器、设备、应用程序的相关信息
frame['a'][1]
'''
'GoogleMaps/RochesterNY'
'''
frame['a'][50]
'''
'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
'''
# 选取a字段第52行的数据的前50个字符
frame['a'][51][:50] # long line
'''
'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P9'
'''
# 从字段a中解析出感兴趣的信息的做法
# 分离字符串中的第一个标记(大致对应于浏览器信息)
# x.split()[0]表示将a字段的信息遇到空白就分开,并选取第一个标记
results = pd.Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
'''
0 Mozilla/5.0
1 GoogleMaps/RochesterNY
2 Mozilla/4.0
3 Mozilla/5.0
4 Mozilla/5.0
dtype: object
'''
results.value_counts()[:8]
'''
Mozilla/5.0 2594
Mozilla/4.0 601
GoogleMaps/RochesterNY 121
Opera/9.80 34
TEST_INTERNET_AGENT 24
GoogleProducer 21
Mozilla/6.0 5
BlackBerry8520/5.0.0.681 4
dtype: int64
'''
将时区计数多的时区记录分解为Windows和非Windows用户,并统计相同时区下其占比
# 处理缺失的代理字符串,直接将其排除在外
cframe = frame[frame.a.notnull()]
# 找出windows用户,并新添加一列'os'
cframe['os'] = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
cframe['os'][:5]
'''
0 Windows
1 Not Windows
2 Windows
3 Not Windows
4 Windows
Name: os, dtype: object
'''
# 根据时区列及新生成的操作系统列对数据分组
by_tz_os = cframe.groupby(['tz', 'os'])
# by_tz_os.size()计算每组的大小
agg_counts = by_tz_os.size().unstack().fillna(0)
agg_counts[:10]
os | Not Windows | Windows |
---|---|---|
tz | ||
245.0 | 276.0 | |
Africa/Cairo | 0.0 | 3.0 |
Africa/Casablanca | 0.0 | 1.0 |
Africa/Ceuta | 0.0 | 2.0 |
Africa/Johannesburg | 0.0 | 1.0 |
Africa/Lusaka | 0.0 | 1.0 |
America/Anchorage | 4.0 | 1.0 |
America/Argentina/Buenos_Aires | 1.0 | 0.0 |
America/Argentina/Cordoba | 0.0 | 1.0 |
America/Argentina/Mendoza | 0.0 | 1.0 |
# 得出总体计数最高的时区在原序列中的索引
# agg_counts.sum(axis=1)计算时区总数
# argsort()得出排序后的数据在原序列中的索引
indexer = agg_counts.sum(axis=1).argsort()
indexer[-10:]
'''
tz
Europe/Sofia 35
Europe/Stockholm 78
Europe/Uzhgorod 96
Europe/Vienna 59
Europe/Vilnius 77
Europe/Volgograd 15
Europe/Warsaw 22
Europe/Zurich 12
Pacific/Auckland 0
Pacific/Honolulu 29
dtype: int64
'''
# 用take()方法沿着指定轴返回给定索引处的元素,默认axis=0
count_subset = agg_counts.take(indexer[-10:])
count_subset
os | Not Windows | Windows |
---|---|---|
tz | ||
America/Sao_Paulo | 13.0 | 20.0 |
Europe/Madrid | 16.0 | 19.0 |
Pacific/Honolulu | 0.0 | 36.0 |
Asia/Tokyo | 2.0 | 35.0 |
Europe/London | 43.0 | 31.0 |
America/Denver | 132.0 | 59.0 |
America/Los_Angeles | 130.0 | 252.0 |
America/Chicago | 115.0 | 285.0 |
245.0 | 276.0 | |
America/New_York | 339.0 | 912.0 |
# 可以实现上述结果,但是返回的数据不是原序列中的格式
agg_counts.sum(1).nlargest(10)
'''
tz
America/New_York 1251.0
521.0
America/Chicago 400.0
America/Los_Angeles 382.0
America/Denver 191.0
Europe/London 74.0
Asia/Tokyo 37.0
Pacific/Honolulu 36.0
Europe/Madrid 35.0
America/Sao_Paulo 33.0
dtype: float64
'''
# 对绘图数据重新排列
count_subset = count_subset.stack()
count_subset
'''
tz os
America/Sao_Paulo Not Windows 13.0
Windows 20.0
Europe/Madrid Not Windows 16.0
Windows 19.0
Pacific/Honolulu Not Windows 0.0
Windows 36.0
Asia/Tokyo Not Windows 2.0
Windows 35.0
Europe/London Not Windows 43.0
Windows 31.0
America/Denver Not Windows 132.0
Windows 59.0
America/Los_Angeles Not Windows 130.0
Windows 252.0
America/Chicago Not Windows 115.0
Windows 285.0
Not Windows 245.0
Windows 276.0
America/New_York Not Windows 339.0
Windows 912.0
dtype: float64
'''
# 给列取名为'total',因为此时前面的为层次化索引
count_subset.name = 'total'
# 剔除层次化索引
count_subset = count_subset.reset_index()
count_subset[:10]
tz | os | total | |
---|---|---|---|
0 | America/Sao_Paulo | Not Windows | 13.0 |
1 | America/Sao_Paulo | Windows | 20.0 |
2 | Europe/Madrid | Not Windows | 16.0 |
3 | Europe/Madrid | Windows | 19.0 |
4 | Pacific/Honolulu | Not Windows | 0.0 |
5 | Pacific/Honolulu | Windows | 36.0 |
6 | Asia/Tokyo | Not Windows | 2.0 |
7 | Asia/Tokyo | Windows | 35.0 |
8 | Europe/London | Not Windows | 43.0 |
9 | Europe/London | Windows | 31.0 |
# 每个时区分组中,windows用户和非windows用户的数量
sns.barplot(x='total', y='tz', hue='os', data=count_subset)
# 定义函数,计算按时区分组中windows用户和非windows用户的比例;即将组百分比归一化为1
def norm_total(group):
group['normed_total'] = group.total / group.total.sum()
return group
results = count_subset.groupby('tz').apply(norm_total)
sns.barplot(x='normed_total', y='tz', hue='os', data=results)
# 以下也可用于将组百分比归一化为1的处理
g = count_subset.groupby('tz')
results2 = count_subset.total / g.total.transform('sum')