第四章 pandas统计分析基础
4.1读/写不同数据源的数据
4.1.1读/写数据库数据
1、数据库数据读取
import MySQLdb
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
print(engine)
print(session)
Engine(mysql+pymysql://root:***@127.0.0.1/testdb?charset=utf8mb4)
sessionmaker(class_='Session', bind=Engine(mysql+pymysql://root:***@127.0.0.1/testdb?charset=utf8mb4), autoflush=True, autocommit=False, expire_on_commit=True)
#使用read_sql_table、read_sql_query、read_sql函数读取数据库数据
import pandas as pd
formlist=pd.read_sql_query('show tables',con=engine)
print('testdb数据库数据表清单为:','\n',formlist)
testdb数据库数据表清单为:
Tables_in_testdb
0 meal_order_detail1
1 meal_order_detail2
2 meal_order_detail3
3 test1
C:\Users\ljy15\AppData\Roaming\Python\Python37\site-packages\pymysql\cursors.py:170: Warning: (1366, "Incorrect string value: '\\xD6\\xD0\\xB9\\xFA\\xB1\\xEA...' for column 'VARIABLE_VALUE' at row 484")
result = self._query(query)
# 使用read_sql_table读取订单详情表
detail1=pd.read_sql_table('meal_order_detail1',con=engine)
print('使用read_sql_table读取订单详情表的长度为',len(detail1))
使用read_sql_table读取订单详情表的长度为 2779
#使用read_sql读取订单详情表
detail2=pd.read_sql('select * from meal_order_detail2',con=engine)
print('使用read_sql函数+sql语句读取订单详情表的长度为',len(detail2))
detail3=pd.read_sql('select * from meal_order_detail3',con=engine)
print('使用read_sql函数+sql语句读取订单详情表的长度为',len(detail3))
使用read_sql函数+sql语句读取订单详情表的长度为 7294
使用read_sql函数+sql语句读取订单详情表的长度为 3611
2、数据库数据存储
注意
在进行这步操作前,记得直接将数据库的编码改为:utf8mb4_general_ci,同时Python代码里,连接数据库时用
charset=“utf8mb4”,并且将test1表的编码设为一致,在cmd端进入对应数据库中进行设置 alter table test1 convert to character set utf8mb4;
#使用to_sql方法写入数据
#使用to_sql存储orderData
detail1.to_sql('test1',con=engine,index=False,if_exists='replace')#test1代表写入的数据库表名,con接收数据库连接,index表示是否将行索引作为数据传入数据库,
#if_exists接收fail,replace,append。fail表示如果表名存在,则不执行写入操作,replace表示如果存在,则将原数据库表删除,再重新创建;append则表示原数据库表的基础上追加数据,默认为fail
#使用read_sql读取test表
formlist1=pd.read_sql_query('show tables',con=engine)
print(formlist1)
Tables_in_testdb
0 meal_order_detail1
1 meal_order_detail2
2 meal_order_detail3
3 test1
4.1.2 读/写文本文件
1、文本文件读取
#使用read_table和read_csv函数读取菜品订单信息表
order=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('使用read_table函数读取的菜品订单信息表的长度为',len(order))
使用read_table函数读取的菜品订单信息表的长度为 945
#使用read_csv函数读取菜品订单信息表
order=pd.read_csv('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('使用read_csv函数读取的菜品订单信息表的长度为',len(order))
使用read_csv函数读取的菜品订单信息表的长度为 945
#更改参数读取菜品订单信息表
order2=pd.read_table('./data/meal_order_info.csv',sep=';',encoding='gbk')
print('分隔符为;时菜品订单信息表的长度为',order2.head())
分隔符为;时菜品订单信息表的长度为 info_id,"emp_id","number_consumers","mode","dining_table_id","dining_table_name","expenditure","dishes_count","accounts_payable","use_start_time","check_closed","lock_time","cashier_id","pc_id","order_number","org_id","print_doc_bill_num","lock_table_info","order_status","phone","name"
0 417,1442,4,NA,1501,1022,165,5,165,"2016/8/1 11...
1 301,1095,3,NA,1430,1031,321,6,321,"2016/8/1 11...
2 413,1147,6,NA,1488,1009,854,15,854,"2016/8/1 1...
3 415,1166,4,NA,1502,1023,466,10,466,"2016/8/1 1...
4 392,1094,10,NA,1499,1020,704,24,704,"2016/8/1 ...
#使用read_csv读取菜品订单信息表,header=None
order3=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('header为None时菜品订单信息表为',order3.iloc[:4,:5])
header为None时菜品订单信息表为 info_id emp_id number_consumers mode dining_table_id
0 417 1442 4 NaN 1501
1 301 1095 3 NaN 1430
2 413 1147 6 NaN 1488
3 415 1166 4 NaN 1502
# 使用UTF-8解析菜品订单信息表
order4=pd.read_table('./data/meal_order_info.csv',sep=',',header=None,encoding='utf-8')
#print('菜品订单信息表为',order4)
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: invalid continuation byte
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-11-4d3d9a20be71> in <module>
1 # 使用UTF-8解析菜品订单信息表
----> 2 order4=pd.read_table('./data/meal_order_info.csv',sep=',',header=None,encoding='utf-8')
3 #print('菜品订单信息表为',order4)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
683 )
684
--> 685 return _read(filepath_or_buffer, kwds)
686
687 parser_f.__name__ = name
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
461
462 try:
--> 463 data = parser.read(nrows)
464 finally:
465 parser.close()
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
1152 def read(self, nrows=None):
1153 nrows = _validate_integer("nrows", nrows)
-> 1154 ret = self._engine.read(nrows)
1155
1156 # May alter columns / col_dict
C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers.py in read(self, nrows)
2057 def read(self, nrows=None):
2058 try:
-> 2059 data = self._reader.read(nrows)
2060 except StopIteration:
2061 if self._first_chunk:
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.read()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_low_memory()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._read_rows()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_column_data()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_tokens()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._convert_with_dtype()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._string_convert()
pandas\_libs\parsers.pyx in pandas._libs.parsers._string_box_utf8()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: invalid continuation byte
4.1.3读/写Excel文件
1、Excel文件读取
#使用read_excel函数读取菜品订单信息
user=pd.read_excel('./data/users.xlsx')#读取文件
print('客户信息表长度:',len(user))
客户信息表长度: 734
2、Excel文件存储
import os
#使用to_excel函数将数据存储为excel文件
print('将客户信息表写入excel文件前,目录内文件列表为:\n',os.listdir('./tmp'))
user.to_excel('./tmp/userInfo.xlsx')
print('客户信息表写入excel文件后,目录内文件列表为:\n',os.listdir('./tmp'))
将客户信息表写入excel文件前,目录内文件列表为:
['userInfo.xlsx']
客户信息表写入excel文件后,目录内文件列表为:
['userInfo.xlsx']
4.1.4 任务实现
1、读取订单详情数据库数据
#读取订单详情表
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail1=pd.read_sql_table('meal_order_detail1',con=engine)
print('使用read_sql_table读取订单详情表1的长度为',len(detail1))
detail2=pd.read_sql_table('meal_order_detail2',con=engine)
print('使用read_sql_table读取订单详情表2的长度为',len(detail2))
detail3=pd.read_sql_table('meal_order_detail3',con=engine)
print('使用read_sql_table读取订单详情表3的长度为',len(detail3))
使用read_sql_table读取订单详情表1的长度为 2779
使用read_sql_table读取订单详情表2的长度为 7294
使用read_sql_table读取订单详情表3的长度为 3611
2、读取订单信息csv数据
#读取订单信息表
orderInfo=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('订单信息表的长度为',len(orderInfo))
订单信息表的长度为 945
3、读取客户信息Excel数据
#读取客户信息表
userInfo=pd.read_excel('./data/users.xlsx',sheet_name='users1')#Anaconda 3.7版本中 'sheetname' 命令,已更新为 'sheet_name' 。
#sheet_name代表Excel表内数据的分表位置;header表示将某行数据作为列名;names表示列名,默认为none
print('订单信息表的长度为',len(userInfo))
订单信息表的长度为 734
4.2掌握DataFrame的常用操作
4.2.11 查看DataFrame的常用属性
#查看订单详情表的4个基本属性
from sqlalchemy import create_engine
import pandas as pd
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail1=pd.read_sql_table('meal_order_detail1',con=engine)
print('订单信息表的索引为',detail1.index)
订单信息表的索引为 RangeIndex(start=0, stop=2779, step=1)
print('订单信息表的所有值为',detail1.values)
订单信息表的所有值为 [['2956' '417' '610062' ... 'NA' 'caipu/104001.jpg' '1442']
['2958' '417' '609957' ... 'NA' 'caipu/202003.jpg' '1442']
['2961' '417' '609950' ... 'NA' 'caipu/303001.jpg' '1442']
...
['6756' '774' '609949' ... 'NA' 'caipu/404005.jpg' '1138']
['6763' '774' '610014' ... 'NA' 'caipu/302003.jpg' '1138']
['6764' '774' '610017' ... 'NA' 'caipu/302006.jpg' '1138']]
print('订单信息表的列名为',detail1.columns)
订单信息表的列名为 Index(['detail_id', 'order_id', 'dishes_id', 'logicprn_name',
'parent_class_name', 'dishes_name', 'itemis_add', 'counts', 'amounts',
'cost', 'place_order_time', 'discount_amt', 'discount_reason',
'kick_back', 'add_inprice', 'add_info', 'bar_code', 'picture_file',
'emp_id'],
dtype='object')
print('订单信息表的数据类型为',detail1.dtypes)
订单信息表的数据类型为 detail_id object
order_id object
dishes_id object
logicprn_name object
parent_class_name object
dishes_name object
itemis_add object
counts float64
amounts float64
cost object
place_order_time datetime64[ns]
discount_amt object
discount_reason object
kick_back object
add_inprice object
add_info object
bar_code object
picture_file object
emp_id object
dtype: object
#size、ndim和shape属性的使用
print('订单信息表的元素个数为',detail1.size)
订单信息表的元素个数为 52801
print('订单信息表的维度数为',detail1.ndim)
订单信息表的维度数为 2
print('订单信息表的形状为',detail1.shape)
订单信息表的形状为 (2779, 19)
#使用T属性进行转置
print('订单信息表转置前形状为',detail1.shape)
print('订单信息表转置后形状为',detail1.T.shape)
订单信息表转置前形状为 (2779, 19)
订单信息表转置后形状为 (19, 2779)
4.2.2 查找增删DataFrame数据
1、查看访问DataFrame中的数据
通过字典访问内部数据的方式访问DataFrame单列数据
#通过字典访问内部数据的方式访问DataFrame单列数据
order_id=detail1['order_id']
print('订单信息表中order_id的形状为','\n',order_id.shape)
订单信息表中order_id的形状为
(2779,)
通过访问属性的方式访问DataFrame单列数据
#通过访问属性的方式访问DataFrame单列数据
dishes_name=detail1.dishes_name
print('订单信息表中dish_name的形状为','\n',dishes_name.shape)
订单信息表中dish_name的形状为
(2779,)
DataFrame单列多行数据获取
#DataFrame单列多行数据获取
dishes_name5=detail1['dishes_name'][:5]
print('订单信息表中dish_name前五行数据为','\n',dishes_name5)
订单信息表中dish_name前五行数据为
0 蒜蓉生蚝
1 蒙古烤羊腿\r\n\r\n\r\n
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
Name: dishes_name, dtype: object
访问DataFrame多列的多行数据
#访问DataFrame多列的多行数据
orderDish=detail1[['order_id','dishes_name']][:5]
print('订单信息表中order_id和dish_name前五行数据为','\n',orderDish)
订单信息表中order_id和dish_name前五行数据为
order_id dishes_name
0 417 蒜蓉生蚝
1 417 蒙古烤羊腿\r\n\r\n\r\n
2 417 大蒜苋菜
3 417 芝麻烤紫菜
4 417 蒜香包
#访问DataFrame多行数据
order5=detail1[:][1:6]
print('订单信息表中1-6行元素为','\n',order5)
订单信息表中1-6行元素为
detail_id order_id dishes_id logicprn_name parent_class_name \
1 2958 417 609957 NA NA
2 2961 417 609950 NA NA
3 2966 417 610038 NA NA
4 2968 417 610003 NA NA
5 1899 301 610019 NA NA
dishes_name itemis_add counts amounts cost place_order_time \
1 蒙古烤羊腿\r\n\r\n\r\n 0 1.0 48.0 NA 2016-08-01 11:07:00
2 大蒜苋菜 0 1.0 30.0 NA 2016-08-01 11:07:00
3 芝麻烤紫菜 0 1.0 25.0 NA 2016-08-01 11:11:00
4 蒜香包 0 1.0 13.0 NA 2016-08-01 11:11:00
5 白斩鸡 0 1.0 88.0 NA 2016-08-01 11:15:00
discount_amt discount_reason kick_back add_inprice add_info bar_code \
1 NA NA NA 0 NA NA
2 NA NA NA 0 NA NA
3 NA NA NA 0 NA NA
4 NA NA NA 0 NA NA
5 NA NA NA 0 NA NA
picture_file emp_id
1 caipu/202003.jpg 1442
2 caipu/303001.jpg 1442
3 caipu/105002.jpg 1442
4 caipu/503002.jpg 1442
5 caipu/204002.jpg 1095
#使用DataFrame的head和tail方法获取多行数据
print('订单信息表中前5行数据为','\n',detail1.head())
订单信息表中前5行数据为
detail_id order_id dishes_id logicprn_name parent_class_name \
0 2956 417 610062 NA NA
1 2958 417 609957 NA NA
2 2961 417 609950 NA NA
3 2966 417 610038 NA NA
4 2968 417 610003 NA NA
dishes_name itemis_add counts amounts cost place_order_time \
0 蒜蓉生蚝 0 1.0 49.0 NA 2016-08-01 11:05:00
1 蒙古烤羊腿\r\n\r\n\r\n 0 1.0 48.0 NA 2016-08-01 11:07:00
2 大蒜苋菜 0 1.0 30.0 NA 2016-08-01 11:07:00
3 芝麻烤紫菜 0 1.0 25.0 NA 2016-08-01 11:11:00
4 蒜香包 0 1.0 13.0 NA 2016-08-01 11:11:00
discount_amt discount_reason kick_back add_inprice add_info bar_code \
0 NA NA NA 0 NA NA
1 NA NA NA 0 NA NA
2 NA NA NA 0 NA NA
3 NA NA NA 0 NA NA
4 NA NA NA 0 NA NA
picture_file emp_id
0 caipu/104001.jpg 1442
1 caipu/202003.jpg 1442
2 caipu/303001.jpg 1442
3 caipu/105002.jpg 1442
4 caipu/503002.jpg 1442
print('订单信息表中后5行数据为','\n',detail1.tail())
订单信息表中后5行数据为
detail_id order_id dishes_id logicprn_name parent_class_name dishes_name \
2774 6750 774 610011 NA NA 白饭/大碗
2775 6742 774 609996 NA NA 牛尾汤
2776 6756 774 609949 NA NA 意文柠檬汁
2777 6763 774 610014 NA NA 金玉良缘
2778 6764 774 610017 NA NA 酸辣藕丁
itemis_add counts amounts cost place_order_time discount_amt \
2774 0 1.0 10.0 NA 2016-08-10 21:56:00 NA
2775 0 1.0 40.0 NA 2016-08-10 21:56:00 NA
2776 0 1.0 13.0 NA 2016-08-10 22:01:00 NA
2777 0 1.0 30.0 NA 2016-08-10 22:03:00 NA
2778 0 1.0 33.0 NA 2016-08-10 22:04:00 NA
discount_reason kick_back add_inprice add_info bar_code \
2774 NA NA 0 NA NA
2775 NA NA 0 NA NA
2776 NA NA 0 NA NA
2777 NA NA 0 NA NA
2778 NA NA 0 NA NA
picture_file emp_id
2774 caipu/601005.jpg 1138
2775 caipu/201006.jpg 1138
2776 caipu/404005.jpg 1138
2777 caipu/302003.jpg 1138
2778 caipu/302006.jpg 1138
(2)DataFrame的loc、iloc访问方式
.loc[行索引名称或条件,列索引名称]
.iloc[行索引位置,列索引位置]
#使用loc和iloc实现单列切片
dishes_name1=detail1.loc[:,'dishes_name']
print('使用loc提取dishes_name列的size为',dishes_name1.size)
使用loc提取dishes_name列的size为 2779
dishes_name2=detail1.iloc[:,3]
print('使用loc提取第三列的size为',dishes_name2.size)
使用loc提取第三列的size为 2779
#使用loc和iloc实现多列切片
orderDish1=detail1.loc[:,['order_id','dishes_name']]
print('使用loc提取order_id和dishes_name列的size为',orderDish1.size)
使用loc提取order_id和dishes_name列的size为 5558
orderDish2=detail1.iloc[:,[1,3]]
print('使用iloc提取第1和第3列的size为',orderDish2.size)
使用iloc提取第1和第3列的size为 5558
#使用loc、iloc实现花式切片
print('列名为order_id和dishes_name的列名为3的数据为:\n',detail1.loc[3,['order_id','dishes_name']])
列名为order_id和dishes_name的列名为3的数据为:
order_id 417
dishes_name 芝麻烤紫菜
Name: 3, dtype: object
print('列名为order_id和dishes_name的行名为2、3、4、5、6的数据为:\n',detail1.loc[2:6,['order_id','dishes_name']])
列名为order_id和dishes_name的行名为2、3、4、5、6的数据为:
order_id dishes_name
2 417 大蒜苋菜
3 417 芝麻烤紫菜
4 417 蒜香包
5 301 白斩鸡
6 301 香烤牛排\r\n
print('列位置为1和3的行位置为3的数据为:\n',detail1.iloc[3,[1,3]])
列位置为1和3的行位置为3的数据为:
order_id 417
logicprn_name NA
Name: 3, dtype: object
print('列位置为1和3的行位置为2,3,4,5,6的数据为:\n',detail1.iloc[2:7,[1,3]])
列位置为1和3的行位置为2,3,4,5,6的数据为:
order_id logicprn_name
2 417 NA
3 417 NA
4 417 NA
5 301 NA
6 301 NA
#使用loc和iloc实现条件切片
print('detail中order_id为458的dishes_name为:\n',detail1.loc[detail1['order_id']=='458',['order_id','dishes_name']])
detail中order_id为458的dishes_name为:
order_id dishes_name
145 458 蒜香辣花甲
146 458 剁椒鱼头
147 458 凉拌蒜蓉西兰花
148 458 木须豌豆
149 458 辣炒鱿鱼
150 458 酸辣藕丁
151 458 炝炒大白菜
152 458 香菇鸡肉粥
153 458 干锅田鸡
154 458 桂圆枸杞鸽子汤
155 458 五香酱驴肉\r\n\r\n\r\n
156 458 路易拉菲红酒干红
157 458 避风塘炒蟹
158 458 白饭/大碗
#使用iloc实现条件切片
print('detail中order_id为458的第1、5列数据为:\n',detail1.iloc[(detail1['order_id']=='458').values,[1,5]])
detail中order_id为458的第1、5列数据为:
order_id dishes_name
145 458 蒜香辣花甲
146 458 剁椒鱼头
147 458 凉拌蒜蓉西兰花
148 458 木须豌豆
149 458 辣炒鱿鱼
150 458 酸辣藕丁
151 458 炝炒大白菜
152 458 香菇鸡肉粥
153 458 干锅田鸡
154 458 桂圆枸杞鸽子汤
155 458 五香酱驴肉\r\n\r\n\r\n
156 458 路易拉菲红酒干红
157 458 避风塘炒蟹
158 458 白饭/大碗
#使用loc,iloc,ix实现切片比较
print('列名为dishes_name行名为2,3,4,5,6的数据为:\n',detail1.loc[2:6,'dishes_name'])
列名为dishes_name行名为2,3,4,5,6的数据为:
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
5 白斩鸡
6 香烤牛排\r\n
Name: dishes_name, dtype: object
print('列位置为5,行位置为2-6的数据为:\n',detail1.iloc[2:6,5])
列位置为5,行位置为2-6的数据为:
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
5 白斩鸡
Name: dishes_name, dtype: object
print('列位置为5,行位置为2-6的数据为:\n',detail1.ix[2:6,5])
列位置为5,行位置为2-6的数据为:
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
5 白斩鸡
6 香烤牛排\r\n
Name: dishes_name, dtype: object
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
"""Entry point for launching an IPython kernel.
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:961: FutureWarning:
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing
See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ix-indexer-is-deprecated
return getattr(section, self.name)[new_key]
2、更改DataFrame中的数据
#更改DataFrame中的数据
#将order_id为458的变换为45800
detail1.loc[detail1['order_id']=='458','order_id']='45800'
print('更改后detail1中order_id为458的order_id为:\n',detail1.loc[detail1['order_id']=='458','order_id'])
print('更改后detail1中order_id为45800的order_id为:\n',detail1.loc[detail1['order_id']=='45800','order_id'])
更改后detail1中order_id为458的order_id为:
Series([], Name: order_id, dtype: object)
更改后detail1中order_id为45800的order_id为:
145 45800
146 45800
147 45800
148 45800
149 45800
150 45800
151 45800
152 45800
153 45800
154 45800
155 45800
156 45800
157 45800
158 45800
Name: order_id, dtype: object
3、为DataFrame增添数据
#为DataFrame新增一列非定值
detail1['payment']=detail1['counts']*detail1['amounts']
print('detail新增列payment的前五行为:','\n',detail1['payment'].head())
detail新增列payment的前五行为:
0 49.0
1 48.0
2 30.0
3 25.0
4 13.0
Name: payment, dtype: float64
#DataFrame新增一列定值
detail1['pay_way']='现金支付'
print('detail新增列pay_way的前五行为:','\n',detail1['pay_way'].head())
detail新增列pay_way的前五行为:
0 现金支付
1 现金支付
2 现金支付
3 现金支付
4 现金支付
Name: pay_way, dtype: object
4、删除某列或某行数据
#删除DataFrame某列
print('删除pay_way前detail的列索引为:','\n',detail1.columns)
detail1.drop(labels='pay_way',axis=1,inplace=True)#label代表的是删除的列名,inplace代表的是是否对原数据生效
print('删除pay_wayde后tail的列索引为:','\n',detail1.columns)
删除pay_way前detail的列索引为:
Index(['detail_id', 'order_id', 'dishes_id', 'logicprn_name',
'parent_class_name', 'dishes_name', 'itemis_add', 'counts', 'amounts',
'cost', 'place_order_time', 'discount_amt', 'discount_reason',
'kick_back', 'add_inprice', 'add_info', 'bar_code', 'picture_file',
'emp_id', 'payment', 'pay_way'],
dtype='object')
删除pay_wayde后tail的列索引为:
Index(['detail_id', 'order_id', 'dishes_id', 'logicprn_name',
'parent_class_name', 'dishes_name', 'itemis_add', 'counts', 'amounts',
'cost', 'place_order_time', 'discount_amt', 'discount_reason',
'kick_back', 'add_inprice', 'add_info', 'bar_code', 'picture_file',
'emp_id', 'payment'],
dtype='object')
#删除DataFrame某几行
print('删除1-10前detail的长度为:','\n',len(detail1))
detail1.drop(labels=range(1,11),axis=0,inplace=True)#label代表的是删除的列名,inplace代表的是是否对原数据生效
print('删除1-10行后detail的长度为:','\n',len(detail1))
删除1-10前detail的长度为:
2779
删除1-10行后detail的长度为:
2769
4.2.3描述分析DataFrame数据
1、数值型特征的描述性统计
#使用np.mean函数计算平均价格
import numpy as np
print('订单详情表中amount(价格)的平均值为',np.mean(detail1['amounts']))
订单详情表中amount(价格)的平均值为 45.343084145901045
#通过pandas实现销量和价格的协方差矩阵计算
print('订单详情表中amount(价格)的平均值为',detail1['amounts'].mean())
订单详情表中amount(价格)的平均值为 45.343084145901045
#使用describe方法实现数值型特征的描述性统计
print('订单详情表中counts和amounts两列的描述性统计为:\n',detail1[['counts','amounts']].describe())
订单详情表中counts和amounts两列的描述性统计为:
counts amounts
count 2769.000000 2769.000000
mean 1.111593 45.343084
std 0.626521 36.841316
min 1.000000 1.000000
25% 1.000000 25.000000
50% 1.000000 35.000000
75% 1.000000 56.000000
max 10.000000 178.000000
2、类别型特征的描述性统计
#对菜品名称频数统计
print('订单详情表中dishes_name频数统计结果前10为:\n',detail1['dishes_name'].value_counts()[0:10])
订单详情表中dishes_name频数统计结果前10为:
白饭/大碗 91
凉拌菠菜 77
谷稻小庄 72
麻辣小龙虾 65
白饭/小碗 60
五色糯米饭(七色) 58
芝士烩波士顿龙虾 55
焖猪手 55
辣炒鱿鱼 53
水煮鱼 47
Name: dishes_name, dtype: int64
#将object数据强制转换为category类型
detail1['dishes_name']=detail1['dishes_name'].astype('category')
print('订单详情表中dishes_name列转变数据类型为:\n',detail1['dishes_name'].dtypes)
订单详情表中dishes_name列转变数据类型为:
category
#category类型特征的描述性统计
print('订单详情表中dishes_name的描述统计结果为:\n',detail1['dishes_name'].describe())
订单详情表中dishes_name的描述统计结果为:
count 2769
unique 154
top 白饭/大碗
freq 91
Name: dishes_name, dtype: object
4.2.4任务实现
1、查看餐饮数据的大小与维度
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
import pandas as pd
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail=pd.read_sql_table('meal_order_detail1',con=engine)
order=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
user=pd.read_excel('./data/users.xlsx')#读取文件
print('订单详情表的维度为',detail.ndim)
print('订单信息表的维度为',order.ndim)
print('客户信息表的维度为',user.ndim)
订单详情表的维度为 2
订单信息表的维度为 2
客户信息表的维度为 2
print('订单详情表的形状为',detail.shape)
print('订单信息表的形状为',order.shape)
print('客户信息表的形状为',user.shape)
订单详情表的形状为 (2779, 19)
订单信息表的形状为 (945, 21)
客户信息表的形状为 (734, 37)
print('订单详情表的元素个数为',detail.size)
print('订单信息表的元素个数为',order.size)
print('客户信息表的元素个数为',user.size)
订单详情表的元素个数为 52801
订单信息表的元素个数为 19845
客户信息表的元素个数为 27158
2、统计餐饮菜品销售状况
#餐饮菜品销量的描述性统计
print('订单详情表列名为counts和amounts两列的描述性统计为:\n',detail.loc[:,['counts','amounts']].describe())
订单详情表列名为counts和amounts两列的描述性统计为:
counts amounts
count 2779.000000 2779.000000
mean 1.111191 45.337172
std 0.625428 36.808550
min 1.000000 1.000000
25% 1.000000 25.000000
50% 1.000000 35.000000
75% 1.000000 56.000000
max 10.000000 178.000000
detail['order_id']=detail['order_id'].astype('category')
detail['dishes_name']=detail['dishes_name'].astype('category')
print('订单详情表列名为order_id和dishes_name两列的描述性统计为:\n',detail[['order_id','dishes_name']].describe())
订单详情表列名为order_id和dishes_name两列的描述性统计为:
order_id dishes_name
count 2779 2779
unique 278 154
top 392 白饭/大碗
freq 24 92
3、剔除全为空值或者所有元素取值相同的列
#剔除餐饮菜品中整列为空或者取值完全相同的列
#定义一个函数剔除全为空值的列和标准差为0的列
def dropNullStd(data):
beforelen=data.shape[1]
colisNull=data.describe().loc['count']==0
for i in range(len(colisNull)):
if colisNull[i]:
data.drop(colisNull.index[i],axis=1,inplace=True)
stdisZero=data.describe().loc['std']==0
for i in range(len(stdisZero)):
if stdisZero[i]:
data.drop(stdisZero.index[i],axis=1,inplace=True)
afterlen=data.shape[1]
print('剔除的列的数目为:', beforelen-afterlen)
print('剔除后数据的形状为:',data.shape)
dropNullStd(detail)
剔除的列的数目为: 0
剔除后数据的形状为: (2779, 19)
#使用dropNullStd函数对订单信息表操作
dropNullStd(order)
剔除的列的数目为: 7
剔除后数据的形状为: (945, 14)
#使用dropNullStd函数对客户信息表操作
dropNullStd(user)
剔除的列的数目为: 13
剔除后数据的形状为: (734, 24)
任务4.3 转换与处理时间序列数据
4.3.1转换字符串时间为标准时间
Timestamp是时间类中最基础的,也是最常用的,在多数情况下,会将与时间相关的字符串转换为Timestamp,pandas提供了to_datetime函数实现
import pandas as pd
order=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('进行转换前订单信息表lock_time的类型为',order['lock_time'].dtypes)
order['lock_time']=pd.to_datetime(order['lock_time'])
print('进行转换后订单信息表lock_time的类型为',order['lock_time'].dtypes)
进行转换前订单信息表lock_time的类型为 object
进行转换后订单信息表lock_time的类型为 datetime64[ns]
Timestamp类型的时间是有限制的,最早只能表示至1677年9月21日,最晚只能表示至2262年4月11日
#Timestamp的最小时间和最大时间
print('最小时间为',pd.Timestamp.min)
最小时间为 1677-09-21 00:12:43.145225
print('最大时间为',pd.Timestamp.max)
最大时间为 2262-04-11 23:47:16.854775807
#时间字符串转换为DatatimeIndex和PeriodIndex
dateIndex=pd.DatetimeIndex(order['lock_time'])
print('进行转换后订单信息表lock_time的类型为',type(dateIndex))
进行转换后订单信息表lock_time的类型为 <class 'pandas.core.indexes.datetimes.DatetimeIndex'>
# print(order["lock_time"])
periods = pd.PeriodIndex(lock_time=order["lock_time"],freq="S")
print('进行转换后订单信息表lock_time的类型为',type(periods))
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-134-f5c0575edfc2> in <module>
1 # print(order["lock_time"])
----> 2 periods = pd.PeriodIndex(lock_time=order["lock_time"],freq="S")
3 print('进行转换后订单信息表lock_time的类型为',type(periods))
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexes\period.py in __new__(cls, data, ordinal, freq, start, end, periods, tz, dtype, copy, name, **fields)
208 raise TypeError(
209 "__new__() got an unexpected keyword argument {}".format(
--> 210 list(set(fields) - valid_field_set)[0]
211 )
212 )
TypeError: __new__() got an unexpected keyword argument lock_time
4.3.2提取时间序列数据信息
order['lock_time']=pd.DatetimeIndex(order['lock_time'])#需要先把i改为timestamp类型
#提取datetime数据中的时间序列数据
year1=[i.year for i in order['lock_time']]
print('lock_time中的年份数据前5个为:',year1[:5])
month1=[i.month for i in order['lock_time']]
print('lock_time中的月份数据前5个为:',month1[:5])
day1=[i.day for i in order['lock_time']]
print('lock_time中的日期数据前5个为:',day1[:5])
weekday1=[i.weekday_name for i in order['lock_time']]
print('lock_time中的星期名称数据前5个为:',weekday1[:5])
lock_time中的年份数据前5个为: [2016, 2016, 2016, 2016, 2016]
lock_time中的月份数据前5个为: [8, 8, 8, 8, 8]
lock_time中的日期数据前5个为: [1, 1, 1, 1, 1]
lock_time中的星期名称数据前5个为: ['Monday', 'Monday', 'Monday', 'Monday', 'Monday']
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
if __name__ == '__main__':
#提取DatetimeIndex和PeriodIndex中的数据
print('dateIndex中的星期名称数据前5个为:',dateIndex.weekday_name[:5])
print('dateIndex中的星期标号数据前5个为:',dateIndex.weekday[:5])
dateIndex中的星期名称数据前5个为: Index(['Monday', 'Monday', 'Monday', 'Monday', 'Monday'], dtype='object', name='lock_time')
dateIndex中的星期标号数据前5个为: Float64Index([0.0, 0.0, 0.0, 0.0, 0.0], dtype='float64', name='lock_time')
4.3.3加减时间数据
#使用Timedelta实现时间数据的加运算
#将lock_time数据向后平移一天
time1=order['lock_time']+pd.Timedelta(days=1)
print('lock_time加上一天前前5行数据为:\n',order['lock_time'][:5])
print('lock_time加上一天前前5行数据为:\n',time1[:5])
lock_time加上一天前前5行数据为:
0 2016-08-01 11:11:46
1 2016-08-01 11:31:55
2 2016-08-01 12:54:37
3 2016-08-01 13:08:20
4 2016-08-01 13:07:16
Name: lock_time, dtype: datetime64[ns]
lock_time加上一天前前5行数据为:
0 2016-08-02 11:11:46
1 2016-08-02 11:31:55
2 2016-08-02 12:54:37
3 2016-08-02 13:08:20
4 2016-08-02 13:07:16
Name: lock_time, dtype: datetime64[ns]
#使用Timedelta实现时间数据的减运算
timeDelta=order['lock_time']-pd.to_datetime('2017-1-1')
print('lock_time减去2017年1月1日0点0时0分后的数据:\n',timeDelta[:5])
print('lock_time减去time1后的数据类型为:\n',timeDelta.dtypes)
lock_time减去2017年1月1日0点0时0分后的数据:
0 -153 days +11:11:46
1 -153 days +11:31:55
2 -153 days +12:54:37
3 -153 days +13:08:20
4 -153 days +13:07:16
Name: lock_time, dtype: timedelta64[ns]
lock_time减去time1后的数据类型为:
timedelta64[ns]
#订单信息表时间数据转换
import pandas as pd
order=pd.read_table('./data/meal_order_info.csv',sep=',',encoding='gbk')
print('进行转换前订单信息表user_start_time和lock_time的类型为:\n',order[['use_start_time','lock_time']].dtypes)
order['use_start_time']=pd.to_datetime(order['use_start_time'])
order['lock_time']=pd.to_datetime(order['lock_time'])
print('进行转换后订单信息表user_start_time和lock_time的类型为:\n',order[['use_start_time','lock_time']].dtypes)
进行转换前订单信息表user_start_time和lock_time的类型为:
use_start_time object
lock_time object
dtype: object
进行转换后订单信息表user_start_time和lock_time的类型为:
use_start_time datetime64[ns]
lock_time datetime64[ns]
dtype: object
2、提取菜品数据中的年月日和星期信息
#订单信息表时间信息提取
year=[i.year for i in order['lock_time']]#提取年份信息
month=[i.month for i in order['lock_time']]#提取月份信息
day=[i.day for i in order['lock_time']]#提取日期信息
week=[i.week for i in order['lock_time']]#提取周信息
weekday=[i.weekday() for i in order['lock_time']]#提取星期信息
#提取星期名称信息
weekname=[i.weekday_name for i in order['lock_time']]
print('订单详情表的前5条数据的年份信息为:',year[:5])
print('订单详情表的前5条数据的月份信息为:',month[:5])
print('订单详情表的前5条数据的日期信息为:',day[:5])
print('订单详情表的前5条数据的周信息为:',week[:5])
print('订单详情表的前5条数据的星期信息为:',weekday[:5])
print('订单详情表的前5条数据的星期名称信息为:',weekname[:5])
订单详情表的前5条数据的年份信息为: [2016, 2016, 2016, 2016, 2016]
订单详情表的前5条数据的月份信息为: [8, 8, 8, 8, 8]
订单详情表的前5条数据的日期信息为: [1, 1, 1, 1, 1]
订单详情表的前5条数据的周信息为: [31, 31, 31, 31, 31]
订单详情表的前5条数据的星期信息为: [0, 0, 0, 0, 0]
订单详情表的前5条数据的星期名称信息为: ['Monday', 'Monday', 'Monday', 'Monday', 'Monday']
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
3、查看订单信息表时间统计信息
#查看订单信息表时间统计信息
timemin=order['lock_time'].min()
timemax=order['lock_time'].max()
print('订单最早的时间为:',timemin)
print('订单最晚的时间为:',timemax)
print('订单持续的时间为:',timemax-timemin)
订单最早的时间为: 2016-08-01 11:11:46
订单最晚的时间为: 2016-08-31 21:56:12
订单持续的时间为: 30 days 10:44:26
checkTime=order['lock_time']-order['use_start_time']
print('平均点餐时间为:',checkTime.mean())
print('最短点餐时间为:',checkTime.min())
print('最长点餐时间为:',checkTime.max())
平均点餐时间为: 0 days 01:12:10.326923
最短点餐时间为: -1 days +00:05:03
最长点餐时间为: 16 days 00:08:00
任务4.4 使用分组聚合进行组内计算
4.4.1 使用groupby方法拆分数据
#对菜品订单详情表依据订单编号分组
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail=pd.read_sql_table('meal_order_detail1',con=engine)
detailGroup=detail[['order_id','counts','amounts']].groupby(by='order_id')
print('分组后的订单详情表为',detailGroup)
分组后的订单详情表为 <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002376A1830C8>
#GroupBy类求均值、标准差、中位数
print('订单详情表分组后前5组每组的均值为:\n',detailGroup.mean().head())
订单详情表分组后前5组每组的均值为:
counts amounts
order_id
1002 1.0000 32.000
1003 1.2500 30.125
1004 1.0625 43.875
1008 1.0000 63.000
1011 1.0000 57.700
print('订单详情表分组后前5组每组的标准差为:\n',detailGroup.std().head())
订单详情表分组后前5组每组的标准差为:
counts amounts
order_id
1002 0.00000 16.000000
1003 0.46291 21.383822
1004 0.25000 31.195886
1008 0.00000 64.880660
1011 0.00000 50.077828
print('订单详情表分组后前5组每组的大小为:\n',detailGroup.size().head())
订单详情表分组后前5组每组的大小为:
order_id
1002 7
1003 8
1004 16
1008 5
1011 10
dtype: int64
4.4.2使用agg方法聚合数据
#使用agg求出当前数据对应的统计量
print('订单详情表的菜品销量与售价的和与均值为:\n',detail[['counts','amounts']].agg([np.sum,np.mean]))
订单详情表的菜品销量与售价的和与均值为:
counts amounts
sum 3088.000000 125992.000000
mean 1.111191 45.337172
#使用agg分布求字段的不同统计量
print('订单详情表的菜品销量总和与售价的总和与均值为:\n',detail.agg({'counts':np.sum,'amounts':[np.mean,np.sum]}))
订单详情表的菜品销量总和与售价的总和与均值为:
counts amounts
mean NaN 45.337172
sum 3088.0 125992.000000
#在agg方法中使用自定义函数
#自定义函数求两倍的和
def DoubleSum(data):
s=data.sum()*2
return s
print('菜品订单详情表的菜品销量两倍总和为:','\n',detail.agg({'counts':DoubleSum},axis=0))
菜品订单详情表的菜品销量两倍总和为:
counts 6176.0
dtype: float64
#agg方法中使用的自定义函数含NumPy中的函数
#自定义函数求两倍的和
def DoubleSum1(data):
s=np.sum(data)*2
return s
print('菜品订单详情表的菜品销量两倍总和为:','\n',detail.agg({'counts':DoubleSum1},axis=0).head())
菜品订单详情表的菜品销量两倍总和为:
counts
0 2.0
1 2.0
2 2.0
3 2.0
4 2.0
print('订单详情表分组后前三组每组的均值为:\n',detailGroup.agg(np.mean).head(3))
订单详情表分组后前三组每组的均值为:
counts amounts
order_id
1002 1.0000 32.000
1003 1.2500 30.125
1004 1.0625 43.875
print('订单详情表分组后前三组每组的标准差为:\n',detailGroup.agg(np.std).head(3))
订单详情表分组后前三组每组的标准差为:
counts amounts
order_id
1002 0.00000 16.000000
1003 0.46291 21.383822
1004 0.25000 31.195886
#使用agg方法对分组数据使用不同的聚合函数
print('订单详情表分组后前三组每组菜品总数和售价均值为:\n',detailGroup.agg({'counts':np.sum,'amounts':np.mean}).head(3))
订单详情表分组后前三组每组菜品总数和售价均值为:
counts amounts
order_id
1002 7.0 32.000
1003 10.0 30.125
1004 17.0 43.875
4.4.3使用apply方法聚合数据
#apply方法的基础用法
print('订单详情表的菜品销量与售价的均值为:\n',detail[['counts','amounts']].apply(np.mean))
订单详情表的菜品销量与售价的均值为:
counts 1.111191
amounts 45.337172
dtype: float64
#使用apply方法进行聚合操作
print('订单详情表分组前3组的均值为:\n',detailGroup.apply(np.mean).head(3))
订单详情表分组前3组的均值为:
order_id counts amounts
order_id
1002 1.431572e+26 1.0000 32.000
1003 1.253875e+30 1.2500 30.125
1004 6.275628e+61 1.0625 43.875
print('订单详情表分组前3组的标准差:\n',detailGroup.apply(np.std).head(3))
订单详情表分组前3组的标准差:
counts amounts
order_id
1002 0.000000 14.813122
1003 0.433013 20.002734
1004 0.242061 30.205287
4.4.4 使用transform方法聚合数据
#使用transform方法将销量和售价翻倍
print('订单详情表的菜品销量与售价的两倍为:\n',detail[['counts','amounts']].transform(lambda x:x*2).head(4))
订单详情表的菜品销量与售价的两倍为:
counts amounts
0 2.0 98.0
1 2.0 96.0
2 2.0 60.0
3 2.0 50.0
#使用transform实现组内离差标准化
print('订单详情表分组后实现组内离差标准化后前5行为:\n',detailGroup.transform(lambda x:(x.mean()-x.min())/(x.max()-x.min())).head())
---------------------------------------------------------------------------
ZeroDivisionError Traceback (most recent call last)
<ipython-input-92-a8b0b34b746b> in <module>
1 #使用transform实现组内离差标准化
----> 2 print('订单详情表分组后实现组内离差标准化后前5行为:\n',detailGroup.transform(lambda x:(x.mean()-x.min())/(x.max()-x.min())).head())
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in transform(self, func, *args, **kwargs)
582 result = getattr(self, func)(*args, **kwargs)
583 else:
--> 584 return self._transform_general(func, *args, **kwargs)
585
586 # a reduction transform
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _transform_general(self, func, *args, **kwargs)
529 # Try slow path and fast path.
530 try:
--> 531 path, res = self._choose_path(fast_path, slow_path, group)
532 except TypeError:
533 return self._transform_item_by_item(obj, fast_path)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _choose_path(self, fast_path, slow_path, group)
631 def _choose_path(self, fast_path, slow_path, group):
632 path = slow_path
--> 633 res = slow_path(group)
634
635 # if we make it here, test if we can use the fast path
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in <lambda>(group)
625 fast_path = lambda group: func(group, *args, **kwargs)
626 slow_path = lambda group: group.apply(
--> 627 lambda x: func(x, *args, **kwargs), axis=self.axis
628 )
629 return fast_path, slow_path
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\frame.py in apply(self, func, axis, broadcast, raw, reduce, result_type, args, **kwds)
6911 kwds=kwds,
6912 )
-> 6913 return op.get_result()
6914
6915 def applymap(self, func):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in get_result(self)
184 return self.apply_raw()
185
--> 186 return self.apply_standard()
187
188 def apply_empty_result(self):
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_standard(self)
290
291 # compute the result using the series generator
--> 292 self.apply_series_generator()
293
294 # wrap results
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\apply.py in apply_series_generator(self)
319 try:
320 for i, v in enumerate(series_gen):
--> 321 results[i] = self.f(v)
322 keys.append(v.name)
323 except Exception as e:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in <lambda>(x)
625 fast_path = lambda group: func(group, *args, **kwargs)
626 slow_path = lambda group: group.apply(
--> 627 lambda x: func(x, *args, **kwargs), axis=self.axis
628 )
629 return fast_path, slow_path
<ipython-input-92-a8b0b34b746b> in <lambda>(x)
1 #使用transform实现组内离差标准化
----> 2 print('订单详情表分组后实现组内离差标准化后前5行为:\n',detailGroup.transform(lambda x:(x.mean()-x.min())/(x.max()-x.min())).head())
ZeroDivisionError: ('float division by zero', 'occurred at index counts')
4.4.5 任务实现
1、按照时间对菜品订单详情表进行拆分
#订单详情表按照日期分组
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail=pd.read_sql_table('meal_order_detail1',con=engine)
detail['place_order_time']=pd.to_datetime(detail['place_order_time'])
detail['date']=[i.date() for i in detail['place_order_time']]
detailGroup=detail[['date','counts','amounts']].groupby(by='date')
print('订单详情表前5组每组的数目为:\n',detailGroup.size().head())
订单详情表前5组每组的数目为:
date
2016-08-01 217
2016-08-02 138
2016-08-03 157
2016-08-04 144
2016-08-05 193
dtype: int64
2、使用agg方法计算单日菜品销售的平均单价和售价中位数
#求分组后的订单详情表每日菜品销售的均价、中位数
dayMean=detailGroup.agg({'amounts':np.mean})
print('订单详情表前5组单日菜品销售均价为:\n',dayMean.head())
订单详情表前5组单日菜品销售均价为:
amounts
date
2016-08-01 43.161290
2016-08-02 44.384058
2016-08-03 43.885350
2016-08-04 52.423611
2016-08-05 44.927461
dayMedian=detailGroup.agg({'amounts':np.median})
print('订单详情表前5组单日菜品销售中位数为:\n',dayMedian.head())
订单详情表前5组单日菜品销售中位数为:
amounts
date
2016-08-01 33.0
2016-08-02 35.0
2016-08-03 38.0
2016-08-04 39.0
2016-08-05 37.0
3、使用apply方法统计单日菜品销售数目
#求取订单详情表中每日菜品总销量
daySaleSum=detailGroup.apply(np.sum)['counts']
print('订单详情表前5组单日菜品售出数目为:\n',daySaleSum.head())
订单详情表前5组单日菜品售出数目为:
date
2016-08-01 233.0
2016-08-02 151.0
2016-08-03 192.0
2016-08-04 169.0
2016-08-05 224.0
Name: counts, dtype: float64
任务4.5 创建透视表与交叉表
4.5.11使用pivot_table函数创建透视表
#使用订单号作为透视表索引制作透视图
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail=pd.read_sql_table('meal_order_detail1',con=engine)
detailPivot=pd.pivot_table(detail[['order_id','counts','amounts']],index='order_id')
print('以order_id作为分组键创建的订单透视表:\n',detailPivot.head())
以order_id作为分组键创建的订单透视表:
amounts counts
order_id
1002 32.000 1.0000
1003 30.125 1.2500
1004 43.875 1.0625
1008 63.000 1.0000
1011 57.700 1.0000
#修改聚合函数后的透视表
detailPivot1=pd.pivot_table(detail[['order_id','counts','amounts']],index='order_id',aggfunc=np.sum)#当不指定聚合函数aggfunc时,会默认numpy.mean进行聚合计算
print('以order_id作为分组键创建的订单销量与售价总和透视表:\n',detailPivot1.head())
以order_id作为分组键创建的订单销量与售价总和透视表:
amounts counts
order_id
1002 224.0 7.0
1003 241.0 10.0
1004 702.0 17.0
1008 315.0 5.0
1011 577.0 10.0
#使用订单号和菜品名称作为索引的透视表
detailPivot2=pd.pivot_table(detail[['order_id','dishes_name','counts','amounts']],index=['order_id','dishes_name'],aggfunc=np.sum)#当不指定聚合函数aggfunc时,会默认numpy.mean进行聚合计算
print('以order_id和dishes_name作为分组键创建的订单销量与售价总和透视表:\n',detailPivot2.head())
以order_id和dishes_name作为分组键创建的订单销量与售价总和透视表:
amounts counts
order_id dishes_name
1002 凉拌菠菜 27.0 1.0
南瓜枸杞小饼干 19.0 1.0
焖猪手 58.0 1.0
独家薄荷鲜虾牛肉卷\r\n\r\n\r\n 45.0 1.0
白胡椒胡萝卜羊肉汤 35.0 1.0
#指定菜品名称为列分组键的透视表
detailPivot3=pd.pivot_table(detail[['order_id','dishes_name','counts','amounts']],index='order_id',columns='dishes_name',aggfunc=np.sum)#当不指定聚合函数aggfunc时,会默认numpy.mean进行聚合计算
print('以order_id和dishes_name作为分组键创建的订单销量与售价总和透视表:\n',detailPivot3.iloc[:5,:4])
以order_id和dishes_name作为分组键创建的订单销量与售价总和透视表:
amounts
dishes_name 42度海之蓝 北冰洋汽水 38度剑南春 50度古井贡酒
order_id
1002 NaN NaN NaN NaN
1003 NaN NaN NaN NaN
1004 NaN NaN NaN NaN
1008 NaN NaN NaN NaN
1011 99.0 NaN NaN NaN
#指定某些列制作透视表
detailPivot4=pd.pivot_table(detail[['order_id','dishes_name','counts','amounts']],index='order_id',values='counts',aggfunc=np.sum)#values用于指定要聚合的数据字段名,默认使用全部数据
print('以order_id作为行分组键counts作为值创建的透视表前五行为:\n',detailPivot4.head())
以order_id作为行分组键counts作为值创建的透视表前五行为:
counts
order_id
1002 7.0
1003 10.0
1004 17.0
1008 5.0
1011 10.0
#对透视表的缺失值进行填充
detailPivot5=pd.pivot_table(detail[['order_id','dishes_name','counts','amounts']],index='order_id',columns='dishes_name',aggfunc=np.sum,fill_value=0)#values用于指定要聚合的数据字段名,默认使用全部数据
print('空值填0后以order_id和dishes_name为行列分组键创建透视表前5行4列为:\n',detailPivot5.iloc[:5,:4])
空值填0后以order_id和dishes_name为行列分组键创建透视表前5行4列为:
amounts
dishes_name 42度海之蓝 北冰洋汽水 38度剑南春 50度古井贡酒
order_id
1002 0 0 0 0
1003 0 0 0 0
1004 0 0 0 0
1008 0 0 0 0
1011 99 0 0 0
#在透视表中添加汇总数据
detailPivot6=pd.pivot_table(detail[['order_id','dishes_name','counts','amounts']],index='order_id',columns='dishes_name',aggfunc=np.sum,fill_value=0,margins=True)
#margins表示汇总功能的开关,设置为True后,结果会集中出现名为ALL的行与列,默认为True
print('空值填0后以order_id和dishes_name为行列分组键创建透视表前5行4列为:\n',detailPivot6.iloc[:5,-4:])
空值填0后以order_id和dishes_name为行列分组键创建透视表前5行4列为:
counts
dishes_name 黄油曲奇饼干 黄花菜炒木耳 黑米恋上葡萄 All
order_id
1002 0 0 0 7.0
1003 0 0 0 10.0
1004 0 1 0 17.0
1008 0 0 0 5.0
1011 0 0 0 10.0
4.5.2使用crosstab函数创建交叉表
#使用crosstab函数制定交叉表
detailCross=pd.crosstab(index=detail['order_id'],columns=detail['dishes_name'],values=detail['counts'],aggfunc=np.sum)
print('以order_id和dishes_name为分组键,以counts为值的透视表前5行5列为:\n',detailCross.iloc[:5,:5])
以order_id和dishes_name为分组键,以counts为值的透视表前5行5列为:
dishes_name 42度海之蓝 北冰洋汽水 38度剑南春 50度古井贡酒 52度泸州老窖
order_id
1002 NaN NaN NaN NaN NaN
1003 NaN NaN NaN NaN NaN
1004 NaN NaN NaN NaN NaN
1008 NaN NaN NaN NaN NaN
1011 1.0 NaN NaN NaN NaN
4.5.3 任务实现
1、创建单日菜单成交总额与总数均价透视表
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
detail=pd.read_sql_table('meal_order_detail1',con=engine)
detail['place_order_time']=pd.to_datetime(detail['place_order_time'])
detail['date']=[i.date() for i in detail['place_order_time']]
PivotDetail=pd.pivot_table(detail[['date','dishes_name','counts','amounts']],index='date',aggfunc=np.sum,margins=True)
print('订单详情表单日菜品成交总额与总数透视表前5行为:\n',PivotDetail.head())
订单详情表单日菜品成交总额与总数透视表前5行为:
amounts counts
date
2016-08-01 9366.0 233.0
2016-08-02 6125.0 151.0
2016-08-03 6890.0 192.0
2016-08-04 7549.0 169.0
2016-08-05 8671.0 224.0
2、创建单个菜品单日成交总额透视表
#订单详情表单个菜品单日成交总额透视表
CrossDetail=pd.crosstab(index=detail['date'],columns=detail['dishes_name'],values=detail['amounts'],aggfunc=np.sum,margins=True)
print('订单详情表单个菜品单日成交总额交叉表后5行5列为:\n',CrossDetail.iloc[-5:,-5:])
订单详情表单个菜品单日成交总额交叉表后5行5列为:
dishes_name 黄尾袋鼠西拉子红葡萄酒 黄油曲奇饼干 黄花菜炒木耳 黑米恋上葡萄 All
date
2016-08-07 230.0 32.0 105.0 99.0 31306.0
2016-08-08 46.0 NaN NaN 33.0 6532.0
2016-08-09 138.0 NaN 35.0 99.0 7155.0
2016-08-10 46.0 NaN 70.0 33.0 10231.0
All 736.0 80.0 525.0 561.0 125992.0
实训
实训1 读取并查看P2P网络贷款数据主表的基本信息
1、查看数据的大小、维度和占用内存
import pandas as pd
import numpy as np
#使用read_csv读取P2P网络贷款数据主表的基本信息数据
order1=pd.read_csv('./data/Training_Master.csv',encoding='gbk')
print('P2P网络贷款数据主表的基本信息数据的维度是:\n',order1.ndim)
print('P2P网络贷款数据主表的基本信息数据的大小是:\n',order1.shape)
print('P2P网络贷款数据主表的基本信息数据的占用内存为:\n',order1.memory_usage())
P2P网络贷款数据主表的基本信息数据的维度是:
2
P2P网络贷款数据主表的基本信息数据的大小是:
(30000, 228)
P2P网络贷款数据主表的基本信息数据的占用内存为:
Index 128
Idx 240000
UserInfo_1 240000
UserInfo_2 240000
UserInfo_3 240000
...
SocialNetwork_15 240000
SocialNetwork_16 240000
SocialNetwork_17 240000
target 240000
ListingInfo 240000
Length: 229, dtype: int64
2、对数据进行描述性统计
#使用describe方法进行描述性统计,并剔除值相同或全为空的列
# print(order1.columns)
#用户第三方平台信息
print('P2P网络贷款数据主表ThirdParty_Info_Period1_13和ThirdParty_Info_Period1_14两列的描述性统计为:\n',order1.loc[:,['ThirdParty_Info_Period1_13','ThirdParty_Info_Period1_14']].describe())
P2P网络贷款数据主表ThirdParty_Info_Period1_13和ThirdParty_Info_Period1_14两列的描述性统计为:
ThirdParty_Info_Period1_13 ThirdParty_Info_Period1_14
count 3.000000e+04 3.000000e+04
mean 1.509768e+04 1.324802e+04
std 2.407289e+04 2.137895e+04
min -1.000000e+00 -1.000000e+00
25% 3.817750e+03 3.626750e+03
50% 9.263500e+03 8.799000e+03
75% 1.897300e+04 1.736425e+04
max 1.120334e+06 1.271271e+06
order1['Idx']=order1['Idx'].astype('category')#用户标识
order1['UserInfo_2']=order1['UserInfo_2'].astype('category')#用户基本信息
print('''P2P网络贷款数据主表Idx与UserInfo_2的描述性统计结果为:''','\n',order1[['Idx','UserInfo_2']].describe())
P2P网络贷款数据主表Idx与UserInfo_2的描述性统计结果为:
Idx UserInfo_2
count 30000 29698
unique 30000 327
top 91703 深圳
freq 1 746
3、剔除全为空值或者所有元素取值相同的列
#剔除餐饮菜品中整列为空或者取值完全相同的列
#定义一个函数剔除全为空值的列和标准差为0的列
def dropNullStd(data):
beforelen=data.shape[1]
colisNull=data.describe().loc['count']==0
for i in range(len(colisNull)):
if colisNull[i]:
data.drop(colisNull.index[i],axis=1,inplace=True)
stdisZero=data.describe().loc['std']==0
for i in range(len(stdisZero)):
if stdisZero[i]:
data.drop(stdisZero.index[i],axis=1,inplace=True)
afterlen=data.shape[1]
print('剔除的列的数目为:', beforelen-afterlen)
print('剔除后数据的形状为:',data.shape)
dropNullStd(order1)
剔除的列的数目为: 2
剔除后数据的形状为: (30000, 226)
实训2 提取用户信息更新表和登录信息表的时间信息
1、转换字符串时间为标准时间
import pandas as pd
order2=pd.read_table('./data/Training_LogInfo.csv',sep=',',encoding='gbk')
order3=pd.read_table('./data/Training_Userupdate.csv',sep=',',encoding='gbk')
print('进行转换前登录信息表LogInfo3的类型为:',order2['LogInfo3'].dtypes)
print('进行转换前用户信息更新表ListingInfo1和UserUpdateInfo_2的类型为:',order3[['ListingInfo1','UserupdateInfo2']].dtypes)
order2['LogInfo3']=pd.to_datetime(order2['LogInfo3'])
dtd=order3[['ListingInfo1','UserupdateInfo2']]
dtd.apply(lambda x: pd.to_datetime(x,errors = 'coerce', format = '%Y-%m-%d'))
#dateIndex=pd.to_datetime(order3[['ListingInfo1','UserupdateInfo2']])
print('进行转换后登录信息表LogInfo3的类型为:',order2['LogInfo3'].dtypes)
print('进行转换后用户信息更新表ListingInfo1和UserupdateInfo2的类型为:',dtd.dtypes)
进行转换前登录信息表LogInfo3的类型为: object
进行转换前用户信息更新表ListingInfo1和UserUpdateInfo_2的类型为: ListingInfo1 object
UserupdateInfo2 object
dtype: object
进行转换后登录信息表LogInfo3的类型为: datetime64[ns]
进行转换后用户信息更新表ListingInfo1和UserupdateInfo2的类型为: ListingInfo1 object
UserupdateInfo2 object
dtype: object
2、 使用year、month、week等方法提取用户信息表和登录信息表中的时间信息
#提取登录信息表的时间信息
year1=[i.year for i in order2['LogInfo3']]
print('LogInfo3中的年份数据前5个为:',year1[:5])
month1=[i.month for i in order2['LogInfo3']]
print('LogInfo3中的月份数据前5个为:',month1[:5])
day1=[i.day for i in order2['LogInfo3']]
print('LogInfo3中的日期数据前5个为:',day1[:5])
weekday1=[i.weekday_name for i in order2['LogInfo3']]
print('LogInfo3中的星期名称数据前5个为:',weekday1[:5])
LogInfo3中的年份数据前5个为: [2014, 2014, 2014, 2014, 2014]
LogInfo3中的月份数据前5个为: [2, 2, 2, 2, 2]
LogInfo3中的日期数据前5个为: [20, 23, 24, 25, 27]
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
LogInfo3中的星期名称数据前5个为: ['Thursday', 'Sunday', 'Monday', 'Tuesday', 'Thursday']
#提取用户信息表的时间信息
order3['ListingInfo1']=pd.to_datetime(order3['ListingInfo1'])
year2=[i.year for i in order3['ListingInfo1']]
print('ListingInfo1中的年份数据前5个为:',year2[:5])
month2=[i.month for i in order3['ListingInfo1']]
print('ListingInfo1中的月份数据前5个为:',month2[:5])
day2=[i.day for i in order3['ListingInfo1']]
print('ListingInfo1中的日期数据前5个为:',day2[:5])
weekday2=[i.weekday_name for i in order3['ListingInfo1']]
print('ListingInfo1中的星期名称数据前5个为:',weekday2[:5])
ListingInfo1中的年份数据前5个为: [2014, 2014, 2014, 2014, 2014]
ListingInfo1中的月份数据前5个为: [3, 3, 3, 3, 3]
ListingInfo1中的日期数据前5个为: [5, 5, 5, 5, 5]
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
if __name__ == '__main__':
ListingInfo1中的星期名称数据前5个为: ['Wednesday', 'Wednesday', 'Wednesday', 'Wednesday', 'Wednesday']
order3['UserupdateInfo2']=pd.to_datetime(order3['UserupdateInfo2'])
year3=[i.year for i in order3['UserupdateInfo2']]
print('UserupdateInfo2中的年份数据前5个为:',year3[:5])
month3=[i.month for i in order3['UserupdateInfo2']]
print('UserupdateInfo2中的月份数据前5个为:',month3[:5])
day3=[i.day for i in order3['UserupdateInfo2']]
print('UserupdateInfo2中的日期数据前5个为:',day3[:5])
weekday3=[i.weekday_name for i in order3['UserupdateInfo2']]
print('UserupdateInfo2中的星期名称数据前5个为:',weekday3[:5])
UserupdateInfo2中的年份数据前5个为: [2014, 2014, 2014, 2014, 2014]
UserupdateInfo2中的月份数据前5个为: [2, 2, 2, 2, 2]
UserupdateInfo2中的日期数据前5个为: [20, 20, 20, 20, 20]
C:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:8: FutureWarning: `weekday_name` is deprecated and will be removed in a future version. Use `day_name` instead
UserupdateInfo2中的星期名称数据前5个为: ['Thursday', 'Thursday', 'Thursday', 'Thursday', 'Thursday']
3、计算用户信息表和登录信息表中两时间的差,分别以日、小时、分钟计算
import numpy as np
dayDelta=order3['ListingInfo1']-order2['LogInfo3']#以日计算
print("计算时间差以日为单位:\n",dayDelta.head())
def TransformhourDelta(data):
for i in range(0,len(data)):
data[i]=data[i].total_seconds()/3600
return data
print("计算时间差以小时为单位:\n",TransformhourDelta(dayDelta).head())
def TransformDayIntoMinute(data):
for i in range(0,len(data)):
data[i]=data[i].total_seconds()/60
return data
timeDeltaUserupdate=order3["ListingInfo1"]-order2['LogInfo3']
print("计算时间差以分钟为单位:\n",TransformDayIntoMinute(timeDeltaUserupdate).head())
计算时间差以日为单位:
0 13 days
1 10 days
2 9 days
3 8 days
4 6 days
dtype: timedelta64[ns]
计算时间差以小时为单位:
0 312
1 240
2 216
3 192
4 144
dtype: object
计算时间差以分钟为单位:
0 18720
1 14400
2 12960
3 11520
4 8640
dtype: object
实训3 使用分组聚合方法进一步分析用户信息更新表和登录信息表
1、使用groupby方法拆分数据
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
order2=pd.read_table('./data/Training_LogInfo.csv',sep=',',encoding='gbk')
order3=pd.read_table('./data/Training_Userupdate.csv',sep=',',encoding='gbk')
order2Group=order2[['Idx','Listinginfo1','LogInfo1','LogInfo2','LogInfo3']].groupby(by='Idx')
print('分组后的登录信息表为',order2Group)
分组后的登录信息表为 <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002376A1B3508>
order3Group=order3[['Idx','ListingInfo1','UserupdateInfo1','UserupdateInfo2']].groupby(by='Idx')
print('分组后的用户信息更新表为',order3Group)
分组后的用户信息更新表为 <pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002376A1B3488>
print('登录信息表分组后前5组每组的均值为:\n',order2Group.mean().head())
登录信息表分组后前5组每组的均值为:
LogInfo1 LogInfo2
Idx
3 -1.230769 5.153846
5 -1.272727 3.909091
8 2.096000 5.920000
12 5.864322 3.889447
16 -2.066667 4.333333
print('登录信息表分组后前5组每组的标准差为:\n',order2Group.std().head())
登录信息表分组后前5组每组的标准差为:
LogInfo1 LogInfo2
Idx
3 3.421201 2.166884
5 4.221159 3.910359
8 23.749168 1.860454
12 21.852737 2.049351
16 2.914659 2.468854
print('登录信息表分组后前5组每组的大小为:\n',order2Group.size().head())
登录信息表分组后前5组每组的大小为:
Idx
3 26
5 11
8 125
12 199
16 15
dtype: int64
print('用户信息更新表分组后前5组每组的均值为:\n',order3Group.mean().head())
---------------------------------------------------------------------------
DataError Traceback (most recent call last)
<ipython-input-121-2c7f344e9e2d> in <module>
----> 1 print('用户信息更新表分组后前5组每组的均值为:\n',order3Group.mean().head())
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\groupby.py in mean(self, *args, **kwargs)
1203 try:
1204 return self._cython_agg_general(
-> 1205 "mean", alt=lambda x, axis: Series(x).mean(**kwargs), **kwargs
1206 )
1207 except GroupByError:
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_general(self, how, alt, numeric_only, min_count)
126 def _cython_agg_general(self, how, alt=None, numeric_only=True, min_count=-1):
127 new_items, new_blocks = self._cython_agg_blocks(
--> 128 how, alt=alt, numeric_only=numeric_only, min_count=min_count
129 )
130 return self._wrap_agged_blocks(new_items, new_blocks)
C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\groupby\generic.py in _cython_agg_blocks(self, how, alt, numeric_only, min_count)
186
187 if len(new_blocks) == 0:
--> 188 raise DataError("No numeric types to aggregate")
189
190 # reset the locs in the blocks to correspond to our
DataError: No numeric types to aggregate
print('用户信息更新表分组后前5组每组的均值为:\n',order3Group.std().head())
用户信息更新表分组后前5组每组的均值为:
Empty DataFrame
Columns: []
Index: []
print('用户信息更新表分组后前5组每组的大小为:\n',order3Group.size().head())
用户信息更新表分组后前5组每组的大小为:
Idx
3 13
5 13
8 14
12 14
16 13
dtype: int64
2、使用agg方法求取分组后的最早和最晚更新及登录时间
print('分组后的最早登录时间和最晚登录时间为:\n',order2Group['LogInfo3'].agg([np.min,np.max]))
分组后的最早登录时间和最晚登录时间为:
amin amax
Idx
3 2013-08-30 2013-11-01
5 2013-10-24 2013-11-06
8 2013-10-25 2013-11-06
12 2012-12-08 2013-11-01
16 2013-10-27 2013-11-04
... ... ...
91689 2014-10-26 2014-10-29
91693 2014-10-26 2014-11-05
91695 2014-09-26 2014-10-30
91702 2014-10-26 2014-10-28
91703 2014-10-26 2014-11-03
[28987 rows x 2 columns]
print('分组后的最早更新时间和最晚更新时间为:\n',order3Group['UserupdateInfo2'].agg([np.min,np.max]))
分组后的最早更新时间和最晚更新时间为:
amin amax
Idx
3 2013/08/30 2013/08/30
5 2013/10/24 2013/10/24
8 2013/10/25 2013/11/04
12 2012/12/08 2013/10/02
16 2013/10/27 2013/11/05
... ... ...
91689 2014/10/26 2014/10/26
91693 2014/10/26 2014/10/28
91695 2014/09/26 2014/10/26
91702 2014/10/26 2014/10/26
91703 2014/10/26 2014/11/02
[29995 rows x 2 columns]
3、使用size方法求取分组后的数据的信息更新次数与登录次数
#print(order3Group['Idx'].head())
print('分组后的数据信息登录次数为:\n',order2Group['LogInfo3'].size().head())
print('分组后的数据信息更新次数为:\n',order3Group['UserupdateInfo2'].size().head())
分组后的数据信息登录次数为:
Idx
3 26
5 11
8 125
12 199
16 15
Name: LogInfo3, dtype: int64
分组后的数据信息更新次数为:
Idx
3 13
5 13
8 14
12 14
16 13
Name: UserupdateInfo2, dtype: int64
实训4 对用户信息更新表和登录信息表进行长宽表转换
1、使用pivot_table函数进行长宽表转换
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine("mysql+pymysql://root:123456@127.0.0.1/testdb?charset=utf8mb4")
session = sessionmaker(bind=engine)
order2=pd.read_table('./data/Training_LogInfo.csv',sep=',',encoding='gbk')
order3=pd.read_table('./data/Training_Userupdate.csv',sep=',',encoding='gbk')
order2Pivot=pd.pivot_table(order2[['Idx','Listinginfo1','LogInfo1','LogInfo2','LogInfo3']],index='Idx',aggfunc=np.sum)
print('以Idx为分键组创建的登录信息表为:\n',order2Pivot.head())
order3Pivot=pd.pivot_table(order3[['Idx','ListingInfo1','UserupdateInfo1','UserupdateInfo2']],index='Idx',aggfunc=np.sum)
print('以Idx为分键组创建的信息更新表为:\n',order3Pivot.head())
以Idx为分键组创建的登录信息表为:
LogInfo1 LogInfo2
Idx
3 -32 134
5 -14 43
8 262 740
12 1167 774
16 -31 65
以Idx为分键组创建的信息更新表为:
ListingInfo1 \
Idx
3 2013/11/052013/11/052013/11/052013/11/052013/1...
5 2013/11/072013/11/072013/11/072013/11/072013/1...
8 2013/11/062013/11/062013/11/062013/11/062013/1...
12 2013/11/012013/11/012013/11/012013/11/012013/1...
16 2013/11/052013/11/052013/11/052013/11/052013/1...
UserupdateInfo1 \
Idx
3 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
5 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
8 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
12 _age_Age_EducationId_gender_HasBuyCar_idNumber...
16 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
UserupdateInfo2
Idx
3 2013/08/302013/08/302013/08/302013/08/302013/0...
5 2013/10/242013/10/242013/10/242013/10/242013/1...
8 2013/10/252013/10/252013/10/252013/10/252013/1...
12 2012/12/082013/10/022013/10/022012/12/082013/1...
16 2013/10/272013/10/272013/10/272013/10/272013/1...
#使用用户标识符和登录时间作为索引的透视表
order2Pivot=pd.pivot_table(order2[['Idx','Listinginfo1','LogInfo1','LogInfo2','LogInfo3']],index=['Idx','Listinginfo1'],aggfunc=np.sum)
print('以Idx和Listinginfo1为分键组创建的登录信息表为:\n',order2Pivot.head())
order3Pivot=pd.pivot_table(order3[['Idx','ListingInfo1','UserupdateInfo1','UserupdateInfo2']],index=['Idx','ListingInfo1'],aggfunc=np.sum)
print('以Idx和ListingInfo1为分键组创建的信息更新表为:\n',order3Pivot.head())
以Idx和Listinginfo1为分键组创建的登录信息表为:
LogInfo1 LogInfo2
Idx Listinginfo1
3 2013-11-05 -32 134
5 2013-11-07 -14 43
8 2013-11-06 262 740
12 2013-11-01 1167 774
16 2013-11-05 -31 65
以Idx和ListingInfo1为分键组创建的信息更新表为:
UserupdateInfo1 \
Idx ListingInfo1
3 2013/11/05 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
5 2013/11/07 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
8 2013/11/06 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
12 2013/11/01 _age_Age_EducationId_gender_HasBuyCar_idNumber...
16 2013/11/05 _EducationId_HasBuyCar_IdNumber_LastUpdateDate...
UserupdateInfo2
Idx ListingInfo1
3 2013/11/05 2013/08/302013/08/302013/08/302013/08/302013/0...
5 2013/11/07 2013/10/242013/10/242013/10/242013/10/242013/1...
8 2013/11/06 2013/10/252013/10/252013/10/252013/10/252013/1...
12 2013/11/01 2012/12/082013/10/022013/10/022012/12/082013/1...
16 2013/11/05 2013/10/272013/10/272013/10/272013/10/272013/1...
order2Pivot=pd.pivot_table(order2[['Idx','Listinginfo1','LogInfo1','LogInfo2','LogInfo3']],index='Idx',columns='LogInfo1',aggfunc=np.sum,fill_value=0)
print('以Idx和LogInfo1为行列分键组创建的登录信息表为:\n',order2Pivot.head())
order3Pivot=pd.pivot_table(order3[['Idx','ListingInfo1','UserupdateInfo1','UserupdateInfo2']],index='Idx',columns='UserupdateInfo1',aggfunc=np.sum,fill_value=0)
print('以Idx和UserupdateInfo1为行列分键组创建的信息更新表为:\n',order3Pivot.head())
以Idx和LogInfo1为行列分键组创建的登录信息表为:
LogInfo2 ... \
LogInfo1 -10 -4 0 1 2 3 4 5 6 8 ...
Idx ...
3 0 90 0 7 8 0 29 0 0 0 ...
5 13 24 0 3 1 1 1 0 0 0 ...
8 13 654 0 7 22 1 1 0 0 0 ...
12 13 210 0 29 446 1 15 0 0 0 ...
16 0 60 0 3 1 0 1 0 0 0 ...
LogInfo1 302 303 304 305 307 310 1000 2000 3000 3001
Idx
3 0 0 0 0 0 0 0 0 0 0
5 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0
16 0 0 0 0 0 0 0 0 0 0
[5 rows x 35 columns]
以Idx和UserupdateInfo1为行列分键组创建的信息更新表为:
ListingInfo1 \
UserupdateInfo1 _Age _BussinessAddress _ByUserId _CityId
Idx
3 0 0 0 0
5 0 0 0 0
8 0 0 0 0
12 2013/11/01 0 0 0
16 0 0 0 0
\
UserupdateInfo1 _CompanyAddress _CompanyName _CompanyPhone _CompanySizeId
Idx
3 0 0 0 0
5 0 0 0 0
8 0 0 0 0
12 0 0 0 0
16 0 0 0 0
... UserupdateInfo2 \
UserupdateInfo1 _CompanyTypeId _CreationDate ... _relationshipId
Idx ...
3 0 0 ... 0
5 0 0 ... 0
8 0 0 ... 0
12 0 0 ... 0
16 0 0 ... 0
\
UserupdateInfo1 _residenceAddress _residencePhone _residenceTypeId
Idx
3 0 0 0
5 0 0 0
8 0 0 0
12 0 0 0
16 0 0 0
\
UserupdateInfo1 _residenceYears _schoolName _turnover _userId _webShopUrl
Idx
3 0 0 0 0 0
5 0 0 0 0 0
8 0 0 0 0 0
12 0 0 0 0 0
16 0 0 0 0 0
UserupdateInfo1 _workYears
Idx
3 0
5 0
8 0
12 0
16 0
[5 rows x 172 columns]
2、使用crosstab方法进行长宽表转换
order2Pivot=pd.crosstab(index=order2['Idx'],columns=order2['LogInfo1'],values=order2['LogInfo2'],aggfunc=np.sum)
print('以Idx和LogInfo1为行列分键组创建的登录信息表为:\n',order2Pivot.head())
以Idx和LogInfo1为行列分键组创建的登录信息表为:
LogInfo1 -10 -4 0 1 2 3 4 5 6 \
Idx
3 NaN 90.0 NaN 7.0 8.0 NaN 29.0 NaN NaN
5 13.0 24.0 NaN 3.0 1.0 1.0 1.0 NaN NaN
8 13.0 654.0 NaN 7.0 22.0 1.0 1.0 NaN NaN
12 13.0 210.0 NaN 29.0 446.0 1.0 15.0 NaN NaN
16 NaN 60.0 NaN 3.0 1.0 NaN 1.0 NaN NaN
LogInfo1 8 ... 302 303 304 305 307 310 1000 2000 \
Idx ...
3 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN
5 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN
8 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN
12 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN
16 NaN ... NaN NaN NaN NaN NaN NaN NaN NaN
LogInfo1 3000 3001
Idx
3 NaN NaN
5 NaN NaN
8 NaN NaN
12 NaN NaN
16 NaN NaN
[5 rows x 35 columns]
order3Pivot=pd.crosstab(index=order3['Idx'],columns=order3['UserupdateInfo1'],values=order3['Idx'],aggfunc=np.sum)
print('以Idx和UserupdateInfo1为行列分键组创建的登录信息表为:\n',order3Pivot.head())
以Idx和UserupdateInfo1为行列分键组创建的登录信息表为:
UserupdateInfo1 _Age _BussinessAddress _ByUserId _CityId _CompanyAddress \
Idx
3 NaN NaN NaN NaN NaN
5 NaN NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN
12 12.0 NaN NaN NaN NaN
16 NaN NaN NaN NaN NaN
UserupdateInfo1 _CompanyName _CompanyPhone _CompanySizeId _CompanyTypeId \
Idx
3 NaN NaN NaN NaN
5 NaN NaN NaN NaN
8 NaN NaN NaN NaN
12 NaN NaN NaN NaN
16 NaN NaN NaN NaN
UserupdateInfo1 _CreationDate ... _relationshipId _residenceAddress \
Idx ...
3 NaN ... NaN NaN
5 NaN ... NaN NaN
8 NaN ... NaN NaN
12 NaN ... NaN NaN
16 NaN ... NaN NaN
UserupdateInfo1 _residencePhone _residenceTypeId _residenceYears \
Idx
3 NaN NaN NaN
5 NaN NaN NaN
8 NaN NaN NaN
12 NaN NaN NaN
16 NaN NaN NaN
UserupdateInfo1 _schoolName _turnover _userId _webShopUrl _workYears
Idx
3 NaN NaN NaN NaN NaN
5 NaN NaN NaN NaN NaN
8 NaN NaN NaN NaN NaN
12 NaN NaN NaN NaN NaN
16 NaN NaN NaN NaN NaN
[5 rows x 86 columns]