pandas中的数值计算及统计基础

 import pandas as pd
import numpy as np df = pd.DataFrame({
'key1': [4, 5, 3, np.nan, 2],
'key2': [1, 2, np.nan, 4, 5],
'key3': [1, 2, 3, 'j', 'k']
}, index=['a', 'b', 'c', 'd', 'e'])
print(df)
print(df['key1'].dtype,df['key2'].dtype,df['key3'].dtype)
print('-------')
'''
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 4.0 j
e 2.0 5.0 k
float64 float64 object
-------
'''
# 计算每一列的均值 df.mean()
# 只统计数字列,默认忽略nan。
print(df.mean())
'''
key1 3.5
key2 3.0
dtype: float64
'''
# 不忽略nan值计算均值
# skipna默认为True,如果为False,有NaN的列统计结果仍为NaN
m3 = df.mean(skipna=False)
print(m3)
'''
key1 NaN
key2 NaN
dtype: float64
'''
# 计算单一列的均值
print('计算单一列的均值',df['key2'].mean())
'''
计算单一列的均值 3.0
''' df2 = pd.DataFrame({
'key1': [1, 3, 5],
'key2': [2, 4, 6],
'key3': [3, 5, 7]
}, index=['a', 'b', 'c'])
# print(df2)
# print('--------df2')
# 计算df2每一行的均值并将其结果添加到新的列
df2['mean'] = df2.mean(axis=1)
print(df2)
'''
key1 key2 key3 mean
a 1 2 3 2.0
b 3 4 5 4.0
c 5 6 7 6.0
''' # 统计非NaN值的数量 count()
print(df)
print('-'*6)
print(df.count())
'''
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 4.0 j
e 2.0 5.0 k
------
key1 4
key2 4
key3 5
dtype: int64
''' # 统计
print(df)
print('-' * 6)
print('df的最小值',df.min())
print('df的最大值',df.max())
print('df的key2列的最大值',df['key2'].max())
print('统计df的分位数,参数q确定位置',df.quantile(q=0.75))
print('对df求和',df.sum())
print('求df的中位数,median(),50%分位数',df.median())
print('求df的标准差,std()',df.std())
print('求df的方差,var()',df.var())
print('求skew样本的偏度,skew()',df.skew())
print('求kurt样本的峰度,kurt()',df.kurt())
print('df累计求和,cumsum()',df['key2'].cumsum())
print('df累计求积,cumprod()',df['key2'].cumprod())
print('求df的累计最大值,cummax()', df['key2'].cummax())
print('求df的累计最小值,cummin()', df['key2'].cummin())
'''
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 4.0 j
e 2.0 5.0 k
------
df的最小值 key1 2.0
key2 1.0
dtype: float64
df的最大值 key1 5.0
key2 5.0
dtype: float64
df的key2列的最大值 5.0
统计df的分位数,参数q确定位置 key1 4.25
key2 4.25
Name: 0.75, dtype: float64
对df求和 key1 14.0
key2 12.0
dtype: float64
求df的中位数,median(),50%分位数 key1 3.5
key2 3.0
dtype: float64
求df的标准差,std() key1 1.290994
key2 1.825742
dtype: float64
求df的方差,var() key1 1.666667
key2 3.333333
dtype: float64
求skew样本的偏度,skew() key1 0.0
key2 0.0
dtype: float64
求kurt样本的峰度,kurt() key1 -1.2
key2 -3.3
dtype: float64
df累计求和,cumsum() a 1.0
b 3.0
c NaN
d 7.0
e 12.0
Name: key2, dtype: float64
df累计求积,cumprod() a 1.0
b 2.0
c NaN
d 8.0
e 40.0
Name: key2, dtype: float64
求df的累计最大值,cummax() a 1.0
b 2.0
c NaN
d 4.0
e 5.0
Name: key2, dtype: float64
求df的累计最小值,cummin() a 1.0
b 1.0
c NaN
d 1.0
e 1.0
Name: key2, dtype: float64
''' # 唯一值 :unique()
s = pd.Series(list('kjdhsakjdhjfh'))
sq = s.unique()
print(s)
print(sq)
print('sq的类型:',type(sq))
print('对sq进行重新排序:',pd.Series(sq).sort_values())
'''
0 k
1 j
2 d
3 h
4 s
5 a
6 k
7 j
8 d
9 h
10 j
11 f
12 h
dtype: object
['k' 'j' 'd' 'h' 's' 'a' 'f']
sq的类型: <class 'numpy.ndarray'>
对sq进行重新排序: 5 a
2 d
6 f
3 h
1 j
0 k
4 s
dtype: object
'''
# 对某一列进行值的计数,只能对一列,不能对Dataframe
print(df['key2'].value_counts()) # 判断Dataframe中的每个元素是否都是在某个列表中
print(df)
df_isin = df.isin([1,3])
print(df_isin)
'''
key1 key2 key3
a 4.0 1.0 1
b 5.0 2.0 2
c 3.0 NaN 3
d NaN 4.0 j
e 2.0 5.0 k key1 key2 key3
a False True True
b False False False
c True False True
d False False False
e False False False
'''
上一篇:PCA主成分分析 ICA独立成分分析 LDA线性判别分析 SVD性质


下一篇:何为Web App,何为Hybird App