numpy 与pandas
In [46]:
import numpy as np
import pandas as pd
np.add()/df.add() 加法运算 +
np.aubtract()/df.aubtract() 减法运算 -
np.megative() 负数运算 -
np.multiply() 乘法运算 *
np.divide() 除法运算 /
np.floor_dicide() 向下整除 //
np.power() 指数运算 **
np.mod() 求余数 %
np.abs() 求绝对值
np.sin() ,cos(), tan() 求正弦 余弦 正切
np.exp(x) e为底的x次方 np.exp2(x),2为底的x次方
np.log(x) e为底对x开根号 np.log2(x) 2为底
np.sum() min() max() 求和 求最小值 求最大值 axis=0 列 axis=1 行
np.prod() 计算元素的积
np.mean() 计算元素的平均值
np.std() 计算元素的标准差
np.var() 计算元素的方差
np.argmin()找出最小值的索引
np.median()计算元素的中位数
np.any() 验证是否存在元素为真
np.all() 验证所有元素是否为真
Out[46]:
3.0
In [2]:
data=pd.Series([0.25,0.5,0.75,1.0])#Series是一个带索引的一维数组
data
Out[2]:
0 0.25
1 0.50
2 0.75
3 1.00
dtype: float64
In [3]:
data.index#可获取索引与值
Out[3]:
RangeIndex(start=0, stop=4, step=1)
In [4]:
data.values
Out[4]:
array([0.25, 0.5 , 0.75, 1. ])
In [5]:
data[0]#可切片
Out[5]:
0.25
In [6]:
data.index=["a","b","c","d"]#可定义索引
data
Out[6]:
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
In [7]:
population_dic={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
population=pd.Series(population_dic)
population
Out[7]:
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
dtype: int64
In [8]:
area_dict={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
aeea=pd.Series(area_dict)
aeea
Out[8]:
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
dtype: int64
In [9]:
states=pd.DataFrame({"population":population,"aeea":aeea})#DataFrom是特殊的二维字典,一个标题行对应一列数据,创建时用字典格式创建
states
Out[9]:
|
aeea |
population |
california |
423967 |
423967 |
florida |
170312 |
170312 |
new york |
141297 |
141297 |
texas |
695663 |
695663 |
tllinois |
149995 |
149995 |
In [10]:
states.index
Out[10]:
Index(['california', 'florida', 'new york', 'texas', 'tllinois'], dtype='object')
In [11]:
states.columns
Out[11]:
Index(['aeea', 'population'], dtype='object')
In [12]:
states["aeea"]
Out[12]:
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
Name: aeea, dtype: int64
In [13]:
pd.DataFrame(population,columns=["population"])#创建单列
Out[13]:
|
population |
california |
423967 |
florida |
170312 |
new york |
141297 |
texas |
695663 |
tllinois |
149995 |
In [14]:
data=[{"a":i,"b":i*2}for i in range(5)]
pd.DataFrame(data)
Out[14]:
|
a |
b |
0 |
0 |
0 |
1 |
1 |
2 |
2 |
2 |
4 |
3 |
3 |
6 |
4 |
4 |
8 |
In [15]:
pd.DataFrame(np.random.rand(3,2),index=(["a","b","c"]),columns=["foo","bar"])
Out[15]:
|
foo |
bar |
a |
0.196654 |
0.866753 |
b |
0.175826 |
0.056842 |
c |
0.356369 |
0.272122 |
In [16]:
data=pd.Series([0.25,0.5,0.75,1.0],index=["a","b","c","d"])#Series是一个带索引的一维数组
data
Out[16]:
a 0.25
b 0.50
c 0.75
d 1.00
dtype: float64
In [17]:
data["e"]=1.25#添加数据
data
Out[17]:
a 0.25
b 0.50
c 0.75
d 1.00
e 1.25
dtype: float64
In [18]:
#切片应用
data["a":"c"]
Out[18]:
a 0.25
b 0.50
c 0.75
dtype: float64
In [19]:
data[0:2]
Out[19]:
a 0.25
b 0.50
dtype: float64
In [20]:
data[(data>0.3)&(data<0.8)]
Out[20]:
b 0.50
c 0.75
dtype: float64
In [21]:
data=pd.Series(["a","b","c"],index=[1,3,5])
data
Out[21]:
1 a
3 b
5 c
dtype: object
In [22]:
data.loc[1]#看得见的索引
Out[22]:
'a'
In [23]:
data.loc[1:3]
Out[23]:
1 a
3 b
dtype: object
In [24]:
data.iloc[1]#看不见的索引
Out[24]:
'b'
In [25]:
data.iloc[1:3]
Out[25]:
3 b
5 c
dtype: object
In [26]:
#DataFrame的数据选择方法
area=pd.Series({"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995})
pop=pd.Series({"california":33323967,"texas":33395663,"new york":11111297,"florida":22220312,"tllinois":22229995})
data=pd.DataFrame({"area":area,"pop":pop})
data
Out[26]:
|
area |
pop |
california |
423967 |
33323967 |
florida |
170312 |
22220312 |
new york |
141297 |
11111297 |
texas |
695663 |
33395663 |
tllinois |
149995 |
22229995 |
In [27]:
data["area"]
Out[27]:
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
Name: area, dtype: int64
In [28]:
data.area
Out[28]:
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
Name: area, dtype: int64
In [29]:
data["density"]=data["pop"]/data["area"]#增加一列
data
Out[29]:
|
area |
pop |
density |
california |
423967 |
33323967 |
78.600379 |
florida |
170312 |
22220312 |
130.468270 |
new york |
141297 |
11111297 |
78.637883 |
texas |
695663 |
33395663 |
48.005518 |
tllinois |
149995 |
22229995 |
148.204907 |
In [30]:
data.values#查看数据
Out[30]:
array([[4.23967000e+05, 3.33239670e+07, 7.86003793e+01],
[1.70312000e+05, 2.22203120e+07, 1.30468270e+02],
[1.41297000e+05, 1.11112970e+07, 7.86378833e+01],
[6.95663000e+05, 3.33956630e+07, 4.80055185e+01],
[1.49995000e+05, 2.22299950e+07, 1.48204907e+02]])
In [31]:
data.T#转置
Out[31]:
|
california |
florida |
new york |
texas |
tllinois |
area |
4.239670e+05 |
1.703120e+05 |
1.412970e+05 |
6.956630e+05 |
1.499950e+05 |
pop |
3.332397e+07 |
2.222031e+07 |
1.111130e+07 |
3.339566e+07 |
2.223000e+07 |
density |
7.860038e+01 |
1.304683e+02 |
7.863788e+01 |
4.800552e+01 |
1.482049e+02 |
In [32]:
data.loc[data.density>100,["pop","density"]]
Out[32]:
|
pop |
density |
florida |
22220312 |
130.468270 |
tllinois |
22229995 |
148.204907 |
In [33]:
data.iloc[0,2]=90#修改数据
data
Out[33]:
|
area |
pop |
density |
california |
423967 |
33323967 |
90.000000 |
florida |
170312 |
22220312 |
130.468270 |
new york |
141297 |
11111297 |
78.637883 |
texas |
695663 |
33395663 |
48.005518 |
tllinois |
149995 |
22229995 |
148.204907 |
In [34]:
data[data.density>100]#掩码过滤
Out[34]:
|
area |
pop |
density |
florida |
170312 |
22220312 |
130.468270 |
tllinois |
149995 |
22229995 |
148.204907 |
In [35]:
#pandas 的数值运算方法
import numpy as np
import pandas as pd
In [36]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.randint(0,10,4))
ser
Out[36]:
0 6
1 3
2 7
3 4
dtype: int32
In [38]:
df=pd.DataFrame(rng.randint(0,10,(3,4)),columns=["A","B","C","D"])
df
Out[38]:
|
A |
B |
C |
D |
0 |
1 |
7 |
5 |
1 |
1 |
4 |
0 |
9 |
5 |
2 |
8 |
0 |
9 |
2 |
In [39]:
np.exp(ser)
Out[39]:
0 403.428793
1 20.085537
2 1096.633158
3 54.598150
dtype: float64
In [40]:
np.sin(df*np.pi/4)
Out[40]:
|
A |
B |
C |
D |
0 |
7.071068e-01 |
-0.707107 |
-0.707107 |
0.707107 |
1 |
1.224647e-16 |
0.000000 |
0.707107 |
-0.707107 |
2 |
-2.449294e-16 |
0.000000 |
0.707107 |
1.000000 |
In [45]:
ser//2
Out[45]:
0 3
1 1
2 3
3 2
dtype: int32