pandas 模块

numpy 与pandas In [46]:  
import numpy as np
import pandas as pd
np.add()/df.add() 加法运算    +
np.aubtract()/df.aubtract()  减法运算  -
np.megative() 负数运算 -
np.multiply()  乘法运算 *
np.divide()    除法运算 /
np.floor_dicide()  向下整除 //
np.power()     指数运算 **
np.mod()      求余数 %
np.abs()    求绝对值
np.sin() ,cos(), tan()  求正弦  余弦 正切
np.exp(x) e为底的x次方  np.exp2(x),2为底的x次方
np.log(x) e为底对x开根号  np.log2(x) 2为底
np.sum() min()  max() 求和   求最小值  求最大值   axis=0 列   axis=1  行
np.prod() 计算元素的积
np.mean() 计算元素的平均值
np.std()  计算元素的标准差
np.var() 计算元素的方差
np.argmin()找出最小值的索引
np.median()计算元素的中位数
np.any()  验证是否存在元素为真
np.all()    验证所有元素是否为真

Out[46]:

3.0
In [2]:
data=pd.Series([0.25,0.5,0.75,1.0])#Series是一个带索引的一维数组
data
Out[2]:
0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
In [3]:
data.index#可获取索引与值
Out[3]:
RangeIndex(start=0, stop=4, step=1)
In [4]:
data.values
Out[4]:
array([0.25, 0.5 , 0.75, 1.  ])
In [5]:
data[0]#可切片
Out[5]:
0.25
In [6]:
data.index=["a","b","c","d"]#可定义索引
data
Out[6]:
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
In [7]:
population_dic={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
population=pd.Series(population_dic)
population
  Out[7]:
california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
dtype: int64
In [8]:
area_dict={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
aeea=pd.Series(area_dict)
aeea
Out[8]:
california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
dtype: int64
In [9]:
states=pd.DataFrame({"population":population,"aeea":aeea})#DataFrom是特殊的二维字典,一个标题行对应一列数据,创建时用字典格式创建
states
  Out[9]:
  aeea population
california 423967 423967
florida 170312 170312
new york 141297 141297
texas 695663 695663
tllinois 149995 149995
In [10]:
states.index
Out[10]:
Index(['california', 'florida', 'new york', 'texas', 'tllinois'], dtype='object')
In [11]:
states.columns
Out[11]:
Index(['aeea', 'population'], dtype='object')
In [12]:
states["aeea"]
Out[12]:
california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: aeea, dtype: int64
In [13]:
pd.DataFrame(population,columns=["population"])#创建单列

Out[13]:

  population
california 423967
florida 170312
new york 141297
texas 695663
tllinois 149995
In [14]:
data=[{"a":i,"b":i*2}for i in range(5)]
pd.DataFrame(data)
Out[14]:
  a b
0 0 0
1 1 2
2 2 4
3 3 6
4 4 8
In [15]:
pd.DataFrame(np.random.rand(3,2),index=(["a","b","c"]),columns=["foo","bar"])
Out[15]:
  foo bar
a 0.196654 0.866753
b 0.175826 0.056842
c 0.356369 0.272122
In [16]:
data=pd.Series([0.25,0.5,0.75,1.0],index=["a","b","c","d"])#Series是一个带索引的一维数组
data
  Out[16]:
a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
In [17]:
data["e"]=1.25#添加数据
data
Out[17]:
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64
In [18]:
#切片应用
data["a":"c"]
Out[18]:
a    0.25
b    0.50
c    0.75
dtype: float64
In [19]:
data[0:2]
Out[19]:
a    0.25
b    0.50
dtype: float64
In [20]:
data[(data>0.3)&(data<0.8)]

Out[20]:

b    0.50
c    0.75
dtype: float64
In [21]:
data=pd.Series(["a","b","c"],index=[1,3,5])
data
Out[21]:
1    a
3    b
5    c
dtype: object
In [22]:
data.loc[1]#看得见的索引
Out[22]:
'a'
In [23]:
data.loc[1:3]
Out[23]:
1    a
3    b
dtype: object
In [24]:
data.iloc[1]#看不见的索引

Out[24]:

'b'
In [25]:
data.iloc[1:3]
Out[25]:
3    b
5    c
dtype: object
In [26]:
#DataFrame的数据选择方法
area=pd.Series({"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995})
pop=pd.Series({"california":33323967,"texas":33395663,"new york":11111297,"florida":22220312,"tllinois":22229995})
data=pd.DataFrame({"area":area,"pop":pop})
data
Out[26]:
  area pop
california 423967 33323967
florida 170312 22220312
new york 141297 11111297
texas 695663 33395663
tllinois 149995 22229995
In [27]:
data["area"]

Out[27]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: area, dtype: int64
In [28]:
data.area
Out[28]:
california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: area, dtype: int64
In [29]:
data["density"]=data["pop"]/data["area"]#增加一列

data
Out[29]:
  area pop density
california 423967 33323967 78.600379
florida 170312 22220312 130.468270
new york 141297 11111297 78.637883
texas 695663 33395663 48.005518
tllinois 149995 22229995 148.204907
In [30]:
data.values#查看数据
Out[30]:
array([[4.23967000e+05, 3.33239670e+07, 7.86003793e+01],
       [1.70312000e+05, 2.22203120e+07, 1.30468270e+02],
       [1.41297000e+05, 1.11112970e+07, 7.86378833e+01],
       [6.95663000e+05, 3.33956630e+07, 4.80055185e+01],
       [1.49995000e+05, 2.22299950e+07, 1.48204907e+02]])
In [31]:
data.T#转置

Out[31]:

  california florida new york texas tllinois
area 4.239670e+05 1.703120e+05 1.412970e+05 6.956630e+05 1.499950e+05
pop 3.332397e+07 2.222031e+07 1.111130e+07 3.339566e+07 2.223000e+07
density 7.860038e+01 1.304683e+02 7.863788e+01 4.800552e+01 1.482049e+02
In [32]:
data.loc[data.density>100,["pop","density"]]
Out[32]:
  pop density
florida 22220312 130.468270
tllinois 22229995 148.204907
In [33]:
data.iloc[0,2]=90#修改数据
data

 

Out[33]:
  area pop density
california 423967 33323967 90.000000
florida 170312 22220312 130.468270
new york 141297 11111297 78.637883
texas 695663 33395663 48.005518
tllinois 149995 22229995 148.204907
In [34]:
data[data.density>100]#掩码过滤

Out[34]:

  area pop density
florida 170312 22220312 130.468270
tllinois 149995 22229995 148.204907
In [35]:
#pandas 的数值运算方法
import numpy as np
import pandas as pd
In [36]:
rng=np.random.RandomState(42)
ser=pd.Series(rng.randint(0,10,4))
ser
 

Out[36]:

0    6
1    3
2    7
3    4
dtype: int32
In [38]:
df=pd.DataFrame(rng.randint(0,10,(3,4)),columns=["A","B","C","D"])
df
  Out[38]:
  A B C D
0 1 7 5 1
1 4 0 9 5
2 8 0 9 2
In [39]:
np.exp(ser)
Out[39]:
0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64
In [40]:
np.sin(df*np.pi/4)
Out[40]:
  A B C D
0 7.071068e-01 -0.707107 -0.707107 0.707107
1 1.224647e-16 0.000000 0.707107 -0.707107
2 -2.449294e-16 0.000000 0.707107 1.000000
In [45]:
ser//2
Out[45]:
0    3
1    1
2    3
3    2
dtype: int32
上一篇:Pytorch之Tensor与NumPy数据类型转换


下一篇:NumPy-ndarray 的数据类型