pandas 模块

2023-11-14 16:21:58

numpy 与pandas In [46]:

import numpy as np
import pandas as pd
np.add()/df.add() 加法运算    +
np.aubtract()/df.aubtract()  减法运算  -
np.megative() 负数运算 -
np.multiply()  乘法运算 *
np.divide()    除法运算 /
np.floor_dicide()  向下整除 //
np.power()     指数运算 **
np.mod()      求余数 %
np.abs()    求绝对值
np.sin() ,cos(), tan()  求正弦  余弦 正切
np.exp(x) e为底的x次方  np.exp2(x),2为底的x次方
np.log(x) e为底对x开根号  np.log2（x） 2为底
np.sum（） min（）  max（） 求和   求最小值  求最大值   axis=0 列   axis=1  行
np.prod() 计算元素的积
np.mean() 计算元素的平均值
np.std()  计算元素的标准差
np.var（） 计算元素的方差
np.argmin（）找出最小值的索引
np.median（）计算元素的中位数
np.any（）  验证是否存在元素为真
np.all()    验证所有元素是否为真

Out[46]:

3.0

In [2]:

data=pd.Series([0.25,0.5,0.75,1.0])#Series是一个带索引的一维数组
data

Out[2]:

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [3]:

data.index#可获取索引与值

Out[3]:

RangeIndex(start=0, stop=4, step=1)

In [4]:

data.values

Out[4]:

array([0.25, 0.5 , 0.75, 1.  ])

In [5]:

data[0]#可切片

Out[5]:

0.25

In [6]:

data.index=["a","b","c","d"]#可定义索引
data

Out[6]:

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [7]:

population_dic={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
population=pd.Series(population_dic)
population

Out[7]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
dtype: int64

In [8]:

area_dict={"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995}
aeea=pd.Series(area_dict)
aeea

Out[8]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
dtype: int64

In [9]:

states=pd.DataFrame({"population":population,"aeea":aeea})#DataFrom是特殊的二维字典，一个标题行对应一列数据，创建时用字典格式创建
states

Out[9]:

	aeea	population
california	423967	423967
florida	170312	170312
new york	141297	141297
texas	695663	695663
tllinois	149995	149995

In [10]:

states.index

Out[10]:

Index(['california', 'florida', 'new york', 'texas', 'tllinois'], dtype='object')

In [11]:

states.columns

Out[11]:

Index(['aeea', 'population'], dtype='object')

In [12]:

states["aeea"]

Out[12]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: aeea, dtype: int64

In [13]:

pd.DataFrame(population,columns=["population"])#创建单列

Out[13]:

	population
california	423967
florida	170312
new york	141297
texas	695663
tllinois	149995

In [14]:

data=[{"a":i,"b":i*2}for i in range(5)]
pd.DataFrame(data)

Out[14]:

	a	b
0	0	0
1	1	2
2	2	4
3	3	6
4	4	8

In [15]:

pd.DataFrame(np.random.rand(3,2),index=(["a","b","c"]),columns=["foo","bar"])

Out[15]:

	foo	bar
a	0.196654	0.866753
b	0.175826	0.056842
c	0.356369	0.272122

In [16]:

data=pd.Series([0.25,0.5,0.75,1.0],index=["a","b","c","d"])#Series是一个带索引的一维数组
data

Out[16]:

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [17]:

data["e"]=1.25#添加数据
data

Out[17]:

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

In [18]:

#切片应用

data["a":"c"]

Out[18]:

a    0.25
b    0.50
c    0.75
dtype: float64

In [19]:

data[0:2]

Out[19]:

a    0.25
b    0.50
dtype: float64

In [20]:

data[(data>0.3)&(data<0.8)]

Out[20]:

b    0.50
c    0.75
dtype: float64

In [21]:

data=pd.Series(["a","b","c"],index=[1,3,5])
data

Out[21]:

1    a
3    b
5    c
dtype: object

In [22]:

data.loc[1]#看得见的索引

Out[22]:

'a'

In [23]:

data.loc[1:3]

Out[23]:

1    a
3    b
dtype: object

In [24]:

data.iloc[1]#看不见的索引

Out[24]:

'b'

In [25]:

data.iloc[1:3]

Out[25]:

3    b
5    c
dtype: object

In [26]:

#DataFrame的数据选择方法
area=pd.Series({"california":423967,"texas":695663,"new york":141297,"florida":170312,"tllinois":149995})
pop=pd.Series({"california":33323967,"texas":33395663,"new york":11111297,"florida":22220312,"tllinois":22229995})
data=pd.DataFrame({"area":area,"pop":pop})
data

Out[26]:

	area	pop
california	423967	33323967
florida	170312	22220312
new york	141297	11111297
texas	695663	33395663
tllinois	149995	22229995

In [27]:

data["area"]

Out[27]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: area, dtype: int64

In [28]:

data.area

Out[28]:

california    423967
florida       170312
new york      141297
texas         695663
tllinois      149995
Name: area, dtype: int64

In [29]:

data["density"]=data["pop"]/data["area"]#增加一列

data

Out[29]:

	area	pop	density
california	423967	33323967	78.600379
florida	170312	22220312	130.468270
new york	141297	11111297	78.637883
texas	695663	33395663	48.005518
tllinois	149995	22229995	148.204907

In [30]:

data.values#查看数据

Out[30]:

array([[4.23967000e+05, 3.33239670e+07, 7.86003793e+01],
       [1.70312000e+05, 2.22203120e+07, 1.30468270e+02],
       [1.41297000e+05, 1.11112970e+07, 7.86378833e+01],
       [6.95663000e+05, 3.33956630e+07, 4.80055185e+01],
       [1.49995000e+05, 2.22299950e+07, 1.48204907e+02]])

In [31]:

data.T#转置

Out[31]:

	california	florida	new york	texas	tllinois
area	4.239670e+05	1.703120e+05	1.412970e+05	6.956630e+05	1.499950e+05
pop	3.332397e+07	2.222031e+07	1.111130e+07	3.339566e+07	2.223000e+07
density	7.860038e+01	1.304683e+02	7.863788e+01	4.800552e+01	1.482049e+02

In [32]:

data.loc[data.density>100,["pop","density"]]

Out[32]:

	pop	density
florida	22220312	130.468270
tllinois	22229995	148.204907

In [33]:

data.iloc[0,2]=90#修改数据
data

Out[33]:

	area	pop	density
california	423967	33323967	90.000000
florida	170312	22220312	130.468270
new york	141297	11111297	78.637883
texas	695663	33395663	48.005518
tllinois	149995	22229995	148.204907

In [34]:

data[data.density>100]#掩码过滤

Out[34]:

	area	pop	density
florida	170312	22220312	130.468270
tllinois	149995	22229995	148.204907

In [35]:

#pandas 的数值运算方法
import numpy as np
import pandas as pd

In [36]:

rng=np.random.RandomState(42)
ser=pd.Series(rng.randint(0,10,4))
ser

Out[36]:

0    6
1    3
2    7
3    4
dtype: int32

In [38]:

df=pd.DataFrame(rng.randint(0,10,(3,4)),columns=["A","B","C","D"])
df

Out[38]:

	A	B	C	D
0	1	7	5	1
1	4	0	9	5
2	8	0	9	2

In [39]:

np.exp(ser)

Out[39]:

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [40]:

np.sin(df*np.pi/4)

Out[40]:

	A	B	C	D
0	7.071068e-01	-0.707107	-0.707107	0.707107
1	1.224647e-16	0.000000	0.707107	-0.707107
2	-2.449294e-16	0.000000	0.707107	1.000000

In [45]:

ser//2

Out[45]:

0    3
1    1
2    3
3    2
dtype: int32

码农公寓

相关文章