- 需求:
- 导入文件,查看原始数据
- 将人口数据和各州简称数据进行合并
- 将合并的数据中重复的abbreviation列进行删除
- 查看存在缺失数据的列
- 找到有哪些state/region使得state的值为NaN,进行去重操作
- 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
- 合并各州面积数据areas
- 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
- 去除含有缺失数据的行
- 找出2010年的全民人口数据
- 计算各州的人口密度
- 排序,并找出人口密度最高的五个州 df.sort_values()
In [1]:
import numpy as np
from pandas import DataFrame,Series
import pandas as pd
In [3]:
abb = pd.read_csv('./data/state-abbrevs.csv')
pop = pd.read_csv('./data/state-population.csv')
area = pd.read_csv('./data/state-areas.csv')
In [8]:
#将人口数据和各州简称数据进行合并
display(abb.head(1),pop.head(1))
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop.head()
|
state |
abbreviation |
0 |
Alabama |
AL |
|
state/region |
ages |
year |
population |
0 |
AL |
under18 |
2012 |
1117489.0 |
Out[8]:
|
state |
abbreviation |
state/region |
ages |
year |
population |
0 |
Alabama |
AL |
AL |
under18 |
2012 |
1117489.0 |
1 |
Alabama |
AL |
AL |
total |
2012 |
4817528.0 |
2 |
Alabama |
AL |
AL |
under18 |
2010 |
1130966.0 |
3 |
Alabama |
AL |
AL |
total |
2010 |
4785570.0 |
4 |
Alabama |
AL |
AL |
under18 |
2011 |
1125763.0 |
In [9]:
#将合并的数据中重复的abbreviation列进行删除
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
abb_pop.head()
Out[9]:
|
state |
state/region |
ages |
year |
population |
0 |
Alabama |
AL |
under18 |
2012 |
1117489.0 |
1 |
Alabama |
AL |
total |
2012 |
4817528.0 |
2 |
Alabama |
AL |
under18 |
2010 |
1130966.0 |
3 |
Alabama |
AL |
total |
2010 |
4785570.0 |
4 |
Alabama |
AL |
under18 |
2011 |
1125763.0 |
In [10]:
#查看存在缺失数据的列
abb_pop.isnull().any(axis=0)
Out[10]:
state True
state/region False
ages False
year False
population True
dtype: bool
In [11]:
#找到有哪些state/region使得state的值为NaN,进行去重操作
In [13]:
#1.检测state列中的空值
abb_pop['state'].isnull()
Out[13]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
2514 True
2515 True
2516 True
2517 True
2518 True
2519 True
2520 True
2521 True
2522 True
2523 True
2524 True
2525 True
2526 True
2527 True
2528 True
2529 True
2530 True
2531 True
2532 True
2533 True
2534 True
2535 True
2536 True
2537 True
2538 True
2539 True
2540 True
2541 True
2542 True
2543 True
Name: state, Length: 2544, dtype: bool
In [15]:
#2.将1的返回值作用的state_region这一列中
abb_pop['state/region'][abb_pop['state'].isnull()]
Out[15]:
2448 PR
2449 PR
2450 PR
2451 PR
2452 PR
2453 PR
2454 PR
2455 PR
2456 PR
2457 PR
2458 PR
2459 PR
2460 PR
2461 PR
2462 PR
2463 PR
2464 PR
2465 PR
2466 PR
2467 PR
2468 PR
2469 PR
2470 PR
2471 PR
2472 PR
2473 PR
2474 PR
2475 PR
2476 PR
2477 PR
...
2514 USA
2515 USA
2516 USA
2517 USA
2518 USA
2519 USA
2520 USA
2521 USA
2522 USA
2523 USA
2524 USA
2525 USA
2526 USA
2527 USA
2528 USA
2529 USA
2530 USA
2531 USA
2532 USA
2533 USA
2534 USA
2535 USA
2536 USA
2537 USA
2538 USA
2539 USA
2540 USA
2541 USA
2542 USA
2543 USA
Name: state/region, Length: 96, dtype: object
In [29]:
#3.去重
abb_pop['state/region'][abb_pop['state'].isnull()].unique()
Out[29]:
array([], dtype=object)
In [19]:
#为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
abb_pop['state/region'] == 'USA'
Out[19]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
2514 True
2515 True
2516 True
2517 True
2518 True
2519 True
2520 True
2521 True
2522 True
2523 True
2524 True
2525 True
2526 True
2527 True
2528 True
2529 True
2530 True
2531 True
2532 True
2533 True
2534 True
2535 True
2536 True
2537 True
2538 True
2539 True
2540 True
2541 True
2542 True
2543 True
Name: state/region, Length: 2544, dtype: bool
In [23]:
indexs = abb_pop['state'][abb_pop['state/region'] == 'USA'].index
In [24]:
abb_pop.loc[indexs,'state'] = 'United State'
In [27]:
pr_index = abb_pop['state'][abb_pop['state/region'] == 'PR'].index
In [28]:
abb_pop.loc[pr_index,'state'] = 'PPPRRR'
合并各州面积数据areas 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行 去除含有缺失数据的行 找出2010年的全民人口数据 计算各州的人口密度 排序,并找出人口密度最高的五个州 df.sort_values()
In [31]:
#合并各州面积数据areas
abb_pop_area = pd.merge(abb_pop,area,how='outer')
abb_pop_area.head()
Out[31]:
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
0 |
Alabama |
AL |
under18 |
2012.0 |
1117489.0 |
52423.0 |
1 |
Alabama |
AL |
total |
2012.0 |
4817528.0 |
52423.0 |
2 |
Alabama |
AL |
under18 |
2010.0 |
1130966.0 |
52423.0 |
3 |
Alabama |
AL |
total |
2010.0 |
4785570.0 |
52423.0 |
4 |
Alabama |
AL |
under18 |
2011.0 |
1125763.0 |
52423.0 |
In [32]:
#我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
abb_pop_area['area (sq. mi)'].isnull()
Out[32]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
2515 True
2516 True
2517 True
2518 True
2519 True
2520 True
2521 True
2522 True
2523 True
2524 True
2525 True
2526 True
2527 True
2528 True
2529 True
2530 True
2531 True
2532 True
2533 True
2534 True
2535 True
2536 True
2537 True
2538 True
2539 True
2540 True
2541 True
2542 True
2543 True
2544 False
Name: area (sq. mi), Length: 2545, dtype: bool
In [35]:
a_index = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index
In [36]:
#去除含有缺失数据的行
abb_pop_area.drop(labels=a_index,axis=0,inplace=True)
In [38]:
#找出2010年的全民人口数据
abb_pop_area.query('year == 2010 & ages == "total"')
Out[38]:
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
3 |
Alabama |
AL |
total |
2010.0 |
4785570.0 |
52423.0 |
91 |
Alaska |
AK |
total |
2010.0 |
713868.0 |
656425.0 |
101 |
Arizona |
AZ |
total |
2010.0 |
6408790.0 |
114006.0 |
189 |
Arkansas |
AR |
total |
2010.0 |
2922280.0 |
53182.0 |
197 |
California |
CA |
total |
2010.0 |
37333601.0 |
163707.0 |
283 |
Colorado |
CO |
total |
2010.0 |
5048196.0 |
104100.0 |
293 |
Connecticut |
CT |
total |
2010.0 |
3579210.0 |
5544.0 |
379 |
Delaware |
DE |
total |
2010.0 |
899711.0 |
1954.0 |
389 |
District of Columbia |
DC |
total |
2010.0 |
605125.0 |
68.0 |
475 |
Florida |
FL |
total |
2010.0 |
18846054.0 |
65758.0 |
485 |
Georgia |
GA |
total |
2010.0 |
9713248.0 |
59441.0 |
570 |
Hawaii |
HI |
total |
2010.0 |
1363731.0 |
10932.0 |
581 |
Idaho |
ID |
total |
2010.0 |
1570718.0 |
83574.0 |
666 |
Illinois |
IL |
total |
2010.0 |
12839695.0 |
57918.0 |
677 |
Indiana |
IN |
total |
2010.0 |
6489965.0 |
36420.0 |
762 |
Iowa |
IA |
total |
2010.0 |
3050314.0 |
56276.0 |
773 |
Kansas |
KS |
total |
2010.0 |
2858910.0 |
82282.0 |
858 |
Kentucky |
KY |
total |
2010.0 |
4347698.0 |
40411.0 |
869 |
Louisiana |
LA |
total |
2010.0 |
4545392.0 |
51843.0 |
954 |
Maine |
ME |
total |
2010.0 |
1327366.0 |
35387.0 |
965 |
Montana |
MT |
total |
2010.0 |
990527.0 |
147046.0 |
1050 |
Nebraska |
NE |
total |
2010.0 |
1829838.0 |
77358.0 |
1061 |
Nevada |
NV |
total |
2010.0 |
2703230.0 |
110567.0 |
1146 |
New Hampshire |
NH |
total |
2010.0 |
1316614.0 |
9351.0 |
1157 |
New Jersey |
NJ |
total |
2010.0 |
8802707.0 |
8722.0 |
1242 |
New Mexico |
NM |
total |
2010.0 |
2064982.0 |
121593.0 |
1253 |
New York |
NY |
total |
2010.0 |
19398228.0 |
54475.0 |
1338 |
North Carolina |
NC |
total |
2010.0 |
9559533.0 |
53821.0 |
1349 |
North Dakota |
ND |
total |
2010.0 |
674344.0 |
70704.0 |
1434 |
Ohio |
OH |
total |
2010.0 |
11545435.0 |
44828.0 |
1445 |
Oklahoma |
OK |
total |
2010.0 |
3759263.0 |
69903.0 |
1530 |
Oregon |
OR |
total |
2010.0 |
3837208.0 |
98386.0 |
1541 |
Maryland |
MD |
total |
2010.0 |
5787193.0 |
12407.0 |
1626 |
Massachusetts |
MA |
total |
2010.0 |
6563263.0 |
10555.0 |
1637 |
Michigan |
MI |
total |
2010.0 |
9876149.0 |
96810.0 |
1722 |
Minnesota |
MN |
total |
2010.0 |
5310337.0 |
86943.0 |
1733 |
Mississippi |
MS |
total |
2010.0 |
2970047.0 |
48434.0 |
1818 |
Missouri |
MO |
total |
2010.0 |
5996063.0 |
69709.0 |
1829 |
Pennsylvania |
PA |
total |
2010.0 |
12710472.0 |
46058.0 |
1914 |
Rhode Island |
RI |
total |
2010.0 |
1052669.0 |
1545.0 |
1925 |
South Carolina |
SC |
total |
2010.0 |
4636361.0 |
32007.0 |
2010 |
South Dakota |
SD |
total |
2010.0 |
816211.0 |
77121.0 |
2021 |
Tennessee |
TN |
total |
2010.0 |
6356683.0 |
42146.0 |
2106 |
Texas |
TX |
total |
2010.0 |
25245178.0 |
268601.0 |
2117 |
Utah |
UT |
total |
2010.0 |
2774424.0 |
84904.0 |
2202 |
Vermont |
VT |
total |
2010.0 |
625793.0 |
9615.0 |
2213 |
Virginia |
VA |
total |
2010.0 |
8024417.0 |
42769.0 |
2298 |
Washington |
WA |
total |
2010.0 |
6742256.0 |
71303.0 |
2309 |
West Virginia |
WV |
total |
2010.0 |
1854146.0 |
24231.0 |
2394 |
Wisconsin |
WI |
total |
2010.0 |
5689060.0 |
65503.0 |
2405 |
Wyoming |
WY |
total |
2010.0 |
564222.0 |
97818.0 |
In [40]:
#计算各州的人口密度
abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
abb_pop_area.head()
Out[40]:
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
midu |
0 |
Alabama |
AL |
under18 |
2012.0 |
1117489.0 |
52423.0 |
21.316769 |
1 |
Alabama |
AL |
total |
2012.0 |
4817528.0 |
52423.0 |
91.897221 |
2 |
Alabama |
AL |
under18 |
2010.0 |
1130966.0 |
52423.0 |
21.573851 |
3 |
Alabama |
AL |
total |
2010.0 |
4785570.0 |
52423.0 |
91.287603 |
4 |
Alabama |
AL |
under18 |
2011.0 |
1125763.0 |
52423.0 |
21.474601 |
In [43]:
#排序,并找出人口密度最高的五个州 df.sort_values()
abb_pop_area.sort_values(by='midu',axis=0,ascending=False).head()
Out[43]:
|
state |
state/region |
ages |
year |
population |
area (sq. mi) |
midu |
391 |
District of Columbia |
DC |
total |
2013.0 |
646449.0 |
68.0 |
9506.602941 |
385 |
District of Columbia |
DC |
total |
2012.0 |
633427.0 |
68.0 |
9315.102941 |
387 |
District of Columbia |
DC |
total |
2011.0 |
619624.0 |
68.0 |
9112.117647 |
431 |
District of Columbia |
DC |
total |
1990.0 |
605321.0 |
68.0 |
8901.779412 |
389 |
District of Columbia |
DC |
total |
2010.0 |
605125.0 |
68.0 |
8898.897059 |