6 pandas人口分析案例

  • 需求:
    • 导入文件,查看原始数据
    • 将人口数据和各州简称数据进行合并
    • 将合并的数据中重复的abbreviation列进行删除
    • 查看存在缺失数据的列
    • 找到有哪些state/region使得state的值为NaN,进行去重操作
    • 为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
    • 合并各州面积数据areas
    • 我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
    • 去除含有缺失数据的行
    • 找出2010年的全民人口数据
    • 计算各州的人口密度
    • 排序,并找出人口密度最高的州
import numpy as np
import pandas as pd
from pandas import DataFrame
#导入文件,查看原始数据
abb = pd.read_csv('./data/state-abbrevs.csv') #state(州的全称)abbreviation(州的简称)
area = pd.read_csv('./data/state-areas.csv') #state州的全称,area (sq. mi)州的面积
pop = pd.read_csv('./data/state-population.csv')#state/region简称,ages年龄,year时间,population人口数量
#将人口数据和各州简称数据进行合并
abb_pop = pd.merge(abb,pop,left_on='abbreviation',right_on='state/region',how='outer')
abb_pop.head()
state abbreviation state/region ages year population
0 Alabama AL AL under18 2012 1117489.0
1 Alabama AL AL total 2012 4817528.0
2 Alabama AL AL under18 2010 1130966.0
3 Alabama AL AL total 2010 4785570.0
4 Alabama AL AL under18 2011 1125763.0
#将合并的数据中重复的abbreviation列进行删除
abb_pop.drop(labels='abbreviation',axis=1,inplace=True)
abb_pop.head()
state state/region ages year population
0 Alabama AL under18 2012 1117489.0
1 Alabama AL total 2012 4817528.0
2 Alabama AL under18 2010 1130966.0
3 Alabama AL total 2010 4785570.0
4 Alabama AL under18 2011 1125763.0
#查看存在缺失数据的列
#方式1:isnull,notll,any,all
abb_pop.isnull().any(axis=0)
#state,population这两列中是存在空值
state            True
state/region    False
ages            False
year            False
population       True
dtype: bool
#方式2:
abb_pop.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2544 entries, 0 to 2543
Data columns (total 5 columns):
state           2448 non-null object
state/region    2544 non-null object
ages            2544 non-null object
year            2544 non-null int64
population      2524 non-null float64
dtypes: float64(1), int64(1), object(3)
memory usage: 119.2+ KB
#找到有哪些state/region使得state的值为NaN,进行去重操作(将state中的空值对应的简称找到,且对简称进行去重)
abb_pop.head()
state state/region ages year population
0 Alabama AL under18 2012 1117489.0
1 Alabama AL total 2012 4817528.0
2 Alabama AL under18 2010 1130966.0
3 Alabama AL total 2010 4785570.0
4 Alabama AL under18 2011 1125763.0
#思路:可以将state这一列中的空值对应的行数据取出,从该行数据中就可以取出简称的值
#1.将state中的空值定位到
abb_pop['state'].isnull()
#2.将上述的布尔值作为源数据的行索引
abb_pop.loc[abb_pop['state'].isnull()]#将state中空对应的行数据取出
#3.将简称取出
abb_pop.loc[abb_pop['state'].isnull()]['state/region']
#4.对简称去重
abb_pop.loc[abb_pop['state'].isnull()]['state/region'].unique()

#结论:只有PR和USA对应的全称数据为空值
array([], dtype=object)
#为找到的这些state/region的state项补上正确的值,从而去除掉state这一列的所有NaN
#思考:填充该需求中的空值可不可以使用fillna?
    # - 不可以。fillna可以使用空的紧邻值做填充。fillna(value='xxx')使用指定的值填充空值
    # 使用给元素赋值的方式进行填充!
#1.先给USA的全称对应的空值进行批量赋值
#1.1将USA对应的行数据找出(行数据中就存在state的空值)
abb_pop['state/region'] == 'USA'
abb_pop.loc[abb_pop['state/region'] == 'USA']#将usa对应的行数据取出
state state/region ages year population
2496 NaN USA under18 1990 64218512.0
2497 NaN USA total 1990 249622814.0
2498 NaN USA total 1991 252980942.0
2499 NaN USA under18 1991 65313018.0
2500 NaN USA under18 1992 66509177.0
2501 NaN USA total 1992 256514231.0
2502 NaN USA total 1993 259918595.0
2503 NaN USA under18 1993 67594938.0
2504 NaN USA under18 1994 68640936.0
2505 NaN USA total 1994 263125826.0
2506 NaN USA under18 1995 69473140.0
2507 NaN USA under18 1996 70233512.0
2508 NaN USA total 1995 266278403.0
2509 NaN USA total 1996 269394291.0
2510 NaN USA total 1997 272646932.0
2511 NaN USA under18 1997 70920738.0
2512 NaN USA under18 1998 71431406.0
2513 NaN USA total 1998 275854116.0
2514 NaN USA under18 1999 71946051.0
2515 NaN USA total 2000 282162411.0
2516 NaN USA under18 2000 72376189.0
2517 NaN USA total 1999 279040181.0
2518 NaN USA total 2001 284968955.0
2519 NaN USA under18 2001 72671175.0
2520 NaN USA total 2002 287625193.0
2521 NaN USA under18 2002 72936457.0
2522 NaN USA total 2003 290107933.0
2523 NaN USA under18 2003 73100758.0
2524 NaN USA total 2004 292805298.0
2525 NaN USA under18 2004 73297735.0
2526 NaN USA total 2005 295516599.0
2527 NaN USA under18 2005 73523669.0
2528 NaN USA total 2006 298379912.0
2529 NaN USA under18 2006 73757714.0
2530 NaN USA total 2007 301231207.0
2531 NaN USA under18 2007 74019405.0
2532 NaN USA total 2008 304093966.0
2533 NaN USA under18 2008 74104602.0
2534 NaN USA under18 2013 73585872.0
2535 NaN USA total 2013 316128839.0
2536 NaN USA total 2009 306771529.0
2537 NaN USA under18 2009 74134167.0
2538 NaN USA under18 2010 74119556.0
2539 NaN USA total 2010 309326295.0
2540 NaN USA under18 2011 73902222.0
2541 NaN USA total 2011 311582564.0
2542 NaN USA under18 2012 73708179.0
2543 NaN USA total 2012 313873685.0
#1.2将USA对应的全称空对应的行索引取出
indexs = abb_pop.loc[abb_pop['state/region'] == 'USA'].index
abb_pop.iloc[indexs]
abb_pop.loc[indexs,'state'] = 'United States'
#2.可以将PR的全称进行赋值
abb_pop['state/region'] == 'PR'
abb_pop.loc[abb_pop['state/region'] == 'PR'] #PR对应的行数据
indexs = abb_pop.loc[abb_pop['state/region'] == 'PR'].index
abb_pop.loc[indexs,'state'] = 'PPPRRR'
#合并各州面积数据areas
abb_pop_area = pd.merge(abb_pop,area,how='outer')
#我们会发现area(sq.mi)这一列有缺失数据,找出是哪些行
abb_pop_area['area (sq. mi)'].isnull()
abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()] #空对应的行数据
indexs = abb_pop_area.loc[abb_pop_area['area (sq. mi)'].isnull()].index
#去除含有缺失数据的行
abb_pop_area.drop(labels=indexs,axis=0,inplace=True)
#找出2010年的全民人口数据(基于df做条件查询)
abb_pop_area.query('ages == "total" & year == 2010')
state state/region ages year population area (sq. mi)
3 Alabama AL total 2010.0 4785570.0 52423.0
91 Alaska AK total 2010.0 713868.0 656425.0
101 Arizona AZ total 2010.0 6408790.0 114006.0
189 Arkansas AR total 2010.0 2922280.0 53182.0
197 California CA total 2010.0 37333601.0 163707.0
283 Colorado CO total 2010.0 5048196.0 104100.0
293 Connecticut CT total 2010.0 3579210.0 5544.0
379 Delaware DE total 2010.0 899711.0 1954.0
389 District of Columbia DC total 2010.0 605125.0 68.0
475 Florida FL total 2010.0 18846054.0 65758.0
485 Georgia GA total 2010.0 9713248.0 59441.0
570 Hawaii HI total 2010.0 1363731.0 10932.0
581 Idaho ID total 2010.0 1570718.0 83574.0
666 Illinois IL total 2010.0 12839695.0 57918.0
677 Indiana IN total 2010.0 6489965.0 36420.0
762 Iowa IA total 2010.0 3050314.0 56276.0
773 Kansas KS total 2010.0 2858910.0 82282.0
858 Kentucky KY total 2010.0 4347698.0 40411.0
869 Louisiana LA total 2010.0 4545392.0 51843.0
954 Maine ME total 2010.0 1327366.0 35387.0
965 Montana MT total 2010.0 990527.0 147046.0
1050 Nebraska NE total 2010.0 1829838.0 77358.0
1061 Nevada NV total 2010.0 2703230.0 110567.0
1146 New Hampshire NH total 2010.0 1316614.0 9351.0
1157 New Jersey NJ total 2010.0 8802707.0 8722.0
1242 New Mexico NM total 2010.0 2064982.0 121593.0
1253 New York NY total 2010.0 19398228.0 54475.0
1338 North Carolina NC total 2010.0 9559533.0 53821.0
1349 North Dakota ND total 2010.0 674344.0 70704.0
1434 Ohio OH total 2010.0 11545435.0 44828.0
1445 Oklahoma OK total 2010.0 3759263.0 69903.0
1530 Oregon OR total 2010.0 3837208.0 98386.0
1541 Maryland MD total 2010.0 5787193.0 12407.0
1626 Massachusetts MA total 2010.0 6563263.0 10555.0
1637 Michigan MI total 2010.0 9876149.0 96810.0
1722 Minnesota MN total 2010.0 5310337.0 86943.0
1733 Mississippi MS total 2010.0 2970047.0 48434.0
1818 Missouri MO total 2010.0 5996063.0 69709.0
1829 Pennsylvania PA total 2010.0 12710472.0 46058.0
1914 Rhode Island RI total 2010.0 1052669.0 1545.0
1925 South Carolina SC total 2010.0 4636361.0 32007.0
2010 South Dakota SD total 2010.0 816211.0 77121.0
2021 Tennessee TN total 2010.0 6356683.0 42146.0
2106 Texas TX total 2010.0 25245178.0 268601.0
2117 Utah UT total 2010.0 2774424.0 84904.0
2202 Vermont VT total 2010.0 625793.0 9615.0
2213 Virginia VA total 2010.0 8024417.0 42769.0
2298 Washington WA total 2010.0 6742256.0 71303.0
2309 West Virginia WV total 2010.0 1854146.0 24231.0
2394 Wisconsin WI total 2010.0 5689060.0 65503.0
2405 Wyoming WY total 2010.0 564222.0 97818.0
#计算各州的人口密度(人口除以面积)
abb_pop_area['midu'] = abb_pop_area['population'] / abb_pop_area['area (sq. mi)']
abb_pop_area
state state/region ages year population area (sq. mi) midu
0 Alabama AL under18 2012.0 1117489.0 52423.0 21.316769
1 Alabama AL total 2012.0 4817528.0 52423.0 91.897221
2 Alabama AL under18 2010.0 1130966.0 52423.0 21.573851
3 Alabama AL total 2010.0 4785570.0 52423.0 91.287603
4 Alabama AL under18 2011.0 1125763.0 52423.0 21.474601
5 Alabama AL total 2011.0 4801627.0 52423.0 91.593900
6 Alabama AL total 2009.0 4757938.0 52423.0 90.760506
7 Alabama AL under18 2009.0 1134192.0 52423.0 21.635389
8 Alabama AL under18 2013.0 1111481.0 52423.0 21.202163
9 Alabama AL total 2013.0 4833722.0 52423.0 92.206131
10 Alabama AL total 2007.0 4672840.0 52423.0 89.137211
11 Alabama AL under18 2007.0 1132296.0 52423.0 21.599222
12 Alabama AL total 2008.0 4718206.0 52423.0 90.002594
13 Alabama AL under18 2008.0 1134927.0 52423.0 21.649410
14 Alabama AL total 2005.0 4569805.0 52423.0 87.171757
15 Alabama AL under18 2005.0 1117229.0 52423.0 21.311810
16 Alabama AL total 2006.0 4628981.0 52423.0 88.300574
17 Alabama AL under18 2006.0 1126798.0 52423.0 21.494344
18 Alabama AL total 2004.0 4530729.0 52423.0 86.426359
19 Alabama AL under18 2004.0 1113662.0 52423.0 21.243767
20 Alabama AL total 2003.0 4503491.0 52423.0 85.906778
21 Alabama AL under18 2003.0 1113083.0 52423.0 21.232722
22 Alabama AL total 2001.0 4467634.0 52423.0 85.222784
23 Alabama AL under18 2001.0 1120409.0 52423.0 21.372470
24 Alabama AL total 2002.0 4480089.0 52423.0 85.460370
25 Alabama AL under18 2002.0 1116590.0 52423.0 21.299620
26 Alabama AL under18 1999.0 1121287.0 52423.0 21.389218
27 Alabama AL total 1999.0 4430141.0 52423.0 84.507583
28 Alabama AL total 2000.0 4452173.0 52423.0 84.927856
29 Alabama AL under18 2000.0 1122273.0 52423.0 21.408027
... ... ... ... ... ... ... ...
2419 Wyoming WY under18 2003.0 124182.0 97818.0 1.269521
2420 Wyoming WY total 2004.0 509106.0 97818.0 5.204625
2421 Wyoming WY under18 2004.0 123974.0 97818.0 1.267395
2422 Wyoming WY total 2002.0 500017.0 97818.0 5.111707
2423 Wyoming WY under18 2002.0 125495.0 97818.0 1.282944
2424 Wyoming WY total 2001.0 494657.0 97818.0 5.056912
2425 Wyoming WY under18 2001.0 126212.0 97818.0 1.290274
2426 Wyoming WY total 2000.0 494300.0 97818.0 5.053262
2427 Wyoming WY under18 2000.0 128774.0 97818.0 1.316465
2428 Wyoming WY total 1999.0 491780.0 97818.0 5.027500
2429 Wyoming WY under18 1999.0 130793.0 97818.0 1.337106
2430 Wyoming WY total 1997.0 489452.0 97818.0 5.003701
2431 Wyoming WY under18 1997.0 134328.0 97818.0 1.373244
2432 Wyoming WY under18 1998.0 132602.0 97818.0 1.355599
2433 Wyoming WY total 1998.0 490787.0 97818.0 5.017349
2434 Wyoming WY under18 1996.0 135698.0 97818.0 1.387250
2435 Wyoming WY total 1996.0 488167.0 97818.0 4.990564
2436 Wyoming WY total 1995.0 485160.0 97818.0 4.959823
2437 Wyoming WY under18 1995.0 136785.0 97818.0 1.398362
2438 Wyoming WY under18 1994.0 137733.0 97818.0 1.408054
2439 Wyoming WY total 1994.0 480283.0 97818.0 4.909965
2440 Wyoming WY under18 1992.0 137308.0 97818.0 1.403709
2441 Wyoming WY total 1992.0 466251.0 97818.0 4.766515
2442 Wyoming WY total 1993.0 473081.0 97818.0 4.836339
2443 Wyoming WY under18 1993.0 137458.0 97818.0 1.405242
2444 Wyoming WY total 1991.0 459260.0 97818.0 4.695046
2445 Wyoming WY under18 1991.0 136720.0 97818.0 1.397698
2446 Wyoming WY under18 1990.0 136078.0 97818.0 1.391135
2447 Wyoming WY total 1990.0 453690.0 97818.0 4.638103
2544 Puerto Rico NaN NaN NaN NaN 3515.0 NaN

2449 rows × 7 columns

#排序,并找出人口密度最高的州
abb_pop_area.sort_values(by='midu',axis=0,ascending=False).iloc[0]['state']
'District of Columbia'
上一篇:Java面试 - 重载(Overload)和重写(Override)的区别?


下一篇:java-调用超类方法而不是子类方法