美国2012年总统候选人政治献金数据分析
导入包
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
方便操作,将月份和参选人以及所在政党进行定义
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
df = pd.read_csv('./data/usa_election.txt')
df.head()
C:\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:2728: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
# 新建一列各个候选人所在党派party
df['party'] = df['cand_nm'].map(parties)
df.head()
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
# party这一列中有哪些元素
df['party'].unique()
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
# 统计party列中各个元素出现次数
df['party'].value_counts()
Democrat 292400
Republican 237575
Reform 5364
Libertarian 702
Name: party, dtype: int64
# 查看各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by='party')['contb_receipt_amt'].sum()
party
Democrat 8.105758e+07
Libertarian 4.132769e+05
Reform 3.390338e+05
Republican 1.192255e+08
Name: contb_receipt_amt, dtype: float64
# 查看每天各个党派收到的政治献金总数contb_receipt_amt
df.groupby(by=['contb_receipt_dt','party'])['contb_receipt_amt'].sum()
contb_receipt_dt party
01-APR-11 Reform 50.00
Republican 12635.00
01-AUG-11 Democrat 175281.00
Libertarian 1000.00
Reform 1847.00
Republican 234598.46
01-DEC-11 Democrat 651532.82
Libertarian 725.00
Reform 875.00
Republican 486405.96
01-FEB-11 Republican 250.00
01-JAN-11 Republican 8600.00
01-JAN-12 Democrat 58098.80
Reform 515.00
Republican 75704.72
01-JUL-11 Democrat 165961.00
Libertarian 2000.00
Reform 100.00
Republican 115848.72
01-JUN-11 Democrat 145459.00
Libertarian 500.00
Reform 50.00
Republican 433109.20
01-MAR-11 Republican 1000.00
01-MAY-11 Democrat 82644.00
Reform 480.00
Republican 28663.87
01-NOV-11 Democrat 122529.87
Libertarian 3000.00
Reform 1792.00
...
30-OCT-11 Reform 3910.00
Republican 43913.16
30-SEP-11 Democrat 3373517.24
Libertarian 550.00
Reform 2050.00
Republican 4886331.76
31-AUG-11 Democrat 374387.44
Libertarian 10750.00
Reform 450.00
Republican 1017735.02
31-DEC-11 Democrat 3553072.57
Reform 695.00
Republican 1094376.72
31-JAN-11 Republican 6000.00
31-JAN-12 Democrat 1418410.31
Reform 150.00
Republican 869890.41
31-JUL-11 Democrat 20305.00
Reform 966.00
Republican 12781.02
31-MAR-11 Reform 200.00
Republican 62475.00
31-MAY-11 Democrat 351705.66
Libertarian 250.00
Reform 100.00
Republican 301339.80
31-OCT-11 Democrat 204996.87
Libertarian 4250.00
Reform 3105.00
Republican 734601.83
Name: contb_receipt_amt, Length: 1183, dtype: float64
# 将表中日期格式转换为'yyyy-mm-dd' day-m-y
def transformDate(d):
day,month,year = d.split('-')
month = months[month]
return '20'+year+'-'+str(month)+'-'+day
df['contb_receipt_dt'] = df['contb_receipt_dt'].apply(transformDate)
df.head()
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
2011-6-23 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
2011-7-05 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
2011-8-01 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
# 查看老兵(捐献者职业)主要支持谁 :查看老兵们捐赠给谁的钱最多
# 1.将老兵对应的行数据取出
df['contbr_occupation'] == 'DISABLED VETERAN'
old_bing = df.loc[df['contbr_occupation'] == 'DISABLED VETERAN']
# 2.根据候选人分组
old_bing.groupby(by='cand_nm')['contb_receipt_amt'].sum()
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
df['contb_receipt_amt'].max()
1944042.43
#捐赠金额最大的人的职业以及捐献额 .通过query("查询条件来查找捐献人职业")
df.query('contb_receipt_amt == 1944042.43')
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
176127 |
C00431445 |
P80003338 |
Obama, Barack |
OBAMA VICTORY FUND 2012 - UNITEMIZED |
CHICAGO |
IL |
60680 |
NaN |
NaN |
1944042.43 |
2011-12-31 |
NaN |
X |
* |
SA18 |
763233 |
Democrat |