美国2012年总统候选人政治献金数据分析
导入包
In [1]:
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
方便大家操作,将月份和参选人以及所在政党进行定义
In [2]:
months = {'JAN' : 1, 'FEB' : 2, 'MAR' : 3, 'APR' : 4, 'MAY' : 5, 'JUN' : 6,
'JUL' : 7, 'AUG' : 8, 'SEP' : 9, 'OCT': 10, 'NOV': 11, 'DEC' : 12}
of_interest = ['Obama, Barack', 'Romney, Mitt', 'Santorum, Rick',
'Paul, Ron', 'Gingrich, Newt']
parties = {
'Bachmann, Michelle': 'Republican',
'Romney, Mitt': 'Republican',
'Obama, Barack': 'Democrat',
"Roemer, Charles E. 'Buddy' III": 'Reform',
'Pawlenty, Timothy': 'Republican',
'Johnson, Gary Earl': 'Libertarian',
'Paul, Ron': 'Republican',
'Santorum, Rick': 'Republican',
'Cain, Herman': 'Republican',
'Gingrich, Newt': 'Republican',
'McCotter, Thaddeus G': 'Republican',
'Huntsman, Jon': 'Republican',
'Perry, Rick': 'Republican'
}
读取文件
In [3]:
table = pd.read_csv('data/usa_election.txt')
table.head()
C:\jupyter\lib\site-packages\IPython\core\interactiveshell.py:2785: DtypeWarning: Columns (6) have mixed types. Specify dtype option on import or set low_memory=False.
interactivity=interactivity, compiler=compiler, result=result)
Out[3]:
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
In [8]:
#使用map函数+字典,新建一列各个候选人所在党派party
table['party'] = table['cand_nm'].map(parties)
table.head()
Out[8]:
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
23-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
05-JUL-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
01-AUG-11 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
20-JUN-11 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
In [10]:
#party这一列中有哪些元素
table['party'].unique()
Out[10]:
array(['Republican', 'Democrat', 'Reform', 'Libertarian'], dtype=object)
In [ ]:
#使用value_counts()函数,统计party列中各个元素出现次数,value_counts()是Series中的,无参,返回一个带有每个元素出现次数的Series
In [11]:
table['party'].value_counts()
Out[11]:
Democrat 292400
Republican 237575
Reform 5364
Libertarian 702
Name: party, dtype: int64
In [12]:
#使用groupby()函数,查看各个党派收到的政治献金总数contb_receipt_amt
table.groupby(by='party')['contb_receipt_amt'].sum()
Out[12]:
party
Democrat 8.105758e+07
Libertarian 4.132769e+05
Reform 3.390338e+05
Republican 1.192255e+08
Name: contb_receipt_amt, dtype: float64
In [13]:
#查看具体每天各个党派收到的政治献金总数contb_receipt_amt 。使用groupby([多个分组参数])
table.groupby(by=['party','contb_receipt_dt'])['contb_receipt_amt'].sum()
Out[13]:
party contb_receipt_dt
Democrat 01-AUG-11 175281.00
01-DEC-11 651532.82
01-JAN-12 58098.80
01-JUL-11 165961.00
01-JUN-11 145459.00
01-MAY-11 82644.00
01-NOV-11 122529.87
01-OCT-11 148977.00
01-SEP-11 403297.62
02-AUG-11 164510.11
02-DEC-11 216056.96
02-JAN-12 89743.60
02-JUL-11 17105.00
02-JUN-11 422453.00
02-MAY-11 396675.00
02-NOV-11 147183.81
02-OCT-11 62605.62
02-SEP-11 137948.41
03-AUG-11 147053.02
03-DEC-11 81304.02
03-JAN-12 87406.97
03-JUL-11 5982.00
03-JUN-11 320176.20
03-MAY-11 261819.11
03-NOV-11 119304.56
03-OCT-11 363061.02
03-SEP-11 45598.00
04-APR-11 640235.12
04-AUG-11 598784.23
04-DEC-11 72795.10
...
Republican 29-AUG-11 941769.23
29-DEC-11 428501.42
29-JAN-11 750.00
29-JAN-12 75220.02
29-JUL-11 233423.35
29-JUN-11 1340704.29
29-MAR-11 38875.00
29-MAY-11 8363.20
29-NOV-11 407322.64
29-OCT-11 81924.01
29-SEP-11 1612794.52
30-APR-11 43004.80
30-AUG-11 915548.58
30-DEC-11 492470.45
30-JAN-12 255204.80
30-JUL-11 12249.04
30-JUN-11 2744932.63
30-MAR-11 50240.00
30-MAY-11 17803.60
30-NOV-11 809014.83
30-OCT-11 43913.16
30-SEP-11 4886331.76
31-AUG-11 1017735.02
31-DEC-11 1094376.72
31-JAN-11 6000.00
31-JAN-12 869890.41
31-JUL-11 12781.02
31-MAR-11 62475.00
31-MAY-11 301339.80
31-OCT-11 734601.83
Name: contb_receipt_amt, Length: 1183, dtype: float64
In [14]:
def trasform_date(d):
day,month,year = d.split('-')
month = months[month]
return "20"+year+'-'+str(month)+'-'+day
In [17]:
#将表中日期格式转换为'yyyy-mm-dd'。日期格式,通过函数加map方式进行转换
table['contb_receipt_dt'] = table['contb_receipt_dt'].apply(trasform_date)
In [18]:
table.head()
Out[18]:
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
0 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
250.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
1 |
C00410118 |
P20002978 |
Bachmann, Michelle |
HARVEY, WILLIAM |
MOBILE |
AL |
3.6601e+08 |
RETIRED |
RETIRED |
50.0 |
2011-6-23 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
2 |
C00410118 |
P20002978 |
Bachmann, Michelle |
SMITH, LANIER |
LANETT |
AL |
3.68633e+08 |
INFORMATION REQUESTED |
INFORMATION REQUESTED |
250.0 |
2011-7-05 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
3 |
C00410118 |
P20002978 |
Bachmann, Michelle |
BLEVINS, DARONDA |
PIGGOTT |
AR |
7.24548e+08 |
NONE |
RETIRED |
250.0 |
2011-8-01 |
NaN |
NaN |
NaN |
SA17A |
749073 |
Republican |
4 |
C00410118 |
P20002978 |
Bachmann, Michelle |
WARDENBURG, HAROLD |
HOT SPRINGS NATION |
AR |
7.19016e+08 |
NONE |
RETIRED |
300.0 |
2011-6-20 |
NaN |
NaN |
NaN |
SA17A |
736166 |
Republican |
In [19]:
#查看老兵(捐献者职业)DISABLED VETERAN主要支持谁 :查看老兵们捐赠给谁的钱最多
table['contbr_occupation'] == 'DISABLED VETERAN'
Out[19]:
0 False
1 False
2 False
3 False
4 False
5 False
6 False
7 False
8 False
9 False
10 False
11 False
12 False
13 False
14 False
15 False
16 False
17 False
18 False
19 False
20 False
21 False
22 False
23 False
24 False
25 False
26 False
27 False
28 False
29 False
...
536011 False
536012 False
536013 False
536014 False
536015 False
536016 False
536017 False
536018 False
536019 False
536020 False
536021 False
536022 False
536023 False
536024 False
536025 False
536026 False
536027 False
536028 False
536029 False
536030 False
536031 False
536032 False
536033 False
536034 False
536035 False
536036 False
536037 False
536038 False
536039 False
536040 False
Name: contbr_occupation, Length: 536041, dtype: bool
In [21]:
old_bing_df = table.loc[table['contbr_occupation'] == 'DISABLED VETERAN']
In [22]:
old_bing_df.groupby(by='cand_nm')['contb_receipt_amt'].sum()
Out[22]:
cand_nm
Cain, Herman 300.00
Obama, Barack 4205.00
Paul, Ron 2425.49
Santorum, Rick 250.00
Name: contb_receipt_amt, dtype: float64
In [23]:
table['contb_receipt_amt'].max()
Out[23]:
1944042.43
In [24]:
#找出候选人的捐赠者中,捐赠金额最大的人的职业以及捐献额 .通过query("查询条件来查找捐献人职业")
table.query('contb_receipt_amt == 1944042.43')
Out[24]:
|
cmte_id |
cand_id |
cand_nm |
contbr_nm |
contbr_city |
contbr_st |
contbr_zip |
contbr_employer |
contbr_occupation |
contb_receipt_amt |
contb_receipt_dt |
receipt_desc |
memo_cd |
memo_text |
form_tp |
file_num |
party |
176127 |
C00431445 |
P80003338 |
Obama, Barack |
OBAMA VICTORY FUND 2012 - UNITEMIZED |
CHICAGO |
IL |
60680 |
NaN |
NaN |
1944042.43 |
2011-12-31 |
NaN |
X |
* |
SA18 |
763233 |
Democrat |