01.Series
# -*- coding: utf-8 -*-
"""
Series 객체 특징
- pandas 제공 1차원 자료구성
- DataFrame 칼럼 구성요소
- 수학/통계 관련 함수 제공
- 범위 수정, 블럭 연산
- indexing/slicing(list 동일)
- 시계열 데이터 처리
"""
import pandas as pd #pd.Series()
from pandas import Series
#1.Series 생성
#1) List 이용
list=[4000,3000,2000,3500]
print(list*2) #[4000, 3000, 2000, 3500, 4000, 3000, 2000, 3500]
price=Series([4000,3000,2000,3500])
print(price*2)
"""
0 8000
1 6000
2 4000
3 7000
dtype: int64
"""
print("index=",price.index)#index index= RangeIndex(start=0, stop=4, step=1)
print("value=",price.values)#data value= [4000 3000 2000 3500]
print(list[0],price[0]) #4000 4000
# 2) dic 이용 key=index : value=values
person=pd.Series({'name':'홍길동','age':35,'addr':'서울시'})
print(person)
"""
addr 서울시
age 35
name 홍길동
dtype: object
"""
# 2. indexing(list와 동일)
ser_data=pd.Series([4,4.5,6,8,10.5])
print(ser_data[0]) #4.0
print(ser_data[:3])
"""
0 4.0
1 4.5
2 6.0
dtype: float64
"""
print(ser_data[3:])
"""
3 8.0
4 10.5
dtype: float64
"""
print(ser_data[:])
"""
0 4.0
1 4.5
2 6.0
3 8.0
4 10.5
dtype: float64
"""
#print(ser_data[-1]) # - 사용할수 없다
# Boolean 조건
print(ser_data[ser_data>=5])
"""
2 6.0
3 8.0
4 10.5
dtype: float64
"""
# 3. Series 결합, NA 처리
data1=Series([4000,None,3000,2000],
index=['a','m','o','k'])
data2=Series([4000,3000,3500,2000],
index=['a','o','k','m'])
#join :index 기준
resualt=data1+data2 # 블럭 연산
print(resualt)
"""
a 8000.0
k 5500.0
m NaN 숫자+None=None
o 6000.0
dtype: float64
"""
print(type(resualt))# Series' <class 'pandas.core.series.Series'>
#NA 처리 :0,평균 대체
result2=resualt.fillna(0)#0 대체
print(result2)
"""
a 8000.0
k 5500.0
m 0.0
o 6000.0
dtype: float64
"""
result3=resualt.fillna(resualt.mean())#평균 대체
print(result3)
"""
a 8000.0
k 5500.0
m 6500.0
o 6000.0
dtype: float64
"""
print(pd.notnull(resualt))
"""
a True
k True
m False
o True
dtype: bool
"""
# 결측치를 제외한 subset 생성
subset=resualt[pd.notnull(resualt)]
print(subset)
"""
a 8000.0
k 5500.0
o 6000.0
dtype: float64
"""
#4.Series 연산
print(ser_data)
"""
0 4.0
1 4.5
2 6.0
3 8.0
4 10.5
dtype: float64
"""
#10블럭수정
ser_data[1:4]=50
print(ser_data)
"""
0 4.0
1 50.0
2 50.0
3 50.0
4 10.5
dtype: float64
"""
#2)수학 통계 함수
print(ser_data.sum())#164.5
print(ser_data.mean())#32.9
print(ser_data.max())#50.0
print(ser_data.min())#4.0
#3) broadcast 연산
print(ser_data * 0.5) #vector(1) * scala(0)
"""
0 2.00
1 25.00
2 25.00
3 25.00
4 5.25
dtype: float64
"""
02.DataFrame
# -*- coding: utf-8 -*-
"""
Created on Sat Feb 9 12:34:12 2019
@author: 502-03
DataFrame 객체 특징
- Pandas제공 2차원 행렬구조 (table 구조 동일)
- 칼럼 단위 상이한 자료형 제공
- DataFrame 구성요소
-> Series : 1 차원 (vector)
"""
import pandas as pd
from pandas import DataFrame
#1.DataFrame 생성
name=['홍길동','이순신','강감찬','유관순']
age=[35,45,55,25]
pay=[350,450,550,250]
emp=pd.DataFrame({'name':name,'age':age,'pay':pay},
columns=['name','age','pay'])
print(emp)
"""
name age pay
0 홍길동 35 350
1 이순신 45 450
2 강감찬 55 550
3 유관순 25 250
"""
#1) Series 객체 이용: colum추가
gender = pd.Series(['M','M','M','F'])
emp['gender']=gender
print(emp)
"""
name age pay gender
0 홍길동 35 350 M
1 이순신 45 450 M
2 강감찬 55 550 M
3 유관순 25 250 F
"""
#2) Numpy 객체 이용
import numpy as np
frame = pd.DataFrame(np.arange(12).reshape(3,4),
columns=['a','b','c','d'])
print(frame)
"""
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
# 행/열 통계 구하기
print(frame.mean()) #열단위 평균
"""
a 4.0
b 5.0
c 6.0
d 7.0
dtype: float64
"""
print(frame.mean(axis=0)) #열단위 평균
"""
a 4.0
b 5.0
c 6.0
d 7.0
dtype: float64
"""
print(frame.mean(axis=1)) #행단위 평균
"""
0 1.5
1 5.5
2 9.5
dtype: float64
"""
# 2. index 지정
print(frame.index)#RangeIndex(start=0, stop=3, step=1)
print(frame.values)
"""
[[ 0 1 2 3]
[ 4 5 6 7]
[ 8 9 10 11]]
"""
print(frame.columns)
"""
Index(['a', 'b', 'c', 'd'], dtype='object')
"""
# 1) 특정 칼럼(a)로 index 지정
set_index=frame.set_index('a')
print(set_index)
"""
b c d
a
0 1 2 3
4 5 6 7
8 9 10 11
"""
# 2)index재 지정
reset_index=set_index.reset_index();
print(reset_index)
"""
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
#3 DF칼럼 참조
#1)단일 칼럼 참조
a_col1=frame.a #DF.colum
a_col2=frame['a'] #DF['colum']
print(a_col1)
"""
0 0
1 4
2 8
Name: a, dtype: int32
"""
print(a_col2)
"""
0 0
1 4
2 8
Name: a, dtype: int32
"""
a_col2=frame['a'][2] #DF['colum'][index]
#2) 복수 칼럼 참조
print(frame[['a','c']]) # [['a':'c']](x)
"""
a c
0 0 2
1 4 6
2 8 10
"""
cols=['a','b']
frame[cols]
# 4.Make Subset
#2)특정칼럼 제외
print('subset1')
subset_df=frame[['a','c','d']]
print(subset_df)
"""
a c d
0 0 2 3
1 4 6 7
2 8 10 11
"""
#20특정행 제외
print('drop')
print(frame.drop(0)) #1행 제거
"""
a b c d
1 4 5 6 7
2 8 9 10 11
"""
print(frame.drop(1)) #2행 제거
"""
a b c d
0 0 1 2 3
2 8 9 10 11
"""
#해당원소가 제외된 새로운 make new object
a_col=frame['a'] #DF(2)->vector(1)
print(type(a_col))#<class 'pandas.core.series.Series'>
#a칼럼 기준으로 행 삭제
sunbset_df2=frame #df 볻제
print(sunbset_df2)
"""
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
"""
for i,c in enumerate(a_col):
print('i=',i,'c=',c)
if c < 5 :
sunbset_df2=sunbset_df2.drop(i)
"""
i= 0 c= 0
i= 1 c= 4
i= 2 c= 8
"""
print(sunbset_df2)
"""
a b c d
2 8 9 10 11
"""
#3)칼럼 많은 경우
iris=pd.read_csv("../data/iris.csv")
print(iris.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal.Length 150 non-null float64
Sepal.Width 150 non-null float64
Petal.Length 150 non-null float64
Petal.Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
None
'''
print(type(iris)) # DataFrame
print(iris.columns)
"""
Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
'Species'],
dtype='object')
"""
#cols = list(iris.columns) # 칼럼명 추출
cols=iris.columns.tolist() #python 3.6 用这个
print(cols)
'''
['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
'''
print(iris[cols[0]]) #첫번째 칼럼
"""
0 5.1
1 4.9
2 4.7
3 4.6
...
145 6.7
146 6.3
147 6.5
148 6.2
149 5.9
"""
print(iris[cols[-1]])# 마지막 칼럼
"""
0 setosa
1 setosa
2 setosa
3 setosa
4 setosa
...
146 virginica
147 virginica
148 virginica
149 virginica
"""
#1~3칼럼 참조
print(iris[['Sepal.Length', 'Sepal.Width', 'Petal.Length']])
"""
0 5.1 3.5 1.4
1 4.9 3.0 1.4
2 4.7 3.2 1.3
3 4.6 3.1 1.5
4 5.0 3.6 1.4
5 5.4 3.9 1.7
...
146 6.3 2.5 5.0
147 6.5 3.0 5.2
148 6.2 3.4 5.4
149 5.9 3.0 5.1
"""
print(iris[cols[:3]]) #권장
"""
0 5.1 3.5 1.4
1 4.9 3.0 1.4
2 4.7 3.2 1.3
3 4.6 3.1 1.5
...
146 6.3 2.5 5.0
147 6.5 3.0 5.2
148 6.2 3.4 5.4
149 5.9 3.0 5.1
"""
print(iris.head())
"""
[150 rows x 3 columns]
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#1~4칼럼 :x,5칼럼 :y
iris_x=iris[cols[:4]]
iris_y=iris[cols[-1]]
print(iris_x.shape)#(150, 4) 2차원
print(iris_y.shape)#(150,) 1차원
#5.DF 행렬 참조 : R [row,col1:col3]
'''
DF.ix[row index or lable,col index or lable]
- DF 대상으로 행과 열의 index(숫자) or lable(문자) 참조
- 연속 데이터는 (:) 사용가능
- lable이 숫자면 lable-based 참조
'''
#DF.ix[row,col]
print('frame')
print(frame)
'''
a b c d
0 0 1 2 3
1 4 5 6 7
2 8 9 10 11
'''
print(frame.ix[1]) #행 default
'''
a 4
b 5
c 6
d 7
Name: 1, dtype: int32
'''
print(frame.ix[1,2]) #2행3열 6
print(frame.ix[:,'d']) #d열 전체
'''
0 3
1 7
2 11
Name: d, dtype: int32
'''
print(frame.ix[:,'b':'c']) #b~b열 전체
"""
b c
0 1 2
1 5 6
2 9 10
"""
print(len(iris)) #관측치 길이 -150
import numpy as np
idx=np.random.choice(10,5,replace=False)
print(idx)# [3 4 9 0 6]
idx=np.random.choice(len(iris),int(len(iris)*0.7),
replace=False)
print(idx,len(idx))
"""
[ 9 75 1 138 16 24 35 90 68 73 48 147 46 80 74 89 124 94
83 0 134 71 142 3 91 34 86 15 143 85 103 30 97 93 109 104
125 45 69 79 49 87 108 127 139 8 33 99 37 148 18 23 41 11
117 60 107 43 50 58 149 136 100 120 92 6 77 76 84 88 47 95
25 72 29 118 106 141 17 32 5 26 132 112 31 2 52 19 51 98
144 128 27 21 121 14 63 122 20 66 145 78 4 81 44] 105
"""
train_set=iris.ix[idx,:]
print(train_set.shape)#(105, 5)
03.Descriptive
# -*- coding: utf-8 -*-
"""
1. DataFrame 요약통계량
2. 변수 간의 상관성 분석
"""
import pandas as pd
product = pd.read_csv('../data/product.csv')
print(product.info())
# 기술통계량 구하기
summary = product.describe()
print(summary)
# 행/열 통계량 구하기 : axis=0 or 1
print(product.sum(axis = 0)) # 열 합계
'''
a 773
b 827
c 817
'''
print(product.sum(axis = 1)) # 행 합계
# 산포도
print(product.var()) # 분산
print(product.std()) # 표준편차
# 빈도수
a_cnt = product['a'].value_counts()
print(a_cnt)
'''
3 126
4 64
2 37
1 30
5 7
'''
# 중복 제외
b_uni = product['b'].unique()
print(b_uni) # [4 3 2 5 1]
# 변수 간의 상관분석( -1 < r < 1)
p_corr = product.corr()
print(p_corr)
'''
a b c
a 1.000000 0.499209 0.467145
b 0.499209 1.000000 0.766853
c 0.467145 0.766853 1.000000
'''
ac_corr = product['a'].corr(product['c'])
print(ac_corr) # 0.4671449836008965
#문) iris 1 ~ 4 칼럼 -> 상관분석(r)
cols = list(iris.columns)
print(cols) # 5개 칼럼 list
iris_sub = iris[cols[:4]]
print(iris_sub.corr())
04.merge
# -*- coding: utf-8 -*-
"""
DataFrame marge
"""
import pandas as pd
wdbc = pd.read_csv("../data/wdbc_data.csv")
print(wdbc.info())
'''
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
'''
cols = list(wdbc.columns)
print(cols)
df1 = wdbc[cols[:16]] # 1~16
sid = wdbc['id'] # id 칼럼
df2 = wdbc[cols[16:]] # 17~32
df2['id'] = sid
print(df1.shape) # (569, 16)
print(df2.shape) # (569, 17)
# 1. id 칼럼으로 DF 병합
df_merge = pd.merge(df1, df2) # id 칼럼, how='inner'
print(df_merge.info())
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 569 entries, 0 to 568
Data columns (total 32 columns):
'''
# 2. 칼럼 단위 df 붙이기
df1 = wdbc[cols[:16]] # 1~16
df2 = wdbc[cols[16:]] # 17~32
df_merge2 = pd.concat([df1, df2], axis=1) # 열 단위 결합
print(df_merge2.info())
'''
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
'''
05.timeSeries
# -*- coding: utf-8 -*-
"""
시계열 데이터 시각화
1. 날짜형식 수정(다국어 -> 한국어)
2. 시계열 시각화
3. 이동평균 기능
"""
import pandas as pd
from datetime import datetime # 날짜형식 수정
cospi = pd.read_csv("../data/cospi.csv")
print(cospi.info())
'''
RangeIndex: 247 entries, 0 to 246
Data columns (total 6 columns):
Date 247 non-null object
Open 247 non-null int64
High 247 non-null int64
Low 247 non-null int64
Close 247 non-null int64
Volume 247 non-null int64
'''
print(cospi.head())
# 0 26-Feb-16 1180000 1187000 1172000 1172000 176906
# 26-Feb-16 -> 2016-2-26
# 1. 날짜형식 수정(다국어 -> 한국식)
Date = cospi['Date'] # cospi.Date
kDate = [] # 빈list
for d in Date :
kDate.append(datetime.strptime(d, "%d-%b-%y"))
print(kDate[:10])
cospi['Date'] = kDate # (다국어 -> 한국식)
print(cospi.head())
# 2. 시계열 시각화
import matplotlib.pyplot as plt
# 1개 칼럼 추세그래프
cospi['High'].plot(title = "Trend line of High column")
plt.show()
# 2개 칼럼 추세그래프
cospi[['High', 'Low']].plot(title = "Trend line of High vs Low")
plt.show()
# 2. index 수정
print(cospi.index)
# RangeIndex(start=0, stop=247, step=1)
# index 수정 -> Date 칼럼
new_cospi = cospi.set_index('Date')
print(new_cospi.head())
# 년도별 검색
'])
'])
# 월별 검색
print(new_cospi['2016-02'])
# 범위 검색
print(new_cospi['2016-02':'2016-01'])
new_cospi_HL = new_cospi[['High', 'Low']]
new_cospi_HL['].plot(title="title")
plt.show()
new_cospi_HL['2016-02'].plot(title="title")
plt.show()
# 3. 이동평균 기능
# 5일, 10일, 20일
roll_mean5 = pd.Series.rolling(new_cospi.High,
window=5, center=False).mean()
print(roll_mean5)
roll_mean10 = pd.Series.rolling(new_cospi.High,
window=10, center=False).mean()
roll_mean20 = pd.Series.rolling(new_cospi.High,
window=20, center=False).mean()
# roll mean 시각화
new_cospi.High.plot(color='orange', label='High column')
roll_mean5.plot(color='red', label='5day rolling mean')
roll_mean10.plot(color='green', label='10day rolling mean')
roll_mean20.plot(color='blue', label='20day rolling mean')
plt.legend(loc='best')
plt.show()