注:可直接看方法解析和应用部分,其余部分为笔者的推导详解。
目录
方法解析
主成分分析就是把多个指标转化为少数几个指标,这少数几个指标能尽可能反映原来指标的作用,以达到降维的统计目的。
有p个指标X1,X2,...Xp,用k个指标Y1,Y2,...Yk来代替它们,其中,Yi=a1*X1+...+ap*Xp,并且,Y的方差要尽可能大,以尽可能最大限度反映原来指标作用:
,
且限制向量a的长度为1,若不限制,用增加a的长度来增大方差没有意义。
可看出上述方差等于单位特征向量下的特征值,可通过求X协方差矩阵V正交变换下的特征值矩阵得到,特征值越大,方差越大,而a等于特征值对应的单位特征向量,且各不同特征值对应的特征向量线性无关,即可得到线性无关的新指标Y。
把特征值按从大到小顺序排列,计算主成分的累计贡献率:(也可以用方差来求累计贡献率)
其中m<=k
取前m个主成分,且累计贡献率超过85%即可。
在实际运用中,若遇到p个指标的量纲不尽相同或取值彼此差异很大的问题,可将各指标进行标准化,即:
且此时的协方差矩阵为相关矩阵(可省去数据标准化过程)
可得主成分分析的步骤为:
1.数据标准化(本文采用样本标准差进行标准化)
2.求协方差矩阵及特征值矩阵和特征向量正交矩阵
3.修剪得到累积贡献率超过85%的特征向量矩阵
4.修剪后的特征向量矩阵转置与原始数据相乘得到降维后的数据
python实现
数据模拟
import pandas as pd
import numpy as np
import random
english=np.linspace(1,1,50)
math=np.linspace(1,1,50)
insurance=np.linspace(1,1,50)
politics=np.linspace(1,1,50)
for i in range(len(english)):
english[i]=random.randint(70,90)
math[i]=random.randint(100,150)
insurance[i] = random.randint(70, 90)
politics[i] = random.randint(100, 150)
names=["english","math","insurance","politics"]
data1=pd.DataFrame([english,math,insurance,politics],index=names)
data=pd.DataFrame(data1.values.T,columns=names)
data.to_excel('D:/CSDN/主成分分析/python/pythondata.xlsx',
index=None)
数据标准化
data=pd.read_excel('D:/CSDN/主成分分析/python/pythondata.xlsx')
def standard(data):
p = np.shape(data)[0] # 求行数
n = np.shape(data)[1] # 求列数
t = np.ones((p, n)) # 生成数组盒子
for i in range(n):
a = data.iloc[:, i] # 取第i列
m = np.mean(a) # 求平均值
s = np.std(a, ddof=1) # 求样本标准差
for k in range(len(a)):
t[k][i] = (a[k] - m) / s
names = data.columns.values.tolist() # 生成列名称的列表
y = pd.DataFrame(t, columns=names) # 列表整理
return y
d = standard(data)
d.to_excel('D:/CSDN/主成分分析/python/standarddata.xlsx',
index=None)
求协方差矩阵及特征值和特征向量正交矩阵
d = pd.read_excel('D:/CSDN/主成分分析/python/standarddata.xlsx')
def eigs(d):
covmatrix = np.cov(d, rowvar=False) # 求协方差矩阵
eig1, eigvec = np.linalg.eig(covmatrix) # 求特征值和特征向量的矩阵
eig = np.mat(eig1) # 数组转矩阵
p = np.shape(eigvec)[0] # 求行数
n = np.shape(eigvec)[1] # 求列数
t = np.ones((p, n)) # 生成数组盒子
for i in range(n): # 特征向量矩阵正交化
s = np.linspace(0, 0, p + 1) # 生成数组盒子
for k in range(p): # 求得向量平方和
s[k + 1] = s[k] + eigvec[k][i] ** 2
sq = np.sqrt(s[p])
for j in range(p): # 求得正交矩阵
t[j][i] = eigvec[j][i] / sq
eigvectors = np.mat(t)
return covmatrix, eig, eigvectors
covmatrix, eig, eigvectors = eigs(d)
print(covmatrix)
print(eig)
print(eigvectors)
可得,协方差矩阵为:
[[ 1. -0.09610392 0.06312812 -0.13389414]
[-0.09610392 1. 0.0368002 -0.18039632]
[ 0.06312812 0.0368002 1. 0.02086925]
[-0.13389414 -0.18039632 0.02086925 1. ]]
特征值向量为:
[0.70794822 1.18923252 1.09923045 1.00358881]
特征向量正交矩阵为:
[[-0.51220391 -0.26305615 0.79945362 -0.17123822]
[-0.56908617 -0.60052403 -0.51339355 0.22790103]
[ 0.22547272 -0.12236595 0.30093053 0.91849303]
[-0.60245343 0.74511672 -0.08210838 0.27406049]]
修剪得到累积贡献率超过85%的特征值向量和特征向量矩阵
def cut(eig, eigvectors):
e = np.array(eig)[0] # 向量转为一维数组
w = sorted(e,reverse=True) # 降序排列
l = np.linspace(0, 0, len(w) + 1)
for j in range(len(w)):
l[j + 1] = l[j] + w[j] / sum(w)
for k in range(len(w) + 1):
if l[k] >= 0.85:
q = int(k) # 获得需要留下的特征向量个数
break
ma = np.linspace(0, 0, q)
for c in range(q):
ma[c] = w[c]
eigcut = np.mat(ma) # 获得需要的特征值向量
r = np.argsort(-e) # 得到降序排列的下标
h = np.array(eigvectors) # 矩阵转数组
p = np.shape(h)[0] # 求行数
t = np.ones((p, q)) # 生成数组盒子
f = 0
for i in r:
a = h[:, i] # 取数组第i列
for x in range(len(a)):
t[x][f] = a[x]
f = f + 1
if f == q:
break
eigvectorscut = np.mat(t) # 获得需要的特征向量矩阵
return eigcut, eigvectorscut
eigcut, eigvectorscut = cut(eig, eigvectors)
print(eigcut)
print(eigvectorscut)
修剪后的特征值向量为:
[[1.18923252 1.09923045 1.00358881 0.70794822]]
修剪后的特征向量矩阵为:
[[-0.26305615 0.79945362 -0.17123822 -0.51220391]
[-0.60052403 -0.51339355 0.22790103 -0.56908617]
[-0.12236595 0.30093053 0.91849303 0.22547272]
[ 0.74511672 -0.08210838 0.27406049 -0.60245343]]
修剪后的特征向量与原始数据相乘得到降维后的数据
def mult(eigvectorscut,d):
q = np.shape(eigvectorscut)[1] # 求列数
p = np.shape(d)[0] # 求行数
t = np.ones((p, q)) # 生成数组盒子
dm = np.mat(d) # 列表转矩阵
for i in range(q):
a = eigvectorscut[:, i] # 取特征向量矩阵第i列
g = np.array(dm * a) # 获得第i个主成分并转化为数组
for j in range(len(g)):
t[j][i] = g[j]
new = pd.DataFrame(t)
return new
newdata = mult(eigvectorscut, d)
newdata.to_excel('D:/CSDN/主成分分析/python/newdata.xlsx',
index=None)
完整代码
import pandas as pd
import numpy as np
import random
english=np.linspace(1,1,50)
math=np.linspace(1,1,50)
insurance=np.linspace(1,1,50)
politics=np.linspace(1,1,50)
for i in range(len(english)):
english[i]=random.randint(70,90)
math[i]=random.randint(100,150)
insurance[i] = random.randint(70, 90)
politics[i] = random.randint(100, 150)
names=["english","math","insurance","politics"]
data1=pd.DataFrame([english,math,insurance,politics],index=names)
data=pd.DataFrame(data1.values.T,columns=names)
data.to_excel('D:/CSDN/主成分分析/python/pythondata.xlsx',
index=None)
data=pd.read_excel('D:/CSDN/主成分分析/python/pythondata.xlsx')
def standard(data):
p = np.shape(data)[0] # 求行数
n = np.shape(data)[1] # 求列数
t = np.ones((p, n)) # 生成数组盒子
for i in range(n):
a = data.iloc[:, i] # 取列表第i列
m = np.mean(a) # 求平均值
s = np.std(a, ddof=1) # 求样本标准差
for k in range(len(a)):
t[k][i] = (a[k] - m) / s
names = data.columns.values.tolist() # 生成列名称的列表
y = pd.DataFrame(t, columns=names) # 列表整理
return y
d = standard(data)
d.to_excel('D:/CSDN/主成分分析/python/standarddata.xlsx',
index=None)
d = pd.read_excel('D:/CSDN/主成分分析/python/standarddata.xlsx')
def eigs(d):
covmatrix = np.cov(d, rowvar=False) # 求协方差矩阵
eig1, eigvec = np.linalg.eig(covmatrix) # 求特征值和特征向量的矩阵
eig=np.mat(eig1)#数组转矩阵
p = np.shape(eigvec)[0] # 求行数
n = np.shape(eigvec)[1] # 求列数
t = np.ones((p, n)) # 生成数组盒子
for i in range(n): # 特征向量矩阵正交化
s = np.linspace(0, 0, p + 1) # 生成数组盒子
for k in range(p): # 求得向量平方和
s[k + 1] = s[k] + eigvec[k][i] ** 2
sq = np.sqrt(s[p])
for j in range(p): # 求得正交矩阵
t[j][i] = eigvec[j][i] / sq
eigvectors = np.mat(t)
return covmatrix, eig, eigvectors#分别为协方差矩阵,特征值矩阵,特征向量矩阵
covmatrix, eig, eigvectors = eigs(d)
print(covmatrix)
print(eig)
print(eigvectors)
def cut(eig, eigvectors):
e = np.array(eig)[0] # 向量转为一维数组
w = sorted(e,reverse=True) # 降序排列
l = np.linspace(0, 0, len(w) + 1)
for j in range(len(w)):
l[j + 1] = l[j] + w[j] / sum(w)
for k in range(len(w) + 1):
if l[k] >= 0.85:
q = int(k) # 获得需要留下的特征向量个数
break
ma = np.linspace(0, 0, q)
for c in range(q):
ma[c] = w[c]
eigcut = np.mat(ma) # 获得需要的特征值向量
r = np.argsort(-e) # 得到降序排列的下标
h = np.array(eigvectors) # 矩阵转数组
p = np.shape(h)[0] # 求行数
t = np.ones((p, q)) # 生成数组盒子
f = 0
for i in r:
a = h[:, i] # 取数组第i列
for x in range(len(a)):
t[x][f] = a[x]
f = f + 1
if f == q:
break
eigvectorscut = np.mat(t) # 获得需要的特征向量矩阵
return eigcut, eigvectorscut
eigcut, eigvectorscut = cut(eig, eigvectors)
print(eigcut)
print(eigvectorscut)
def mult(eigvectorscut,d):
q = np.shape(eigvectorscut)[1] # 求列数
p = np.shape(d)[0] # 求行数
t = np.ones((p, q)) # 生成数组盒子
dm = np.mat(d) # 列表转矩阵
for i in range(q):
a = eigvectorscut[:, i] # 取特征向量矩阵第i列
g = np.array(dm * a) # 获得第i个主成分并转化为数组
for j in range(len(g)):
t[j][i] = g[j]
new = pd.DataFrame(t)
return new
newdata = mult(eigvectorscut, d)
newdata.to_excel('D:/CSDN/主成分分析/python/newdata.xlsx',
index=None)
print(newdata)
应用
from sklearn.decomposition import PCA
import pandas as pd
data=pd.read_excel('D:/CSDN/主成分分析/python/pythondata.xlsx')
m=PCA(n_components=0.99)#数值0~0.9999表示主成分方差占比,数值大于等于1表示主成分个数
b=m.fit_transform(data)#导入训练数据,并导出降维后的数据
print(b)
a=m.explained_variance_ratio_#导出各主成分方差占比
print(a)
k=m.components_#导出特征向量矩阵
print(k)
关于sklearn包的PCA具体用法,可以CTRL+点击PCA或参考:
Bobo老师机器学习笔记第七课-sklearn中PCA的用法
PCA降维原理及其代码实现(附加 sklearn PCA用法参数详解)
R实现
数据模拟
english=floor(runif(50,70,90))
math=floor(runif(50,100,150))
insurance=floor(runif(50,70,90))
politicals=floor(runif(50,70,90))
m=cbind(english,math,insurance,politicals)
write.csv(m,"D:/CSDN/主成分分析/r/rdata.csv",row.names=FALSE)
数据标准化
dat=read.csv("D:/CSDN/主成分分析/r/rdata.csv")
standard=function(dat){
p=dim(dat)[1]
q=dim(dat)[2]
h=array(NA,dim = c(p,q))
for (i in 1:q){
a=dat[,i]
n=length(a)
me=mean(a)
std=sqrt(var(english)*n/(n-1))
for (j in 1:n){
h[j,i]=(a[j]-me)/std
}
}
y=data.frame(h)
names(y)=colnames(dat)
return(y)
}
m1=standard(dat)
write.csv(m1,"D:/CSDN/主成分分析/r/standarddata.csv",row.names=FALSE)
求协方差矩阵及特征值和特征向量正交矩阵
dat1=read.csv("D:/CSDN/主成分分析/r/standarddata.csv")
eigs=function(dat1){
covmatrix=cov(dat1)
eig=eigen(covmatrix)$values
eigvec=eigen(covmatrix)$vectors
return(list("covmatrix"=covmatrix,
"eig"=eig,
"eigvec"=eigvec))
}
eigs(dat1)
可得协方差矩阵为:
english math insurance politicals
english 0.98000000 -0.19243790 -0.01920151 -0.25995686
math -0.19243790 8.14783791 0.07170565 0.09153449
insurance -0.01920151 0.07170565 1.10495985 -0.01587398
politicals -0.25995686 0.09153449 -0.01587398 1.17423804
特征值为:
8.155126 1.349682 1.105989 0.796239
特征向量正交矩阵为:
[,1] [,2] [,3] [,4]
[1,] -0.02734422 0.56491144 0.07279386 0.82147941
[2,] 0.99947450 0.02691342 0.01175447 0.01371976
[3,] 0.01020816 0.01704532 -0.99683690 0.07695097
[4,] 0.01410031 -0.82453635 0.02964943 0.56485565
修剪得到累积贡献率超过85%的特征值向量和特征向量矩阵
w=eigs(dat1)
eig=w$eig
eigvec=w$eigvec
cutd=function(eig,eigvec){
a=sort(eig,decreasing = TRUE)
h=array(0,dim = c(1,length(a)+1))
for (i in 1:length(a)){
h[i+1]=h[i]+a[i]/sum(a)
}
for (k in 2:length(h)){
if (h[k]>=0.85){
r=k-1
break
}
}
ma=matrix(NA,nrow = 1,ncol = r)
for (c in 1:r){
ma[c]=a[c]
}
z=order(eig,decreasing = TRUE)
n=dim(eigvec)[1]
t=matrix(NA,nrow = n,ncol = r)
f=1
for (j in z){
b=eigvec[,j]
for (x in 1:length(b)){
t[x,f]=b[x]
}
f=f+1
if (f>r){
break
}
}
return(list("eigcut"=ma,
"eigveccut"=t))
}
cutd(eig,eigvec)
修剪后的特征值向量为:
8.155126 1.349682 1.105989
修剪后的特征向量矩阵为:
[,1] [,2] [,3]
[1,] -0.02734422 0.56491144 0.07279386
[2,] 0.99947450 0.02691342 0.01175447
[3,] 0.01020816 0.01704532 -0.99683690
[4,] 0.01410031 -0.82453635 0.02964943
修剪后的特征向量与原始数据相乘得到降维后的数据
l=cutd(eig,eigvec)
eigveccut=l$eigveccut
mult=function(eigveccut,dat1){
s=dim(dat1)[2]
d=apply(dat1,2,as.numeric)#把数据框中类型为数值型的对象按列抽离出来
dat1m=matrix(d,ncol = s)
p=dim(dat1)[1]
q=dim(eigveccut)[2]
t=array(NA,dim = c(p,q))
for (i in 1:q){
a=matrix(eigveccut[,i],ncol=1)
g=dat1m %*% a
for (j in 1:length(g)){
t[j,i]=g[j]
}
}
new=data.frame(t)
return(list("newdata"=new))
}
k=mult(eigveccut,dat1)
newdata=k$newdata
write.csv(newdata,"D:/CSDN/主成分分析/r/newdata.csv",row.names=FALSE)
完整代码
english=floor(runif(50,70,90))
math=floor(runif(50,100,150))
insurance=floor(runif(50,70,90))
politicals=floor(runif(50,70,90))
m=cbind(english,math,insurance,politicals)
write.csv(m,"D:/CSDN/主成分分析/r/rdata.csv",row.names=FALSE)
dat=read.csv("D:/CSDN/主成分分析/r/rdata.csv")
standard=function(dat){
p=dim(dat)[1]
q=dim(dat)[2]
h=array(NA,dim = c(p,q))
for (i in 1:q){
a=dat[,i]
n=length(a)
me=mean(a)
std=sqrt(var(english)*n/(n-1))
for (j in 1:n){
h[j,i]=(a[j]-me)/std
}
}
y=data.frame(h)
names(y)=colnames(dat)
return(y)
}
m1=standard(dat)
write.csv(m1,"D:/CSDN/主成分分析/r/standarddata.csv",row.names=FALSE)
dat1=read.csv("D:/CSDN/主成分分析/r/standarddata.csv")
eigs=function(dat1){
covmatrix=cov(dat1)
eig=eigen(covmatrix)$values
eigvec=eigen(covmatrix)$vectors
return(list("covmatrix"=covmatrix,
"eig"=eig,
"eigvec"=eigvec))
}
w=eigs(dat1)
eig=w$eig
eigvec=w$eigvec
cutd=function(eig,eigvec){
a=sort(eig,decreasing = TRUE)
h=array(0,dim = c(1,length(a)+1))
for (i in 1:length(a)){
h[i+1]=h[i]+a[i]/sum(a)
}
for (k in 2:length(h)){
if (h[k]>=0.85){
r=k-1
break
}
}
ma=matrix(NA,nrow = 1,ncol = r)
for (c in 1:r){
ma[c]=a[c]
}
z=order(eig,decreasing = TRUE)
n=dim(eigvec)[1]
t=matrix(NA,nrow = n,ncol = r)
f=1
for (j in z){
b=eigvec[,j]
for (x in 1:length(b)){
t[x,f]=b[x]
}
f=f+1
if (f>r){
break
}
}
return(list("eigcut"=ma,
"eigveccut"=t))
}
l=cutd(eig,eigvec)
eigveccut=l$eigveccut
mult=function(eigveccut,dat1){
s=dim(dat1)[2]
d=apply(dat1,2,as.numeric)#把数据框中类型为数值型的对象按列抽离出来
dat1m=matrix(d,ncol = s)
p=dim(dat1)[1]
q=dim(eigveccut)[2]
t=array(NA,dim = c(p,q))
for (i in 1:q){
a=matrix(eigveccut[,i],ncol=1)
g=dat1m %*% a
for (j in 1:length(g)){
t[j,i]=g[j]
}
}
new=data.frame(t)
return(list("newdata"=new))
}
k=mult(eigveccut,dat1)
newdata=k$newdata
write.csv(newdata,"D:/CSDN/主成分分析/r/newdata.csv",row.names=FALSE)
应用
dat=read.csv("D:/CSDN/主成分分析/r/rdata.csv")
ac=princomp(dat,cor=TRUE)#用相关矩阵进行主成分分析
summary(ac)#显示各主成分方差及占比
summary(ac,loadings = TRUE)#显示各主成分方差及占比和特征向量矩阵
predict(ac)#显示变换之后的数据
ao=prcomp(dat)
summary(ao)
predict(ao)
prcomp和princomp都能实现PCA,用法和区别可以直接help或者参考: