ML实战:手动实现PCA降维算法
- 在此手动实现PCA算法,并且应用到神经网络的手写数字识别加速上
代码实现
PCA类
import numpy as np
from sklearn.preprocessing import Normalizer
import sys
class PCA:
def __init__(self,x,features=None,rate=None):
'''
:param x:训练集
:param z:训练集实现PCA降维后的结果
:param u:降维后的空间的向量表达
:param xapprox:压缩重现后的近似样本
:param features:保留的特征数
:param rate: 保留率限制
:param variance:本次PCA保留率
'''
if features is None and rate is None:
print('You should input a standard!')
sys.exit(0)
if features is not None and rate is not None:
print('You should not input two standards!')
sys.exit(0)
transfer=Normalizer()
self.x=transfer.fit_transform(x)
self.features=features
self.rate=rate
self.variance=0
def fit_features(self,features):
#根据保留特征进行PCA降维
m=len(self.x)
cov=np.matmul(self.x.T,self.x)/m
U,s,V=np.linalg.svd(cov)
self.u=U[:,:features]
self.z=np.matmul(self.u.T,self.x.T)
self.xapprox=np.matmul(self.u,self.z).T
temp = self.x - self.xapprox
self.variance=1-np.sum(temp*temp)/np.sum(self.x*self.x)
def fit_rate(self):
#根据保留率进行PCA降维,从特征数为1循环调用fit_features(),直至满足要求
self.features=0
while self.variance<self.rate:
self.features += 1
self.fit_features(self.features)
def cost(self,x):
#计算对数据集x降维后,还保留了多少
transfer = Normalizer()
x = transfer.fit_transform(x)
z=np.matmul(self.u.T,x.T)
xapprox = np.matmul(self.u, z).T
temp = x - xapprox
return 1 - np.sum(temp * temp) / np.sum(x * x)
def predict(self,x):
#对数据集x降维
transfer = Normalizer()
x = transfer.fit_transform(x)
return np.matmul(self.u.T,x.T).T
def fit(self):
#参数拟合
if self.features is not None:
self.fit_features(self.features)
else:
self.fit_rate()
主函数
#use mypca on digits recongnize by neural network
from PCA_class import PCA
from sklearn import datasets
import numpy as np
import sys
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
np.set_printoptions(suppress=True)
#读入数据
x=datasets.load_digits().data
y=datasets.load_digits().target
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.03,random_state=np.random.randint(0,30))
X=np.arange(1,len(x_test)+1)
#调用fit方法,实现参数拟合
pca=PCA(x_train,rate=0.99)
pca.fit()
#对训练集和测试集都实现pca降维
x_train_pca=pca.predict(x_train)
x_test_pca=pca.predict(x_test)
#查看PCA算法性能
print('The original features:',str(len(x_train[0])))
print('After PCA, the features:',str(pca.features))
print('After using PCA on x_test, x_test also have save '+str(round(pca.cost(x_test),2))+'%')
#用降维后的数据训练神经网络
clf=MLPClassifier(hidden_layer_sizes=(50),activation='logistic',solver='adam',learning_rate_init=0.0001,max_iter=3000)
clf.fit(x_train_pca,y_train)
y_predict=clf.predict(x_test_pca)
#实现预测结果可视化
plt.figure(figsize=(30,8),dpi=80)
plt.scatter(X,y_test,label='real',marker='s',color='blue')
plt.scatter(X,y_predict,label='predict',marker='x',color='red')
plt.legend(loc=[1,0])
plt.grid(True,linestyle='--',alpha=0.5)
plt.yticks(y_test[::1])
plt.xticks(X[::1])
plt.xlabel('index of tests')
plt.ylabel('target')
plt.savefig('E:\python\ml\ml by myself\PCA\PCA_myself_on_NeuralNetwork')
sys.exit(0)
结果
PCA性能
应用到神经网络加速后