from numpy import *
from numpy.linalg import linalg
from numpy.ma import mean, argsort, shape
import pandas as pd
import numpy as np
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]#
datArr = [list(map(float, line)) for line in stringArr]
return mat(datArr)#将datArr转换为矩阵
def pca(dataMat, topNfeat=9999999):
meanVals = mean(list(dataMat), axis=0)#计算平均值
meanRemoved = dataMat - meanVals#减去原始数据的平均值
covMat = cov(meanRemoved, rowvar=0)#计算协方差矩阵
eigVals,eigVects = linalg.eig(mat(covMat))#获得特征值和特征向量
eigValInd = argsort(eigVals)#对特征值进行从小到大的排序
eigValInd = eigValInd[:-(topNfeat+1):-1]
redEigVects = eigVects[:,eigValInd]
lowDDataMat = meanRemoved * redEigVects#将数据转换到新空间
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
dataMat = loadDataSet('testSet.txt')
lowDMat, reconMAT = pca(dataMat, 1)
print(shape(lowDMat))
报错:ValueError: operands could not be broadcast together with shapes (1000,2) (2,1)
操作数不能与形如(1000,2)(2,1)一起广播
与书里面的对照看了下,可能是函数导入错误,重新导了下函数,顺利通过。
import numpy as np
def loadDataSet(fileName, delim='\t'):
fr = open(fileName)
stringArr = [line.strip().split(delim) for line in fr.readlines()]#
datArr = [list(map(float, line)) for line in stringArr]
return np.mat(datArr)#将datArr转换为矩阵
def pca(dataMat, topNfeat=9999999):
meanVals = np.mean(list(dataMat), axis=0)#计算平均值
meanRemoved = dataMat - meanVals#减去原始数据的平均值
covMat = np.cov(meanRemoved, rowvar=0)#计算协方差矩阵
eigVals,eigVects = np.linalg.eig(np.mat(covMat))#获得特征值和特征向量
eigValInd = np.argsort(eigVals)#对特征值进行从小到大的排序
eigValInd = eigValInd[:-(topNfeat+1):-1]
redEigVects = eigVects[:,eigValInd]
lowDDataMat = meanRemoved * redEigVects#将数据转换到新空间
reconMat = (lowDDataMat * redEigVects.T) + meanVals
return lowDDataMat, reconMat
dataMat = loadDataSet('testSet.txt')
lowDMat, reconMAT = pca(dataMat, 1)
print(np.shape(lowDMat))
输出:(1000, 1)