#加载包 library(ggplot2) library(cluster) library(reshape) library(clusterCrit)
#数据预处理,数据探索 dataAggregatione <- read.csv("http://cs.joensuu.fi/sipu/datasets/Aggregation.txt", sep = "\t",header = FALSE) dataAggregationeScaled <- scale(dataAggregatione[, -3]) dataAggregatione <- data.frame(dataAggregationeScaled,name = as.character(c(1:nrow(dataAggregationeScaled)))) rownames(dataAggregatione) <- dataAggregatione$name ggplot(dataAggregatione, aes(V1, V2)) + geom_point()
dataSpiral <- read.csv("http://cs.joensuu.fi/sipu/datasets/spiral.txt", sep = "\t", header = FALSE) dataSpiralScaled <- scale(dataSpiral[, -3]) dataSpiral <- data.frame(dataSpiralScaled,name = as.character(c(1:nrow(dataSpiralScaled)))) rownames(dataSpiral) <- dataSpiral$name ggplot(dataSpiral, aes(V1, V2)) + geom_point()
应用kmeans算法进行聚类时如何确定聚类簇数K?这里选择K为:2-50,度量聚类标准选择三种方法,Dunn,Calinski-Harabasz,Silhouette。
#根据聚类分析标准选择最优的K值 #针对aggregation数据集 set.seed(1234) #初始化 vals <- matrix(rep(NA, 49 * 3), ncol = 3, dimnames = list(c(),c("Dunn", "Calinski-Harabasz", "Silhouette"))) #算法迭代 for (k in 2:50) { cl <- kmeans(dataAggregatione[, c(1, 2)], k) #聚类 vals[(k - 1), 1] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Dunn")) vals[(k - 1), 2] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Calinski_Harabasz")) vals[(k - 1), 3] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Silhouette")) } vals <- data.frame(K = c(2:50), vals) choosen_k <- matrix(c(vals[bestCriterion(vals[, 2], "Dunn"), "K"], vals[bestCriterion(vals[, 3], "Calinski_Harabasz"), "K"], vals[bestCriterion(vals[, 4], "Silhouette"), "K"]), ncol = 3, dimnames = list(c("Aggregation"),c("Dunn", "Calinski_Harabasz", "Silhouette"))) choosen_k
可以发现不同的评价标准,K值选择不一样
#Spiral数据集 set.seed(1234) vals <- matrix(rep(NA, 49 * 3), ncol = 3, dimnames = list(c(),c("Dunn", "Calinski-Harabasz", "Silhouette"))) for (k in 2:50) { cl <- kmeans(dataSpiral[, c(1, 2)], k) vals[(k - 1), 1] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Dunn")) vals[(k - 1), 2] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Calinski_Harabasz")) vals[(k - 1), 3] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Silhouette")) } vals <- data.frame(K = c(2:50), vals) choosen_k <- matrix(c(vals[bestCriterion(vals[, 2], "Dunn"), "K"], vals[bestCriterion(vals[, 3], "Calinski_Harabasz"), "K"], vals[bestCriterion(vals[, 4], "Silhouette"), "K"]), ncol = 3, dimnames = list(c("Spiral"),c("Dunn", "Calinski_Harabasz", "Silhouette"))) choosen_k
#可视化聚类结果 kmeansResultsAggreation <- kmeans(x = dataAggregatione[, c(1, 2)],centers = 3)$cluster dataAggregatione$clusterSimpleKmeans <- as.character(kmeansResultsAggreation) ggplot(dataAggregatione, aes(V1, V2)) + geom_point(aes(colour = clusterSimpleKmeans)) + opts(legend.position = "none")
看结果没有划分开,红色圈圈标注
kmeansResultsSpiral <- kmeans(x = dataSpiral[, c(1, 2)],centers = 37)$cluster dataSpiral$clusterSimpleKmeans <- as.character(kmeansResultsSpiral) ggplot(dataSpiral, aes(V1, V2)) + geom_point(aes(colour = clusterSimpleKmeans)) + opts(legend.position = "none")
Evidence Accumulation Clustering算法
#Ensemble思路 createCoAssocMatrix <- function(Iter, rangeK, dataSet) { nV <- dim(dataSet)[1] CoAssoc <- matrix(rep(0, nV * nV), nrow = nV) for (j in 1:Iter) { jK <- sample(c(rangeK[1]:rangeK[2]), 1, replace = FALSE) jSpecCl <- kmeans(x = dataSet, centers = jK)$cluster CoAssoc_j <- matrix(rep(0, nV * nV), nrow = nV) for (i in unique(jSpecCl)) { indVenues <- which(jSpecCl == i) CoAssoc_j[indVenues, indVenues] <- CoAssoc_j[indVenues, indVenues] + (1/Iter) } CoAssoc <- CoAssoc + CoAssoc_j } return(CoAssoc) } eac <- function(Iter, rangeK, dataset, hcMethod = "single") { CoAssocSim <- createCoAssocMatrix(Iter, rangeK, dataset) #transform from similiarity into distance matrix CoAssocDist <- 1 - CoAssocSim hclustM <- hclust(as.dist(CoAssocDist), method = hcMethod) #determine the cut cutValue <- hclustM$height[which.max(diff(hclustM$height))] return(cutree(hclustM, h = cutValue)) }
#EAC aggregation数据集 set.seed(1234) EACResults_Aggregatione <- eac(Iter = 200, rangeK = c(2, 50),dataset = dataAggregatione[, c(1, 2)], hcMethod = "single") table(EACResults_Aggregatione) dataAggregatione$clusterEAC <- as.character(EACResults_Aggregatione) ggplot(dataAggregatione, aes(V1, V2)) + geom_point(aes(colour = clusterEAC)) + opts(legend.position = "none")
set.seed(1234) EACResults_Spiral <- eac(Iter = 200, rangeK = c(2, 50),dataset = dataSpiral[, c(1, 2)], hcMethod = "single") table(EACResults_Spiral) dataSpiral$clusterEAC <- as.character(EACResults_Spiral) ggplot(dataSpiral, aes(V1, V2)) + geom_point(aes(colour = clusterEAC)) + opts(legend.position = "none")
从上图可以看出,EA聚类算法较kmeans效果要好,可以较好的对数据进行划分。