【转载】Data Clustering Using Evidence Accumulation

#加载包
library(ggplot2)
library(cluster)
library(reshape)
library(clusterCrit)
【转载】Data Clustering Using Evidence Accumulation
#数据预处理,数据探索
dataAggregatione <- read.csv("http://cs.joensuu.fi/sipu/datasets/Aggregation.txt", sep = "\t",header = FALSE)
dataAggregationeScaled <- scale(dataAggregatione[, -3])
dataAggregatione <- data.frame(dataAggregationeScaled,name = as.character(c(1:nrow(dataAggregationeScaled))))
rownames(dataAggregatione) <- dataAggregatione$name
ggplot(dataAggregatione, aes(V1, V2)) + geom_point()
【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation

dataSpiral <- read.csv("http://cs.joensuu.fi/sipu/datasets/spiral.txt", sep = "\t", header = FALSE)
dataSpiralScaled <- scale(dataSpiral[, -3])
dataSpiral <- data.frame(dataSpiralScaled,name = as.character(c(1:nrow(dataSpiralScaled))))
rownames(dataSpiral) <- dataSpiral$name
ggplot(dataSpiral, aes(V1, V2)) + geom_point()

【转载】Data Clustering Using Evidence Accumulation应用kmeans算法进行聚类时如何确定聚类簇数K?这里选择K为:2-50,度量聚类标准选择三种方法,Dunn,Calinski-Harabasz,Silhouette。

【转载】Data Clustering Using Evidence Accumulation
#根据聚类分析标准选择最优的K值
#针对aggregation数据集
set.seed(1234)
#初始化
vals <- matrix(rep(NA, 49 * 3), ncol = 3, dimnames = list(c(),c("Dunn", "Calinski-Harabasz", "Silhouette"))) 
#算法迭代
for (k in 2:50) {
    cl <- kmeans(dataAggregatione[, c(1, 2)], k) #聚类
    vals[(k - 1), 1] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Dunn"))
    vals[(k - 1), 2] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Calinski_Harabasz"))
    vals[(k - 1), 3] <- as.numeric(intCriteria(as.matrix(dataAggregatione[, c(1, 2)]), cl$cluster,"Silhouette"))
}
vals <- data.frame(K = c(2:50), vals)
choosen_k <- matrix(c(vals[bestCriterion(vals[, 2], "Dunn"), "K"],
                      vals[bestCriterion(vals[, 3], "Calinski_Harabasz"), "K"],
                      vals[bestCriterion(vals[, 4], "Silhouette"), "K"]),
                      ncol = 3,
                      dimnames = list(c("Aggregation"),c("Dunn", "Calinski_Harabasz", "Silhouette")))
choosen_k
【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation

可以发现不同的评价标准,K值选择不一样 

【转载】Data Clustering Using Evidence Accumulation
#Spiral数据集
set.seed(1234)
vals <- matrix(rep(NA, 49 * 3), ncol = 3, dimnames = list(c(),c("Dunn", "Calinski-Harabasz", "Silhouette")))
for (k in 2:50) {
    cl <- kmeans(dataSpiral[, c(1, 2)], k)
    vals[(k - 1), 1] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Dunn"))
    vals[(k - 1), 2] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Calinski_Harabasz"))
    vals[(k - 1), 3] <- as.numeric(intCriteria(as.matrix(dataSpiral[, c(1, 2)]), cl$cluster,"Silhouette"))
}
vals <- data.frame(K = c(2:50), vals)
choosen_k <- matrix(c(vals[bestCriterion(vals[, 2], "Dunn"), "K"],
                      vals[bestCriterion(vals[, 3], "Calinski_Harabasz"), "K"],
                      vals[bestCriterion(vals[, 4], "Silhouette"), "K"]),
                      ncol = 3,
                      dimnames = list(c("Spiral"),c("Dunn", "Calinski_Harabasz", "Silhouette")))
choosen_k
【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation

#可视化聚类结果
kmeansResultsAggreation <- kmeans(x = dataAggregatione[, c(1, 2)],centers = 3)$cluster
dataAggregatione$clusterSimpleKmeans <- as.character(kmeansResultsAggreation)
ggplot(dataAggregatione, aes(V1, V2)) + geom_point(aes(colour = clusterSimpleKmeans)) + opts(legend.position = "none")

看结果没有划分开,红色圈圈标注

【转载】Data Clustering Using Evidence Accumulation

kmeansResultsSpiral <- kmeans(x = dataSpiral[, c(1, 2)],centers = 37)$cluster
dataSpiral$clusterSimpleKmeans <- as.character(kmeansResultsSpiral)
ggplot(dataSpiral, aes(V1, V2)) + geom_point(aes(colour = clusterSimpleKmeans)) + opts(legend.position = "none")   

【转载】Data Clustering Using Evidence Accumulation

Evidence Accumulation Clustering算法

【转载】Data Clustering Using Evidence Accumulation
#Ensemble思路
createCoAssocMatrix <- function(Iter, rangeK, dataSet) {
    nV <- dim(dataSet)[1]
    CoAssoc <- matrix(rep(0, nV * nV), nrow = nV)
    for (j in 1:Iter) {
        jK <- sample(c(rangeK[1]:rangeK[2]), 1, replace = FALSE)
        jSpecCl <- kmeans(x = dataSet, centers = jK)$cluster
        CoAssoc_j <- matrix(rep(0, nV * nV), nrow = nV)
        for (i in unique(jSpecCl)) {
            indVenues <- which(jSpecCl == i)
            CoAssoc_j[indVenues, indVenues] <- CoAssoc_j[indVenues, indVenues] + (1/Iter)
        }
        CoAssoc <- CoAssoc + CoAssoc_j
    }
    return(CoAssoc)
}

eac <- function(Iter, rangeK, dataset, hcMethod = "single") {
    CoAssocSim <- createCoAssocMatrix(Iter, rangeK, dataset)
    #transform from similiarity into distance matrix
    CoAssocDist <- 1 - CoAssocSim 
    hclustM <- hclust(as.dist(CoAssocDist), method = hcMethod)
    #determine the cut
    cutValue <- hclustM$height[which.max(diff(hclustM$height))] 
    return(cutree(hclustM, h = cutValue))
}
【转载】Data Clustering Using Evidence Accumulation
【转载】Data Clustering Using Evidence Accumulation
#EAC aggregation数据集
set.seed(1234)
EACResults_Aggregatione <- eac(Iter = 200, rangeK = c(2, 50),dataset = dataAggregatione[, c(1, 2)], hcMethod = "single")
table(EACResults_Aggregatione)
dataAggregatione$clusterEAC <- as.character(EACResults_Aggregatione)
ggplot(dataAggregatione, aes(V1, V2)) + geom_point(aes(colour = clusterEAC)) + opts(legend.position = "none")
【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation
set.seed(1234)
EACResults_Spiral <- eac(Iter = 200, rangeK = c(2, 50),dataset = dataSpiral[, c(1, 2)], hcMethod = "single")
table(EACResults_Spiral)
dataSpiral$clusterEAC <- as.character(EACResults_Spiral)
ggplot(dataSpiral, aes(V1, V2)) + geom_point(aes(colour = clusterEAC)) + opts(legend.position = "none")
【转载】Data Clustering Using Evidence Accumulation

【转载】Data Clustering Using Evidence Accumulation

从上图可以看出,EA聚类算法较kmeans效果要好,可以较好的对数据进行划分。

【转载】Data Clustering Using Evidence Accumulation

上一篇:点点守护为什么建议安卓手机别开root权限?


下一篇:大数据与云计算方向路线图