【问题标题】:Calculating optimal number of clusters with Nbclust()使用 Nbclust() 计算最佳簇数
【发布时间】:2020-06-28 14:25:34
【问题描述】:

我想计算大型数据集的最佳聚类数:17 列和 >80.000 行。

这是我的代码:

1。路径的定义

setwd("C:/Users/A/Documents/Master BWL/Masterarbeit")

2。加载所需的包

library(factoextra); library(cluster); library(skmeans); library(mclust); 
library(fpc); library(psda); library(simEd); library (ggpubr);
library(dbscan); library(clustertend); library(MASS); library(devtools);
library(ggbiplot);library(NbClust)

3。导入csv文件

WKA_ohneJB <- read.csv("WKA_ohneJB_PCA.csv", header=TRUE, sep = ";", stringsAsFactors = FALSE)

WKA_ohneJB_scaled <- scale(WKA_ohneJB)

# NbClust ()
nb <- NbClust(WKA_ohneJB_scaled , distance = "manhattan", min.nc = 2, max.nc = 7, method = "kmeans")
dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10)))
structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 
821039L, 821040L, 821041L, 821042L, 821043L, 821044L, 821045L, 
821046L, 821047L, 821048L), BASKETS_NZ = c(1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
    LOGONS = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), PIS = c(71L, 39L, 50L, 4L, 
    13L, 4L, 30L, 65L, 13L, 31L, 111L, 33L, 3L, 46L, 11L, 8L, 
    17L, 68L, 65L, 15L), PIS_AP = c(14L, 2L, 4L, 0L, 0L, 0L, 
    1L, 0L, 2L, 1L, 13L, 0L, 0L, 2L, 1L, 0L, 3L, 8L, 0L, 1L), 
    PIS_DV = c(3L, 19L, 4L, 1L, 0L, 0L, 6L, 2L, 2L, 3L, 38L, 
    8L, 0L, 5L, 2L, 0L, 1L, 0L, 3L, 2L), PIS_PL = c(0L, 5L, 8L, 
    2L, 0L, 0L, 0L, 24L, 0L, 6L, 32L, 8L, 0L, 0L, 4L, 0L, 0L, 
    0L, 0L, 0L), PIS_SDV = c(18L, 0L, 11L, 0L, 0L, 0L, 0L, 0L, 
    0L, 1L, 6L, 0L, 0L, 13L, 0L, 0L, 1L, 15L, 1L, 0L), PIS_SHOPS = c(3L, 
    24L, 13L, 3L, 0L, 0L, 6L, 28L, 2L, 11L, 71L, 16L, 2L, 5L, 
    6L, 0L, 1L, 0L, 3L, 2L), PIS_SR = c(19L, 0L, 14L, 0L, 0L, 
    0L, 2L, 23L, 0L, 3L, 6L, 0L, 0L, 20L, 0L, 0L, 3L, 32L, 1L, 
    0L), QUANTITY = c(13L, 2L, 18L, 1L, 14L, 1L, 4L, 2L, 5L, 
    1L, 5L, 2L, 2L, 4L, 1L, 3L, 2L, 8L, 17L, 8L), WKA = c(1L, 
    1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 
    0L, 0L, 1L, 1L), NEW_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), EXIST_CUST = c(1L, 
    1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 
    1L, 1L, 1L, 1L), WEB_CUST = c(1L, 0L, 0L, 0L, 1L, 1L, 0L, 
    1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), MOBILE_CUST = c(0L, 
    1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    1L, 0L, 1L, 0L), TABLET_CUST = c(0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L), 
    LOGON_CUST_STEP2 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 
    0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(1L, 
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 821039L, 821040L, 821041L, 
821042L, 821043L, 821044L, 821045L, 821046L, 821047L, 821048L
), class = "data.frame")

错误:na.omit(jeu1) 中的错误:找不到对象“多边形”

【问题讨论】:

  • dput(rbind(head(WKA_ohneJB, 10), tail(WKA_ohneJB, 10))) 提供部分数据集可能有助于其他人重现您遇到的问题。
  • @knytt 我提供了部分数据集。感谢您的建议。
  • 使用scale函数的缩放在某些列中创建NaNs,您确定要使用所有列进行聚类吗?
  • @knytt 是的,我想使用所有列。由于这是一个大型数据集,我是否必须在运行 NbClust 之前降低维度?

标签: r cluster-analysis k-means


【解决方案1】:

确定聚类数量的简单方法是检查组内平方和和/或轮廓平均宽度图中的 弯头,代码会生成简单的图来检查这些。 .

为了进行聚类,需要解决缩放后NaNs的问题...

WKA_ohneJB_scaled <- as.matrix(scale(data[, c(-1, -2, -18)]))

plot_scree_clusters <- function(x) {
  wss <- 0
  max_i <- 10 # max clusters
  for (i in 1:max_i) {
    km.model <- kmeans(x, centers = i, nstart = 20)
    wss[i] <- km.model$tot.withinss
  }
  plot(1:max_i, wss, type = "b",
       xlab = "Number of Clusters",
       ylab = "Within groups sum of squares")
}

plot_scree_clusters(WKA_ohneJB_scaled)

plot_sil_width <- function(x) {
  sw <- 0
  max_i <- 10 # max clusters
  for (i in 2:max_i) {
    km.model <- cluster::pam(x = pc_comp$x, k = i)
    sw[i] <- km.model$silinfo$avg.width
  }
  sw <- sw[-1]
  plot(2:max_i, sw, type = "b",
       xlab = "Number of Clusters",
       ylab = "Average silhouette width")
}

plot_sil_width(WKA_ohneJB_scaled)

【讨论】:

    【解决方案2】:

    使用 knytt 提到的肘部方法。以下是描述该技术的几个参考资料。

    https://www.r-bloggers.com/finding-optimal-number-of-clusters/

    https://uc-r.github.io/kmeans_clustering#elbow

    另外,请考虑使用 Affinity Propogation 库。 AP 库将自动为您确定最佳集群数量。查看下面的简单示例。

    install.packages("apcluster")
    
    library("apcluster")
    c1 <- cbind(rnorm(30,.3,.5),rnorm(30.7,.4))
    c2 <- cbind(rnorm(30,.7,.4),rnorm(30.4,.5))
    x1 <- rbind(c1,c2)
    
    plot(x1, xlab="", ylab="", pch=19, cex=.8)
    
    apresia <- apcluster(negDistMat(r=2),x1)
    
    s1 <- negDistMat(x1,r=2)
    apres1b <- apcluster(s1)
    
    apresia
    
    plot(apresia, x1)
    

    资源:

    https://cran.r-project.org/web/packages/apcluster/vignettes/apcluster.pdf

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 2017-04-10
      • 2021-06-13
      • 2019-10-26
      • 1970-01-01
      • 2013-05-19
      • 2021-06-26
      • 2019-11-20
      • 1970-01-01
      相关资源
      最近更新 更多