nTrain <- 150 data <- iris[1:nTrain,] data.kmeans <- list() kMax <- 20 withinss <- rep(NA, kMax) for (k in 1:kMax) { data.kmeans [[k]] <- kmeans (data [, -5], k, nstart = 30) withinss[k] <- data.kmeans[[k]]$tot.withinss; } plot(data[,-5], col=data$Species) plot(data[,-5], col=data.kmeans[[3]]$cluster) intraclassVariance <- function(C, data) { nCenters <- dim(C)[1] nRows <- dim(data)[1] d <- rep(NA, nCenters) SS <- rep(0, nCenters) N <- rep(0.0001, nCenters) for (t in 1:nRows) { x <- data[t,] for (j in 1:nCenters) { d[j] <- norm(as.matrix(x - C[j,]), type="F") } c <- which.min(d) SS[c] <- SS[c] + min(d)^2 N[c] <- N[c] + 1 } variance <- sum(SS / N) ## Now assign return(variance) } mixtureLikelihood <- function(C, data) { nCenters <- dim(C)[1] nRows <- dim(data)[1] L <- 0; for (j in 1:nCenters) { ## probability of the data for each center logLc <- 0; for (t in 1:nRows) { x <- data[t,] d <- norm(as.matrix(x - C[j,]), type="F") logLc <- logLc - d } L <- L + exp(logLc) / nCenters } return(L) } ## use a hold-out set ##pertrubedData <- iris[(nTrain+1):150,] ## generate some random noise to make a new dataset ## nRows <- 150 nColumns <- 4 pertrubedData <- iris; pertrubedData[,-5] <- pertrubedData[,-5] + matrix(0.1*rnorm(nRows * nColumns), nRows, nColumns); trainingVariance <- rep(NA, kMax) holdoutVariance <- rep(NA, kMax) for (k in 1:kMax) { trainingVariance[k] <- sum(data.kmeans[[k]]$withinss / data.kmeans[[k]]$size) C <- data.kmeans[[k]]$centers holdoutVariance[k] <- intraclassVariance(C, pertrubedData[,-5]) } ## note that out of sample data SS *also* decreases a lot, since our centers now cover more space. myPlot <- par(mfrow = c(2,2)) plot(withinss) plot(oowithinss) plot(trainingVariance) lines(trainingVariance) plot(holdoutVariance) lines(holdoutVariance) #trainingLL <- rep(NA, kMax) #holdoutLL <- rep(NA, kMax) #for (k in 1:kMax) { # trainingLL[k] <- mixtureLikelihood(C, data[,-5]) # holdoutLL[k] <- mixtureLikelihood(C, pertrubedData[,-5]) #} #plot(trainingLL, main="training") #plot(holdoutLL, main="holdout")