## -*- Mode: R -*- ## a small KNN example, comparing it with a Kernel classifier ## ## The syntax of k-nearet neighbour classification is ## ## knn(train, test, cl, k = 1, l = 0, prob = FALSE, use.all = TRUE) ## ## train: the training set (without labels) ## test: the test set ## cl: the class labels ## k: the number of neighbours ## ## We define our own KNNClassifier with arguments ## ## KernelClassifier(train, test, cl, alpha) ## ## train: the training set (without labels) ## test: the test set ## cl: the class labels ## alpha: The kernel scaling constant ## ## We use the compiler package to speed up KernelClassifier require("compiler") enableJIT(3) ## First, we select the data to optimise on. training.filename <-"clusterTraining2.txt"; # change to 10, 100 for more features training.Data <- read.table(training.filename) ## Then, we select the data to test on. testing.filename <-"clusterTesting2.txt"; # change to 10, 100 for more features testing.Data <- read.table(testing.filename) ## From the training data, we select the amount of data to use for estimating parameters nTraining <- 100; # must be <= 4000 X <- training.Data[1:nTraining, ] ## Let's use the last 1000 part of the data as a holdout set to tune the hyper-parameters Y <- training.Data[4001:5000, ] ## handling the labels label <- "label" # the name of the label ## Magic that gives us the correct column labelColumn <- names(X) %in% c(label) # select the corresponding column ## A function to calculate the classification error ClassificationError <- function(labels, prediction) { return (mean(labels != prediction)) } ################### Code for Kernel Classifier ######################## ## Distance function to use EuclideanDistance <- function(x, y) { ## alternative methods possible than euclidean return (dist(rbind(x, y), method="euclidean")); } ## this function assumes that labels are 0, 1, ..., nClasses - 1 ClassifyKernel <- function(Data, x, labels, alpha, nClasses) { ## get the number of rows nData <- dim(Data)[1]; nFeatures <- dim(Data)[2]; ## calculate squared euclidean distance and exponentiate [faster this way] ##delta <- exp(-alpha * rowSums(Data - rep(x, nFeatures, 1)))^2 # use ## set up the array containing the weights w <- rep(0, nClasses) ## add a weight to each label. for (t in 1:nData) { delta = Data[t, ] - x; w[1 + labels[t]] <- w[1 + labels[t]] + exp(-alpha * sum(delta * delta)); } ## return the label with the maximum weight. return (which.max(w) - 1); } KernelClassifier <- function(training.data, testing.data, training.labels, alpha) { ## get the number of testing rows nData <- dim(testing.data)[1] prediction <- rep(0, nData) ## get the label column and number of labels nClasses <- length(unique(training.labels)); for (t in 1:nData) { prediction[t] <- ClassifyKernel(training.data, testing.data[t, ], training.labels, alpha, nClasses); } return (prediction); } ##################### END KernelClassifier ##################################### ## convenient for splitting the data in features and labels train.features <- X[, !labelColumn]; holdout.features <- Y[, !labelColumn]; train.labels <- X[, labelColumn]; holdout.labels <- Y[, labelColumn]; ### run a loop that plots the train and holdout error for KNN if (TRUE) { print("Running KNN") maxNeighbours = 100; knn.train.error <- rep(-1,maxNeighbours) knn.holdout.error <- rep(-1,maxNeighbours) for (num.neighbours in 1:maxNeighbours) { train.prediction <- class::knn(train.features, train.features, train.labels, k = num.neighbours) holdout.prediction <- class::knn(train.features, holdout.features, train.labels, k = num.neighbours) knn.train.error[num.neighbours] <- ClassificationError(train.labels, train.prediction) knn.holdout.error[num.neighbours] <- ClassificationError(holdout.labels, holdout.prediction) } ##par(fig=c(1, 10, 0, 1)) ## Write graph to a file output.filename <- paste0("KNN", training.filename, "nTrain", nTraining, "maxK", maxNeighbours, ".pdf") pdf(output.filename,width=7,height=5) ## plot the data plot(knn.train.error, type="l", col="blue", lty = 1, lwd = 2, ylim=c(0,0.5)) lines(knn.holdout.error, type="l", col="red", lty = 2, lwd = 2) ## information about the graph legend("topleft", legend = c("training", "validation"), lty = c(1, 2), col = c("blue", "red"), lwd = c(2, 2)) title(paste("KNN with ", nTraining, "training examples on", training.filename)) ## visualise the minimum error point minerr <- min(knn.holdout.error); argminerr <- (1:maxNeighbours)[knn.holdout.error==min(knn.holdout.error)]; text(argminerr[1], minerr + 0.02, as.character(minerr)) points(argminerr[1], minerr, type="p") ## close the PDF device dev.off() } print("running kernel"); ## run a loop that plots the train and holdout error for KernelClassifier Alphas <- seq(0.1, 10, 0.1) #Alphas <- c(0.1, 0.5, 1, 2, 10) kernel.holdout.error <- rep(-1, length(Alphas)) kernel.train.error <- rep(-1, length(Alphas)) for (k in 1:length(Alphas)) { print(paste("Testing alpha ", Alphas[k],":", k, "/", length(Alphas))) holdout.prediction <- KernelClassifier(train.features, holdout.features, train.labels, Alphas[k]) kernel.train.error[k] <- ClassificationError(train.labels, train.prediction) kernel.holdout.error[k] <- ClassificationError(holdout.labels, holdout.prediction) print(kernel.holdout.error[k]) } ##par(fig=c(1, 10, 0, 1)) ## Write graph to a file output.filename <- paste0("Kernel", training.filename, "nTrain", nTraining, ".pdf") pdf(output.filename,width=7,height=5) ## plot the data plot(kernel.train.error, type="l", col="blue", lty = 1, lwd = 2, ylim=c(0,0.5)) lines(kernel.holdout.error, type="l", col="red", lty = 2, lwd = 2) ## information about the graph legend("topleft", legend = c("training", "validation"), lty = c(1, 2), col = c("blue", "red"), lwd = c(2, 2)) title(paste("Kernel with ", nTraining, "training examples on", training.filename)) ## visualise the minimum error point minerr <- min(kernel.holdout.error); argminerr <- Alphas[kernel.holdout.error==min(kernel.holdout.error)]; text(argminerr[1], minerr + 0.02, as.character(minerr)) points(argminerr[1], minerr, type="p") ## close the PDF device dev.off() #### plot both holdout errors ## Write graph to a file output.filename <- paste0("KNNvsKernel", training.filename, "nTrain", nTraining, ".pdf") pdf(output.filename,width=7,height=5) ## plot the data plot(knn.holdout.error, type="l", col="blue", lty = 1, lwd = 2, ylim=c(0.2,0.5)) lines(kernel.holdout.error, type="l", col="red", lty = 2, lwd = 2) ## information about the graph legend("topleft", legend = c("KNN", "kernel"), lty = c(1, 2), col = c("blue", "red"), lwd = c(2, 2)) title(paste("Kernel vs KNN with ", nTraining, "training examples on", training.filename)) ## visualise the minimum error point minerr <- min(knn.holdout.error); argminerr <- (1:maxNeighbours)[knn.holdout.error==min(knn.holdout.error)]; text(argminerr[1], minerr - 0.02, as.character(minerr)) points(argminerr[1], minerr, type="p") minerr <- min(kernel.holdout.error); argminerr <- (1:length(Alphas))[kernel.holdout.error==min(kernel.holdout.error)]; text(argminerr[1], minerr + 0.02, as.character(minerr)) points(argminerr[1], minerr, type="p") ## close the PDF device dev.off()