## A simple classification tree with rpart ## functions for reading data from a table. ##X <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/geneTraining.txt") ##Y <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/geneTesting.txt") ##X <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/cancer.txt") ##Y <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/cancerHoldout.txt") X <- read.table("cancer.txt") Y <- read.table("cancerHoldout.txt") Z <- read.table("cancerTest.txt") ## load the tree library library("rpart") acc.train <- rep(1,20); acc.test <- rep(1,20); for (split in 1:20) { ## Settings for the tree fitting. These are not necessary, but here we are going to use these settings to make the algorithm like ID3. fit.params = list(split="information") ## use information gain to split fit.ctrl = rpart.control(minsplit = split) ## always split - increase the number to simplify the tree ## fit a classification tree fit.tree <- rpart(cancer ~ smoker + age + sex + SSN, data = X, method="class", parms = fit.params, control = fit.ctrl) ## Calculate classification error ## The predict function returns a series of labels fit.train <- predict(fit.tree, X, tree = "class") acc <- 0; for (t in 1:dim(X)[1]) { acc = acc + fit.train[t, 1 + X[t,]$cancer]; } acc.train[split] <- acc / dim(X)[1]; print(acc.train[split]) fit.test <- predict(fit.tree, Y, tree = "class") acc <- 0; for (t in 1:dim(Y)[1]) { acc = acc + fit.test[t, 1 + Y[t,]$cancer]; } acc.test[split] <- acc / dim(Y)[1]; } print("Classification accuracy in training:") print(acc.train) print("Classification accuracy in testing:") print(acc.test)