## A simple classification tree with rpart

## functions for reading data from a table.
##X <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/geneTraining.txt")
##Y <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/geneTesting.txt")

##X <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/cancer.txt")
##Y <- read.table("http://www.cse.chalmers.se/~chrdimi/downloads/fouille/cancerHoldout.txt")

X <- read.table("cancer.txt")
Y <- read.table("cancerHoldout.txt")
Z <- read.table("cancerTest.txt")


## load the tree library
library("rpart")

acc.train <- rep(1,20);
acc.test <- rep(1,20);


for (split in 1:20) {
    
    ## Settings for the tree fitting. These are not necessary, but here we are going to use these settings to make the algorithm like ID3.
    fit.params = list(split="information") ## use information gain to split
    fit.ctrl = rpart.control(minsplit = split) ## always split - increase the number to simplify the tree
    
    ## fit a classification tree
    fit.tree <- rpart(cancer ~ smoker + age + sex + SSN, data = X, method="class", parms = fit.params, control = fit.ctrl)
    
    ## Calculate classification error
    ## The predict function returns a series of labels

    fit.train <- predict(fit.tree, X, tree = "class")
    acc <- 0;
    for (t in 1:dim(X)[1]) {
        acc = acc + fit.train[t, 1 + X[t,]$cancer];
    }
    acc.train[split] <- acc / dim(X)[1];
    print(acc.train[split])
    
    fit.test <- predict(fit.tree, Y, tree = "class")
    acc <- 0;
    for (t in 1:dim(Y)[1]) {
        acc = acc + fit.test[t, 1 + Y[t,]$cancer];
    }
    acc.test[split] <- acc / dim(Y)[1];
}


print("Classification accuracy in training:")
print(acc.train)
print("Classification accuracy in testing:")
print(acc.test)