# Example script for the MTS calculations used in the paper: # Dimitris Liparas, Lefteris Angelis, Robert Feldt, "Applying the Mahalanobis-Taguchi Strategy for Software Defect Diagnosis", 2011 # # Script version: 1.0, June 8th 2011 # # Copyright (c) 2011 Dimitris Liparas, dliparas@csd.auth.gr # # The latest version of this script can be found here: # http://www.cse.chalmers.se/~feldt/publications/liparas_2011_mts_sw_defects/mts.r # #Before the use of the MTS script, the following package must be installed and loaded in the R environment: DoE.base (for the use of orthogonal designs in STEP 3) #STEP 1: CONSTRUCTION OF THE MEASUREMENT SCALE AND STEP 2: VALIDATION OF THE SCALE #Specify the name of the file that contains the data of the healthy group's variables b <- read.csv (file="filename_healthy.csv",head=FALSE,sep=";") #Specify the name of the file that contains the data of the abnormal group's variables c <- read.csv (file="filename_abnormal.csv",head=FALSE,sep=";") #Compute the covariance matrix of the healthy group's variables d <- cov(b) #Compute the mean values of the healthy group's variables e <- colMeans(b) #The function mahalanobis () computes the Mahalanobis distance values for the group of healthy cases f <- mahalanobis (b, e, d, inverted=FALSE) #The function mahalanobis () computes the Mahalanobis distance values for the group of abnormal cases. The mean values, standard deviations and covariance matrix corresponding to the group of healthy cases are used for the computation of the Mahalanobis distance values for the abnormal cases. g <- mahalanobis (c, e, d, inverted=FALSE) #Divide the Mahalanobis distance values (both for the healthy and for the abnormal cases) by the number of variables h <- ncol(b) f <- f/h g <- g/h #STEP 3: IDENTIFICATION OF THE SUBSET OF USEFUL VARIABLES USING ORTHOGONAL ARRAYS AND SIGNAL-TO-NOISE RATIOS #The command oa.design () is included in package DoE.base. It creates an orthogonal design with the number of runs, factors and levels that you specify. Each row of the orthogonal design represents an experimental run and contains the levels of various factors in order to study the effects of the factors on a prespecified response variable. Each column of the orthogonal design represents a factor of the experiment. Consider the inclusion or exclusion of each variable as a factor with two levels. For each one of these runs, the MD values are calculated for the abnormal cases, but using only the specified variables. a <- oa.design (nruns=NULL, nfactors=NULL, nlevels=NULL) for(i in 1:ncol(a)){ a[1,i]<-1} k <- nrow(a) l <-ncol(a) m <- matrix (nrow=k,ncol=nrow(c)) m[1,] <- g d <- matrix(nrow=nrow(b),ncol=ncol(b)) e <- matrix(nrow=nrow(c),ncol=ncol(c)) for (i in 2:k) { for (j in 1:l) { if (a[i,j]==1){ d[,j] <- b[,j] e[,j] <- c[,j] } } d <- d [,-which(apply(d,2,function(x)all(is.na(x))))] e <- e [,-which(apply(e,2,function(x)all(is.na(x))))] n <- cov(d) o <- colMeans(d) p <- ncol(d) q <- mahalanobis (e, o, n, inverted=FALSE) r <- q/p m[i,] <- r d <- matrix(nrow=nrow(b),ncol=ncol(b)) e <- matrix(nrow=nrow(c),ncol=ncol(c)) } #SIGNAL TO NOISE RATIOS (When the MD values of the abnormal cases are calculated for each run of the orthogonal design, use these MD values to calculate the value of a S/N ratio, which is the response variable for each different run. The set of most useful variables is determined by computing and evaluating the gain in the values of the S/N ratios.) p<-1/m q<-rowSums(p) r<-ncol(m) s<-q/r t<- -10*log(s) k <- nrow(a) l <-ncol(a) n <- matrix (nrow=k) w <- matrix (nrow=2, ncol=l) for (i in 1:l){ for (j in 1:k){ if (a[j,i]==1) n[j,] <- t[j] } n <- n [-which(apply(n,1,function(x)all(is.na(x)))),] p <-mean(n) w[1,i]<-p n <- matrix (nrow=k) } k <- nrow(a) l <-ncol(a) n <- matrix (nrow=k) for (i in 1:l){ for (j in 1:k){ if (a[j,i]==2) n[j,] <- t[j] } n <- n [-which(apply(n,1,function(x)all(is.na(x)))),] p <-mean(n) w[2,i]<-p n <- matrix (nrow=k) } l=ncol(a) n<- matrix(nrow=1,ncol=l) for(i in 1:l){ p<-w[1,i]-w[2,i] n[,i]<-p} #STEP 4: RECONSTRUCTION OF THE SCALE #Reconstruct the measurement scale using only the "optimal" subset of variables found in STEP 3. Use the reconstructed scale to compute the MD values for any unknown cases, in order to take any corrective actions, if necessary. #Specify the name of the file that contains the data of the healthy group's variables b_optimal <- read.csv (file="filename.csv",head=FALSE,sep=";") #Specify the name of the file that contains the data of the abnormal group's variables c_optimal <- read.csv (file="filename.csv",head=FALSE,sep=";") for(i in 1:length(n)) { if(n[1,i]<0) { b_optimal[,i]<-NA } } b_optimal <- b_optimal [,-which(apply(b_optimal,2,function(x)all(is.na(x))))] for(i in 1:length(n)) { if(n[1,i]<0) { c_optimal[,i]<-NA } } c_optimal <- c_optimal [,-which(apply(c_optimal,2,function(x)all(is.na(x))))] d2 <- cov(b_optimal) e2 <- colMeans(b_optimal) f_final <- mahalanobis (b_optimal, e2, d2, inverted=FALSE) g_final <- mahalanobis (c_optimal, e2, d2, inverted=FALSE) h2 <- ncol(b_optimal) f_final <- f_final/h2 g_final <- g_final/h2