setwd("C:/Users/kere9654/Desktop/Share with Balaji Class") #**************************************************************************************** # Load packages #**************************************************************************************** #For Trees library(mclust) install.packages("mclust") install.packages("tree") install.packages("randomForest") install.packages("maps") install.packages("akima") install.packages("fields") #Function to prune tree source("PruneTree.r") library(mclust) library(tree) library(randomForest) #For PCA library(maps) library(akima) library(fields) #Function to prune tree source("PruneTree.r") #Load this only when plotting a randomforest tree because #it masks necssary commands: #source("RandomForestPlot.R") #Load Data - monthly. 2008 to 2015, generally months 4-11 data=read.csv("DATA_FC_PNF_lags_monthly.csv") #Remove unnessary water quality parameters, the dependent variable (TOC), and dates drop<-c("Month", "Year","TOC","Turb","WaterTemp","Alk","Hardness", "NH3","pH") Xvar=data[,!(names(data)%in% drop)] Y=data$TOC data=as.data.frame(cbind(Y,Xvar)) myTree <- tree(Y ~ ., data = data, model = T) #Perform CV on tree object cvTree <- cv.tree(myTree) optTree <- which.min(cvTree$dev) bestTree <- cvTree$size[optTree] #Prune tree based on CV results pruneTree <- prune.tree(myTree, best = bestTree) #Use tree to predict original data treePred <- predict(pruneTree) treeResid <- resid(pruneTree) myRange <- range(treePred, data$Y) #Plot Unpruned Tree plot(myTree) text(myTree, cex = .75) title(main = c("Unpruned Tree for TOC")) #Plot CV plot(cvTree$size, cvTree$dev, type = "b", main = c("Cross Validation for TOC")) #Plot Prunned Tree plot(pruneTree) text(pruneTree, cex = .75) title(main =c("Pruned Tree for TOC")) #Plot Observed vs Predicted plot(data$Y, treePred, xlim = myRange, ylim = myRange, xlab="Observed Turbidity (NTU)", ylab="Modeled Turbidity (NTU)", main = c("Pred vs True for ToC")) lines(data$Y, data$Y, col = "black") #Fit tree tree.T = tree(Y ~ ., data =data) summary(tree.T) #Use treeFun to prune the tree, predict from the tree, and get plots treeFit = treeFun(data, toPlot=T, title="TOC (mg/L)") summary(treeFit) #How good are predictions? -had to place treeFun code in here to do #R2 R2 <- (cor(data$Y,treePred))^2 R2 #RMSE RMSE <- sqrt(mean((data$Y -treePred)^2)) RMSE #**************************************************************************************** # Random Forest #**************************************************************************************** #Fit random forest and plot forest.T = randomForest(Y ~ ., data = data) summary(forest.T) print(forest.T) forest.T.Pred = predict(forest.T) forest.T.res = data$Y - forest.T.Pred # no residual command so use: res = y - yhat forest.myRange <- range(forest.T.Pred, data$Y) #Plot error VS trees plot(forest.T, main="Random Forest for TOC (mg/L)") #Plot observed VS predicted plot(data$Y, forest.T.Pred, xlim = forest.myRange, ylim = forest.myRange, xlab="Observed TOC Concentration (mg/L)", ylab="Modeled TOC Concentration (mg/L)", main = ("Pred vs True for Random Forest of TOC (mg/L)")) lines(data$Y, data$Y, col = "black") #How good are predictions? # R2 R2 <- (cor(data$Y,forest.T.Pred))^2 R2 ##RMSE RMSE <- sqrt(mean((data$Y -forest.T.Pred)^2)) RMSE #Importance of the different variables #Dotchart of variable importance as measured by a Random Forest #Shows total decrease in node impurities from splitting on the variable, averaged over #all trees. For classification, the node impurity is measured by the Gini index. #For regression, it is measured by residual sum of squares. Variables are ordered #by importance were the top most variable is the most important. #You can use this chart to determine which variables to include in a PCA by looking #for a large break between the variables, which will help you to decide how many important #vairables to choose. varImpPlot(forest.T, pch = 20, main = "Importance of Variables") #Can also see importance of the variables as table importance(forest.T) #Tree visuallization - look at tree number - load source above #Can plot a given tree number in the random forest: tree_num = 350 x=tree_func(forest.T,tree_num) #Can use getTree to to also visualize tree number 350, since the Error VS #trees plot had stabilized by then. getTree(forest.T, k=tree_num, labelVar=TRUE)