CART

Pick a locations in Arizona and fit a CART model to its

  1. the seasonal pirecipitation along with 3 PCs of the winter SSTs over Pacific. Compare the CART estimtes from observed. Perform a drop-10% cross validation and boxplot correlation and RMSE.

  2. Fit a random forest model and compare the performance with (a) above.

###########HW2,Problem 3
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW2.R"))
source("Lib_HW2.R")

Lat - Long grid creation

### Lat - Long grid..
# information obtained from: http://iridl.ldeo.columbia.edu/SOURCES/.KAPLAN/.EXTENDED/.v2/ 

ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid) # number of latitudinal locations

xgrid=seq(27.5,382.5,by=5)
nx=length(xgrid) # number of longitudinal locations
nglobe = nx*ny
# creation of spatial grid
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
  for(ix in 1:nx){
    i=i+1
    xygrid[i,1]=ygrid[iy]
    xygrid[i,2]=xgrid[ix]
  }
}

Read data of the seasonal pirecipitation and winter SSTs over Pacific

nyrs = 118    #Nov-Mar 1901 - Nov-Mar 2018   
setwd("./data") #move to the data directory
data=readBin("Kaplan-SST-NDJFM1900-NDJFM2018.r4",what="numeric", n=( nx * ny * nyrs), size=4,endian="swap")

data <- array(data = data, dim=c( nx, ny, nyrs) )
data1=data[,,1]

# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 < 20 & data1 != "NaN"]   # only non-missing data.
xygrid1=xygrid[index1,]

nsites=length(index1)
data2=data1[index1]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstannavg=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
  data1=data[,,i]
  index1=index[data1 < 20 & data1 != "NaN"]
  data2=data1[index1]
  sstannavg[i,]=data2
}  

indexgrid = index1
rm("data")  #remove the object data to clear up space
## Index of locations corresponding to Pacific 
sstdata=sstannavg
xlongs = xygrid1[,2]
ylats = xygrid1[,1]
indexpac = indexgrid[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


xlong = xlongs[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]
ylat = ylats[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]

index1=1:length(xlongs)
index = index1[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


sstannavg = sstdata[,index]
indexgrid = indexpac

N = 113 #Nov-Mar 1901 - Nov-Mar 2013 
sstannavg=sstannavg[1:N,]
wuplocs = matrix(scan("WesternUS-coords.txt"), ncol=5,byrow=T)
nlocs = dim(wuplocs)[1]
xlats = wuplocs[,3]
xlongs = -wuplocs[,4]

nlocs1=nlocs+1
winterprecip = matrix(scan("western-US-winterprecip-1901-2014.txt"),ncol=nlocs1,byrow=T)
years = winterprecip[,1]
winterprecip = winterprecip[,2:nlocs1]      #first column is year

Perform a PCA on the winter global SST anomalies

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=sstscale %*%zsvd$u
#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

Pick a locations in Arizona

loc=which(xlats<35 & xlats>34.5) ##chosen according to s patial maps of problem 3
prec=winterprecip[,loc]
sprintf("The Arizona's location chosen is: E%s and N%s",xlongs[loc], xlats[loc])
## [1] "The Arizona's location chosen is: E-112.5 and N34.6"

Fit tree to data and visualize

#create the names of PCs
name=""
for (i in 1:dim(pcs)[2]) {
  name[i]=paste("PC",i,sep = "")
}
pcs=as.data.frame(pcs)
names(pcs)=name
pcs1=cbind(prec,pcs)
pcs1 = data.frame(pcs1)
tree.precip = tree(prec ~PC1+PC2+PC3, data = pcs1, model = T)
summary(tree.precip)
## 
## Regression tree:
## tree(formula = prec ~ PC1 + PC2 + PC3, data = pcs1, model = T)
## Number of terminal nodes:  13 
## Residual mean deviance:  0.355 = 35.5 / 100 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.54900 -0.34330 -0.04525  0.00000  0.32170  1.75700
### treeFun below
treeFit = treeFun(tree.precip, toPlot=T, title="Burnt Area Tree")

summary(treeFit)
## 
## Regression tree:
## snip.tree(tree = myTree, nodes = c(29L, 8L))
## Number of terminal nodes:  8 
## Residual mean deviance:  0.3843 = 40.35 / 105 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.54900 -0.39820 -0.04525  0.00000  0.35180  1.75700
#Use tree to predict original data
treePred <- predict(treeFit)
#treeResid <- resid(treeFit)
myRange <- range(treePred, prec)
skill_tree = c(sqrt(mean((prec-treePred)^2)),cor(prec, treePred))
names(skill_tree)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
 geom_point(mapping = aes(prec, treePred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)  

Drop-10% cross validation and boxplot correlation and RMSE

Skill=Drop_10_pred(pcs1,treeFit$besTree,"tree")
 f1=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree[2]),
 color = "red", size = 3)+
  labs(title="CART",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 myrange1=range(Skill[,1])
 f2=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1) 
 multiplot(f1, f2, cols = 2) 

(b) Fit a Random Forest to data and visualize

forest.precip = randomForest(prec ~ PC1+PC2+PC3, data = pcs1)
summary(forest.precip)
##                 Length Class  Mode     
## call              3    -none- call     
## type              1    -none- character
## predicted       113    -none- numeric  
## mse             500    -none- numeric  
## rsq             500    -none- numeric  
## oob.times       113    -none- numeric  
## importance        3    -none- numeric  
## importanceSD      0    -none- NULL     
## localImportance   0    -none- NULL     
## proximity         0    -none- NULL     
## ntree             1    -none- numeric  
## mtry              1    -none- numeric  
## forest           11    -none- list     
## coefs             0    -none- NULL     
## y               113    -none- numeric  
## test              0    -none- NULL     
## inbag             0    -none- NULL     
## terms             3    terms  call
plot(forest.precip)

rforestpred = predict(forest.precip)
myRange = range(rforestpred, prec)

skill_tree_forest = c(sqrt(mean((prec-rforestpred)^2)),cor(prec, rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
 geom_point(mapping = aes(prec, rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)

Drop-10% cross validation and boxplot correlation and RMSE

Skill_forest=Drop_10_pred(pcs1,1,"Random Forest")
 f3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
 color = "red", size = 3)+
  labs(title="Random Rorest",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 
 f4=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1)
 multiplot(f1, f2,f3,f4, cols = 2) 

 multiplot(P1,P2, cols = 2) 

Final Comments

  • The location picked in Arizona is -112.5°E and 34.6°N.

  • According to CART, the most important PCs are PC1, 2, and 3 (mentioned in importance order). It is possible to see that pruned tree for burnt area tree can reduce nodes from 13 to 8.

  • CART and random forest give predictions between 0.9 and 2.5, even when there are observations values outside of this range. This was expected in the case of CART since it is known that method has a low prediction accuracy. This is because this method for different values of observation data predicts the same value (mean of the corresponding node).

  • Although CART predicts in a wider range (0.9-2.5) than random forest (0.9-2.2), predictions are more distributed than CART (predictions restricted to 8 possible values).

  • Drop-10% cross validation shows a increase in the correlation and a decrease in RMSE for random forest in comparison with CART although values of fitted model (red points) indicates a better performance of CART than RF.