library(kohonen)
library(raster)
library(rasterVis)
library(maptools)
library(RColorBrewer)
library(maptools)
data("wrld_simpl")
library(tree)
library(randomForest)
library(ggplot2)


set.seed(1)


multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  library(grid)
  
  # Make a list from the ... arguments and plotlist
  plots <- c(list(...), plotlist)
  
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    # Make the panel
    # ncol: Number of columns of plots
    # nrow: Number of rows needed, calculated from # of cols
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))
  }
  
  if (numPlots==1) {
    print(plots[[1]])
    
  } else {
    # Set up the page
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    # Make each plot, in the correct location
    for (i in 1:numPlots) {
      # Get the i,j matrix positions of the regions that contain this subplot
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}


ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid) # number of latitudinal locations

xgrid=seq(27.5,382.5,by=5)
nx=length(xgrid) # number of longitudinal locations

nglobe = nx*ny
# creation of spatial grid
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
  for(ix in 1:nx){
    i=i+1
    xygrid[i,1]=ygrid[iy]
    xygrid[i,2]=xgrid[ix]
  }
}

nyrs = 55    #Nov-Mar 1901 - Nov-Mar 2018   

data=readBin("Kaplan-SST-DJFM1900-DJFM2018.r4",what="numeric", n=( nx * ny * nyrs), size=4,endian="swap")

data = array(data = data, dim=c( nx, ny, nyrs ) )

data1=data[,,1]

# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 < 20 & data1 != "NaN"]   # only non-missing data.
xygrid1=xygrid[index1,]

nsites=length(index1)
data2=data1[index1]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstannavg=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
  data1=data[,,i]
  index1=index[data1 < 20 & data1 != "NaN"]
  data2=data1[index1]
  sstannavg[i,]=data2
}  

indexgrid = index1
rm("data")  #remove the object data to clear up space

## write out the grid locations..
sstdata=sstannavg
xlongs = xygrid1[,2]
ylats = xygrid1[,1]
indexpac = indexgrid[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


xlong = xlongs[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]
ylat = ylats[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]

index1=1:length(xlongs)
index = index1[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


sstannavg = sstdata[,index]
indexgrid = indexpac


#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=sstscale %*%zsvd$u
#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))


sprintf("First PC explained %s percent of the variance",round(sum(lambdas[1])*100))
sprintf("First 2 PC explained %s percent of the variance",round(sum(lambdas[1:2])*100))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))


N = 55 #Nov-Mar 1964 - Nov-Mar 2013 
sstannavg=sstannavg[1:N,]

wuplocs = matrix(scan("Precipitaton_data - Copy.txt"),ncol =4,byrow=T)

#wuplocs
nlocs = dim(wuplocs)[1]
xlats = wuplocs[,2]
xlongs = wuplocs[,1]
#xlongs
nlocs1=nlocs+1
winterprecip = matrix(scan("Max_Winter_Seas_Prec - Copy.txt"),ncol=nlocs1,byrow=T)

years = winterprecip[,1]
winterprecip = winterprecip[,2:nlocs1]      #first column is year

winterprecip= scale(winterprecip)


zs=var(winterprecip)

#do an Eigen decomposition..
zsvd=svd(zs)
#Eigen vectors
Preceof=zsvd$u
#Principal Components...
precippcs=winterprecip %*% zsvd$u

#Eigen Values.. - fraction variance 
lambdas1=(zsvd$d/sum(zsvd$d))


sprintf("First PC explained %s percent of the variance",round(sum(lambdas[1])*100))
sprintf("First 2 PC explained %s percent of the variance",round(sum(lambdas[1:2])*100))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))


#create the names of PCs
name=""
for (i in 1:dim(pcs)[2]) {
  name[i]=paste("PC",i,sep = "")
}
pcs=as.data.frame(pcs)
names(pcs)=name
pcs1=cbind(precippcs,pcs)
pcs1 = data.frame(pcs1)
tree.precip1 = tree(X1 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip1)

tree.precip2 = tree(X2 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip2)

tree.precip3 = tree(X3 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip3)

tree.precip4 = tree(X4 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip4)

Regression tree:
tree(formula = X1 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Variables actually used in tree construction:
[1] "PC2" "PC4" "PC3"
Number of terminal nodes:  7 
Residual mean deviance:  8.701 = 417.7 / 48 
Distribution of residuals:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-8.7090 -1.3360 -0.3201  0.0000  1.8950  5.8700

Regression tree:
tree(formula = X2 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Variables actually used in tree construction:
[1] "PC3" "PC4"
Number of terminal nodes:  9 
Residual mean deviance:  4.491 = 206.6 / 46 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-4.34900 -1.09600 -0.04789  0.00000  1.10400  4.61100

Regression tree:
tree(formula = X3 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Number of terminal nodes:  9 
Residual mean deviance:  2.873 = 132.2 / 46 
Distribution of residuals:
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-3.3030 -1.1450 -0.3934  0.0000  1.0170  5.9580

Regression tree:
tree(formula = X4 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Number of terminal nodes:  9 
Residual mean deviance:  3.331 = 153.2 / 46 
Distribution of residuals:
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-3.32000 -1.22200  0.02581  0.00000  0.96170  4.92300


treeFun = function(myTree, toPlot = F, title = ""){

  #Perform CV on tree object
  cvTree <- cv.tree(myTree)
  optTree <- which.min(cvTree$dev)
  bestTree <- cvTree$size[optTree]
  
  #prune Tree based on CV results
  pruneTree <- prune.tree(myTree, best = bestTree)
  
  #If plotting is selected
  if(toPlot){
    
    #Plot unpruned Tree
    plot(myTree)
    text(myTree, cex = .75)
    title(main = paste("Unpruned Tree for", title))
    
    #Plot CV
    plot(cvTree$size, cvTree$dev, type = "b", 
         main = paste("Cross Validation for", title))
    
    #Plot Prunned Tree
    plot(pruneTree)
    text(pruneTree, cex = .75)
    title(main = paste("Pruned Tree for", title))
  } 
  pruneTree$besTree=bestTree
  return(pruneTree)
}


treeFit1 = treeFun(tree.precip1, toPlot=T, title="pc1 Tree")

treeFit2 = tree.precip2

treeFit3 = treeFun(tree.precip3, toPlot=T, title="pc3 Tree")

treeFit4 = tree.precip4


#Use tree to predict original data
treePred1 <- predict(treeFit1)
#treeResid <- resid(treeFit)
myRange <- range(treePred1, precippcs[,1])
skill_tree1 = c(sqrt(mean((precippcs[,1]-treePred1)^2)),cor(precippcs[,1], treePred1))
names(skill_tree1)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
 geom_point(mapping = aes(precippcs[,1], treePred1), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X1)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X1 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest(prec ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}


Skill=Drop_10_pred(pcs1,treeFit1$besTree,"tree")


f1_1=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
 color = "red", size = 3)+
  labs(title="CART",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 myrange1=range(Skill[,1])
 f2_1=ggplot() +
 geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1) 
 multiplot(f1_1, f2_1, cols = 2)

Warning message:
"Removed 17 rows containing non-finite values (stat_boxplot)."


#Use tree to predict original data
treePred2 <- predict(treeFit2)
#treeResid <- resid(treeFit)
myRange <- range(treePred2, precippcs[,2])
skill_tree2 = c(sqrt(mean((precippcs[,2]-treePred2)^2)),cor(precippcs[,2], treePred2))
names(skill_tree2)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
 geom_point(mapping = aes(precippcs[,2], treePred2), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X2)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X2 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, 2)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest(prec ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}


Skill=Drop_10_pred(pcs1, treeFit2,"tree")


f1_2=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
 color = "red", size = 3)+
  labs(title="CART",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 myrange1=range(Skill[,1])
 f2_2=ggplot() +
 geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1) 
 multiplot(f1_2, f2_2, cols = 2)

Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


#Use tree to predict original data
treePred3 <- predict(treeFit3)
#treeResid <- resid(treeFit)
myRange <- range(treePred3, precippcs[,3])
skill_tree3 = c(sqrt(mean((precippcs[,3]-treePred3)^2)),cor(precippcs[,2], treePred3))
names(skill_tree3)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
 geom_point(mapping = aes(precippcs[,3], treePred3), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X2)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X3 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest(x3 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}


Skill=Drop_10_pred(pcs1, treeFit3$besTree,"tree")


f1_3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
 color = "red", size = 3)+
  labs(title="CART",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 myrange1=range(Skill[,1])
 f2_3=ggplot() +
 geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1) 
 multiplot(f1_3, f2_3, cols = 2)

Warning message:
"Removed 91 rows containing non-finite values (stat_boxplot)."


#Use tree to predict original data
treePred4 <- predict(treeFit4)
#treeResid <- resid(treeFit)
myRange <- range(treePred4, precippcs[,4])
skill_tree4 = c(sqrt(mean((precippcs[,4]-treePred1)^2)),cor(precippcs[,2], treePred4))
names(skill_tree4)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
 geom_point(mapping = aes(precippcs[,4], treePred4), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X2)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X4 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, 2)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest(x4 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}




Skill=Drop_10_pred(pcs1, treeFit4,"tree")
f1_4=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
 color = "red", size = 3)+
  labs(title="CART",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 myrange1=range(Skill[,1])
 f2_4=ggplot() +
 geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1) 
 multiplot(f1_4, f2_4, cols = 2)

Warning message in cor(drop_actual, drop_pred):
"the standard deviation is zero"
Warning message in cor(drop_actual, drop_pred):
"the standard deviation is zero"
Warning message in cor(drop_actual, drop_pred):
"the standard deviation is zero"
Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


forest.precip_1 = randomForest(X1 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_1)
plot(forest.precip_1)

                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted        55    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times        55    -none- numeric  
importance        4    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y                55    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X1)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X1 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest( X1 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}


rforestpred = predict(forest.precip_1)
myRange = range(rforestpred, precippcs[,1])

skill_tree_forest = c(sqrt(mean((precippcs[,1]-rforestpred)^2)),cor(precippcs[,1], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
 geom_point(mapping = aes(precippcs[,1], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)


Skill_forest = Drop_10_pred(pcs1,1,'random forest') 


f3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
 color = "red", size = 3)+
  labs(title="Random Rorest",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 
 f4=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1)
 multiplot(f1_1, f2_1,f3,f4, cols = 2)

Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


forest.precip_2 = randomForest(X2 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_2)
plot(forest.precip_2)

                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted        55    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times        55    -none- numeric  
importance        4    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y                55    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call


rforestpred = predict(forest.precip_2)
myRange = range(rforestpred, precippcs[,2])

skill_tree_forest = c(sqrt(mean((precippcs[,2]-rforestpred)^2)),cor(precippcs[,2], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
 geom_point(mapping = aes(precippcs[,2], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X1)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X2 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest( X2 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}


Skill_forest = Drop_10_pred(pcs1,1,'random forest') 


f3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
 color = "red", size = 3)+
  labs(title="Random Rorest",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 
 f4=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1)
 multiplot(f1_2, f2_2,f3,f4, cols = 2)

Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


forest.precip_3 = randomForest(X3 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_3)
plot(forest.precip_3)

                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted        55    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times        55    -none- numeric  
importance        4    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y                55    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call


rforestpred = predict(forest.precip_3)
myRange = range(rforestpred, precippcs[,3])

skill_tree_forest = c(sqrt(mean((precippcs[,3]-rforestpred)^2)),cor(precippcs[,3], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
 geom_point(mapping = aes(precippcs[,3], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X1)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X3 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest( X3 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}



Skill_forest = Drop_10_pred(pcs1,1,'random forest') 


f3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
 color = "red", size = 3)+
  labs(title="Random Rorest",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 
 f4=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1)
 multiplot(f1_3, f2_3,f3,f4, cols = 2)

Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


forest.precip_4 = randomForest(X4 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_4)
plot(forest.precip_4)

                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted        55    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times        55    -none- numeric  
importance        4    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y                55    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call


rforestpred = predict(forest.precip_4)
myRange = range(rforestpred, precippcs[,4])

skill_tree_forest = c(sqrt(mean((precippcs[,4]-rforestpred)^2)),cor(precippcs[,4], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
 geom_point(mapping = aes(precippcs[,4], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  geom_abline(intercept = 0, slope = 1) +
  coord_cartesian(xlim = myRange, ylim = myRange) + 
 labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)


Drop_10_pred = function(X,bestTree,type) {
  mod_data = X
  N = length(X$X1)
  drop = 10
  nsample = 572
  i_full = 1:N
  # initialize skill score vectors
  skill_rmse = vector(mode="numeric", length=nsample)
  skill_cor = vector(mode="numeric", length=nsample)
  for (i in 1:nsample){
    if (type=="tree"){
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      myTree =tree(X4 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
      drop_mod= prune.tree(myTree, best = bestTree)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }else{
      i_drop = sample(i_full,N*drop/100)            # can add argument replace=TRUE
      drop_dt = mod_data[-i_drop,] # drop 10% precip value
      drop_mod=randomForest( X4 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
      drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
      drop_actual = mod_data[i_drop,1]
      skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
      skill_cor[i] = cor(drop_actual,drop_pred)
    }
    
  }
  CV=as.data.frame(cbind(skill_rmse,skill_cor))
  names(CV)=c("rmse","cor")
  return(CV)
}



Skill_forest = Drop_10_pred(pcs1,1,'random forest') 


f3=ggplot() +
 geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
 geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
 color = "red", size = 3)+
  labs(title="Random Rorest",x = "",y = "")+
   theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
 
 f4=ggplot() +
 geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
 geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
 color = "red", size = 3)+
   labs(title="",x = "",y = "")+
   coord_cartesian(xlim = myrange1, ylim = myrange1)
 multiplot(f1_4, f2_4,f3,f4, cols = 2)

Warning message:
"Removed 3 rows containing non-finite values (stat_boxplot)."


multiplot(P1,P2, cols = 2)

Random Forest¶