library(kohonen)
library(raster)
library(rasterVis)
library(maptools)
library(RColorBrewer)
library(maptools)
data("wrld_simpl")
library(tree)
library(randomForest)
library(ggplot2)
set.seed(1)
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
Fit a CART model to the leading 4 PCs of winter 3-day precipitation extreme, using the four leading PCs of winter SSTs as covariates. -You will fit a CART model for each PC separately -with the CART estimates of the PCs, estimate the ‘model precipitation’ at all the locations by multiplying the PCs estimates with Eigen Vectors (a) Compare the CART model precipitation with historic. Also Perform a drop-10% cross validation and boxplot correlation and RMSE (b) Repeat (a) above with a random forest model Compare and summarize your findings.
ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid) # number of latitudinal locations
xgrid=seq(27.5,382.5,by=5)
nx=length(xgrid) # number of longitudinal locations
nglobe = nx*ny
# creation of spatial grid
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
for(ix in 1:nx){
i=i+1
xygrid[i,1]=ygrid[iy]
xygrid[i,2]=xgrid[ix]
}
}
nyrs = 55 #Nov-Mar 1901 - Nov-Mar 2018
data=readBin("Kaplan-SST-DJFM1900-DJFM2018.r4",what="numeric", n=( nx * ny * nyrs), size=4,endian="swap")
data = array(data = data, dim=c( nx, ny, nyrs ) )
data1=data[,,1]
# the lat -long data grid..
index=1:(nx*ny)
index1=index[data1 < 20 & data1 != "NaN"] # only non-missing data.
xygrid1=xygrid[index1,]
nsites=length(index1)
data2=data1[index1]
### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstannavg=matrix(NA,nrow=nyrs, ncol=nsites)
for(i in 1:nyrs){
data1=data[,,i]
index1=index[data1 < 20 & data1 != "NaN"]
data2=data1[index1]
sstannavg[i,]=data2
}
indexgrid = index1
rm("data") #remove the object data to clear up space
## write out the grid locations..
sstdata=sstannavg
xlongs = xygrid1[,2]
ylats = xygrid1[,1]
indexpac = indexgrid[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]
xlong = xlongs[ylats >= -20 & xlongs >= 105 & xlongs <= 290]
ylat = ylats[ylats >= -20 & xlongs >= 105 & xlongs <= 290]
index1=1:length(xlongs)
index = index1[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]
sstannavg = sstdata[,index]
indexgrid = indexpac
#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)
#do an Eigen decomposition..
zsvd=svd(zs)
#Principal Components...
pcs=sstscale %*%zsvd$u
#Eigen Values.. - fraction variance
lambdas=(zsvd$d/sum(zsvd$d))
sprintf("First PC explained %s percent of the variance",round(sum(lambdas[1])*100))
sprintf("First 2 PC explained %s percent of the variance",round(sum(lambdas[1:2])*100))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
3 day winter
N = 55 #Nov-Mar 1964 - Nov-Mar 2013
sstannavg=sstannavg[1:N,]
wuplocs = matrix(scan("Precipitaton_data - Copy.txt"),ncol =4,byrow=T)
#wuplocs
nlocs = dim(wuplocs)[1]
xlats = wuplocs[,2]
xlongs = wuplocs[,1]
#xlongs
nlocs1=nlocs+1
winterprecip = matrix(scan("Max_Winter_Seas_Prec - Copy.txt"),ncol=nlocs1,byrow=T)
years = winterprecip[,1]
winterprecip = winterprecip[,2:nlocs1] #first column is year
winterprecip= scale(winterprecip)
zs=var(winterprecip)
#do an Eigen decomposition..
zsvd=svd(zs)
#Eigen vectors
Preceof=zsvd$u
#Principal Components...
precippcs=winterprecip %*% zsvd$u
#Eigen Values.. - fraction variance
lambdas1=(zsvd$d/sum(zsvd$d))
sprintf("First PC explained %s percent of the variance",round(sum(lambdas[1])*100))
sprintf("First 2 PC explained %s percent of the variance",round(sum(lambdas[1:2])*100))
sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
#create the names of PCs
name=""
for (i in 1:dim(pcs)[2]) {
name[i]=paste("PC",i,sep = "")
}
pcs=as.data.frame(pcs)
names(pcs)=name
pcs1=cbind(precippcs,pcs)
pcs1 = data.frame(pcs1)
tree.precip1 = tree(X1 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip1)
tree.precip2 = tree(X2 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip2)
tree.precip3 = tree(X3 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip3)
tree.precip4 = tree(X4 ~PC1+PC2+PC3+PC4, data = pcs1, model = T)
summary(tree.precip4)
Regression tree: tree(formula = X1 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T) Variables actually used in tree construction: [1] "PC2" "PC4" "PC3" Number of terminal nodes: 7 Residual mean deviance: 8.701 = 417.7 / 48 Distribution of residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -8.7090 -1.3360 -0.3201 0.0000 1.8950 5.8700
Regression tree:
tree(formula = X2 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Variables actually used in tree construction:
[1] "PC3" "PC4"
Number of terminal nodes: 9
Residual mean deviance: 4.491 = 206.6 / 46
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-4.34900 -1.09600 -0.04789 0.00000 1.10400 4.61100
Regression tree: tree(formula = X3 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T) Number of terminal nodes: 9 Residual mean deviance: 2.873 = 132.2 / 46 Distribution of residuals: Min. 1st Qu. Median Mean 3rd Qu. Max. -3.3030 -1.1450 -0.3934 0.0000 1.0170 5.9580
Regression tree:
tree(formula = X4 ~ PC1 + PC2 + PC3 + PC4, data = pcs1, model = T)
Number of terminal nodes: 9
Residual mean deviance: 3.331 = 153.2 / 46
Distribution of residuals:
Min. 1st Qu. Median Mean 3rd Qu. Max.
-3.32000 -1.22200 0.02581 0.00000 0.96170 4.92300
treeFun = function(myTree, toPlot = F, title = ""){
#Perform CV on tree object
cvTree <- cv.tree(myTree)
optTree <- which.min(cvTree$dev)
bestTree <- cvTree$size[optTree]
#prune Tree based on CV results
pruneTree <- prune.tree(myTree, best = bestTree)
#If plotting is selected
if(toPlot){
#Plot unpruned Tree
plot(myTree)
text(myTree, cex = .75)
title(main = paste("Unpruned Tree for", title))
#Plot CV
plot(cvTree$size, cvTree$dev, type = "b",
main = paste("Cross Validation for", title))
#Plot Prunned Tree
plot(pruneTree)
text(pruneTree, cex = .75)
title(main = paste("Pruned Tree for", title))
}
pruneTree$besTree=bestTree
return(pruneTree)
}
treeFit1 = treeFun(tree.precip1, toPlot=T, title="pc1 Tree")
treeFit2 = tree.precip2
treeFit3 = treeFun(tree.precip3, toPlot=T, title="pc3 Tree")
treeFit4 = tree.precip4
#Use tree to predict original data
treePred1 <- predict(treeFit1)
#treeResid <- resid(treeFit)
myRange <- range(treePred1, precippcs[,1])
skill_tree1 = c(sqrt(mean((precippcs[,1]-treePred1)^2)),cor(precippcs[,1], treePred1))
names(skill_tree1)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
geom_point(mapping = aes(precippcs[,1], treePred1), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X1)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X1 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest(prec ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill=Drop_10_pred(pcs1,treeFit1$besTree,"tree")
f1_1=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
color = "red", size = 3)+
labs(title="CART",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
myrange1=range(Skill[,1])
f2_1=ggplot() +
geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_1, f2_1, cols = 2)
Warning message: "Removed 17 rows containing non-finite values (stat_boxplot)."
#Use tree to predict original data
treePred2 <- predict(treeFit2)
#treeResid <- resid(treeFit)
myRange <- range(treePred2, precippcs[,2])
skill_tree2 = c(sqrt(mean((precippcs[,2]-treePred2)^2)),cor(precippcs[,2], treePred2))
names(skill_tree2)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
geom_point(mapping = aes(precippcs[,2], treePred2), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X2)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X2 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, 2)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest(prec ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill=Drop_10_pred(pcs1, treeFit2,"tree")
f1_2=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
color = "red", size = 3)+
labs(title="CART",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
myrange1=range(Skill[,1])
f2_2=ggplot() +
geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_2, f2_2, cols = 2)
Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
#Use tree to predict original data
treePred3 <- predict(treeFit3)
#treeResid <- resid(treeFit)
myRange <- range(treePred3, precippcs[,3])
skill_tree3 = c(sqrt(mean((precippcs[,3]-treePred3)^2)),cor(precippcs[,2], treePred3))
names(skill_tree3)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
geom_point(mapping = aes(precippcs[,3], treePred3), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X2)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X3 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest(x3 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill=Drop_10_pred(pcs1, treeFit3$besTree,"tree")
f1_3=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
color = "red", size = 3)+
labs(title="CART",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
myrange1=range(Skill[,1])
f2_3=ggplot() +
geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_3, f2_3, cols = 2)
Warning message: "Removed 91 rows containing non-finite values (stat_boxplot)."
#Use tree to predict original data
treePred4 <- predict(treeFit4)
#treeResid <- resid(treeFit)
myRange <- range(treePred4, precippcs[,4])
skill_tree4 = c(sqrt(mean((precippcs[,4]-treePred1)^2)),cor(precippcs[,2], treePred4))
names(skill_tree4)=c("rmse","Correlation")
#Plot observed vs predicted
P1=ggplot() +
geom_point(mapping = aes(precippcs[,4], treePred4), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "CART",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P1)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X2)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X4 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, 2)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest(x4 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill=Drop_10_pred(pcs1, treeFit4,"tree")
f1_4=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree1[2]),
color = "red", size = 3)+
labs(title="CART",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
myrange1=range(Skill[,1])
f2_4=ggplot() +
geom_boxplot(mapping = aes(x = "FRMSE", Skill[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree1[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_4, f2_4, cols = 2)
Warning message in cor(drop_actual, drop_pred): "the standard deviation is zero" Warning message in cor(drop_actual, drop_pred): "the standard deviation is zero" Warning message in cor(drop_actual, drop_pred): "the standard deviation is zero" Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
forest.precip_1 = randomForest(X1 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_1)
plot(forest.precip_1)
Length Class Mode call 3 -none- call type 1 -none- character predicted 55 -none- numeric mse 500 -none- numeric rsq 500 -none- numeric oob.times 55 -none- numeric importance 4 -none- numeric importanceSD 0 -none- NULL localImportance 0 -none- NULL proximity 0 -none- NULL ntree 1 -none- numeric mtry 1 -none- numeric forest 11 -none- list coefs 0 -none- NULL y 55 -none- numeric test 0 -none- NULL inbag 0 -none- NULL terms 3 terms call
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X1)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X1 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest( X1 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
rforestpred = predict(forest.precip_1)
myRange = range(rforestpred, precippcs[,1])
skill_tree_forest = c(sqrt(mean((precippcs[,1]-rforestpred)^2)),cor(precippcs[,1], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
geom_point(mapping = aes(precippcs[,1], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)
Skill_forest = Drop_10_pred(pcs1,1,'random forest')
f3=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
color = "red", size = 3)+
labs(title="Random Rorest",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f4=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_1, f2_1,f3,f4, cols = 2)
Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
forest.precip_2 = randomForest(X2 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_2)
plot(forest.precip_2)
Length Class Mode call 3 -none- call type 1 -none- character predicted 55 -none- numeric mse 500 -none- numeric rsq 500 -none- numeric oob.times 55 -none- numeric importance 4 -none- numeric importanceSD 0 -none- NULL localImportance 0 -none- NULL proximity 0 -none- NULL ntree 1 -none- numeric mtry 1 -none- numeric forest 11 -none- list coefs 0 -none- NULL y 55 -none- numeric test 0 -none- NULL inbag 0 -none- NULL terms 3 terms call
rforestpred = predict(forest.precip_2)
myRange = range(rforestpred, precippcs[,2])
skill_tree_forest = c(sqrt(mean((precippcs[,2]-rforestpred)^2)),cor(precippcs[,2], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
geom_point(mapping = aes(precippcs[,2], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X1)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X2 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest( X2 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill_forest = Drop_10_pred(pcs1,1,'random forest')
f3=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
color = "red", size = 3)+
labs(title="Random Rorest",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f4=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_2, f2_2,f3,f4, cols = 2)
Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
forest.precip_3 = randomForest(X3 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_3)
plot(forest.precip_3)
Length Class Mode call 3 -none- call type 1 -none- character predicted 55 -none- numeric mse 500 -none- numeric rsq 500 -none- numeric oob.times 55 -none- numeric importance 4 -none- numeric importanceSD 0 -none- NULL localImportance 0 -none- NULL proximity 0 -none- NULL ntree 1 -none- numeric mtry 1 -none- numeric forest 11 -none- list coefs 0 -none- NULL y 55 -none- numeric test 0 -none- NULL inbag 0 -none- NULL terms 3 terms call
rforestpred = predict(forest.precip_3)
myRange = range(rforestpred, precippcs[,3])
skill_tree_forest = c(sqrt(mean((precippcs[,3]-rforestpred)^2)),cor(precippcs[,3], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
geom_point(mapping = aes(precippcs[,3], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X1)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X3 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest( X3 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill_forest = Drop_10_pred(pcs1,1,'random forest')
f3=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
color = "red", size = 3)+
labs(title="Random Rorest",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f4=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_3, f2_3,f3,f4, cols = 2)
Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
forest.precip_4 = randomForest(X4 ~ PC1+PC2+PC3+PC4, data = pcs1)
summary(forest.precip_4)
plot(forest.precip_4)
Length Class Mode call 3 -none- call type 1 -none- character predicted 55 -none- numeric mse 500 -none- numeric rsq 500 -none- numeric oob.times 55 -none- numeric importance 4 -none- numeric importanceSD 0 -none- NULL localImportance 0 -none- NULL proximity 0 -none- NULL ntree 1 -none- numeric mtry 1 -none- numeric forest 11 -none- list coefs 0 -none- NULL y 55 -none- numeric test 0 -none- NULL inbag 0 -none- NULL terms 3 terms call
rforestpred = predict(forest.precip_4)
myRange = range(rforestpred, precippcs[,4])
skill_tree_forest = c(sqrt(mean((precippcs[,4]-rforestpred)^2)),cor(precippcs[,4], rforestpred))
names(skill_tree_forest)=c("rmse","Correlation")
#Plot observed vs predicted
P2=ggplot() +
geom_point(mapping = aes(precippcs[,4], rforestpred), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
geom_abline(intercept = 0, slope = 1) +
coord_cartesian(xlim = myRange, ylim = myRange) +
labs(title = "Random Forest",x = "Oberved Precipitation",y = "Estimated Prediction")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(P2)
Drop_10_pred = function(X,bestTree,type) {
mod_data = X
N = length(X$X1)
drop = 10
nsample = 572
i_full = 1:N
# initialize skill score vectors
skill_rmse = vector(mode="numeric", length=nsample)
skill_cor = vector(mode="numeric", length=nsample)
for (i in 1:nsample){
if (type=="tree"){
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
myTree =tree(X4 ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T)
drop_mod= prune.tree(myTree, best = bestTree)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}else{
i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE
drop_dt = mod_data[-i_drop,] # drop 10% precip value
drop_mod=randomForest( X4 ~ PC1+PC2+PC3 +PC4, data = drop_dt)
drop_pred=predict(drop_mod,newdata=mod_data[i_drop,])
drop_actual = mod_data[i_drop,1]
skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2))
skill_cor[i] = cor(drop_actual,drop_pred)
}
}
CV=as.data.frame(cbind(skill_rmse,skill_cor))
names(CV)=c("rmse","cor")
return(CV)
}
Skill_forest = Drop_10_pred(pcs1,1,'random forest')
f3=ggplot() +
geom_boxplot(mapping = aes(x = "Correlation", Skill_forest[,2])) +
geom_point(mapping = aes(x = "Correlation", y = skill_tree_forest[2]),
color = "red", size = 3)+
labs(title="Random Rorest",x = "",y = "")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
f4=ggplot() +
geom_boxplot(mapping = aes(x = "RMSE", Skill_forest[,1])) +
geom_point(mapping = aes(x = "RMSE", y = skill_tree_forest[1]),
color = "red", size = 3)+
labs(title="",x = "",y = "")+
coord_cartesian(xlim = myrange1, ylim = myrange1)
multiplot(f1_4, f2_4,f3,f4, cols = 2)
Warning message: "Removed 3 rows containing non-finite values (stat_boxplot)."
multiplot(P1,P2, cols = 2)