library(sm) # for sm.density in diagnostics library(leaps) # to provide combinations library(MPV) # for calculating objective function library(akima) # for interp to smooth for plotting library(fields) # for surface to plot surface plot library(locfit) # for fitting local polynomial library(MASS) library(VGAM) library(nnet) library(verification) library(latex2exp) library(maps) #library(rgdal) # for converting projection of coordinates library(data.table) #to use the melt library(ggplot2) #to plot, map_data, scale_fill_gradientn, etc library(reshape2) library(scales) # for Visualization spatial map library(spBayes) library(geoR) # For Trees library(mclust) library(tree) # classification and regression trees library(randomForest) # Random Forest library(RColorBrewer) # Color schemes for maps library(Hmisc) # Add minor tick marks library(wesanderson) library(cluster) library(kohonen) library(parallel) #### GLM Fitting Function GLM_fit = function(prec_obs, X, family) { if (family == "gamma") { links = c("log", "inverse","identity") # clean data and remove zeros prec_obs = ifelse(prec_obs <=0, runif(1, 0.0001, 0.001), prec_obs) } else if (family == "binomial"){ links = c("logit") } else if (family == "gaussian"){ links = c("identity") } N = length(prec_obs) combs = leaps(X,prec_obs, nbest=25) # GEt upto 25 combinations for each # number of predictors combos = combs$which ncombos = length(combos[,1]) glm_val=vector(length = length(links)) bestcombo=vector(length = length(links)) obj_func=1:ncombos xmse = 1:ncombos for(j in 1:length(links)) { aux_var=1 # allow to fit glm if (aux_var==1) {for(i in 1:ncombos) { xx = X[,combos[i,]] xx=as.data.frame(xx) names(xx) = names(X[combos[i,]]) if (family == "gamma"){ zz=glm(prec_obs ~ ., data=xx, family = Gamma(link=links[j])) }else if (family == "binomial"){ zz=glm(prec_obs ~ ., data=xx, family = binomial(link=links[j])) }else if (family == "gaussian"){ zz=glm(prec_obs ~ ., data=xx, family = gaussian(link=links[j])) } obj_func[i]=AIC(zz) xmse[i] = sum((zz$res)^2) / (N - length(zz$coef)) }} if (aux_var==1){ # Test using AIC objective function glm_val[j]=min(obj_func) bestcombo[j]=which.min(obj_func) }else{ # Test using AIC objective function glm_val[j]=200000 bestcombo[j]=which.min(obj_func) } } press_df = data.frame(glm_val) rownames(press_df) = links[1:length(links)] print("Results of AIC for bestfit GLM") print(press_df) sprintf("Choosing the GLM which minimizes AIC: %s family and %s link function.", family, links[which.min(glm_val)]) xx = X[,combos[bestcombo[which.min(glm_val)],]] xx = data.frame(xx) names(xx) = names(X[combos[bestcombo[which.min(glm_val)],]]) if (family == "gamma") { bestmod = glm(prec_obs ~ ., data = xx, family = Gamma(link=links[which.min(glm_val)])) } else if (family == "binomial") { bestmod = glm(prec_obs ~ ., data = xx, family = binomial(link=links[which.min(glm_val)])) } else if (family == "gaussian") { bestmod = glm(prec_obs ~ ., data = xx, family = gaussian(link=links[which.min(glm_val)])) } else { print("Error!") } bestmod$Call$LINK=links[which.min(glm_val)] bestmod$Call$AIC=AIC(bestmod) bestmod$Call$combo=combos[bestcombo[which.min(glm_val)],] return(bestmod) } #### Quilt Plotting Function Quilt_plotting=function(lon, lat, ypred, lon1, lat1, yob, type=""){ if (type=="binary"){ quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Binary Precipitation on DEM Grid') US( add=TRUE, col="black", lwd=2) quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Binary Precipitation') US( add=TRUE, col="black", lwd=2) quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid') US( add=TRUE, col="black", lwd=2) } else{ par(mfrow=c(2,1)) quilt.plot(lon, lat,ypred$fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Precipitation on DEM Grid (mm)') US( add=TRUE, col="black", lwd=2) quilt.plot(lon1, lat1,yob,xlab="Longitude (m)",ylab="Latitude (m)",main='Actual Precipitation (mm)') US( add=TRUE, col="black", lwd=2) quilt.plot(lon, lat,ypred$se.fit,xlab="Longitude (m)",ylab="Latitude (m)",main='Predicted Standard Error on DEM Grid (mm)') US( add=TRUE, col="black", lwd=2) } } #### Drop 10% cross validation Function Drop_10_pred = function(X,bestTree,type) { mod_data = X N = nrow(X) drop = 10 nsample = 500 i_full = 1:N # initialize skill score vectors skill_rmse = vector(mode="numeric", length=nsample) skill_cor = vector(mode="numeric", length=nsample) for (i in 1:nsample){ if (type=="tree"){ i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE drop_dt = mod_data[-i_drop,] # drop 10% precip value myTree =tree(prec ~ PC1+PC2+PC3+PC4, data = drop_dt, model = T) drop_mod= prune.tree(myTree, best = bestTree) drop_pred=predict(drop_mod,newdata=mod_data[i_drop,]) drop_actual = mod_data[i_drop,1] skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2)) skill_cor[i] = cor(drop_actual,drop_pred) }else{ i_drop = sample(i_full,N*drop/100) # can add argument replace=TRUE drop_dt = mod_data[-i_drop,] # drop 10% precip value drop_mod=randomForest(prec ~ PC1+PC2+PC3+PC4, data = drop_dt) drop_pred=predict(drop_mod,newdata=mod_data[i_drop,]) drop_actual = mod_data[i_drop,1] skill_rmse[i] = sqrt(mean((drop_actual - drop_pred)^2)) skill_cor[i] = cor(drop_actual,drop_pred) } } CV=as.data.frame(cbind(skill_rmse,skill_cor)) names(CV)=c("rmse","cor") return(CV) } logit2prob = function(logit){ odds <- exp(logit) prob <- odds / (1 + odds) return(prob) } #### Pruning Function treeFun = function(myTree, toPlot = F, title = ""){ #Perform CV on tree object cvTree <- cv.tree(myTree) optTree <- which.min(cvTree$dev) bestTree <- cvTree$size[optTree] if (bestTree == 1) { bestTree = length(myTree) } #prune Tree based on CV results pruneTree <- prune.tree(myTree, best = bestTree) #If plotting is selected if(toPlot){ #Plot unpruned Tree plot(myTree) text(myTree, cex = .75) title(main = paste("Unpruned Tree for", title)) #Plot CV plot(cvTree$size, cvTree$dev, type = "b", main = paste("Cross Validation for", title)) #Plot Prunned Tree plot(pruneTree) text(pruneTree, cex = .75) title(main = paste("Pruned Tree for", title)) } pruneTree$besTree=bestTree return(pruneTree) } #### Extreme Clustering Function pam_fmado_ll = function (x, k, ll) { # x - Design matrix, typically colums are stations, rows are time # k - Number of clusters # ll - two column matrix of lat long points (or preferably projected) with N rows N = ncol(x) # number of stations T = nrow(x) # number of time points # compute the F-madogram distance V = array(NaN, dim = c(T, N)) for (p in 1:N) { x.vec = as.vector(x[, p]) Femp = ecdf(x.vec)(x.vec) V[, p] = Femp } DD = dist(t(V), method = "manhattan", diag = TRUE, upper = TRUE)/(2 * T) # weight by physical distance DDll = dist(ll,method='manhattan') DDw = as.matrix(DD) + t(t(as.matrix(DDll))/apply(as.matrix(DDll),2,max))*max(as.matrix(DD)) # do the clustering output = pam(DDw, k, diss = TRUE, medoids = NULL) return(output) } #### multiplot Function multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { library(grid) # Make a list from the ... arguments and plotlist plots <- c(list(...), plotlist) numPlots = length(plots) # If layout is NULL, then use 'cols' to determine layout if (is.null(layout)) { # Make the panel # ncol: Number of columns of plots # nrow: Number of rows needed, calculated from # of cols layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), ncol = cols, nrow = ceiling(numPlots/cols)) } if (numPlots==1) { print(plots[[1]]) } else { # Set up the page grid.newpage() pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) # Make each plot, in the correct location for (i in 1:numPlots) { # Get the i,j matrix positions of the regions that contain this subplot matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, layout.pos.col = matchidx$col)) } } } #### Local Polynomial Fitting #search for best alpha over a range of alpha values between 0 and 1 locpoly_fit = function(prec_obs, X, family="", link="", glm=FALSE,plot=FALSE) { nvar = ncol(X) # number of variables N = nrow(X) # number of data points if(glm==TRUE) { if (family == "gamma") { prec_obs = ifelse(prec_obs <=0, runif(1, 0.0001, 0.001), prec_obs) } porder=1 minalpha=0.6 alpha_grid=seq(minalpha,1.0,by=0.05) n=length(alpha_grid) porder=2 minalpha=0.6 alpha1_grid=seq(minalpha,1.0,by=0.05) alpha=alpha2_grid=c(alpha_grid,alpha1_grid) #get the GCV values for all the alpha values in alpha for order of # polynomial = 1 and 2. kern="bisq" argument is to use the bisquare kernel # in computing the weights of the neighbors, which are then used in # the weighted least squares.. gcv_deg1=gcvplot(prec_obs ~ ., data=X, maxk = 100000, alpha=alpha_grid,deg=1,kern="bisq", ev=dat(),scale=TRUE,family=family,link=link) gcv_deg2=gcvplot(prec_obs ~ ., data=X, maxk = 100000, alpha=alpha1_grid,deg=2,kern="bisq",ev=dat(),scale=TRUE,family=family,link=link) # pick the best alpha and the degree of the polynomial that # gives the least GCV z2=order(c(gcv_deg1$values,gcv_deg2$values)) bestdeg=1 if(z2[1] > n)bestdeg=2 best_alpha = alpha2_grid[z2[1]] best_gcv = c(gcv_deg1$values,gcv_deg2$values)[z2[1]] output=c(bestdeg, best_alpha, best_gcv) #the best parameter set # Now fit the LOCFIT model using the best alpha and degree obtained from above.. if (plot==FALSE){ bestmod=locfit(prec_obs ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq" , scale = T, family=family, link=link,ev=dat()) }else { bestmod=locfit(prec_obs ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq" , scale = T, family=family, link=link) } bestmod$call$alpha = best_alpha bestmod$call$deg = bestdeg bestmod$call$family = family bestmod$call$link = link bestmod$call$gcv = best_gcv } else{ porder=1 minalpha=2*(nvar*porder+1)/N alpha_grid=seq(minalpha,1.0,by=0.05) n=length(alpha_grid) porder=2 minalpha=2*(nvar*porder+1)/N alpha1_grid=seq(minalpha,1.0,by=0.05) alpha=alpha2_grid=c(alpha_grid,alpha1_grid) #get the GCV values for all the alpha values in alpha for order of # polynomial = 1 and 2. kern="bisq" argument is to use the bisquare kernel # in computing the weights of the neighbors, which are then used in # the weighted least squares.. gcv_deg1=gcvplot(prec_obs ~ ., data=X, maxk = 100000, alpha=alpha_grid,deg=1,kern="bisq", ev=dat(),scale=TRUE) gcv_deg2=gcvplot(prec_obs ~ ., data=X, maxk = 100000, alpha=alpha1_grid,deg=2,kern="bisq",ev=dat(),scale=TRUE) # pick the best alpha and the degree of the polynomial that # gives the least GCV z2=order(c(gcv_deg1$values,gcv_deg2$values)) bestdeg=1 if(z2[1] > n)bestdeg=2 best_alpha = alpha2_grid[z2[1]] best_gcv = c(gcv_deg1$values,gcv_deg2$values)[z2[1]] output=c(bestdeg, best_alpha, best_gcv) #the best parameter set # Now fit the LOCFIT model using the best alpha and degree obtained from above.. if (plot==FALSE){ bestmod=locfit(prec_obs ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq",ev=dat()) }else{ bestmod=locfit(prec_obs ~., data=X, alpha=best_alpha, maxk = 10000, deg=bestdeg,kern="bisq") } bestmod$call$alpha = best_alpha bestmod$call$deg = bestdeg bestmod$call$gcv = best_gcv } return(bestmod) } #### F-test for Local Polynomial Fitting loc_Ftest = function(prec_obs,prec_hat,X, bestmod) { N=length(prec_obs) #number ofdata points RSS1 = sum((prec_obs-prec_hat$fit)^2) nu1 = bestmod$dp[6] # trace(L) [lk] nu2 = bestmod$dp[7] # trace(L^T L) [df1] nu11 = N-2*nu1 + nu2 #linear regression ### # #linear regression X=as.matrix(X) zzLin=lm(prec_obs~X) XX = cbind(rep(1,N), X) # Compute the Hat matrix hatm = XX %*% solve(t(XX) %*% XX) %*% t(XX) II = diag(N) delta0 = t(II-hatm)%*%(II-hatm) #Equation 9.2 nu00 = sum(diag(delta0)) RSS0 = sum(residuals(zzLin)^2) Fdata = (RSS0 - RSS1)/(nu00 - nu11) Fdata = (Fdata / (RSS1 / nu11)) Ftheor = qf(0.95,(nu00-nu11), nu11) #95% confidence level.. ## Fdata > Ftheor - reject null - i.e., data is otherwise (local polynomial) if (Fdata > Ftheor) { print("F-test:") print(sprintf("Reject the Null because F(local poly) = %0.2f > %0.2f = F(linear model).", Fdata, Ftheor)) } }