library(randomForest) library(verification) library(ordinalForest) library(rpartScore) library(rpart.plot) library(party) ####################### Combined Model - NAsik + Pune + Ahmadnagar + Thane + Solapur data1=read.table("Nasik-cat-data.txt",header=TRUE) data3=read.table("Pune-cat-data.txt",header=TRUE) data4=read.table("Thane-cat-data.txt",header=TRUE) data5=read.table("Ahmadnagar-cat1-data.txt",header=TRUE) data6=read.table("Solapur-cat-data.txt",header=TRUE) migdata1=rbind(data1[,2:11],data3[,2:11],data4[,2:11],data5[,2:11], data6[,2:11]) # exclude Year ### Fit Random Forest #### Use ordinal forest or regular randomforest package. migdata1$MIG=factor(migdata1$MIG) ## the model prediction will be categorical probability ## if you remove the perffunction argument - the model prediction will be categories ordforres1 <- ordfor(depvar="MIG", data=migdata1, nsets=1000, ntreeperdiv=100, ntreefinal=5000, perffunction=c("probability")) preds=predict(ordforres,migdata1) ypred = preds$classprobs table('true'=migdata1$MIG, 'predicted'=preds$ypred) ## Variable importance sort(ordforres$varimp, decreasing=TRUE) ### RPSS of the fitted random forest model ### Only for categorical probability prediction acc2 = migdata1$MIG N = length(acc2) p1 <- length(acc2[acc2 == 1])/N p2 <- length(acc2[acc2 == 2])/N p3 <- length(acc2[acc2 == 3])/N climo <- c(p1, p2, p3) rps(acc2, ypred, baseline=climo )$rpss ######################## ### LOOCV = leave a record, fit the random forest model on the rest and predict ### the dropped point migdata1=rbind(data1[,2:11],data3[,2:11],data4[,2:11],data5[,2:11], data6[,2:11]) # exclude Year migdata1$MIG=factor(migdata1$MIG) ## Cross validation estimate and variable importance #### Note that this can take a bit of time depending on the number of data points ### that has to be dropped one at a time. ypredcv = c() varimpcv = c() for(i in 1:N){ migdatacv = migdata1[-i,] ordforres <- ordfor(depvar="MIG", data=migdatacv, nsets=1000, ntreeperdiv=100, ntreefinal=5000, perffunction=c("probability")) preds=predict(ordforres,migdata1) ypred = preds$classprobs[i,] ypredcv = rbind(ypredcv,ypred) varimpcv = rbind(varimpcv,ordforres$varimp) } ### RPSS of the LOOCv prediction rps(acc2, ypredcv, baseline=climo )$rpss ### Plot the probability forecasts vs the historical category n = nrow(ypredcv) plot(1:n, ypredcv[,1],xlab="Obs",ylab="Probs",ylim=c(0,1)) points(1:n, ypredcv[,2],col='blue') points(1:n, ypredcv[,3],col='red') par(new=TRUE) plot(1:n, acc2, xaxt="n", yaxt="n", xlab="", ylab="", type="p",pch=15) axis(side=4,at=1:3) mtext("Hist Category", side=4) grid() ## variable importance. Plot the variable importance of each variable ### from the LOOCV as boxplot boxplot(varimpcv) grid() ##################### ### Best Tree migdata1=rbind(data1[,2:11],data3[,2:11],data4[,2:11],data5[,2:11], data6[,2:11]) # exclude Year xgroups <- sample(rep(1:10, length = nrow(migdata1)), nrow(migdata1), replace = FALSE) #### ### Build a large tree and then prune using rpart ## minsplit = 9, mininum number of obs in a node to split (~5% of data) migtree = rpart(MIG ~ FOOD+SOCSTR+ENVSTR+RAIN+WAT+WATxRAIN+DSTMGT+NFE+PORT, data = migdata1, method="class", control=rpart.control(minsplit=9)) ## plot the CP vs X-validated error plotcp(migtree) # Print the CP and pick the CP corresponding to minimum x-error printcp(migtree) ## Fit the pruned tree migtreepruned = prune(migtree, cp=0.01333) rpart.plot(migtreepruned) ## ypred = predict(migtreepruned) acc2 = migdata1$MIG N = length(acc2) p1 <- length(acc2[acc2 == 1])/N p2 <- length(acc2[acc2 == 2])/N p3 <- length(acc2[acc2 == 3])/N climo <- c(p1, p2, p3) rps(acc2, ypred, baseline=climo )$rpss ##################### ### Leave Thane out and predict usig Random Forest. ### You can do the same with Tree as well migdata2=rbind(data1[,2:11],data3[,2:11],data5[,2:11],data6[,2:11],data4[,2:11]) # exclude Year & dd Thane at the end migdata3 = migdata2[1:160,] migdata3$MIG=factor(migdata3$MIG) ordforres <- ordfor(depvar="MIG", data=migdata3, nsets=1000, ntreeperdiv=100, ntreefinal=5000, perffunction=c("probability")) preds=predict(ordforres,migdata2) ypredcvthane = preds$classprobs[161:175,] varimpcvthane = ordforres$varimp accth = migdata3$MIG Nth = length(accth) p1 <- length(accth[accth == 1])/Nth p2 <- length(accth[accth == 2])/Nth p3 <- length(accth[accth == 3])/Nth climo <- c(p1, p2, p3) rps(acc2, ypredcv, baseline=climo )$rpss nth = nrow(ypredcvthane) plot(1:nth, ypredcvthane[,1],xlab="Obs",ylab="Probs",ylim=c(0,1)) points(1:nth, ypredcvthane[,2],col='blue') points(1:nth, ypredcvthane[,3],col='red') par(new=TRUE) ### RPSS plot(1:nth, migdata2$MIG[161:175], xaxt="n", yaxt="n", xlab="", ylab="", type="p",pch=15) axis(side=4,at=1:3) mtext("Hist Category", side=4) grid() ##### ### Best tree ## minsplit = 9, mininum number of obs in a node to split (~5% of data) migtree = rpart(MIG ~ FOOD+SOCSTR+ENVSTR+RAIN+WAT+WATxRAIN+DSTMGT+NFE+PORT, data = migdata3, method="class", control=rpart.control(minsplit=9)) ## plot the CP vs X-validated error plotcp(migtree) # Print the CP and pick the CP corresponding to minimum x-error printcp(migtree) ## minimum is at cp = 0.0238 ## Fit the pruned tree and plot migtreepruned = prune(migtree, cp=0.0238) rpart.plot(migtreepruned) ### Predict and RPSS ypred = predict(migtreepruned) acc2 = migdata3$MIG N = length(acc2) p1 <- length(acc2[acc2 == 1])/N p2 <- length(acc2[acc2 == 2])/N p3 <- length(acc2[acc2 == 3])/N climo <- c(p1, p2, p3) rps(acc2, ypred, baseline=climo )$rpss ##