Cluster Analysis - Extremes
For the winter seasonal maximum precipitation perform clustering of the Extremes using the method proposed in Bernard et al. (2013) and used in Bracken et al. (2015) - these papers are linked on the class page. Comment on the cluster patterns of the average and maxima. [data at http://civil.colorado.edu/~bracken/western-usextremes-data/].
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#data required:
save=TRUE # True for saving figure as pdf file;
#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW2.R"))
source("Lib_HW2.R")
setwd("./data") #move to the data directory
## Read data as table (data frame)
data=read.csv("ghcn_3day_max_west_us.csv")
# save code of stations in a list
aux_station=as.vector(data$station[1])
N=dim(data)[1]
count=0
for (i in 2:N) {
if (data$station[i]!=data$station[i-1]){
count=count+1
aux_station[count]=as.vector(data$station[i])
}
}
# find the temporal extension of each station
L_data=as.data.frame(array(NaN, dim = c(length(aux_station), 4)))
names(L_data)=c("Station","N_Data","year_init","year_fin")
for (i in 1:length(aux_station)) {
cc=data[which(data$station==aux_station[i]),c(2,4)]
if(length(which(cc[,1]=="winter"))>0){
aux_2=cc[which(cc[,1]=="winter"),2]
L_data[i,1]=i
L_data[i,2]=length(aux_2)
L_data[i,3]=aux_2[1]
L_data[i,4]=aux_2[length(aux_2)]
}
}
L_data=na.omit(L_data) # listwise deletion of missing
years=seq(min(L_data[,3]),max(L_data[,4]),by=1)
a=which(L_data[,2]>49)#filtering by stations with more than 50 years of data
#data frame with precipitation data (rows: time points; columns: Stations)
precip_w=as.data.frame(array(NaN, dim = c(length(years),length(a))))
# data frame with with geographic data
grid=as.data.frame(array(NaN, dim = c(length(a), 3)))
names(grid)=c("long","lat","elev")
name_station = list()
for (i in 1:length(a)) {
ind=which(data$station==aux_station[L_data[a[i],1]])
aux=data[ind,c(2,4,5)]
grid[i,]=data[ind[1],c(8,7,9)]
if(length(which(aux[,1]=="winter"))>0){
aux_2=aux[which(aux[,1]=="winter"),2:3]
name_station[i]=aux_station[L_data[a[i],1]]
}
for (j in 1:length(aux_2[,1])) {
ind1=which(years==aux_2[j,1])
precip_w[ind1,i]=aux_2[j,2]
}
}
names(precip_w)=as.vector(name_station)
#save data in csv files
write.csv(precip_w, file = "PAM_western_us.csv",row.names=FALSE, na="")
write.csv(grid[,1:2], file = "grid_western_us.csv",row.names=FALSE, na="")
rm("data")
ks = 2:20
# Do the clustering for all k values
cluster_list = list()
for(k in ks){
#message('Trying ',k,' clusters')
cluster_list[[which(ks == k)]] = pam_fmado_ll(precip_w,k,grid[,1:2])
}
# find the average silhouette value (smaller is better)
sil = sapply(cluster_list,function(x)x$silinfo$avg.width)
# plot the ave silhouette value versus k, look for a maximum
ggplot() +
geom_line(mapping = aes(ks,sil)) +
geom_point(mapping = aes(ks,sil), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
labs(title = "Silhouette width",x = 'Number of clusters',y = 'Silhouette width')+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
# find the optimal k
optimal_k = ks[which.max(sil)]
message('The optimal number of clusters is: ',optimal_k)
## The optimal number of clusters is: 3
## for optimal number of clusters
clusters = cbind(grid,cluster=cluster_list[[which(ks==optimal_k)]]$clustering)
p_map = ggplot(clusters)+
geom_point(aes(long,lat,color=factor(cluster)))+
labs(title = paste("Clustering of the Extremes spatial map with K=",optimal_k," (optimal)",sep = ""),
x = "Longitude", y = "Latitude")+
scale_color_manual('Cluster',values=tim.colors(optimal_k))+
coord_quickmap(xlim=c(-125,-105),ylim=c(31,50))+
theme_minimal()+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
borders('state')
print(p_map)
## for k=8
clusters = cbind(grid,cluster=cluster_list[[which(ks==8)]]$clustering)
p_map1 = ggplot(clusters)+
ggtitle("Clustering of the Extremes spatial map with K=8") +
geom_point(aes(long,lat,color=factor(cluster)))+
scale_color_manual('Cluster',values=tim.colors(8))+
coord_quickmap(xlim=c(-125,-105),ylim=c(31,50))+
theme_minimal()+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))+
borders('state')
print(p_map1)
Comments
The dataset was filtered in order to only consider stations with more than 50 years of data. The silhouette plot was used to choose the optimal number of clusters, which corresponds to the maximum silhouette value. In this case, optimal number of clusters is 3.
In the spatial maps of the optimal numbers of clustering is possible to see that the first cluster corresponds to the west coast covering the north of California, north west of Nevada, Oregon and Washington. The second cluster corresponds to the north west covering states of Montana, Wyoming, Idaho, north east of Nevada, and west of Utah and Colorado.
In addition, the spatial maps considering K=8 was plotting in order to compare results with that obtained Bracken et al. (2015). It is possible to see that clusters are similar but not the same. A explanation of this, it could be the filter applied to the dataset.
Finally, Kmeans clustering was applied to the mean of winter seasonal maximum precipitation. It is observed: an increase of the size of west coast cluster and the south west cluster. In the case of the west coast cluster, it reaches up to the south of the California coast, while the west south cluster of Kmeans covers completely Colorado and Utah state, and a significant part of Wyoming (the same cluster for extreme clustering only covers the south part of Colorado and Utah). These results are less realistic because is unlikely that extreme precipitation of the south coast of California be similar to that of the north west coast (north California, Oregon, and Washington). The explanation is the same for the cluster 3.