7. Cluster Analysis - Kmeans
(i) Cluster the winter 3-day maximum precipitation average - use latitude, longitude, and elevation. The clustering involves:
(a) identify the best number of clusters, say, Kbest
Select a desired number of clusters, say, j; cluster the data and compute the WSS; repeat for j = 1,10; plot j vs. WSS; and select the best number of clusters Kbest, where the WSS starts to saturate.
(b) cluster the data into Kbest clusters and display them spatially.
(ii) Also employ any other clustering method - K-medoid clustering or Hierarchical Clustering
Compare and summarize your findings
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()
#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")
# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_avg = colMeans(data[,-1]) # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)
## Create full matrix
precip = loc[,1:3]
lon = precip[,1]
lat = precip[,2]
names(precip) = c("Long","Lat","Elev")
precip$Prec = prec_avg
precip_scale = scale(precip) # Standardize variables
Niter=50
WSS = matrix(nrow=Niter, ncol=15)
for (j in 1:Niter) {
WSS[j,1]= (nrow(precip_scale)-1)*sum(apply(precip_scale,2,var))#with elev
for (i in 2:15) {
WSS[j,i] = sum(kmeans(precip_scale, centers=i)$withinss)
}}
wss = apply(WSS, 2, FUN = mean)
g1 = ggplot() +
geom_line(mapping = aes(1:15, wss)) +
geom_point(mapping = aes(1:15, wss), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
labs(title = "WSS vs. # of Clusters",x = "Number of Clusters",y = "WSS (Within-Group Sum of Squares)")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)
slope=wss[2:length(wss)]-wss[1:(length(wss)-1)]
kbest = which(slope== max(slope)) # optimal number of clusters
print(sprintf("The optimal number of clusters is %s", kbest))
## [1] "The optimal number of clusters is 14"
fit <- kmeans(precip_scale, kbest)
# get cluster means
means = aggregate(precip,by=list(fit$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit$cluster)
data_1 = data.frame(precip,Cluster)
MainStates <- map_data("state")
xlim = c(-115,-101.3)
ylim = c(29.5,42.5)
g2 = ggplot() +
geom_polygon(data=MainStates, aes(x=long, y=lat, group=group), color="gray20", fill="transparent") +
coord_sf(xlim = xlim, ylim = ylim,label_axes = list(bottom = "E", left = "N"), expand =FALSE) +
theme_bw() +
geom_point(data.frame(lon, lat), mapping = aes(x=lon, y=lat, colour = data_1$Cluster), name = "Cluster") +
geom_point(data = means, aes(x = means$Long, y = means$Lat), shape = 6, size = 2, stroke = 2) +
geom_text(aes(x = means$Long, y = means$Lat, label = means$Group.1), vjust = 2.25, size = 3) +
labs(title = "Clustering of Southwest U.S. 3-Day Winter Maximum Precipitation", x = "Longitude", y = "Latitude", color = "Cluster")
show(g2)
# Install package
library(fpc)
fit <- pamk(precip_scale, krange = 2:15, scaling = F)
print(sprintf("The optimal number of clusters is %s",fit$nc))
## [1] "The optimal number of clusters is 9"
# get cluster means
means = aggregate(precip, by = list(fit$pamobject$cluster), FUN = mean)
# append cluster assignment
Cluster=factor(fit$pamobject$clustering)
data_2 = data.frame(precip,Cluster)
g2 = ggplot() +
geom_point(data.frame(lon, lat), mapping = aes(x=lon, y=lat, colour = data_2$Cluster)) +
geom_point(data = means, aes(x = means$Long, y = means$Lat), shape = 6, size = 2, stroke = 2) +
geom_text(aes(x = means$Long, y = means$Lat, label = means$Group.1), vjust = 2.25, size = 3) +
geom_polygon( data=MainStates, aes(x=long, y=lat, group=group), color="gray20", fill="transparent") +
theme_bw() +
coord_sf(xlim = xlim, ylim = ylim,label_axes = list(bottom = "E", left = "N"), expand =FALSE) +
labs(title = "Clustering of Southwest U.S. 3-Day Winter Maximum Precipitation", x = "Longitude", y = "Latitude", color = "Cluster")
show(g2)