HW2_Prob7

7. Cluster Analysis - Kmeans
(i) Cluster the winter 3-day maximum precipitation average - use latitude, longitude, and elevation. The clustering involves:
(a) identify the best number of clusters, say, Kbest
Select a desired number of clusters, say, j; cluster the data and compute the WSS; repeat for j = 1,10; plot j vs. WSS; and select the best number of clusters Kbest, where the WSS starts to saturate.
(b) cluster the data into Kbest clusters and display them spatially.
(ii) Also employ any other clustering method - K-medoid clustering or Hierarchical Clustering
Compare and summarize your findings

#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#### Initialize Graphics
graphics.off()
.pardefault <- par()

#### Source Libraries and set working directory
mainDir="/Users/annastar/OneDrive - UCB-O365/CVEN 6833 - Advanced Data Analysis Techniques/Homework/HW 02/"
setwd(mainDir)
suppressPackageStartupMessages(source("hw2_library.R"))
source("hw2_library.R")

Read data

# Load winter precipitation data
data = read.table(paste(mainDir, "/data/Winter_temporal/Max_Winter_Seas_Prec.txt", sep = ""), header = T)
# Omit extreme values
non_values <- which(data[,-1] > 1000, arr.ind = T)
data[non_values] = NaN
data = na.omit(data)
prec_avg = colMeans(data[,-1])  # Mean of location columns
# Load location data
loc = read.table(paste(mainDir, "/data/Precipitaton_data.txt", sep = ""), header = T)

## Create full matrix
precip = loc[,1:3]
lon = precip[,1]
lat = precip[,2]
names(precip) = c("Long","Lat","Elev")
precip$Prec = prec_avg
precip_scale = scale(precip)  # Standardize variables

(i) Cluster the winter 3-day maximum precipitation average

Niter=50
WSS = matrix(nrow=Niter, ncol=15)
for (j in 1:Niter) {
  WSS[j,1]= (nrow(precip_scale)-1)*sum(apply(precip_scale,2,var))#with elev
  for (i in 2:15) {
    WSS[j,i] = sum(kmeans(precip_scale, centers=i)$withinss)
  }}
wss = apply(WSS, 2, FUN = mean)

g1 = ggplot() +
  geom_line(mapping = aes(1:15, wss)) +
  geom_point(mapping = aes(1:15, wss), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
  labs(title = "WSS vs. # of Clusters",x = "Number of Clusters",y = "WSS (Within-Group Sum of Squares)")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
show(g1)

(a) Identify the best number of clusters Kbest

slope=wss[2:length(wss)]-wss[1:(length(wss)-1)]
kbest = which(slope== max(slope))     # optimal number of clusters
print(sprintf("The optimal number of clusters is %s", kbest))

## [1] "The optimal number of clusters is 14"

(b) Cluster the data into Kbest clusters and display them spatially

fit <- kmeans(precip_scale, kbest)
# get cluster means
means = aggregate(precip,by=list(fit$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit$cluster)
data_1 = data.frame(precip,Cluster) 

MainStates <- map_data("state")
xlim = c(-115,-101.3)
ylim = c(29.5,42.5)
g2 = ggplot() + 
  geom_polygon(data=MainStates, aes(x=long, y=lat, group=group), color="gray20", fill="transparent") +
  coord_sf(xlim = xlim, ylim = ylim,label_axes = list(bottom = "E", left = "N"), expand =FALSE) +
  theme_bw() +
  geom_point(data.frame(lon, lat), mapping = aes(x=lon, y=lat, colour = data_1$Cluster), name = "Cluster") +
  geom_point(data = means, aes(x = means$Long, y = means$Lat), shape = 6, size = 2, stroke = 2) +
  geom_text(aes(x = means$Long, y = means$Lat, label = means$Group.1), vjust = 2.25, size = 3) +
  labs(title = "Clustering of Southwest U.S. 3-Day Winter Maximum Precipitation", x = "Longitude", y = "Latitude", color = "Cluster")
show(g2)

(ii) Repeat with a different clustering method - K-medoid clustering

# Install package
library(fpc)

Identify the best number of clusters Kbest

fit <- pamk(precip_scale, krange = 2:15, scaling = F)
print(sprintf("The optimal number of clusters is %s",fit$nc))

## [1] "The optimal number of clusters is 9"

Cluster the data into Kbest clusters and display them spatially

# get cluster means
means = aggregate(precip, by = list(fit$pamobject$cluster), FUN = mean)
# append cluster assignment 
Cluster=factor(fit$pamobject$clustering)
data_2 = data.frame(precip,Cluster) 

g2 = ggplot() + 
  geom_point(data.frame(lon, lat), mapping = aes(x=lon, y=lat, colour = data_2$Cluster)) +
  geom_point(data = means, aes(x = means$Long, y = means$Lat), shape = 6, size = 2, stroke = 2) +
  geom_text(aes(x = means$Long, y = means$Lat, label = means$Group.1), vjust = 2.25, size = 3) +
  geom_polygon( data=MainStates, aes(x=long, y=lat, group=group), color="gray20", fill="transparent") +
  theme_bw() +
  coord_sf(xlim = xlim, ylim = ylim,label_axes = list(bottom = "E", left = "N"), expand =FALSE) +
  labs(title = "Clustering of Southwest U.S. 3-Day Winter Maximum Precipitation", x = "Longitude", y = "Latitude", color = "Cluster")
show(g2)

Compare and summarize findings

The optimal number of clusters using a K-means clustering algorithm is determined by finding the shallowest slope of the WSS plot. Given 15 maximum clusters, the optimal number of clusters is found to be between 12-14. The WSS plot shows that WSS continues to decrease as a function of the number of clusters
The optimal number of clusters using a K-medoids clustering algorithm is determined within the pamk() function based on the optimum average silhouette width. Given 15 maximum clusters, the optimal number of clusters is found to be 9.
The algorithms result in similar clusters, with better agreement at southern latitudes