Cluster Analysis - Kmeans
Cluster the January and July precipitation over Colorado - use latitude, longitude and elevation. The clustering involves:
identify the best number of clusters, say, Kbest Select a desired number of clusters, say, j; cluster the data and computer the WSS; repeat for j=1,10; plot j versus WSS; and select the best number of clusters Kbest, where the WSS starts to saturate.
cluster the data into Kbest clusters and display them. Try with and without elevation in the clusters and summarize the results.
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)
#data required:
save=TRUE # True for saving figure as pdf file;
#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW2.R"))
source("Lib_HW2.R")
## Read data as table (data frame)
setwd("./data") #move to the data directory
data = read.table("colo_monthly_precip_01.dat") #data January
#assign names for columns/variables:
names(data)=c("Lat","Long","Elev","Pm")
# Prepare Data
non_values<- which(data[,4]<0)#filtering the -999 values
data[non_values,4]=NaN
data=na.omit(data) # listwise deletion of missing
data1=data[,c(1,2,4)] # without elevation
datas=scale(data) # standardize variables with elev
data1s=scale(data1) # standardize variables without elev
data2 = read.table("colo_monthly_precip_07.dat") #data July
#assign names for columns/variables:
names(data2)=c("Lat","Long","Elev","Pm")
# Prepare Data
non_values<- which(data2[,4]<0)#filtering the -999 values
data2[non_values,4]=NaN
data2=na.omit(data) # listwise deletion of missing
data3=data2[,c(1,2,4)] # without elevation
data2s=scale(data2) # standardize variables with elev
data3s=scale(data3) # standardize variables without elev
# Determine number of clusters by iterations
Niter=50
WSS = matrix(nrow=Niter, ncol=15)
WSS1 = matrix(nrow=Niter, ncol=15)
for (j in 1:Niter) {
WSS[j,1]= (nrow(datas)-1)*sum(apply(datas,2,var))#with elev
WSS1[j,1]= (nrow(data1s)-1)*sum(apply(data1s,2,var))#without elev
for (i in 2:15) {
WSS[j,i] = sum(kmeans(datas, centers=i)$withinss)
WSS1[j,i]= sum(kmeans(data1s, centers=i)$withinss)
}}
wss=apply(WSS, 2, FUN = mean)
wss1=apply(WSS1, 2, FUN = mean)
P1=ggplot() +
geom_line(mapping = aes(1:15, wss)) +
geom_point(mapping = aes(1:15, wss), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
labs(title = "Considering elevation",x = "Number of Clusters",y = "Within groups sum of squares")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
P2=ggplot() +
geom_line(mapping = aes(1:15, wss1)) +
geom_point(mapping = aes(1:15, wss1), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
labs(title = "Without considering elevation",x = "Number of Clusters",y = "Within groups sum of squares")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
multiplot(P1,P2, cols = 2)
## find the number of clusters for each case
#Considering elevation
slope=wss[2:length(wss)]-wss[1:(length(wss)-1)]
ind=which(slope== max(slope))#optimal number of clusters
#Without considering elevation
slope=wss1[2:length(wss1)]-wss1[1:(length(wss1)-1)]
ind1=which(slope== max(slope))#optimal number of clusters
N_cluster=c(ind,ind1); # numbers cluster considered column1: W\Elev; Column2:Wo\Elev
# K-Means Cluster Analysis
fit <- kmeans(datas, N_cluster[1]) #W\Elev
fit1 <- kmeans(data1s, N_cluster[2]) #Wo\Elev
# get cluster means
means=aggregate(data,by=list(fit$cluster),FUN=mean)
means1=aggregate(data1,by=list(fit1$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit$cluster)
data_1 = data.frame(data,Cluster)
Cluster=factor(fit1$cluster)
data1_1 = data.frame(data1,Cluster)
N_cluster=c(6,6); # numbers cluster considered column1: W\Elev; Column2:Wo\Elev
# K-Means Cluster Analysis
fit <- kmeans(datas, N_cluster[1]) #W\Elev
fit1 <- kmeans(data1s, N_cluster[2]) #Wo\Elev
# get cluster means
means=aggregate(data,by=list(fit$cluster),FUN=mean)
means1=aggregate(data1,by=list(fit1$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit$cluster)
data_6 = data.frame(data,Cluster)
Cluster=factor(fit1$cluster)
data1_6 = data.frame(data1,Cluster)
# Determine number of clusters by iterations
WSS = matrix(nrow=Niter, ncol=15)
WSS1 = matrix(nrow=Niter, ncol=15)
for (j in 1:Niter) {
WSS[j,1]= (nrow(data2s)-1)*sum(apply(data2s,2,var))#with elev
WSS1[j,1]= (nrow(data3s)-1)*sum(apply(data3s,2,var))#without elev
for (i in 2:15) {
WSS[j,i] =sum(kmeans(data2s, centers=i)$withinss)
WSS1[j,i] = sum(kmeans(data3s, centers=i)$withinss)
}}
wss2=apply(WSS, 2, FUN = mean)
wss3=apply(WSS1, 2, FUN = mean)
P3=ggplot() +
geom_line(mapping = aes(1:15, wss2)) +
geom_point(mapping = aes(1:15, wss2), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
labs(title = "Considering elevation",x = "Number of Clusters",y = "Within groups sum of squares")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
P4=ggplot() +
geom_line(mapping = aes(1:15, wss3)) +
geom_point(mapping = aes(1:15, wss3), shape = 21, size = 3, color = "gray30", fill ="cadetblue1")+
labs(title = "Without considering elevation",x = "Number of Clusters",y = "Within groups sum of squares")+
theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))
multiplot(P3,P4, cols = 2)
#Considering elevation
slope=wss2[2:length(wss2)]-wss2[1:(length(wss2)-1)]
ind2=which(slope== max(slope))#optimal number of clusters
#Without considering elevation
slope=wss3[2:length(wss3)]-wss3[1:(length(wss3)-1)]
ind3=which(slope== max(slope))#optimal number of clusters
N_cluster=c(ind2,ind3); # numbers cluster considered column1: W\Elev; Column2:Wo\Elev
# K-Means Cluster Analysis
fit2 <- kmeans(data2s, N_cluster[1]) #W\Elev
fit3 <- kmeans(data3s, N_cluster[2]) #Wo\Elev
# get cluster means
means=aggregate(data2,by=list(fit2$cluster),FUN=mean)
means1=aggregate(data3,by=list(fit3$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit2$cluster)
data_2 = data.frame(data2,Cluster)
Cluster=factor(fit3$cluster)
data_3 = data.frame(data3,Cluster)
N_cluster=c(6,6); # numbers cluster considered column1: W\Elev; Column2:Wo\Elev
# K-Means Cluster Analysis
fit2 <- kmeans(data2s, N_cluster[1]) #W\Elev
fit3 <- kmeans(data3s, N_cluster[2]) #Wo\Elev
# get cluster means
means=aggregate(data2,by=list(fit2$cluster),FUN=mean)
means1=aggregate(data3,by=list(fit3$cluster),FUN=mean)
# append cluster assignment
Cluster=factor(fit2$cluster)
data2_6 = data.frame(data2,Cluster)
Cluster=factor(fit3$cluster)
data3_6 = data.frame(data3,Cluster)
Using the same algorithm which was used for January, Kbest for July precipitation were 14 and 14 for the cases with and without considering elevation, respectively.
By incluiding the elevation, Kbest is increase from 14 to 14. However, as in the csae of January, it is not possible to see differences betweeen clusters defined for two cases. it seems that these number of cluster is so much.
Considering K=6, plots of both cases are seen are not really different. it is only possible to see differences between south-west clusters.
Comments
The optimal number of clusters (Kbest) for both cases (with and without elevation) were found by an iterative algorithm, which runs kmeans() function n times, and then, it computes the wws average. According to this, the optimal number of cluster (Kbest) was found the maximum slope (positive or nearly to zero) of the of wws (kink plots). Thus, Kbest for January precipitation were 13 and 14 for the cases with and without considering elevation, respectively.
By incluiding the elevation, Kbest is reduced from 13 to 14. However, it is not possible to see differences betweeen clusters defined for two cases. it seems that these number of cluster is so much.
Considering K=6, plots are so much clear, and clear differences between cluster of both cases are seen. for example, cluster 5 of the case which considers elevation for clustering correspond a zone of low elevation while in the other case, this zone is divided in two clusters.