PCA

The data library at IRI, Columbia University is a wealth of data http://iridl.ldeo.columbia.edu/ (feel free to explore it). Gridded monthly global sea surface temperature (SST) anomalies for the period 1856 - present, also known as ‘Kaplan SST’ can be obtained from http://iridl.ldeo.columbia.edu/SOURCES/.KAPLAN/.EXTENDED/.v2/.ssta/ (Follow the link to ‘Data Files’ and download this monthly data as a binary direct access file. You can also do the NETCDF format as well). The reference to the paper describing this data set (Kaplan et al., 1998) is at the bottom of the above link. R-commands to read the binary data file and other commands are in R-Session#2

You wish to identify the space-time patterns of variability of the winter season (Dec - Mar) average SST for the Nov 1900 - Mar 2018 period (118 yrs). To this end,

  1. Perform a PCA on the winter global SST anomalies
  1. Perform a rotated PCA (rotate the first 6 PCs) and plot the leading 4 spatial and temporal modes of variability. Compare the results with (i) above.

  2. Repeat (i) and (ii) on the western US winter precipitation (Nov1900 - Mar2013) 113 yrs

  3. Perform (i) just for Pacific Ocean domain and correlate the first four PCs of winter SSTs with that of the western US precipitation and show correlation maps.

###########HW2,Problem 3
#### CLEAR MEMORY
rm(list=ls())
#### Prevent Warnings
options(warn=-1)

#### SOURCE LIBRARIES (suppress package loading messages) AND SET WORKING DIRECTORY
mainDir="C:/Users/DELL/Dropbox/Boulder/Courses/Advance data analysis/HW/HW2"
setwd(mainDir)
suppressPackageStartupMessages(source("Lib_HW2.R"))
source("Lib_HW2.R")
# unload packages that create a conflict with map("world2",add=T)
detach(package:mclust)
detach(package:sm)
detach(package:kohonen)

Lat - Long grid creation

### Lat - Long grid..
# information obtained from: http://iridl.ldeo.columbia.edu/SOURCES/.KAPLAN/.EXTENDED/.v2/ 

ygrid=seq(-87.5,87.5,by=5)
ny=length(ygrid) # number of latitudinal locations

xgrid=seq(27.5,382.5,by=5)
nx=length(xgrid) # number of longitudinal locations

nglobe = nx*ny
# creation of spatial grid
xygrid=matrix(0,nrow=nx*ny,ncol=2)
i=0
for(iy in 1:ny){
  for(ix in 1:nx){
    i=i+1
    xygrid[i,1]=ygrid[iy]
    xygrid[i,2]=xgrid[ix]
  }
}

Read Kaplan SST data..

nyrs = 118    #Nov-Mar 1901 - Nov-Mar 2018   
setwd("./data") #move to the data directory
data=readBin("Kaplan-SST-NDJFM1900-NDJFM2018.r4",what="numeric", n=( nx * ny * nyrs), size=4,endian="swap")

data = array(data = data, dim=c( nx, ny, nyrs ) )
data1=data[,,1]

# the lat -long data grid..

index=1:(nx*ny)

index1=index[data1 < 20 & data1 != "NaN"]   # only non-missing data.
xygrid1=xygrid[index1,]

nsites=length(index1)
data2=data1[index1]

### SSTdata matrix - rows are seasonal (i.e. one value per year)
## and columns are locations
sstannavg=matrix(NA,nrow=nyrs, ncol=nsites)

for(i in 1:nyrs){
  data1=data[,,i]
  index1=index[data1 < 20 & data1 != "NaN"]
  data2=data1[index1]
  sstannavg[i,]=data2
}  

indexgrid = index1
rm("data")  #remove the object data to clear up space

## write out the grid locations..
write(t(xygrid1),file="kaplan-sst-locs.txt",ncol=2)

I) Perform a PCA on the winter global SST anomalies and Eigen spectrum plot

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=sstscale %*%zsvd$u
#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))
#Plot the eigen variance spectrum for the first 25 modes
#Eigen spectrum
ggplot() +
 geom_line(mapping = aes(c(1:25), lambdas[1:25])) +
 geom_point(mapping = aes(c(1:25), lambdas[1:25]), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
 labs(title = "Eigen Spectrum",x = "Modes",y = "Frac. Var. explained")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
## [1] "First 3 PCs explained 48 percent of the variance"
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
## [1] "First 4 PCs explained 52 percent of the variance"

Plot the leading 4 spatial component or Eigen Vector pattern and the leading 4 PCs

xlong = sort(unique(xygrid[,2]))
ylat = sort(unique(xygrid[,1]))

par(mfrow = c(4, 2))
par(mar = c(3, 4, 2, 1))
for(i in 1:4){
  zfull = rep(NaN,nglobe)   #also equal 72*36
  zfull[indexgrid]=zsvd$u[,i]
  zmat = matrix(zfull,nrow=nx,ncol=ny)
  image.plot(xlong,ylat,zmat,ylim=range(-40,70),ann=FALSE)
  mtext(paste("EOF no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
  contour(xlong,ylat,(zmat),ylim=range(-40,70),add=TRUE,nlev=6,lwd=2)
  map("world2",add=T)
  box()
  
  plot(1901:2018, scale(pcs[,i]),type="l",xlab="Year",axes=FALSE,ann=FALSE)
  axis(2)
  axis(1)
  mtext(paste("PC no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
}

Plot PC2 and ENSO

## read ENSO index
nino4=scan("http://iridl.ldeo.columbia.edu/SOURCES/.Indices/.nino/.EXTENDED/.NINO4/T/(Nov%201899)/(Mar%202018)/RANGE/T/(Nov-Mar)/seasonalAverage/data.ch")
## PC2 will be ENSO
print(bquote("the correlation between PC no.2 and ENSO index is "~ R^2 == .(round(cor(scale(pcs[,2]),nino4)*1000)/1000)))
## "the correlation between PC no.2 and ENSO index is " ~ R^2 == 
##     0.796
ggplot() +
 geom_line(mapping = aes(1901:2018, scale(pcs[,2]),colour="gray50")) +
  geom_line(mapping = aes(1901:2018, nino4,colour="red")) +
   scale_color_discrete(name = "Time series", labels = c("PC no.2", "ENSO index"))+
  geom_point(mapping = aes(1901:2018, scale(pcs[,2])), shape = 21, size = 3, color = "gray50", fill = "white") +
 labs(title="PC2 and  ENSO index",x = "Year",y = "PC no.2")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

II) rotate the first 6 PCs and plot the leading 4 spatial and temporal modes of variability

##########################################################################################################
####### ii rotate the first 6 PCs and plot the leading 4 spatial and temporal modes of variability #####
##########################################################################################################

zrot = varimax(zsvd$u[,1:6],normalize=FALSE)
rotpcs=sstannavg %*% zrot$loadings
xlong = sort(unique(xygrid[,2]))
ylat = sort(unique(xygrid[,1]))
par(mfrow = c(4, 2))
par(mar = c(3, 4, 2, 1))

for(i in 1:4){
  zfull = rep(NaN,nglobe)   #also equal 72*36
  zfull[indexgrid]=zrot$loadings[,i]
  zmat = matrix(zfull,nrow=nx,ncol=ny)
  image.plot(xlong,ylat,zmat,ylim=range(-50,70),ann=FALSE)
  mtext(paste("Rotated EOF no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
  contour(xlong,ylat,(zmat),ylim=range(-50,70),add=TRUE,nlev=6,lwd=2)
  map("world2",add=T)
  box()
  
  plot(1901:2018, scale(rotpcs[,i]),type="l",xlab="Year",axes=FALSE,ann=FALSE)
  axis(2)
  axis(1)
  mtext(paste("Rotated PC no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
}

## Plot rotated PC2 and ENSO

print(bquote("the correlation between rotated PC no.2 and ENSO index is "~ R^2 == .(round(cor(scale(rotpcs[,2]),nino4)*1000)/1000)))
## "the correlation between rotated PC no.2 and ENSO index is " ~ 
##     R^2 == 0.898
ggplot() +
 geom_line(mapping = aes(1901:2018, scale(rotpcs[,2]),colour="gray50")) +
  geom_line(mapping = aes(1901:2018, nino4,colour="red")) +
   scale_color_discrete(name = "Time series", labels = c("PC no.2", "ENSO index"))+
  geom_point(mapping = aes(1901:2018, scale(rotpcs[,2])), shape = 21, size = 3, color = "gray50", fill = "white") +
 labs(title="Rotated PC no.2 and  ENSO index",x = "Year",y = "PC no.2")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

Comments

SST PCA

  • According to the Eigen spectrum, the first 3 PCs would be an adequate choice since they explained 48% of the SST variance. However, due to the problem’ instruction, the first 4 PCs were analyzed which can explained 52% of the variance.
  • The first PC looks like a trend in the time plot, but it is not possible to identify what is the trend in the spatial map (EOF no.1).
  • The second PC seems as ENSO. This can be seen in both spatial map and time plot, and it is confirmed in the time plot of both ENSO index and PC2.
  • In the case of PCs 3 and 4, both show signatures in the north Pacific as well as in the Atlantic Ocean. However, their temporal plots do not show any trend, they look like random noise.

Rotated SST PCA

  • Spatial pattern become more evident (hot zones become stronger). this verified by the correlation between PC2 and ENSO index. In this case, \(R^2=0.898\) and for the original PC2 the correlation is lower \(R^2=0.796\). However, as in the original PCs, the PC2 does not show any trend.

  • Temporal plot of PC3 shows a king of cycling pattern of roughly 30 years of period.

III) Repeat (i) and (ii) on the western US winter precipitation (Nov1900 - Mar2013) 113 yrs

Read Western US grid and winter precipitation data

setwd("./data") #move to the data directory
N = 113 #Nov-Mar 1901 - Nov-Mar 2013 
wuplocs = matrix(scan("WesternUS-coords.txt"), ncol=5,byrow=T)
nlocs = dim(wuplocs)[1]
xlats = wuplocs[,3]
xlongs = -wuplocs[,4]

nlocs1=nlocs+1
winterprecip = matrix(scan("western-US-winterprecip-1901-2014.txt"),ncol=nlocs1,byrow=T)
years = winterprecip[,1]
winterprecip = winterprecip[,2:nlocs1]      #first column is year

Perform a PCA on the winter global SST anomalies

### Perform PCA

#get variance matrix..
winterprecip1 = scale(winterprecip)
zs=var(winterprecip1)

#do an Eigen decomposition..
zsvd=svd(zs)
#Eigen vectors
Preceof=zsvd$u
#Principal Components...
precippcs=winterprecip1 %*% zsvd$u

#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

#Plot the eigen variance spectrum for the first 25 modes
#Eigen spectrum
ggplot() +
 geom_line(mapping = aes(c(1:25), lambdas[1:25])) +
 geom_point(mapping = aes(c(1:25), lambdas[1:25]), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
 labs(title = "Eigen Spectrum",x = "Modes",y = "Frac. Var. explained")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
## [1] "First 3 PCs explained 68 percent of the variance"
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
## [1] "First 4 PCs explained 73 percent of the variance"

Plot the leading 4 spatial component or Eigen Vector pattern and the leading 4 PCs..

par(mfrow = c(4, 2))
par(mar = c(3, 4, 2, 1))
for(i in 1:4){
  quilt.plot(xlongs, xlats, zsvd$u[,i],xlim=range(-125,-100),ann=FALSE)
  mtext(paste("EOF no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
  grid(col="gray70",lty=2)
  US(add=TRUE, col="gray40", lwd=1,xlim=range(-125,-100))
  box()
  
  plot(1901:2013, precippcs[,i],type="l",axes=FALSE,ann=FALSE)
  axis(2)
  axis(1)
  mtext(paste("PC no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
}

Rotate the first 6 PCs and plot the leading 4 spatial and temporal modes of variability

zrot = varimax(zsvd$u[,1:6],normalize=FALSE)
rotpcs=t(t(zrot$loadings) %*% t(winterprecip1))
## plot the first 4 rotated spatial mode and the rotated PCs
par(mfrow = c(4, 2))
par(mar = c(3, 4, 2, 1))
for(i in 1:4){
  quilt.plot(xlongs, xlats, zrot$loadings[,i],xlim=range(-125,-100),ann=FALSE)
  mtext(paste("Rotated EOF no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
  grid(col="gray70",lty=2)
  US(add=TRUE, col="gray40", lwd=1,xlim=range(-125,-100))
  box()
  
  plot(1901:2013, scale(rotpcs[,i]),type="l",axes=FALSE,ann=FALSE)
  axis(2)
  axis(1)
  mtext(paste("Rotated PC no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
}

Pgrid=cbind(xlongs, xlats)

Comments

Western US precipitation PCA

  • According to the Eigen spectrum, the first 3 PCs would be an adequate choice since they explained 68% of the SST variance. However, due to the problem’ instruction, the first 4 PCs were analyzed which can explained 73% of the variance.
  • In spatial map of EOF 1 and 2 it is possible to see a contrast between north and south of US, while in the case of EOF 3 and 4 a contrast between east and western US is seen. In the case of plot of PCs, temporal pattern of PCs is not evident. It is only possible to see a negative trend for PC3

Rotated western US precipitation PCA

  • Spatial patterns of EOF 2 and 4 become stronger, but for EOF 1 and 3 pattern are less evident.

  • Although it is possible to see the negative temporal trend of PC3, this is less clear than in the original PCs. In the case of the other PCs, it still difficult to identify any pattern.

IV) Perform (i) just for Pacific Ocean domain. Only period ov-Mar 1901 - Nov-Mar 2013 is considered

## Index of locations corresponding to Pacific 
sstdata=sstannavg
xlongs = xygrid1[,2]
ylats = xygrid1[,1]
indexpac = indexgrid[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


xlong = xlongs[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]
ylat = ylats[ylats >= -20 &  xlongs >= 105 & xlongs <= 290]

index1=1:length(xlongs)
index = index1[ ylats >= -20 & xlongs >= 105 & xlongs <= 290]


sstannavg = sstdata[1:N,index] #SSTs Nov-Mar 1901 - Nov-Mar 2013
indexgrid = indexpac

Perform a PCA on the winter Pacific SST anomalies

#get variance matrix..
## scale the data
sstscale = scale(sstannavg)
zs=var(sstscale)

#do an Eigen decomposition..
zsvd=svd(zs)

#Principal Components...
pcs=sstscale %*%zsvd$u
#Eigen Values.. - fraction variance 
lambdas=(zsvd$d/sum(zsvd$d))

#Plot the eigen variance spectrum for the first 25 modes
#Eigen spectrum
ggplot() +
 geom_line(mapping = aes(c(1:25), lambdas[1:25])) +
 geom_point(mapping = aes(c(1:25), lambdas[1:25]), shape = 21, size = 3, color = "gray30", fill ="cadetblue1") +
 labs(title = "Eigen Spectrum",x = "Modes",y = "Frac. Var. explained")+
  theme(plot.title = element_text(size = 13, face = "bold", hjust = 0.5))

sprintf("First 3 PCs explained %s percent of the variance",round(sum(lambdas[1:3])*100))
## [1] "First 3 PCs explained 56 percent of the variance"
sprintf("First 4 PCs explained %s percent of the variance",round(sum(lambdas[1:4])*100))
## [1] "First 4 PCs explained 62 percent of the variance"

Plot the leading 4 spatial component or Eigen Vector pattern and the leading 4 PCs

# the data is on a grid so fill the entire globaal grid with NaN and then populate the ocean grids with
# the Eigen vector
xlong = sort(unique(xygrid[,2]))
ylat = sort(unique(xygrid[,1]))
par(mfrow = c(4, 2))
par(mar = c(3, 4, 2, 1))
for(i in 1:4){
  zfull = rep(NaN,nglobe)   #also equal 72*36
  zfull[indexgrid]=zsvd$u[,i]
  zmat = matrix(zfull,nrow=nx,ncol=ny)
  image.plot(xlong,ylat,zmat,ylim=range(-40,70),ann=FALSE)
  mtext(paste("EOF no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
  contour(xlong,ylat,(zmat),ylim=range(-40,70),add=TRUE,nlev=6,lwd=2)
  map("world2",add=T)
  box()
  
  plot(1901:2013, pcs[,i],type="l",xlab="Year",axes=FALSE,ann=FALSE)
  axis(2)
  axis(1)
  mtext(paste("PC no.",i,sep=""), side = 3, line = 0.2, cex = 0.8)
}

Correlate the first four PCs of winter SSTs with that of the western US precipitation

cor_pc=cor(precippcs[,1:4],pcs[,1:4])# correlation between PCS
par(mfrow = c(4, 4))
par( oma=c(2,0,4,0)) # save some room for the legend
par(mar = c(4, 4, 2, 1))
for (i in 1:4) {
  for (j in 1:4) {
    if(i==1 & j==1){
      plot(pcs[,j],precippcs[,i],type="p",xlab=paste("R2=",round(cor_pc[i,j]*1000)/1000,sep = ""),ylab=as.character(i),xlim=c(-20,20),ylim=c(-10,10),main=as.character(j),axes=FALSE,font.lab=2)
      axis(2)
      axis(1)
    }else if(i==1){
      plot(pcs[,j],precippcs[,i],type="p",xlab=paste("R2=",round(cor_pc[i,j]*1000)/1000,sep = ""),ylab="",xlim=c(-20,20),ylim=c(-10,10),main=as.character(j),axes=FALSE,font.lab=2 )
      axis(2)
      axis(1)
    }else if(j==1){
      plot(pcs[,j],precippcs[,i],type="p",xlab=paste("R2=",round(cor_pc[i,j]*1000)/1000,sep = ""),ylab=as.character(i),xlim=c(-20,20),ylim=c(-10,10),axes=FALSE,font.lab=2 )
      axis(2)
      axis(1)
    }
    else {
      plot(pcs[,j],precippcs[,i],type="p",xlab=paste("R2=",round(cor_pc[i,j]*1000)/1000,sep = ""),ylab="",xlim=c(-20,20),ylim=c(-10,10),axes=FALSE,font.lab=2 )
      axis(2)
      axis(1)
    }}}
mtext("Correlation between four PCs of winter SSTs with that of the western US precipitation", outer = TRUE, side = 3, cex = 1.2, line = 1)

predict western US precipitation based in correlaction betweeen PCS

fit1 <- lm(precippcs[,1] ~ pcs[,c(1,2,3)])## Linear fitting for Precip PCy1=f(PCx1,PCx2,PCx4) 
fit2 <- lm(precippcs[,2] ~ pcs[,c(1,2,4)])## Linear fitting for Precip PCy2=f(PCx1,PCx2,PCx4) 
fit3 <- lm(precippcs[,3] ~ pcs[,c(1,2,4)])## Linear fitting for PrecipPCy3=f(PCx1,PCx2,PCx4)
fit4 <- lm(precippcs[,4] ~ pcs[,4])## Linear fitting for Precip PCy4=f(PCx4) 
## estimation of PCy based on Local polynomial
PrecPC_pred=matrix(1,nrow=dim(precippcs)[1],ncol=dim(precippcs)[2])
PrecPC_pred[,1]=predict(fit1,newdata=as.data.frame(pcs[,c(1,2,3)]),se.fit=F)
PrecPC_pred[,2]=predict(fit2,newdata=as.data.frame(pcs[,c(1,2,4)]),se.fit=F)
PrecPC_pred[,3]=predict(fit3,newdata=as.data.frame(pcs[,c(1,2,4)]),se.fit=F)
PrecPC_pred[,4]=predict(fit4,newdata=as.data.frame(pcs[,4]),se.fit=F)
# for the rest of PCy their mean will consider as estimator
for (i in 5:dim(precippcs)[2]) {
  PrecPC_pred[,i]=PrecPC_pred[,i]*mean(precippcs[,i])
}

#Principal Components...
precPred=PrecPC_pred %*% t(Preceof)

### correlate the forecasted PCs with the historical PCs
pccor = diag(cor(PrecPC_pred[,1:4],precippcs[,1:4]))
print("the correlation between observed and predicted PCs is:")
## [1] "the correlation between observed and predicted PCs is:"
print(round(pccor*1000)/1000)
## [1] 0.468 0.385 0.443 0.214
## correlate the forecasted rainfall with the historical rainfall
preccor = diag(cor(precPred,winterprecip1))

#plot of R2 between the observed and predicted winter precipitation at each grid point
nlevel=11
coltab=rev(brewer.pal(nlevel-1,'RdBu'))
zr=range(preccor)
break_a=round(seq(zr[1],zr[2],by=((zr[2]-zr[1])/(nlevel-1)))*100)/100
quilt.plot(Pgrid[,1], Pgrid[,2],preccor ,xlim=range(-125,-100),col=coltab,main = bquote(~ R^2 ~ "between the observed and predicted winter precipitation using first 4 SST PCs"),xlab="Longitude",ylab="Latitude",zlim=zr,lab.breaks=break_a)
grid(col="gray70",lty=2)
US(add=TRUE, col="gray50", lwd=2,xlim=range(-125,-100))
box()

Comments

  • According to the Eigen spectrum, the first 3 PCs would be an adequate choice since they explained 56% of the SST variance. However, due to the problem’ instruction, the first 4 PCs were analyzed which can explained 62% of the variance.
  • The first spatial map (EOF 1) displays a central N Pacific signature. However, there is a clear temporal pattern (temporal plot of PC1), it is only possible to see a mild negative trend.
  • As in the case of global SST, the second PC correspond to ENSO, and the negative temporal trend is really clear.
  • PC3 and 4 show signatures in the north pacific, but temporal pattern are not clear.

  • In regards to the correlation between winter SST PCs (SPC) and PCs of winter western US precipitation (PPC), First PC of precipitation (row 1 of correlation plot) are linearly correlated (\(R^2 >0.1\)) with PC1, 2, and 3 of SST. PC2 of precipitation is linearly correlated with P1, 2, and 4. Finally, PC 3 and 4 of precipitation are linearly correlated with PC1 and 2 and PC1 respectively.

  • The information of correlation between winter western US precipitation and SST was used to fit linear model for precipitation PCs, and then, estimate precipitation in the western US. As it can see in the correlation spatial map, prediction were good in the north and south states, but really poor in the middle of the US.