# CVEN 5454 - Lecture 2, 01/16/2020
# Part I

# In this code, we'll go over some introductory concepts for using R.
# Then we'll implement the following statistical measures:
# Mean, median, variance, standard deviation, IQR, skewness.
# We'll also show how to plot boxplots, histograms, and scatterplots
# in base R.

################################################################################

# First, we'll generate some random data by concatenating a series of values
x1 <- c(2, 4, 8, 9, 11, 11, 120) # note, we use c() to concatenate an array in R.

# check out the data, make sure it's what we think it is
x1 # print the vector
class(x1) # check the data type
length(x1) # find the length of the data
dim(x1)

# Other ways to "get data" are to download it from online:
rain <- read.table("http://civil.colorado.edu/~balajir/r-session-files/aismr.txt")
View(rain)
help(read.table) # get help for a certain command
class(rain)
names(rain) <- c('Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                'Sep', 'Oct', 'Nov', 'Dec') # name the columns

# Indexing columns of a dataframe
View(rain$Year) # use the '$' to index a single column of a dataframe

# Turn the rain dataframe back to a matrix and see what happens
rain_matrix <- as.matrix(rain, dimnames = NULL)
class(rain_matrix)
rain_matrix$Year #uh oh, we can't use the '$' to index a matrix

# Indexing a matrix
years <- rain_matrix[,1] #indexes the first column
year_1872 <- rain_matrix[2,] #indexes the second row

# What about importing data from a file on your own computer?
# Determine your working directory
getwd()

# try to read in data from file
rain2 <- read.table("aismr.txt")

# set your working directory
setwd("C:/Users/mdero/Documents/CVEN_5454")

# try to read in data again
rain2 <- read.table("aismr.txt")


################################################################################
# Now we'll show the R commands for some simple statistics

x1 <- c(2, 4, 8, 9, 11, 11, 120)

# calculate the mean (built in R functions)
x1_mean <- mean(x1)

# calculate the median
x1_median <- median(x1)

# calculate the variance
x1_var <- var(x1)

# calculate the std. dev
x1_stdev <- sd(x1)

# check that std. dev is square root of variance
print(x1_stdev)
print(x1_var^(.5))

# calculate the IQR
x1_IQR <- IQR(x1)

# calculate the skew, but first we need an extra R package
# install.packages()
library(e1071)
x1_skew <- skewness(x1)

################################################################################
# Next we'll introduce histograms base R:

# Below is some fake data measurements of total organic carbon (TOC) in ppm
toc <- c(21, 18, 43, 21, 11, 18, 22, 19, 24, 25, 15, 20, 26, 23, 22, 18, 27)

# a histogram is a way to represent the distribution of a variable via "binning"
hist(toc) # very basic histogram in 'base R'

# use some additional arguments like specifying number of breaks, x label, title
hist(x = toc, breaks = 5, xlab = "TOC (ppm)", main = "")

################################################################################
# Now, we'll do boxplots in base R:
# A boxplot is another method for representing a distribution of a variable
# (more condensed than a histogram)
# Boxplot notes:
# - middle line is the median
# - box represents the 25th and 75 percentiles of the data
# - whiskers represent 1.5 * IQR
# - any outliers are plotted outside the whisker range

boxplot(toc) # looks dumb! resize to use elsewhere
boxplot(x = toc, xlab = "", ylab = "TOC (ppm)", main = "Boxplot of TOC")

################################################################################
# Last, we'll do a scatterplot in base R

# generate some random data from a normal distribution (length = 10)
x <- rnorm(n = 10, mean = 0, sd = 1)
y <- rnorm(n = 10, mean = 0, sd = 1)

# generate a very simple scatterplot
plot(x, y, pch = 16)

# add a line with an intcerpt of zero and slope of one 
abline(0,1, col = "red")