R Code Example: Data Lifecycle in R – Gates Bolton Analytics

Cleaning, Visualization Options, Machine Learning Examples
###############################
##
## Data Visualization  
##
## Example 1: The Data Science
##             Life Cycle - 
##             Visually for RECORD DATA
##             with smaller dimensionality.
##
## Methods: cleaning, EDA, ML
##
## Gates
##
## Great reference:
## http://r-statistics.co/Top50-Ggplot2-Visualizations-MasterList-R-Code.html
##

## ASK YOURSELF THIS QUESTION:
## 
## Can you answer a question with a vis?

###################################
dev.off()
library(ggplot2)
##
## DATA SET
##
## https://drive.google.com/file/d/1KtnccHzbms1NGz2DzQVPM7W4U9s1s3bS/view?usp=sharing
##
##
Myfile="SummerStudentAdmissions3_.csv"
## USE YOUR OWN PATH - this is my path!
setwd("C:/Users/profa/Documents/RStudioFolder_1/DrGExamples/ANLY503")

MyData <- read.csv(Myfile)

MyData
############################################
## Part 1: Cleaning the data
##         using data vis - ggplot
##
##         EDA is Exploratory Data ANalysis
##             Clean and explore...
################################################

## LOOK AT Each Variable.
str(MyData)
## Notice that there are 9 variables

## Variable (also called features, attributes, columns) Name
(MyVarNames<-names(MyData))
MyVarNames[1]
MyData[MyVarNames[1]]

(NumColumns <-ncol(MyData))


##############################
## Column 1: Decision
###################################

## THis is NOT part of the data!
## It is the LABEL of the data.

## Dataset labels should be of type factor
str(MyData$Decision)
table(MyData$Decision)

## VISUALIZE to SEE what/where the errors are
theme_set(theme_classic())
MyBasePlot1 <- ggplot(MyData)
(MyBasePlot1<-MyBasePlot1 + 
    geom_bar(aes(MyData$Decision, fill = MyData$Decision)) + 
    ggtitle("Decision Label"))

## OK - I have problems. 
## 1) I have a blank level - likely from a missing value. 
## 2) I have a label called banana - which is wrong.

## Let's fix these.
## To fix factor data, first convert it to char. 
str(MyData$Decision)
nrow(MyData)
MyData$Decision
## View some of the data
subset(MyData, select=c("Decision", "GPA"))

##Convert it to char. 
MyData$Decision <- as.character(MyData$Decision)
## Keep only rows that are  "Admit", "Decline", or "Waitlist"

MyData <- MyData[(MyData$Decision == "Admit" | 
                  MyData$Decision == "Decline" | 
                  MyData$Decision == "Waitlist"),]

nrow(MyData)
## Check it again

(MyPlot1<-ggplot(MyData, aes(x=Decision, fill=Decision)) + 
  geom_bar()+
  geom_text(stat='count',aes(label=..count..),vjust=2)+
  ggtitle("Student Dataset Labels"))

## Good! Now we can see (and show others) that the
## Label in the dataset it clean and balanced.
## NOTE that we have color, a title, an x-axis label
## and labeled bars. We also have a legend.

## We are not done!!
str(MyData$Decision)
## This needs to be changed to type: factor
MyData$Decision<-as.factor(MyData$Decision)
## Check it
table(MyData$Decision)
str(MyData$Decision)
## Good! We now have factor data with 3 levels.

#################################################
## THe  next variable to look at is Gender
## Like Decision, Gender is also qualitative.
## Let's use a pie to look at it...
#################################################

str(MyData$Gender)
NumRows=nrow(MyData)
(TempTable <- table(MyData$Gender))
(MyLabels <- paste(names(TempTable), ":", 
                   round(TempTable/NumRows,2) ,sep=""))
pie(TempTable, labels = MyLabels,
    main="Pie Chart of Gender") 

#install.packages("plotrix")
library(plotrix)
pie3D(TempTable,labels=MyLabels,explode=0.3,
      main="Pie Chart of Gender ")


table(MyData$Gender)

## We have one problem. 
## We have a blank or NA in the data - or something like this....
## We need to fix this. 

(sum(is.na(MyData$Gender)))  ## This confirms that it is not NA
## Let's look at str
str(MyData$Gender)
## This shows that we have blank and not NA....
## FIX - change to char, correct, change back to factor
## Keep track of what you are removing from the dataset
## We can look because our dataset is very small....
MyData
nrow(MyData)
MyData$Gender <- as.character(MyData$Gender)
## Keep only rows that are Male or Female

MyData <- MyData[(MyData$Gender == "Male" | 
                    MyData$Gender == "Female") ,]
nrow(MyData)
## Turn back to factor
MyData$Gender<- as.factor(MyData$Gender)
str(MyData$Gender)
table(MyData$Gender)

(TempTable <- table(MyData$Gender))
(MyLabels <- paste(names(TempTable), ":", 
                   round(TempTable/NumRows,2) ,sep=""))
pie(TempTable, labels = MyLabels,
    main="Pie Chart of Gender") 


############################################
## Next variable is: DateSub
#############################################
#names(MyData)
## Check format
str(MyData$DateSub)  ## It is incorrect.
## Check for NAs
(sum(is.na(MyData$DateSub)))
## Check the table
table(MyData$DateSub)
## The dates look ok - but the format is wrong and 
## needs to be DATE
(MyData$DateSub <- as.Date(MyData$DateSub, "%m/%d/%Y") )
str(MyData$DateSub)

## NOw that we have dates, can visualize them with 
## a time series vis option. 

ggplot(data = MyData, aes(x = DateSub, y = GPA))+
  geom_line(color = "#00AFBB", size = 2)

## We have a problem!
## The GPA should never be above 4.0. 

ggplot(MyData, aes(x = DateSub, y = GPA)) + 
  geom_area(aes(color = Gender, fill = Gender), 
            alpha = 0.5, position = position_dodge(0.8)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))

## We can already SEE many things. 
## We can see that Males applied a bit early and a bit later.
## We can see that we have an error in at least one GPA
## value that we will need to fix. 
## We can see that Female and Male application times and GPAs
## do not appear sig diff - but we can investigate this further.

#####################################################
##
##         Let's look at GPA and then dates with it
####################################################

str(MyData$GPA)  
MyData$GPA<-as.numeric(MyData$GPA)
table(MyData$GPA)

## Are there NAs?
(sum(is.na(MyData$GPA)))

## Fix the missing GPA first
## Find it
(MissingGPA <- MyData[is.na(MyData$GPA),])
## OK - its a Female/Admit. We can replace the missing GPA
## with the median of all Female Admits.
## CREATE a temp dataframe with all the Females and Admits...
(Temp<-MyData[MyData$Decision=="Admit" & MyData$Gender=="Female",])
## The median for Female Admits is:
(MyMed<-median(Temp$GPA, na.rm=TRUE))
## NOW - replace the missing GPA with this Median
MyData$GPA[is.na(MyData$GPA)] <- MyMed
## Check to assure the missing value  was updated...
(sum(is.na(MyData$GPA)))

table(MyData$GPA)

## While I will use plyr below - it is good to 
## know how to use functions as well...
##-------------------------------------------
## Create a function to get medians
GetMedian <- function(x){
  out<-median(x)
  return(out) 
}

## Check and use the function
(MaleMed<-GetMedian(MyData$GPA[which(MyData$Gender=="Male")]))
(FemaleMed<-GetMedian(MyData$GPA[which(MyData$Gender=="Female")]))

##---------------------------------------------

library(plyr)

## Create a table using the dataset
## This table is BY Gender
## The method is summarize
## A new column is med and is the median for GPA
(TEMPmeds <- ddply(MyData, .(Gender), summarize, 
                   med = median(GPA)))


## Next, we have an incorrect value....let's SEE IT

(MyV1 <- ggplot(MyData, aes(x=Gender, y=GPA, fill=Gender)) + 
    geom_violin(trim=TRUE)+ geom_boxplot(width=0.1)+
    geom_text(data = TEMPmeds, 
              aes(x = Gender, y = med, label = med), 
                        size = 3, vjust = -1.5,hjust=-1)+
    ggtitle("GPA and Gender")+
    geom_jitter(shape=16, position=position_jitter(0.02)))

## Now we can SEE the issue. There is at least one GPA
## that is out of range. Let's fix this.
## Let's replace the missing GPA by finding the median
## for the ADMITS in that Gender group

## FIND the row with GPA > 4
(WrongGPAs <- MyData[(MyData$GPA<0 | MyData$GPA >4),])
## 
## We have Male Admit with a GPA of 6.

## Fix it by using Male Admit GPA Median
(Temp<-MyData[MyData$Decision=="Admit" & MyData$Gender=="Male",])
## The median for Male Admits is:
(MyMed<-median(Temp$GPA, na.rm=TRUE))
## NOW - replace the missing GPA with this Median
MyData$GPA[MyData$GPA>4] <- MyMed

## NOW VISUALIZAE IT AGAIN:
(TEMPmeds <- ddply(MyData, .(Gender), summarize, 
                   med = round(median(GPA),2)))


## Next, we have an incorrect value....let's SEE IT

(MyV1 <- ggplot(MyData, aes(x=Gender, y=GPA, fill=Gender)) + 
    geom_violin(trim=TRUE)+ geom_boxplot(width=0.1)+
    geom_text(data = TEMPmeds, 
              aes(x = Gender, y = med, label = med), 
              size = 4, vjust = -1.2,hjust=-1.1)+
    ggtitle("GPA and Gender")+
    geom_jitter(shape=16, position=position_jitter(0.02)))

## That's better!

table(MyData$GPA)
## LOOKS GOOD!


#############################################
##
##            Let's look at State next
############################################
#names(MyData)
str(MyData$State)
## Let's use a BAR to look
BaseGraph <- ggplot(MyData)
(MyG3<-BaseGraph + 
    geom_bar(aes(State, fill = Gender), position="dodge")+
    ggtitle("States and Gender"))

## UGLY!!

## Let's make this nicer so we can READ THE X AXIS
(MyG3<-BaseGraph + 
    geom_bar(aes(State, fill = Gender), position="dodge")+
    ggtitle("States and Gender")+
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)))

## MUCH BETTER!

## Now we can SEE that we have problems :)
## First, we have poor balance. It might be needed to 
## collect all the lower count states, such as ALabama, Mississippi, 
## etc. into a group called OTHER. However, we will not do this here.
## If you want to see how - look at this other tutorial
## http://drgates.georgetown.domains/SummerClassificationRMarkdown.html

## Also - We have two Virginias  - we need to combine them:

MyData$State[MyData$State == "virginia"] <- "Virginia"
table(MyData$State)
## Now - we need to remove the level of virginia
MyData$State<-as.character(MyData$State)
table(MyData$State)
MyData$State<-as.factor(MyData$State)
str(MyData$State)


## Check it
(MyG4<-ggplot(MyData) + 
    geom_bar(aes(State, fill = Gender), position="stack")+
    ggtitle("States and Gender")+
    theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5)))

## Even better!

#########################################
##
## Now let's look at WorkExp
#######################################
#names(MyData)
(sum(is.na(MyData$WorkExp)))
str(MyData$WorkExp)

## Let's look
theme_set(theme_classic())

# Histogram on a Continuous (Numeric) Variable
(MyS3 <- ggplot(MyData,aes(x=WorkExp, y=GPA, color=Decision)) + 
    geom_point() + 
    scale_color_manual(values = c('blue',"red", "green")))

## This helps in many ways. We can see that we have no outliers
## or odd values. 
## However, let's check it with a box plot. 

(MyL1<-ggplot(MyData, aes(x=Decision, y=WorkExp))+
    geom_boxplot()+
    geom_jitter(position=position_jitter(.01), aes(color=Gender))+
    ggtitle("Work Experience, Admissions, and Gender"))

## This looks good and it also starts to tell us that people
## were not penalized or prefered based on work experience.

#####################################################
##
##          Let's look at TestScore and Writing Score
##      
#######################################################
(sum(is.na(MyData$TestScore)))
(sum(is.na(MyData$WritingScore)))

str(MyData)

## Box plots are great to look for odd values

(MyL2<-ggplot(MyData, aes(x=Decision, y=TestScore))+
    geom_boxplot()+
    geom_jitter(position=position_jitter(.01), aes(color=Gender))+
    ggtitle("Test Score, Admissions, and Gender"))

## Interesting!! This mostly makes sense except for the 800
## in the Admit group. However, it is not an outlier - it is
## just interesting.

(MyL3<-ggplot(MyData, aes(x=Decision, y=WritingScore))+
    geom_boxplot()+
    geom_jitter(position=position_jitter(.01), aes(color=Gender))+
    ggtitle("Writing Score, Admissions, and Gender"))

## Hmmm - most of this looks OK, BUT, we have some very strange
## values for the Admit group. 

## *** Let's look at these:


(Temp <- subset(MyData, Decision=="Admit", 
                select=c(Decision,WritingScore)) )
table(Temp$WritingScore)

## OK - we can see that two score seem incorrect. 
## The 1 and the 11, for an Admit, it not likely. 
## Let's replace them with median

(Temp3<-MyData[MyData$Decision=="Admit",])
## The median for Admits is:
(MyMed2<-median(Temp3$WritingScore, na.rm=TRUE))
## NOW - replace the incorrect  with this Median
MyData$WritingScore[MyData$WritingScore<85] <- MyMed2


## check again
(MyL4<-ggplot(MyData, aes(x=Decision, y=WritingScore))+
    geom_boxplot()+
    geom_jitter(position=position_jitter(.01), aes(color=Gender))+
    ggtitle("Writing Score, Admissions, and Gender"))

## MUCH BETTER!

## We can also look using density area plots...


# Use semi-transparent fill
(MyPlot4<-ggplot(MyData, aes(x=WritingScore, fill=Decision)) +
  geom_area(stat ="bin", binwidth=2, alpha=0.5) +
  theme_classic())

## Here - using density - we can get a deeper look
MyPlot5 <- ggplot(MyData, aes(WritingScore))
MyPlot5 + geom_density(aes(fill=factor(Decision)), alpha=0.5) + 
  labs(title="Density plot", 
       subtitle="Decisions Based on Writing Scores")


### Hmmm - does it seem like WritingScore is really
## related to Admissions?

## Let's run an ANOVA test to see
MyANOVA_WS_Adm <- aov(WritingScore ~ Decision, data = MyData)
# Summary of the analysis
summary(MyANOVA_WS_Adm)  ## The test IS significant!
plot(MyANOVA_WS_Adm, 1)
## The above shows we can assume the homogeneity of variances.
plot(MyANOVA_WS_Adm, 2) ## Close to normal

library("ggpubr")
ggboxplot(MyData, x = "Decision", y = "WritingScore", 
          color = "Decision", palette = c("#00AFBB", "#E7B800","green"),
          ylab = "WritingScore", xlab = "Decision")


## Let's add labels...

(TheMean <- ddply(MyData, .(Decision), summarize, 
                   mean2 = round(  mean(WritingScore) ,2 )))


## Another View...

(MyV2 <- ggplot(MyData, aes(x=Decision, y=WritingScore, fill=Decision)) + 
    geom_violin(trim=TRUE)+ geom_boxplot(width=0.1)+
    geom_text(data = TheMean, 
              aes(x = Decision, y = mean2, label = mean2), 
              size = 3, vjust = -1.5,hjust=-1)+
    ggtitle("Writing Score and Admissions Decision")+
    geom_jitter(shape=16, position=position_jitter(0.2)))

###########################################
##  The last variable is VolunteerLevel
##  
##############################################
str(MyData$VolunteerLevel)
## This should NOT be an int
## COrrect it to factor
MyData$VolunteerLevel <- as.factor(MyData$VolunteerLevel)
table(MyData$VolunteerLevel)

(MyG1<-ggplot(MyData) + 
    geom_bar(aes(VolunteerLevel, fill = Decision)) + 
    ggtitle("Decision by Volunteer Level")+
    coord_flip())

## Everything looks good here and we can SEE what we have.

####################################################
##
##     Now for the Visual EDA  - exploring 
##
####################################################

## Let's look at variables with respect to each other

## Is there a relationship between Admission and Gender?
#install.packages("GGally")
library(GGally)
names(MyData)
ggpairs(data=MyData, columns=c(1,2,5), 
        mapping=ggplot2::aes(color = Decision))


table(MyData$Gender)
# Stacked - because we want the relativity
ggplot(MyData, aes(fill=Decision, y=Decision, x=Gender)) + 
  geom_bar(position="stack", stat="identity")+
  theme(axis.text.y = element_blank())



##############################
## Let's look at Gender 
## and the other variables
#################################
names(MyData)
## Gender and Decision  
ggpairs(data=MyData, columns=c(1,2), title="Gender and Admissions",
        mapping=ggplot2::aes(color = Decision),
        lower=list(combo=wrap("facethist",binwidth=1)))

## Let's look at Gender and GPA with Decision
ggpairs(data=MyData, columns=c(1,2,5), title="Gender, GPA, and Admissions",
        mapping=ggplot2::aes(color = Decision),
        lower=list(combo=wrap("facethist",binwidth=1)))

## Can we do more with ggpairs?  YES!
(MyGG<-ggpairs(MyData, 
        columns = c(1,2,5),
        title = "Decision",
        labeller = "label_value",
        upper = list(continuous = "density", combo = "box_no_facet"),
        lower = list(continuous = "points", combo = "dot_no_facet"),
        mapping=ggplot2::aes(color = Decision)
)) 

## Heat maps for correlations

ggcorr(MyData[,c(5,6,7,8)], low = "yellow", mid = "orange", high = "darkred")
#names(MyData)
###########################################
## Questions we can ask about States
## 1) Are there gender differences between the states?
## 2) Are there Admissions diff between states?
## 3) Are there GPA diff among states?
## etc.
###################################
library(gridExtra)

p1 <- ggplot(MyData) +
  geom_bar(aes(x = State, y = stat(count), fill = Gender))+
      theme(axis.text.x=element_text(angle=90,hjust=1))+
      geom_text(stat='count',aes(State, label=..count..),vjust=-1)

p2 <- ggplot(MyData) +
  geom_col(aes(x = State, y = GPA, fill = Decision), 
           position = "dodge") +
  theme(axis.text.x=element_text(angle=90,hjust=1)) 
  
grid.arrange(p1, p2, nrow = 2)

p2+ facet_wrap(~ Gender)

####################################
#
#  Let's look at quant measures
#  with respect to Admissions and Gender
## Such as: GPA, WorkExp, TestScore, WritingScore
##
#############################################
#names(MyData)

## Are these correlations between our
## quant variable - 

(MyGG<-ggpairs(MyData, 
               columns = c(5:8),
               title = "Decision",
               labeller = "label_value",
               upper = list(continuous = "density", combo = "box_no_facet"),
               lower = list(continuous = "points", combo = "dot_no_facet"),
               mapping=ggplot2::aes(color = Decision)
)) 

## Is Work Experience related to GPA?
ggplot(MyData, aes(WorkExp, GPA)) +
  geom_point(shape = 21, 
             color = "lightblue", 
             fill = "white", 
             size = 2, 
             stroke = 2)

## Let's add a regression line
ggplot(MyData,aes(x=WorkExp, y=GPA)) +
  geom_point(shape = 21, 
             color = "lightblue", 
             fill = "white", 
             size = 2, 
             stroke = 2) +
  geom_smooth(method='lm', formula= y~x)

## Correlation
cor(MyData$WorkExp,MyData$GPA)

## nope! There is no reall correlation here.


### What about fitting a quadratic to it??
ggplot(MyData,aes(x=WorkExp, y=GPA)) + 
  geom_point(shape = 21, 
             color = "lightblue", 
             fill = "white", 
             size = 2, 
             stroke = 2) +
  stat_smooth(method = "lm", 
              formula = y ~ x + I(x^2), 
              size = 1) 
  
## -----------------------
## How about GPA and TestScore

ggplot(MyData,aes(x=TestScore, y=GPA)) + 
  geom_point(shape = 21, 
             color = "lightgreen", 
             fill = "white", 
             size = 2, 
             stroke = 2) +
  stat_smooth(method = "lm", 
              formula = y ~ x + I(x^2), 
              size = 1) 

## Yes! Here we have a correlation for sure!
cor(MyData$TestScore,MyData$GPA)  ## r value > .70 

##--------------------------------------------------------

###### Is there is a relationship between GPA and Decision?

## Using functions
## Labeling properly

GetMeasures <- function(x){
  #out <- quantile(x, probs = c(0.25,0.5,0.75))
  out<-median(x)
  #names(out) <- c("ymin","y","ymax")
  return(out) 
}


ggplot(MyData, aes(Decision,y=GPA, fill=Decision))+
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  labs(title="GPA by Admissions")+
  stat_summary(fun.y=GetMeasures, hjust = -2,
               vjust=-4,
                geom="text", 
                aes(label=sprintf("%1.1f", ..y..)),
                position=position_nudge(y=-.1), 
                colour="black", size=3)+
  geom_boxplot(width=0.1)+
  theme_classic()

#########################
## Is there a relationship between GPA and Gender?

ggplot(MyData, aes(Gender,y=GPA, fill=Gender))+
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  labs(title="GPA by Gender")+
  stat_summary(fun.y=GetMeasures, hjust = -1,
               vjust=-3,
               geom="text", 
               aes(label=sprintf("%1.1f", ..y..)),
               position=position_nudge(y=-.3), 
               colour="black", size=2)+
  geom_boxplot(width=0.1)+
  theme_classic()

####### LABEL THE QUARTILES ----------------------
ggplot(MyData, aes(x=factor(Decision), y=GPA, fill=factor(Decision))) + 
  geom_boxplot(width=0.6) +
  stat_summary(geom="text", fun.y=quantile,
               hjust = -.2,
               vjust=0,
               aes(label=sprintf("%1.1f", ..y..), color=Decision),
               position=position_nudge(x=0.3), size=3) +
  theme_bw()


#########################################
## Is there a relationship between GPA and
## Writing Score?
#####
ggplot(data = MyData, aes(x = GPA, y = WritingScore)) +
  geom_point(aes(color = Decision, size=GPA))


ggplot(data=MyData, 
       aes(x=GPA, color=Decision,
           y=WritingScore, 
           size = VolunteerLevel)) +
  geom_point(alpha=0.5) 


######################################
## Are Volunteer Level and Admissions related?
#############
## NOTE: These are both qualitative.
library(ggpubr)

ggballoonplot(MyData, x = "Decision", y = "VolunteerLevel", size = "WorkExp",
              fill = "Decision", facet.by = "Gender",
              ggtheme = theme_bw()) 


############### Which States have more Admits?
## Is there a relationship with Test Score and/or Gender?
###
ggballoonplot(MyData, x = "Decision", y = "State", size = "TestScore",
              fill = "Decision", facet.by = "Gender",
              ggtheme = theme_bw()) 


#####################
# What are the correlations (r values)
## For all the quant. variable?
library(corrplot)
## Create a correlation matrix
MyData_correlationMatrix <- 
  cor(MyData[,c(5,6,7,8)],use="complete.obs")
## Visualize the correlation matrix
c1<-corrplot(MyData_correlationMatrix,method="number")
c2<-corrplot(MyData_correlationMatrix,method="ellipse")
c3<-corrplot(MyData_correlationMatrix,method="circle")
c4<-corrplot(MyData_correlationMatrix,method="color")

c1  ## this will give you the correlations as a table

corrplot.mixed(MyData_correlationMatrix, upper="ellipse")


#############################################
## Let's look at some time series.....
###############################################
library(scales)

ggplot(aes(x = DateSub, y = GPA), data = MyData) + 
  geom_point()
 

ggplot(aes(x = DateSub, y = GPA), data = MyData) + 
  geom_line()

ggplot(MyData, aes(DateSub, GPA)) +
  geom_point(aes(color = Decision, size = GPA, shape=VolunteerLevel)) + 
  ggtitle("GPA Over Time with Decision and Gender") +
  xlab("Date") + ylab("GPA")+
  facet_wrap(~Gender,nrow=2)


### How about Admission over time
ggplot(MyData, aes(DateSub, Decision)) +
  geom_point(aes(color = Decision, size = GPA)) + 
  ggtitle("Decisions Over Time - Does the WHEN Matter?") +
  xlab("Date") + ylab("")+
  facet_wrap(~Gender,nrow=2)



#### Consider all quant variables over time - compared.
## Will need to normalize the data first. Why?
(Temp<-MyData[,c(5,6,7,8)])
#(Temp_scaled <- as.data.frame(scale(Temp)))
## OR - even better
## install.packages("clusterSim")
library(clusterSim)
(Temp_Norm<-data.Normalization(Temp,type="n4",normalization="column"))
## RE:####################################################
##n1 - standardization ((x-mean)/sd)
##n2 - positional standardization ((x-median)/mad)
##n3 - unitization ((x-mean)/range)
##n3a - positional unitization ((x-median)/range)
##n4 - unitization with zero minimum ((x-min)/range
######################################################


## Now make your new DF
(NewDF<-cbind(Temp_Norm, DateSub=MyData$DateSub))

ggplot(NewDF, aes(x=DateSub))+
  geom_line(aes(y=GPA, color="Blue"))+
  geom_point(aes(y=TestScore, color="Green"))
  #geom_line(data=df2,aes(color="Second line"))+
  #geom_line(data=df2,aes(color="Second line"))+
  #labs(color="Legend text")

## ---------------------
## Create Time Series Data....
## -------------------------------------
##The ts() function will convert a numeric vector 
##into an R time series object.

library(xts)
library(zoo)
library(lubridate)

## TS Objects MUST have only numeric columns. 
## I am using columns 5, 6, 7, and 8 in the TS dataset.

(My_TS_Data <- xts(x = MyData[,c(5,6,7,8)], order.by = MyData$DateSub))
plot(My_TS_Data$GPA)
plot(My_TS_Data[,c(1,2)], plot.type="s")
plot(My_TS_Data[,c(1,2)], type="o", col=4)


## Using ggplot without a TS object...
ggplot(data = MyData,  aes(DateSub, GPA) )   +       
  ylab(' GPA')                           +
  geom_line(col="blue")                      +
  geom_point(col="magenta", pch=1)+
  ggtitle("GPA Over Time")
  

## Using ggplot and two numeric variables over time
## without a TS object

ggplot(data = MyData, aes(x=DateSub, y=value, color=variable )  )             +
  ylab('Scores')                                 +
  geom_line(aes(y=MyData$TestScore/150 , col='Test Score - normalized'), size=1, alpha=.5)   +
  geom_line(aes(y=MyData$WritingScore/10, col='Writing Score - norm'),  size=1, alpha=.5)   +
  ggtitle("Test Scores and Writing Scores")+
  theme(legend.position=c(.1,.85))	

################################################

##################################################
## Machine Learning Methods and Visualization
##
## Clustering
## Association Rule Mining
##
## Naive Bayes
## Decision Trees
## SVMs
## 
##################################################

## PART 1: Unsupervised - Clustering
##         k-means and hierarchical

##-----------------------------------------------

#################################
#
#           HUGELY IMPORTANT
#
####### FOr any model or method
####### Make sure the data is
####### in the right (and a smart)
####### format. For example - for
####### clustering - you must REMOVE
####### the labels!
#
#####################################

MyData

## To cluster - remove the label
## and remove any non-numeric variables

MyClusterData<-MyData[,c(5,6,7,8)]
head(MyClusterData)

## Next - normalize!
## This is especially important if you 
## plan to use cosine sim as a distance
## measure. 

## MIN - MAX Function
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}

Norm_Data<-normalize(MyClusterData)
head(Norm_Data)

##OK! How we have a dataframe that this 
## appropriate for clustering

library(mclust)
library(e1071)
library(cluster)

ClusFIT1 <- Mclust(Norm_Data,G=3)
summary(ClusFIT1)
plot(ClusFIT1, what = "classification")

## Since we know that GPA and TestScore
## are most related to decision - let's
## LOOK at just those


ClusFIT2 <- Mclust(Norm_Data[,c(1,3)],G=3)
summary(ClusFIT2)
plot(ClusFIT2, what = "classification")

## Let's also look at this without 
## normalized data

ClusFIT3 <- Mclust(MyClusterData[,c(1,3)],G=3)
summary(ClusFIT3)
plot(ClusFIT3, what = "classification")

## What does this show?
##
## Here, we SEE that we have clear clusters
## for Admit, Waitlist, and Decline.

################
## Example 2
C_Data<-MyClusterData[,c(3,1)]

## Add row names for the cluster vis
## Create "unique" but useful row names...

rownames(C_Data) <- paste((as.character(MyClusterData[,1])),
                    "_",
                    rownames(C_Data), sep="")

str(C_Data)

kmeansFIT <- kmeans(C_Data,3)
(kmeansFIT)
## This shows the cluster centroid means
kmeansFIT$centers
plot(kmeansFIT$cluster)
plot(kmeansFIT$centers)


##########################################
library(factoextra)
fviz_cluster(list
                  (data = C_Data, 
                  cluster = kmeansFIT$cluster, 
                  show.clust.cent = TRUE,
                  labelsize = 1,
                  main = "Cluster of Admissions Data",
                  ellipse.type = "norm",
                  outlier.shape = 10, 
                  outlier.pointsize = 5,
                  outlier.labelsize = 5,
                    pointsize = .5,
                    repel = TRUE
                    ))
             
                  

####################
## Hierarchical clustering
##############################################
d_E <- dist(C_Data, method = "euclidean") # distance matrix

fit1 <- hclust(d_E, method="ward")
plot(fit1) # display dendogram
groups <- cutree(fit1, k=3) # cut tree into 3 clusters
# draw dendogram with red borders around the 3 clusters
rect.hclust(fit1, k=3, border="red") 


# Hierarchical clustering using Complete Linkage
fit2 <- hclust(d_E, method = "complete" )

# Plot the obtained dendrogram
plot(fit2, cex = 0.6, hang = -1)

fit3 <- agnes(C_Data, method = "ward")
pltree(fit3, cex = 0.6, 
       hang = -1, main = "Dendrogram of Data") 

plot(fit1, cex = 0.6)
rect.hclust(fit1, k = 3, border = 2:5)


####################
## Determine optimal number of clusters....
##---------------------------------------------
fviz_nbclust(MyClusterData, FUN = hcut, method = "wss")
## We can see that we are right to use k = 3


##################################################
##
##         Association Rule Mining
##
##################################################
##
## We first need to format the data - BUT - 
## we need to know what we are trying to associate??

head(MyData)

## With ARM - the idea is to see if certain words
## seems more associated with a greater probability.
## We CANNOT do this with numeric data and so we must
## discretize or remove all numeric or date data. 
## We must end up with ONLY categorical data. 

names(MyData)
## Columns that are fine as-is are Decision, Gender, and
## VolunteerLevel. All other columns must be discretized. 

sort(MyData$DateSub)

## New DF
MyARM_Data<-data.frame()
str(MyARM_Data)


library(dplyr)
library(lubridate)

## First, convert all dates to a month
(Temp4<-MyData %>% mutate(month = month(DateSub)))

## Then, cut by month
(Temp5 <- 
  cut(Temp4$month, 
      breaks=c(0, 1, 3, 11, 12), 
      labels=c("Jan","Feb_Mar", "Nov","Dec")))

## Next, convert the GPA

(Temp6 <- 
    cut(MyData$GPA, 
        breaks=c(0, 3.1, 3.4, 3.59, 3.8, 4), 
        labels=c("C_B-", "B","A-", "A", "A+")))

(MyARM_Data<-cbind(as.data.frame(Temp6), as.data.frame(Temp5)))
names(MyARM_Data)<-c("GPALevel","SubDate")
MyARM_Data

## Next, discretize the others...
(Temp7 <- 
    cut(MyData$TestScore, 
        breaks=c(0, 700, 800, 900, 1000), 
        labels=c("Low", "Medium","High", "VeryHigh")))


(MyARM_Data<-cbind(as.data.frame(Temp7), MyARM_Data))
names(MyARM_Data)<-c("TestScore", "GPALevel","SubDate")
MyARM_Data

## Let's include other categorical variables as well

names(MyData)
(MyARM_Data<-cbind(MyARM_Data, MyData$Decision, MyData$Gender, MyData$VolunteerLevel))
MyARM_Data

## OK! Now we have a dataset such that 
## 
## EACH ROW IS A TRANSACTION containing meaningful words/labels

## Apply ARM
library(arules)
library(arulesViz)
## IF  ERROR - use detach and then install the library
## detach("package:arulesViz", unload=TRUE)
## detach("package:arules", unload=TRUE)
## then - run 
## library(arules) 
## library(arulesViz)
##again

MY_rules <- arules::apriori(MyARM_Data, 
                 parameter = list(supp = 0.25, conf = 0.25,
                                  target = "rules", minlen=2))

inspect(MY_rules[1:10])

SortedRules_by_conf <- sort(MY_rules, by="confidence", decreasing=TRUE)
inspect(SortedRules_by_conf[1:10])

SortedRules_by_sup <- sort(MY_rules, by="support", decreasing=TRUE)
inspect(SortedRules_by_sup[1:10])

####### Visualize the results
## Uses arulesViz
plot (SortedRules_by_conf,
      method="graph",
      engine='interactive',
      shading="confidence") 

plot (SortedRules_by_sup,
      method="graph",
      engine='interactive',
      shading="confidence") 

#######################################################
########  Using NetworkD3 To View Results   ###########
#######################################################
# install.packages("tcltk")
# install.packages("igraph")
# install.packages("networkD3")
library(networkD3)
library(tcltk)
library(igraph)

## Build node and egdes properly formatted data files
## Build the edgeList which will have: 
## ---- SourceName
## ---- TargetName
## ---- Weight, 
## ---- SourceID
## ---- TargetID


## Convert the RULES to a DATAFRAME
Rules_DF2<-DATAFRAME(SortedRules_by_conf, separate = TRUE)
(head(Rules_DF2))
str(Rules_DF2)
## Convert to char
Rules_DF2$LHS<-as.character(Rules_DF2$LHS)
Rules_DF2$RHS<-as.character(Rules_DF2$RHS)

## Remove all {}
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[{]', replacement='')
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[}]', replacement='')

head(Rules_DF2)

## Other options for the following
#Rules_Lift<-Rules_DF2[c(1,2,5)]
#Rules_Conf<-Rules_DF2[c(1,2,4)]
#names(Rules_Lift) <- c("SourceName", "TargetName", "Weight")
#names(Rules_Conf) <- c("SourceName", "TargetName", "Weight")
#head(Rules_Lift)
#head(Rules_Conf)

###########################################
###### Do for SUp, Conf, and Lift   #######
###########################################
## Remove the sup, conf, and count
## USING LIFT
Rules_L<-Rules_DF2[c(1,2,5)]   ## Note that 1 is LHS, 2 is RHS, and 5 is lift
names(Rules_L) <- c("SourceName", "TargetName", "Weight")
head(Rules_L,30)

## USING SUP
Rules_S<-Rules_DF2[c(1,2,3)]
names(Rules_S) <- c("SourceName", "TargetName", "Weight")
head(Rules_S,30)

## USING CONF
Rules_C<-Rules_DF2[c(1,2,4)]
names(Rules_C) <- c("SourceName", "TargetName", "Weight")
head(Rules_C,30)

## !!!!!!!!!!!!!!!!!!!!!!!!!
## CHoose and set
## !!!!!!!!!!!!!!!!!!!!!!!
#Rules_Sup<-Rules_C
Rules_Sup<-Rules_S        ## Here - I will use weight as Support for my edges
#Rules_Sup<-Rules_L

###########################################################################
#############       Build a NetworkD3 edgeList and nodeList    ############
###########################################################################

## Just using igraph..........
edgeList<-Rules_Sup
# Create a graph. Use simplyfy to ensure that there are no duplicated edges or self loops
MyGraph <- igraph::simplify(igraph::graph.data.frame(edgeList, directed=TRUE))
plot(MyGraph)
## ................................

############################### BUILD THE NODES & EDGES ####################################
(edgeList<-Rules_Sup)
## Create an igraph object
MyGraph <- igraph::simplify(igraph::graph.data.frame(edgeList, directed=TRUE))

## Create the properly formatted node list
nodeList <- data.frame(ID = c(0:(igraph::vcount(MyGraph) - 1)), 
                       # because networkD3 library requires IDs to start at 0
                       nName = igraph::V(MyGraph)$name)
## Node Degree
(nodeList <- cbind(nodeList, nodeDegree=igraph::degree(MyGraph, 
                                                       v = igraph::V(MyGraph), mode = "all")))

## Determine and include Betweenness
BetweenNess <- igraph::betweenness(MyGraph, 
                                   v = igraph::V(MyGraph), 
                                   directed = TRUE) 

(nodeList <- cbind(nodeList, nodeBetweenness=BetweenNess))



## For scaling...divide by   FYI ---
## RE:https://en.wikipedia.org/wiki/Betweenness_centrality
##/ ((igraph::vcount(MyGraph) - 1) * (igraph::vcount(MyGraph)-2))
## For undirected / 2)
## Min-Max Normalization
##BetweenNess.norm <- (BetweenNess - min(BetweenNess))/(max(BetweenNess) - min(BetweenNess))


## Node Degree


###################################################################################
########## BUILD THE EDGES #####################################################
#############################################################
# Recall that ... 
# edgeList<-Rules_Sup
getNodeID <- function(x){
  which(x == igraph::V(MyGraph)$name) - 1  #IDs start at 0
}

## These are the node names...
igraph::V(MyGraph)$name

## How to retrieve a node ID - you need to know the node names to do this
(getNodeID("MyData$Decision=Admit")) 

edgeList <- plyr::ddply(
  Rules_Sup, .variables = c("SourceName", "TargetName" , "Weight"), 
  function (x) data.frame(SourceID = getNodeID(x$SourceName), 
                          TargetID = getNodeID(x$TargetName)))

head(edgeList)
nrow(edgeList)

########################################################################
##############  Dice Sim ################################################
###########################################################################
#Calculate Dice similarities between all pairs of nodes
#The Dice similarity coefficient of two vertices is twice 
#the number of common neighbors divided by the sum of the degrees 
#of the vertices. Method dice calculates the pairwise Dice similarities 
#for some (or all) of the vertices. 
DiceSim <- igraph::similarity.dice(MyGraph, vids = igraph::V(MyGraph), mode = "all")
head(DiceSim)

#Create  data frame that contains the Dice similarity between any two vertices
F1 <- function(x) {data.frame(diceSim = DiceSim[x$SourceID +1, x$TargetID + 1])}

#Place a new Dice Sim column in edgeList 
head(edgeList)
edgeList <- plyr::ddply(edgeList,
                        .variables=c("SourceName", "TargetName", "Weight", 
                                     "SourceID", "TargetID"), 
                        function(x) data.frame(F1(x)))
head(edgeList)


## NetworkD3 Object
#https://www.rdocumentation.org/packages/networkD3/versions/0.4/topics/forceNetwork

## First - check everything:
head(edgeList)
head(nodeList)

D3_network_Adm <- networkD3::forceNetwork(
  Links = edgeList, # data frame that contains info about edges
  Nodes = nodeList, # data frame that contains info about nodes
  Source = "SourceID", # ID of source node 
  Target = "TargetID", # ID of target node
  Value = "Weight", # value from the edge list (data frame) that will be used to value/weight relationship amongst nodes
  NodeID = "nName", # value from the node list (data frame) that contains node description we want to use (e.g., node name)
  Nodesize = "nodeDegree",  ## or nodeBetweenness",  # value from the node list (data frame) that contains value we want to use for a node size
  Group = "nodeDegree",  # value from the node list (data frame) that contains value we want to use for node color
  height = 600, # Size of the plot (vertical)
  width = 600,  # Size of the plot (horizontal)
  fontSize = 12, # Font size
  linkDistance = networkD3::JS("function(d) { return d.value*100; }"), # Function to determine distance between any two nodes, uses variables already defined in forceNetwork function (not variables from a data frame)
  linkWidth = networkD3::JS("function(d) { return d.value; }"),# Function to determine link/edge thickness, uses variables already defined in forceNetwork function (not variables from a data frame)
  opacity = 0.9, # opacity
  zoom = TRUE, # ability to zoom when click on the node
  opacityNoHover = 0.9, # opacity of labels when static
  linkColour = "red"   ###"edges_col"red"# edge colors
) 

# Plot network
D3_network_Adm

# Save network as html file
networkD3::saveNetwork(D3_network_Adm, 
                       "NetD3_Adm.html", selfcontained = TRUE)

####################################################################
##
##               Supervised Methods: Decision Trees
##
#######################################################################

library(e1071)
library(caret)
library(rpart)  ## For DT
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.1.0 Copyright (c) 2006-2017 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

library(rpart.plot)
library(RColorBrewer)
library(Cairo)

## To perform supervised methods - we need to format the data
## and we need to create Training and Testing sets from the dataset.

head(MyData)
## Make sure all types are correct and Decision is type: FACTOR
## Decision should have 3 levels - Gender 2 levels - etc.
str(MyData)  

## double-check for NAs
## Note that we already cleaned that data above - so will not repeat here
(sum(is.na(MyData)))

## -----------------------------
## Create the Train and Test sets
## ----------------------------------------

## First - remove the date column from the dataset
names(MyData)
head(MyData2<-MyData[,-3])  ## -3 because this is the date column
nrow(MyData2)
set.seed(1234)
(MySample <- sample(nrow(MyData2),nrow(MyData2)*.3))
Testing_Set <- MyData2[MySample,]
Training_Set <- MyData2[-MySample,]

head(Testing_Set)
head(Training_Set)

## Check the label balance! If its not excellent - re-sample
## If you set a seed each time, you can keep the one you like.
## Testing Set
TestG<-ggplot(Testing_Set) +
  geom_bar(aes(x = Decision, y = stat(count), fill = Decision))+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  geom_text(stat='count',aes(Decision, label=..count..),vjust=2)

TrainG<-ggplot(Training_Set) +
  geom_bar(aes(x = Decision, y = stat(count), fill = Decision))+
  theme(axis.text.x=element_text(angle=90,hjust=1))+
  geom_text(stat='count',aes(Decision, label=..count..),vjust=2)


grid.arrange(TestG, TrainG, nrow = 2)

##-------------------------------------------------------
## HUGE !!!!!!!!!!!
## 
## Now- remove the labels from the Test Data and Keep them
##--------------------------------------------------------
head(Testing_Set)
MyTestLabels<-Testing_Set[,1]
head(MyTestLabels)
Testing_Set_No_Labels<-Testing_Set[,-1]
head(Testing_Set_No_Labels)


###############################
## Train and Test the Tree
##
## Visualize each tree as you
## update.
##############################################

## Train the tree
## Recall that trees are created randomly by R
## Recall that there are an infinite number of 
## possible trees. 
##
## By making updates to the data - such as selecting
## specific columns, using feature generation,
## or using discretization - different trees can be
## created and visualized. 

#################################################
## 
MyTree1 <- rpart(Decision ~ ., data = Training_Set, method="class")
summary(MyTree1)

## What was the most important variable?  (TestScore)
## What was the second most important? (GPA)
## Which was least important? (Gender)

##--------------------Predictions...................
## Check your model on the Test data
## Notice you MUST use the Test set with NO LABELS

MyModelPrediction= predict(MyTree1,Testing_Set_No_Labels, type="class")
(MyResults <- data.frame(Predicted=MyModelPrediction,Actual=MyTestLabels))

## Basic Comfusion Matrix
(MyTable<-table(MyModelPrediction,MyTestLabels))
str(MyTable)

## Create a DF from the table to use in the heat map below...
(MyTable_DF<-as.data.frame(MyTable))
str(MyTable_DF)

## Alternative - this offer sensitivity, specificity, accuracy, etc.
#(MyConf_Mat <- confusionMatrix(MyModelPrediction,MyTestLabels))
## To see the table:
#MyConf_Mat$table
#str(MyConf_Mat)

## Nicer Confusion Matrix - but only works on 2x2
## fourfoldplot(MyConf_Mat$table)

## Using ggplot heatmap to build a confusion matrix
ggplot(MyTable_DF, aes(x=MyModelPrediction, y=MyTestLabels, fill=Freq)) + 
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue")+
  geom_text(aes(label=Freq))

#################################
##
## Now - let's build some trees!
##
##########################################
fancyRpartPlot(MyTree1)

## Since TestScore is taking over - let's
## built a tree without it...

MyTree2 <- rpart(Decision ~ GPA+Gender, data = Training_Set, method="class")
#summary(MyTree2)
fancyRpartPlot(MyTree2)

## How about WritingScore and VolunteerLevel?
MyTree3 <- rpart(Decision ~ WritingScore + VolunteerLevel, 
                 data = Training_Set, method="class")
#summary(MyTree3)
fancyRpartPlot(MyTree3)

##Setting cp to a negative amount 
## ensures that the tree will be fully grown.

MyTree4 <- rpart(Decision ~ WritingScore +GPA, 
                 data = Training_Set, method="class", cp=-1)
#summary(MyTree4)
fancyRpartPlot(MyTree4)

######################### There are so many options to try!
#-----------------------------------------
########## Information: Top Attributes ###
#-----------------------------------------
library(FSelector)
(My_FSelector <- FSelector::information.gain(Decision ~ .,data=Training_Set))

                                             
####################################################
##
##   Random Forest
##
###################################################
##install.packages("randomForest")
## Save and restart if you need to - SAVE FIRST
library(randomForest)
My_RF <- randomForest(Decision ~ .,data=Training_Set)

RF_Pred= predict(My_RF,Testing_Set_No_Labels, type="class")

## Basic Comfusion Matrix
(My_RF_Table<-table(RF_Pred,MyTestLabels))

## Create a DF from the table to use in the heat map below...
(My_RF_Table_DF<-as.data.frame(My_RF_Table))

## Using ggplot heatmap to build a confusion matrix
ggplot(My_RF_Table_DF, 
       aes(x=RF_Pred, y=MyTestLabels, fill=Freq)) + 
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue")+
  geom_text(aes(label=Freq))


## Other options
My_RF$confusion
My_RF$importance  ## Includes GINI
##My_RF$forest - UGLY



## Using party ###############
library(party)

## Important ##
## Random Forest is RANDOM!
## So you will get different answers each time - AND - 
## if you it does not run right - just run it again
##
CF_RF<-cforest(Decision ~ ., 
        data=Training_Set, 
        controls=cforest_control(mtry=2, mincriterion=0))


#CF_RF@data
## Note - the "@" is used with objects like $ is used with DFs

MyParty<-party:::prettytree(CF_RF@ensemble[[1]], 
                            names(CF_RF@data@get("input"))) 
MyNewTree <- new("BinaryTree")
MyNewTree@tree <- MyParty
MyNewTree@data <- CF_RF@data
MyNewTree@responses <- CF_RF@responses
MyNewTree
plot(MyNewTree) 


### caret has a good RF model tool as well
## "rf" is random forest
## This is nice as it offers Accuracy and Kappa
library(caret)
RF_caret <- caret::train(Decision~ ., method="rf", data=Training_Set) 
RF_caret

############# Using ggplot and caret to see more---------
# Save the variable importance values from our model 
## object generated from caret.
(ImportantVariables<-varImp(RF_caret, scale = FALSE))

# Get the row names of the variable importance data
rownames(ImportantVariables$importance)

# Convert the variable importance data into a dataframe
Import_DF <- data.frame(rownames(ImportantVariables$importance), 
                        ImportantVariables$importance$Overall)
head(Import_DF)

# Relabel the data
names(Import_DF)<-c('Platform', 'Importance')
# Order the data from greatest importance to least important
Import_DF <- transform(Import_DF, 
                        Platform = reorder(Platform, Importance))

(Import_DF)
# Plot the data with ggplot.
(MyPlot<-ggplot(data=Import_DF, aes(x=Platform, y=Importance)) +
  geom_bar(stat = 'identity',colour = "lightgreen", 
           fill = "lightblue") +
  geom_text(aes(label=round(Importance,2), hjust=-.5))+
  coord_flip())

###################################################
## SAving Plots
###################################################

## if you wish...
## setwd("/Users/SomeName/Desktop/etc")

## Options:
#ggsave("HorizBar.jpeg")
#ggsave(MyPlot, file="HorizBar.jpeg")
ggsave(MyPlot, file="HorizBar.jpeg", width=4, height=8)


######################################################
##
##               Naive Bayes
##
######################################################

library(e1071)

NBStudentclassfier <- naiveBayes(Decision ~.,
                                 data=Training_Set, 
                                 na.action = na.pass)

NBStudentClassifier_Prediction <- predict(NBStudentclassfier, 
                                          Testing_Set_No_Labels)

## Basic Confusion Matrix
table(NBStudentClassifier_Prediction,MyTestLabels)

## This creates excellent output
## It is a good idea to think about how to BUILD
## a pretty figure with this output
NBStudentclassfier

## To the figure, you can also include other vis.

## !!!!!!!! Different libraries offer different vis options
## Above, I like the output that e1071 gives.
## But- below - I will use Caret and the vis options...


library(caret)
library(klaR)

## NB in caret
caret_NB = train(Training_Set[,-1], ## data and NOT label
                 Training_Set[,1],  ## the label ONLY
                 'nb',
                 trControl=trainControl(method='cv',number=100))

caret_NB

Predict <- predict(caret_NB,Testing_Set )
Predict

table(Predict,MyTestLabels)

# Variable importance - this is a fun plot!
VarImp_NB <- caret::varImp(caret_NB)
plot(VarImp_NB)

## 
## Using klaR ----------------------

klaR_NB <- NaiveBayes(Decision ~ ., data = Training_Set)
plot(klaR_NB)



##################################################
##
##          Support Vector Machines
##
########################################################
##
head(Training_Set)  ## Notice that column 1 is the label
## Notice that columns 4, 5, 6, and 7 are numeric.
head(Testing_Set_No_Labels)
head(MyTestLabels)
str(MyData)
#names(MyData)
## Create NEW DFs with just the numeric data
SVM_Training_Data<-Training_Set[,c(1,4,6)]
SVM_Test_Data_noLabel<-Testing_Set_No_Labels[,c(3,5)]  ## Why 3 and 5??
## Because in this testset, that is where these columns are GPA and TestScore

## Recall that Support Vector Machines ONLY WORK ON NUMERIC data
## Consider our LABEL - this is "Decision"
## This must be type factor - and it is - which is good.
## Next, like all other goals in R - there are always
## many different library options. 

library(tidyverse)    # data manipulation and visualization
library(kernlab)      # SVM methodology
library(e1071)        # SVM methodology
#install.packages("ISLR")
library(ISLR)         # contains example data set "Khan"
library(RColorBrewer) # customized coloring of plots

## Let's start with a fun example
## Let's use only two of our numeric variables and let's
## SEE what the SVM linear predictor looks like

### NOTES --------------------------------------
## The "x" Points are the support vectors
## The "o"points are the other points 
## which do not affect the calculation of the linear sep
##----------------------------------------------------

##-------
## plot the dataset
##-----------------------------
# plot the complete dataset
ggplot(data= SVM_Training_Data, 
       aes(x=SVM_Training_Data$GPA, 
           y= SVM_Training_Data$TestScore)) + 
  geom_point() +
  geom_point(aes(color=Decision))+ 
  scale_shape_manual(values=c(1,3)) + 
  ggtitle("Scatter plot of the complete dataset")


## from e1071
SVM_fit1 <- svm(Decision~.,  data = SVM_Training_Data, kernel = "linear", scale = FALSE)
SVM_fit1

# Plot Results
plot(SVM_fit1, SVM_Training_Data)

## This worked really well! We can SEE the seperating lines. 
## Notice that R used TWO (2) SVMS!! WHY??

### Let's see a confusion matrix

SVM_Pred1 <- predict(SVM_fit1, SVM_Test_Data_noLabel)

## Basic Confusion Matrix
table(SVM_Pred1,MyTestLabels)
##---------------------------------------------
## Pretty confusion matrix:
##-------------------------------------------------------

(MyResults2 <- data.frame(Predicted=SVM_Pred1,Actual=MyTestLabels))

## Basic Comfusion Matrix
(MyTable2<-table(SVM_Pred1,MyTestLabels))
str(MyTable2)

## Create a DF from the table to use in the heat map below...
(MyTable_DF<-as.data.frame(MyTable2))
str(MyTable_DF)

## BE CAREFUL - you need the steps above to REFORMAT
## So that you can create the heat map.

## Using ggplot heatmap to build a confusion matrix
ggplot(MyTable_DF, aes(x=SVM_Pred1, y=MyTestLabels, fill=Freq)) + 
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue")+
  geom_text(aes(label=Freq))

########################################
## Tuning - finding the best cost C
########################################
# find optimal cost of misclassification
BestCost <- tune(svm, Decision~., data = SVM_Training_Data, kernel = "linear",
                 ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

(BEST <- BestCost$best.model)


###################################
## Other kernels....include radial, polynomial, etc.
##########################################################
### IT IS VERY IMPORTANT TO NORMALIZE DATA 
## WHEN USING AN SVM!! (WHy??)
SVM_fit2 <- svm(Decision~.,  data = SVM_Training_Data, 
                kernel = "radial", 
                scale = TRUE, 
                cost=10,
                gamma=1)
SVM_fit2

plot(SVM_fit2, SVM_Training_Data)

# -------------------------------------POLYNOMIAL ---------
SVM_fit3 <- svm(Decision~.,  data = SVM_Training_Data, 
                kernel = "polynomial", 
                scale = TRUE, 
                cost=10,
                gamma=1)
SVM_fit3

# Plot Results
plot(SVM_fit3, SVM_Training_Data)


#############################################################
## A look at other options for prediction and confusion matrices
################################################################

##################################################################
### Running cross validation, and tuning with cost
##  to create the best model.............
##################################################################

BestCost2 <- tune(svm, Decision~., data = SVM_Training_Data, kernel = "linear",
                 ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)))

summary(BestCost2)


SVM_pred3 <- predict(BestCost2$best.model, SVM_Test_Data_noLabel)
confusionMatrix(SVM_pred3, MyTestLabels)

plot(BestCost2$best.model, data=SVM_Training_Data)

##########################################################
##             Using the kernlab library for SVMs
########################################################

#library(kernlab)
SVM_fit4 <- ksvm(Decision ~ ., data = SVM_Training_Data, 
                 kernel = "vanilladot", scaled=TRUE)
#vanilladot is the Euclidean inner product kernel
# option  kernel="rbfdot"


## The plot function for ksvm only works for 2-class problems
## Supports only BINARY classification. 
## https://rdrr.io/cran/kernlab/man/ksvm.html
## plot(SVM_fit4, data=SVM_Training_Data)

####################################################
## Using *train* to look at parameters
####################################################
(SVM5 <- train(Decision ~., SVM_Training_Data,
                 method="svmPoly"))
                 

## Visualize Accuracy
plot(SVM5)

###################################################
##
##    Cross validation 
##
######################################################
CrossVal<-trainControl(method="repeatedcv", repeats=5, classProbs = TRUE)
## Include trying different costs
MyCostGrid<-expand.grid(C=c(.01, .1, 1, 10, 100))

(MySVM6<-train(Decision~.,
               data = SVM_Training_Data,
               method="svmLinear",
                   preProc=c("center","scale"),
                   tuneGrid=MyCostGrid,
                   metric="Accuracy"))


plot(MySVM6)

###############################################################
## SVM Tutorials 
###############################################################
## references for SVMs and math
## https://www.eric-kim.net/eric-kim-net/posts/1/kernel_trick.html
## http://datascienceandme.com/topics/RSupportVectorMachine.html
## https://medium.com/nyu-a3sr-data-science-team/support-vector-machines-and-wine-cef59ad38b41
## https://cran.r-project.org/web/packages/kernlab/kernlab.pdf
## https://www.isical.ac.in/~arnabc/multi/lecsvm4.html
## https://rpubs.com/ryankelly/svm
## #################################################################


################################################################
##
## Regression - Linear and non-linear
##
#################################################################

## Linear regression is used to PREDICT a value
## of a variable (called dependent or outcome or response)
## The goal is to create a linear equation
## that best estimates the values of the
## response.

## Example: Y = b +b1X + error
## Multiple Linear Regression:  Y=b+b1X1+b2x2+...
## The "b" values are the coefficients

head(MyData)
## Recall that regression is math
## Therefore, it only works on numeric data.
(MyX<-MyData[,5]) ## Let's make this 2D for now
(MyY<-MyData[,7])

scatter.smooth(x=MyX, y=MyY, main="GPA and TestScores")  # scatterplot

## Checking for outliers and looking at the data
par(mfrow=c(1, 2))  # subplot graph area in 2 columns
boxplot(MyX, main="GPA", col="lightblue")

## These are the points that are "far" from the median.
## However, they are not outliers!
(Stats<-boxplot.stats(MyX)$out)  
## a vector of length 5, containing the extreme of the lower whisker
#https://www.rdocumentation.org/packages/grDevices/versions/3.6.2/topics/boxplot.stats

boxplot(MyY, main="TestScore", col="lightgreen")
        
########-------------- DENSITY ------------
#library(e1071)
par(mfrow=c(1, 2))  # 1 row and 2 columns
plot(density(MyX), main="Density Plot: GPA", 
     ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(MyX), 2)))  
# density plot for GPA
polygon(density(MyX), col="lightblue")

plot(density(MyY), main="Density Plot: TestScores", 
     ylab="Frequency", 
     sub=paste("Skewness:", round(e1071::skewness(MyY), 2)))  
# density plot for testscores
polygon(density(MyY), col="lightgreen")

############----------------correlation------------
cor(MyX, MyY)  ## Strong Positive!

##---------------------------------------------------
##############-----------Build the LINEAR MODEL------
##-------------------------------------------------------

MyLinearModel1 <- lm(MyX ~ MyY)  
# build linear regression model on full data
## Can also do this:  MyLinearModel1 <- lm(Var1 ~ Var2, data = yourdata) 
print(MyLinearModel1)
## What does this mean??
## It means this:
##  Y = 1.11925 + .00274X, where Y is TestScore and X is GPA.
summary(MyLinearModel1)
## Pr(>|t|) is the sig of the t-test and it is VERY sig in this case
## YES! We CAN predict TestScore with GPA. 
## Read more:
## http://r-statistics.co/Linear-Regression.html

############-----------------------------------
##
##   Creating PREDICTIVE linear models
##
###########------------------------------------------
## Here - we need testing and training data.
## 
(MySample<-sample(1:nrow(MyData), 0.8*nrow(MyData)))
RegrTEST<-MyData[-MySample,c(5,7)] 
head(RegrTEST)
## This gives 20% test data for GPA and TestScore
RegrTRAIN<-MyData[MySample,c(5,7)] 
head(RegrTRAIN)

# Build MODEL on training data ...
(Linear_Pred_Model <- lm(GPA ~ TestScore, data=RegrTRAIN) ) 
## This is the MODEL that was created:
## GPA = 1.288393 + .002558*TestScore

(Linear_Pred <- predict(Linear_Pred_Model, RegrTEST)  )
## These are the predictions for GPA per the TestScore for each row.

## Build a data frame to compare the true values
## to the predicted values....
True_ValuesDF <- data.frame(cbind(actual=RegrTEST$GPA, 
                                  predicted=Linear_Pred))  

correlation_accuracy <- cor(True_ValuesDF)  
head(True_ValuesDF)

## Min-Max Accuracy of the prediction 
(min_max_accuracy <- 
  mean(apply(True_ValuesDF, 1, min) / apply(True_ValuesDF, 1, max))  )
## Impressive!! 96.5% accuracy. 

################-------------MultiLinear Regression        ---
###############-----------------------------------------------

# Multiple Linear Regression Example
#head(MyData)
ML_Reg <- lm(GPA ~ TestScore + WorkExp + TestScore, data=MyData)
summary(ML_Reg)
## What is the result??
## It is this:
## GPA = 1.125 + .0027*TestScore  - .0030*WorkExp
## P value is nearly 0 at .00000682 - very sig!

# Other Options.......
coefficients(ML_Reg) # model coefficients - we see these above.
confint(ML_Reg, level=0.95) # Confidence Intervals for model parameters
fitted(ML_Reg) # predicted values for each row GPA
residuals(ML_Reg) # residuals
anova(ML_Reg) # anova table - the F-test in this case is significant!
vcov(ML_Reg) # covariance matrix for model parameters

# Plot the Model Diagnostics
layout(matrix(c(1,2,3,4),2,2)) # 4 graphs and 2X 2 graph grid
plot(ML_Reg, col=MyData$Decision)

## Read More:
## https://www.statmethods.net/stats/regression.html

###################################################################
##
## Neural Networks and Deep Learning
##
###################################################################

## NNs learn by example like all other supervised learning ML
## methods. ANN stands for artificial neural network.
## NNs try to mimic human brain neurons.
## NNs are non-linear, parallel, adaptive (learning), and complex
## NN "adapt" by changing the weights of internal nodes to better
## meet the needs of the problem. 
## Interestingly - NNs are used to sole problems that are normally
## easier for humans  - but harder for machines....like knowing
## if someone is sad or finding a sunset in a picture.

## HOW DOES IT WORK
##
## A NN can have a vector of input (many inputs).
## It can weight each input in the vector and can update the 
## weights. Each input vector (or data row) is LABELED and so have
## a desired output goal. Like all supervised learning methods, we
## use LABELED TRAINING data to *train* the NN. Then, we use a NON-LABELED
## TEST set to see how well our NN can determine the correct label.
## This idea is the same for all supervised learning methods.
##
## THE FUNCTION OF NN
##
## Y = sum(weight(i)*input(i))+bias
##
## Suppose you have a numeric dataset with variables AGE, WEIGHT, HEIGHT
## Suppose a row (vector) in the dataset is [29, 120, 60].
## Supose we know the LABEL and it is "Female". 
## Suppose a different row in the dataset is [34, 210, 75] and
## the label is "Male". 
##
## Then, Y is the label - either Male or Female  - but because Y
## will be a number, we will use a THRESHOLD to determine which numbers
## are classified as "Male" and which as "Female". 
## 
## A possible example (these are made up numbers) might be:
## Y  = (29*.10 + 120*.43 + 60*.67)  = 94.7     <120 so Female
## Y  = (34*.10 + 210*.43 + 75*.67)  = 143.95   >120 so Male
##
## What?? WHy 120?? 
## Answer:
## This is just an example. The 120 is a threshold. Normally, a sigmoid
## is used to create a threshold.
##

## FIRST (and ignore the warnings)
#install.packages("caret")
#install.packages("nnet")
library(caret)
library(nnet)

## Let's use the SVM datasets because NN also need to be numeric.

(MyTestLabels)
head(SVM_Training_Data)
(SVM_Test_Data_noLabel)

ggplot(SVM_Training_Data, aes(x = GPA, y = TestScore, colour = Decision)) +
  geom_point(size=3) +
  ggtitle("Admissions")

## Train the NN
## https://www.rdocumentation.org/packages/nnet/versions/7.3-12/topics/nnet
MyNN <- nnet(Decision ~ GPA+TestScore, data=SVM_Training_Data,
             size=10, ## 10 nodes in hidden layers
             decay=1.0e-2, ## Changing this can affect results
             maxit=100, 
             linout=TRUE)

MyNN  # 2-2-3: 2 inputs (GPA and TestScore), 2 hidden layers, and 3 outputs

(Prediction<-predict(MyNN, SVM_Test_Data_noLabel, type="class"))

table(Prediction, MyTestLabels)