Twitter Mining and ARM with R – Gates Bolton Analytics

####### Twitter in R
## Gates
##
## NOTES
##
## Hi There - this is my code :)
## If you use it, please reference me.
##
## In addition, you may need to update it and/or edit it. 

## I created this code for a Conference Talk I gave for DCR 2019
#########################################


## To use this code and/or to get Tweets via a
## Twitter API, you must - GET A TWITTER DEV ACCOUNT
## RE:
## https://drive.google.com/file/d/10uwqGiGLQmi8r3LcK4GlnbDYvvf5xYyr/view?usp=sharing


#########################################
#  Consumer API keys
#  Access token & access token secret

## NOTE: 
## I have created a text file that contains the
## consumerKey, the comsumerSecret, the access_Token, and the access_Secret
## In Twitter - these have other names and things change a lot in Twitter.
## In Twitter:
## consumerKey  is now called the  API key
## comsumerSecret  is now called the    API key secret
## access_Token  is now called the whatever it is called now...its the first hidden one
## access_Secret  is now called the whatever it is called now...its the second hidden one


## IT DOES NOT MATTER WHAT YOU NAME THEM :) As long as the order is the same


## They are comma separated. 
## The name of my file is TwitterConKey_ConSec_AccTok_AccSec.txt
#Insert your consumerKey and consumerSecret below


############################
## GET A TWITTER ACCOUNT
##
##  https://developer.twitter.com/en
##  Choose DEVELOPER PORTAL
##  https://developer.twitter.com/en/dashboard
##
###########################################
#setwd("C:/Users/profa/Documents/R/RStudioFolder_1/DCR2019Networks/")
## Set your own wd

## This is where I am reading in the 4 Twitter codes.
## This code will not work for you unless you get a Twitter
## Dev Account and the 4 codes....
#TwitterCodesFile="C:/users/profa/desktop/TwitterCodesFileCopy.txt"
TwitterCodesFile="C:/Users/profa/Documents/RStudioFolder_1/DCR2019Networks/TwitterConKey_ConSec_AccTok_AccSec_10_10_21.txt"
tokens<-read.csv(TwitterCodesFile, header=T, sep=",")

## Reading in my Twitter codes from a file I created on my computer
## You will need to update this
consumerKey=as.character(tokens$consumerKey)
consumerSecret=as.character(tokens$consumerSecret)
access_Token=as.character(tokens$access_Token)
access_Secret=as.character(tokens$access_Secret)

## Include this
requestURL='https://api.twitter.com/oauth/request_token'
accessURL='https://api.twitter.com/oauth/access_token'
authURL='https://api.twitter.com/oauth/authorize'


########################
## NOTES:
## !!!!!!!!!!!!!!!!!!!!
## READ ME
##
##
## I have commented out many things
## in this code. I have also added a lot
## of commented notes.
##
## !!!! LEAVE things commented as commented out unless you have
## issues. Then, you can try to see what
## will solve the issue. 
## Nothing works forever :)
###################################################
#oauth_endpoint(authorize = "https://api.twitter.com/oauth",
#               access = "https://api.twitter.com/oauth/access_token")

#connect to API
#download.file(url ='http://curl.haxx.se/ca/cacert.pem', destfile ='cacert.pem')


### NOTES: rtweet is another excellent option
## https://mkearney.github.io/blog/2017/06/01/intro-to-rtweet/
### https://rtweet.info/

#install.packages("devtools")
#install.packages("rlang")
library(rlang)
library(usethis)
library(devtools)
#install.packages("base64enc")
library(base64enc)
#install.packages("RCurl")
library(RCurl)

#devtools::install_version("httr", version="0.6.0", repos="http://cran.us.r-project.org")
#devtools::install_version("twitteR", version="1.1.8", repos="http://cran.us.r-project.org")
#devtools::install_github("jrowen/twitteR", ref = "oauth_httr_1_0")

library(httr)
library(twitteR)

### Install the needed packages...
#install.packages("twitteR")
#install.packages("ROAuth")
# install.packages("rtweet")
library(ROAuth)

library(networkD3)
## If trouble use detach and then install and
## do library
library(arules)
library(rtweet)


library(jsonlite)
#install.packages("streamR")
library(streamR)
#install.packages("rjson")
library(rjson)
#install.packages("tokenizers")
library(tokenizers)
library(tidyverse)
library(plyr)
library(dplyr)
library(ggplot2)
#install.packages("syuzhet")  ## sentiment analysis
library(syuzhet)
library(stringr)
library(arulesViz)
library(igraph)

library(httpuv)
library(openssl)

##############  Using twittR ##########################################################
## NOTE - One day - out of no where - the following line of code stopped working
## It worked for 4 years and then stopped :)
## I Google-Searched the error and found some suggestions.
## One was to install base64enc
## This solution did not work...
## The next option was to run
## devtools::install_version("httr", version="0.6.0", repos="http://cran.us.r-project.org")
## and then - that failed - so I tried...
## # Installing the devtools package
##install.packages("devtools")
## AND then
# Using devtools to install an older version
## devtools::install_version("httr", version="0.6.0", repos="http://cran.us.r-project.org")
## and 
## devtools::install_version("twitteR", version="1.1.8", repos="http://cran.us.r-project.org")
## This all took 1.5 hours! Just FYI
############################################---> this happens in open source languages...

# my_oauth <- OAuthFactory$new(consumerKey = consumerKey,
#                              consumerSecret = consumerSecret,
#                              requestURL = requestURL,
#                              accessURL = accessURL,
#                              authURL = authURL)
# 


#install.packages("base64enc")

#Search

setup_twitter_oauth(consumerKey,consumerSecret,access_Token, access_Secret)

(Search<-twitteR::searchTwitter("#football", n=5, lang="en"))


##https://www.rdocumentation.org/packages/twitteR/versions/1.1.9/topics/searchTwitter
# (Search2 <- searchTwitter("football", 
#                          n = 5, 
#                          lang = "en"
#                          #since='2021-03-01', ## need special account
#                          #until='2021-03-02'  ## need special account
#                          ) )
# 



(Search_DF <- twitteR::twListToDF(Search))
TransactionTweetsFile = "TweetResults.csv"
(Search_DF$text[3])

#make data frame
## do.call is a list that holds all arguments in a function
## https://www.stat.berkeley.edu/~s133/Docall.html
##(Search2_DF <- do.call("rbind", lapply(Search2, as.data.frame)))
## OR
#tokenize_tweets(x, lowercase = TRUE, stopwords = NULL, strip_punct = TRUE, 
#                 strip_url = FALSE, simplify = FALSE)

#tokenize_tweets(Search2_DF$text[1],stopwords = stopwords::stopwords("en"), 
#               lowercase = TRUE,  strip_punct = TRUE, 
#               strip_url = TRUE, simplify = TRUE)

## Start the file
Trans <- file(TransactionTweetsFile)
## Tokenize to words 
Tokens<-tokenizers::tokenize_words(
  Search_DF$text[1],stopwords = stopwords::stopwords("en"), 
  lowercase = T,  strip_punct = T, strip_numeric = T,
  simplify = T)

## Write tokens
cat(unlist(Tokens), "\n", file=Trans, sep=",")
close(Trans)

## Append remaining lists of tokens into file
## Recall - a list of tokens is the set of words from a Tweet
Trans <- file(TransactionTweetsFile, open = "a")
for(i in 2:nrow(Search_DF)){
  Tokens<-tokenize_words(Search_DF$text[i],
                         stopwords = stopwords::stopwords("en"), 
                         lowercase = T,  
                         strip_punct = T, 
                         simplify = T)
  
  cat(unlist(Tokens), "\n", file=Trans, sep=",")
  cat(unlist(Tokens))
}

close(Trans)


######### Read in the tweet transactions
# TweetTrans <- read.transactions(TransactionTweetsFile,
#                                 rm.duplicates = FALSE, 
#                                 format = "basket",
#                                 sep=","
#                                 ## cols = 
# )
#inspect(TweetTrans)
## See the words that occur the most
#Sample_Trans <- sample(TweetTrans, 3)
#summary(Sample_Trans)

## Read the transactions data into a dataframe
TweetDF <- read.csv(TransactionTweetsFile, 
                    header = FALSE, sep = ",")
head(TweetDF)
(str(TweetDF))

## Convert all columns to char 
TweetDF<-TweetDF %>%
  mutate_all(as.character)
(str(TweetDF))
# We can now remove certain words
TweetDF[TweetDF == "t.co"] <- ""
TweetDF[TweetDF == "rt"] <- ""
TweetDF[TweetDF == "http"] <- ""
TweetDF[TweetDF == "https"] <- ""

## Check it so far....
TweetDF

## Clean with grepl - every row in each column
MyDF<-NULL
MyDF2<-NULL
for (i in 1:ncol(TweetDF)){
  MyList=c() 
  MyList2=c() # each list is a column of logicals ...
  MyList=c(MyList,grepl("[[:digit:]]", TweetDF[[i]]))
  MyDF<-cbind(MyDF,MyList)  ## create a logical DF
  MyList2=c(MyList2,(nchar(TweetDF[[i]])<4 | nchar(TweetDF[[i]])>11))
  MyDF2<-cbind(MyDF2,MyList2) 
  ## TRUE is when a cell has a word that contains digits
}
## For all TRUE, replace with blank
TweetDF[MyDF] <- ""
TweetDF[MyDF2] <- ""
(head(TweetDF,10))



# Now we save the dataframe using the write table command 
write.table(TweetDF, file = "UpdatedTweetFile.csv", col.names = FALSE, 
            row.names = FALSE, sep = ",")
TweetTrans <- read.transactions("UpdatedTweetFile.csv", sep =",", 
                                format("basket"),  rm.duplicates = TRUE)
#inspect(TweetTrans)

### Clean up the datafile

## For this ERror
## Error in length(obj) : Method length not implemented for class rules 
## DO THIS: (1) detach("package:arulesViz", unload=TRUE)
## (2) detach("package:arules", unload=TRUE)
## (3) library(arules)

## optional after TweetTrans
## Takes LONG time to run
###gsub("[{}]", "", as.character(inspect(TweetTrans_rules@lhs)$items))
#TweetTrans_rules@rhs <- gsub("[{}]", "", as.character(inspect(TweetTrans_rules@rhs)$items))

############ Create the Rules  - Relationships ###########
TweetTrans_rules = arules::apriori(TweetTrans, 
        parameter = list(support=.06, conf=1, minlen=2))
        #maxlen
        #appearance = list (default="lhs",rhs="milk")
inspect(TweetTrans_rules[1:20])
##  SOrt by Conf
SortedRules_conf <- sort(TweetTrans_rules, by="confidence", decreasing=TRUE)
inspect(SortedRules_conf[1:20])
## Sort by Sup
SortedRules_sup <- sort(TweetTrans_rules, by="support", decreasing=TRUE)
inspect(SortedRules_sup[1:20])
## Sort by Lift
SortedRules_lift <- sort(TweetTrans_rules, by="lift", decreasing=TRUE)

####################################################
### HERE - you can affect which rules are used
###  - the top for conf, or sup, or lift...
####################################################
TweetTrans_rules<-SortedRules_lift[1:50]
inspect(TweetTrans_rules)

####################################################
## IF ERROR
## RUN THIS CODE
## detach("package:arulesViz", unload=TRUE)
## detach("package:arules", unload=TRUE)
## library(arules)
## library(arulesViz) ## After arules works
#####################################################

#(SortedRules_sup[1:30])
#plot(SortedRules_sup[1:30],method="graph",engine='interactive', shading="confidence") 
#plot(SortedRules_conf[1:50],method="graph",engine='interactive',shading="confidence") 

#######################################################
########  Using NetworkD3 To View Results   ###########
#######################################################

## Build node and egdes properly formatted data files
## Build the edgeList which will have SourceName, TargetName
##                                    Weight, SourceID, and
##                                    TargetID

#Rules_DF<-as(TweetTrans_rules, "data.frame")
#(head(Rules_DF))

## Convert the RULES to a DATAFRAME
Rules_DF2<-DATAFRAME(TweetTrans_rules, separate = TRUE)
(head(Rules_DF2))
str(Rules_DF2)
## Convert to char
Rules_DF2$LHS<-as.character(Rules_DF2$LHS)
Rules_DF2$RHS<-as.character(Rules_DF2$RHS)

## Remove all {}
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[{]', replacement='')
Rules_DF2[] <- lapply(Rules_DF2, gsub, pattern='[}]', replacement='')

Rules_DF2

## Other options for the following
#Rules_Lift<-Rules_DF2[c(1,2,5)]
#Rules_Conf<-Rules_DF2[c(1,2,4)]
#names(Rules_Lift) <- c("SourceName", "TargetName", "Weight")
#names(Rules_Conf) <- c("SourceName", "TargetName", "Weight")
#head(Rules_Lift)
#head(Rules_Conf)

###########################################
###### Do for SUp, Conf, and Lift   #######
###########################################
## Remove the sup, conf, and count
## USING LIFT
Rules_L<-Rules_DF2[c(1,2,5)]
names(Rules_L) <- c("SourceName", "TargetName", "Weight")
head(Rules_L,30)

## USING SUP
Rules_S<-Rules_DF2[c(1,2,3)]
names(Rules_S) <- c("SourceName", "TargetName", "Weight")
head(Rules_S,30)

## USING CONF
Rules_C<-Rules_DF2[c(1,2,4)]
names(Rules_C) <- c("SourceName", "TargetName", "Weight")
head(Rules_C,30)

## CHoose and set
#Rules_Sup<-Rules_C
Rules_Sup<-Rules_L
#Rules_Sup<-Rules_S

###########################################################################
#############       Build a NetworkD3 edgeList and nodeList    ############
###########################################################################

#edgeList<-Rules_Sup
# Create a graph. Use simplyfy to ensure that there are no duplicated edges or self loops
#MyGraph <- igraph::simplify(igraph::graph.data.frame(edgeList, directed=TRUE))
#plot(MyGraph)

############################### BUILD THE NODES & EDGES ####################################
(edgeList<-Rules_Sup)
(MyGraph <- igraph::simplify(igraph::graph.data.frame(edgeList, directed=TRUE)))

nodeList <- data.frame(ID = c(0:(igraph::vcount(MyGraph) - 1)), 
                       # because networkD3 library requires IDs to start at 0
                       nName = igraph::V(MyGraph)$name)
## Node Degree
(nodeList <- cbind(nodeList, nodeDegree=igraph::degree(MyGraph, 
                    v = igraph::V(MyGraph), mode = "all")))

## Betweenness
BetweenNess <- igraph::betweenness(MyGraph, 
      v = igraph::V(MyGraph), 
      directed = TRUE) 

(nodeList <- cbind(nodeList, nodeBetweenness=BetweenNess))

## This can change the BetweenNess value if needed
#BetweenNess<-BetweenNess/100



## For scaling...divide by 
## RE:https://en.wikipedia.org/wiki/Betweenness_centrality
##/ ((igraph::vcount(MyGraph) - 1) * (igraph::vcount(MyGraph)-2))
## For undirected / 2)
## Min-Max Normalization
##BetweenNess.norm <- (BetweenNess - min(BetweenNess))/(max(BetweenNess) - min(BetweenNess))


## Node Degree


###################################################################################
########## BUILD THE EDGES #####################################################
#############################################################
# Recall that ... 
# edgeList<-Rules_Sup
getNodeID <- function(x){
  which(x == igraph::V(MyGraph)$name) - 1  #IDs start at 0
}
## UPDATE THIS !! depending on # choice
(getNodeID("salary")) 

edgeList <- plyr::ddply(
  Rules_Sup, .variables = c("SourceName", "TargetName" , "Weight"), 
  function (x) data.frame(SourceID = getNodeID(x$SourceName), 
                          TargetID = getNodeID(x$TargetName)))

head(edgeList)
nrow(edgeList)

########################################################################
##############  Dice Sim ################################################
###########################################################################
#Calculate Dice similarities between all pairs of nodes
#The Dice similarity coefficient of two vertices is twice 
#the number of common neighbors divided by the sum of the degrees 
#of the vertices. Method dice calculates the pairwise Dice similarities 
#for some (or all) of the vertices. 
DiceSim <- igraph::similarity.dice(MyGraph, vids = igraph::V(MyGraph), mode = "all")
head(DiceSim)

#Create  data frame that contains the Dice similarity between any two vertices
F1 <- function(x) {data.frame(diceSim = DiceSim[x$SourceID +1, x$TargetID + 1])}
#Place a new column in edgeList with the Dice Sim
head(edgeList)
edgeList <- plyr::ddply(edgeList,
                        .variables=c("SourceName", "TargetName", "Weight", 
                                               "SourceID", "TargetID"), 
                        function(x) data.frame(F1(x)))
head(edgeList)

##################################################################################
##################   color #################################################
######################################################
# COLOR_P <- colorRampPalette(c("#00FF00", "#FF0000"), 
#                             bias = nrow(edgeList), space = "rgb", 
#                             interpolate = "linear")
# COLOR_P
# (colCodes <- COLOR_P(length(unique(edgeList$diceSim))))
# edges_col <- sapply(edgeList$diceSim, 
#                     function(x) colCodes[which(sort(unique(edgeList$diceSim)) == x)])
# nrow(edges_col)

## NetworkD3 Object
#https://www.rdocumentation.org/packages/networkD3/versions/0.4/topics/forceNetwork

D3_network_Tweets <- networkD3::forceNetwork(
  Links = edgeList, # data frame that contains info about edges
  Nodes = nodeList, # data frame that contains info about nodes
  Source = "SourceID", # ID of source node 
  Target = "TargetID", # ID of target node
  Value = "Weight", # value from the edge list (data frame) that will be used to value/weight relationship amongst nodes
  NodeID = "nName", # value from the node list (data frame) that contains node description we want to use (e.g., node name)
  Nodesize = "nodeBetweenness",  # value from the node list (data frame) that contains value we want to use for a node size
  Group = "nodeDegree",  # value from the node list (data frame) that contains value we want to use for node color
  height = 700, # Size of the plot (vertical)
  width = 900,  # Size of the plot (horizontal)
  fontSize = 20, # Font size
  linkDistance = networkD3::JS("function(d) { return d.value*1000; }"), # Function to determine distance between any two nodes, uses variables already defined in forceNetwork function (not variables from a data frame)
  linkWidth = networkD3::JS("function(d) { return d.value*5; }"),# Function to determine link/edge thickness, uses variables already defined in forceNetwork function (not variables from a data frame)
  opacity = 5, # opacity
  zoom = TRUE, # ability to zoom when click on the node
  opacityNoHover = 5, # opacity of labels when static
  linkColour = "red"   ###"edges_col"red"# edge colors
) 

# Plot network
#D3_network_Tweets

# Save network as html file
networkD3::saveNetwork(D3_network_Tweets, 
                       "NetD3_DCR2019_worldNewsL_2021.html", selfcontained = TRUE)