## This code offers examples for text and record datasets
## AND for Naive Bayes, Bernoulli, DT, and SVMs
###############
## Supervised Learning Example Code
## This code has two examples -
## One for Student Record Data
## One for Text data in two corpus's DOG and HIKE - text
## Using Pretend hand-made Student Dataset - record
## Dataset can be found HERE:
### https://drive.google.com/file/d/1JwbqQxSR3rnVXYlprUoAY-GvOO63v8Vm/view?usp=sharing
##The second dataset is for Text mining and has two folders: DOG and HIKE
##
## DOG
## https://drive.google.com/drive/folders/1gN-NLwbelmtd-KPWBuNr3wY7DTirFWcR?usp=sharing
## HIKE
## https://drive.google.com/drive/folders/14E2Drdj6ahLv4y3yzAnhFRxfbnpPIR7c?usp=sharing
######################################
## Textmining Naive Bayes Example
import nltk
import pandas as pd
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
## For Stemming
from nltk.tokenize import sent_tokenize, word_tokenize
import os
#from nltk.stem import WordNetLemmatizer
#LEMMER = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
STEMMER=PorterStemmer()
print(STEMMER.stem("fishings"))
#words_34 = re.sub(r"[^A-Za-z\-]", " ", "I love my DOG !!!").lower().split()
#print(words_34)
# Use NLTK's PorterStemmer in a function
def MY_STEMMER(str_input):
words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()
words = [STEMMER.stem(word) for word in words]
return words
import string
import numpy as np
##import spacy
## https://spacy.io/usage/spacy-101
# create a spaCy tokenizer
###########################
## Stemming and Lemming
## Stemming is different to Lemmatization
## in the approach it uses to produce root
## forms of words and the word produced.
##
## !!! Stemming can result in words
## That are not actually words.
## trouble, troubling, troubled, troubles ....
## all become troubl
##Lemmatization is the process of grouping together
##the different inflected forms of a word so they can
## be analysed as a single item. Lemmatization is similar
## to stemming but it brings context to the words. So it
## links words with similar meaning to one word.
#####################################################################
MyVect_STEM=CountVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
##stop_words=["and", "or", "but"],
#token_pattern='(?u)[a-zA-Z]+',
#token_pattern=pattern,
tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True
)
MyVect_IFIDF=TfidfVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
lowercase = True,
#binary=True
)
MyVect_IFIDF_STEM=TfidfVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True,
#binary=True
)
MyVect_Bernoulli=CountVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
#tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True,
binary=True
)
#
#We will be creating new data frames - one for NB and one for Bern.
## These are the two new and currently empty DFs
FinalDF_STEM=pd.DataFrame()
FinalDF_TFIDF=pd.DataFrame()
FinalDF_TFIDF_STEM=pd.DataFrame()
FinalDF_Bernoulli=pd.DataFrame()
## You will need to know where things are on your computer.
## This code assumes that it is in the same folder/location
## as the folders DOG and HIKE. It will loop through the files in
## these two folders and will build the list needed to use
## CounterVectorizer.
## NOTICE: My loop has a path in it. This is for MY computer - not yours!
## You will need to adjust the path.
for name in ["DOG", "HIKE"]:
#print(name)
builder=name+"DF" #DOGDF
#print(builder)
builderB=name+"DFB"
path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\Week4_7\\"+name
#print(path)
FileList=[]
for item in os.listdir(path):
#print(path+ "\\" + item)
next=path+ "\\" + item
FileList.append(next)
print("full list...")
#print(FileList)
## Do for all three
## MyVect_STEM and MyVect_IFIDF and MyVect_IFIDF_STEM
X1=MyVect_STEM.fit_transform(FileList)
X2=MyVect_IFIDF.fit_transform(FileList)
X3=MyVect_IFIDF_STEM.fit_transform(FileList)
X4=MyVect_Bernoulli.fit_transform(FileList)
ColumnNames1=MyVect_STEM.get_feature_names()
ColumnNames2=MyVect_IFIDF.get_feature_names()
ColumnNames3=MyVect_IFIDF_STEM.get_feature_names()
ColumnNames4=MyVect_Bernoulli.get_feature_names()
#print("Column names: ", ColumnNames2)
#Create a name
builderS=pd.DataFrame(X1.toarray(),columns=ColumnNames1)
builderT=pd.DataFrame(X2.toarray(),columns=ColumnNames2)
builderTS=pd.DataFrame(X3.toarray(),columns=ColumnNames3)
builderB=pd.DataFrame(X4.toarray(),columns=ColumnNames4)
## Add column
#print("Adding new column....")
builderS["Label"]=name
builderT["Label"]=name
builderTS["Label"]=name
builderB["Label"]=name
#print(builderS)
FinalDF_STEM= FinalDF_STEM.append(builderS)
FinalDF_TFIDF= FinalDF_TFIDF.append(builderT)
FinalDF_TFIDF_STEM= FinalDF_TFIDF_STEM.append(builderTS)
FinalDF_Bernoulli=FinalDF_Bernoulli.append(builderB)
print(FinalDF_STEM.head())
## Replace the NaN with 0 because it actually
## means none in this case
FinalDF_STEM=FinalDF_STEM.fillna(0)
FinalDF_TFIDF=FinalDF_TFIDF.fillna(0)
FinalDF_TFIDF_STEM=FinalDF_TFIDF_STEM.fillna(0)
FinalDF_Bernoulli=FinalDF_Bernoulli.fillna(0)
###### REMOVE number columns ------------------
## Remove columns with number from this one
## You can do this for all 4 DFs if you wish
###############################################
MyList=[]
for col in FinalDF_TFIDF.columns:
#print(col)
LogR=col.isdigit() ## any numbers
if(LogR==True):
#print(col)
MyList.append(str(col))
print(MyList)
FinalDF_TFIDF.drop(MyList, axis=1, inplace=True)
##########################################################
## View all
print("FinalDF_STEM") ## These print statements help you to see where you are
print(FinalDF_STEM)
print("\nFinalDF_TFIDF")
print(FinalDF_TFIDF)
print("\nFinalDF_TFIDF_STEMF")
print(FinalDF_TFIDF_STEM)
print("\nFinalDF_Bernoulli")
print(FinalDF_Bernoulli)
## Create the testing set - grab a sample from the training set.
## Be careful. Notice that right now, our train set is sorted by label.
## If your train set is large enough, you can take a random sample.
from sklearn.model_selection import train_test_split
import random as rd
#rd.seed(1234)
TrainDF1, TestDF1 = train_test_split(FinalDF_STEM, test_size=0.3)
#print(FinalDF_STEM)
#print(TrainDF1)
#print(TestDF1)
TrainDF2, TestDF2 = train_test_split(FinalDF_TFIDF, test_size=0.3)
TrainDF3, TestDF3 = train_test_split(FinalDF_TFIDF_STEM, test_size=0.3)
TrainDF4, TestDF4 = train_test_split(FinalDF_Bernoulli, test_size=0.3)
### OK - at this point we have Train and Test data for the text data
## in DOG and HIKE.
## Of course, this can be updated to work from sentiment (like POS and NEG)
## and can be update for multiple folders or one folder..
###############################################
## For all three DFs - separate LABELS
#################################################
## IMPORTANT - YOU CANNOT LEAVE LABELS ON THE TEST SET
## Save labels
### TEST ---------------------
Test1Labels=TestDF1["Label"]
print(Test1Labels)
Test2Labels=TestDF2["Label"]
Test3Labels=TestDF3["Label"]
print(Test2Labels)
Test4Labels=TestDF4["Label"]
## remove labels
TestDF1 = TestDF1.drop(["Label"], axis=1)
TestDF2 = TestDF2.drop(["Label"], axis=1)
TestDF3 = TestDF3.drop(["Label"], axis=1)
print(TestDF1)
TestDF4 = TestDF4.drop(["Label"], axis=1)
## TRAIN ----------------------------
Train1Labels=TrainDF1["Label"]
Train2Labels=TrainDF2["Label"]
Train3Labels=TrainDF3["Label"]
Train4Labels=TrainDF4["Label"]
print(Train2Labels)
## remove labels
TrainDF1 = TrainDF1.drop(["Label"], axis=1)
TrainDF2 = TrainDF2.drop(["Label"], axis=1)
TrainDF3 = TrainDF3.drop(["Label"], axis=1)
print(TrainDF3)
TrainDF4 = TrainDF4.drop(["Label"], axis=1)
## RECALL that "4" is for Bernoulli
####################################################################
########################### Naive Bayes ############################
####################################################################
from sklearn.naive_bayes import MultinomialNB
#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.fit
#Create the modeler
MyModelNB1= MultinomialNB()
MyModelNB2= MultinomialNB()
MyModelNB3= MultinomialNB()
## When you look up this model, you learn that it wants the
## Run on all three Dfs.................
MyModelNB1.fit(TrainDF1, Train1Labels)
MyModelNB2.fit(TrainDF2, Train2Labels)
MyModelNB3.fit(TrainDF3, Train3Labels)
Prediction1 = MyModelNB1.predict(TestDF1)
Prediction2 = MyModelNB2.predict(TestDF2)
Prediction3 = MyModelNB3.predict(TestDF3)
print("\nThe prediction from NB is:")
print(Prediction1)
print("\nThe actual labels are:")
print(Test1Labels)
print("\nThe prediction from NB is:")
print(Prediction2)
print("\nThe actual labels are:")
print(Test2Labels)
print("\nThe prediction from NB is:")
print(Prediction3)
print("\nThe actual labels are:")
print(Test3Labels)
## confusion matrix
from sklearn.metrics import confusion_matrix
## The confusion matrix is square and is labels X labels
## We ahve two labels, so ours will be 2X2
#The matrix shows
## rows are the true labels
## columns are predicted
## it is al[habetical
## The numbers are how many
cnf_matrix1 = confusion_matrix(Test1Labels, Prediction1)
print("\nThe confusion matrix is:")
print(cnf_matrix1)
cnf_matrix2 = confusion_matrix(Test2Labels, Prediction2)
print("\nThe confusion matrix is:")
print(cnf_matrix2)
cnf_matrix3 = confusion_matrix(Test3Labels, Prediction3)
print("\nThe confusion matrix is:")
print(cnf_matrix3)
### prediction probabilities
## columns are the labels in alphabetical order
## The decinal in the matrix are the prob of being
## that label
print(np.round(MyModelNB1.predict_proba(TestDF1),2))
print(np.round(MyModelNB2.predict_proba(TestDF2),2))
print(np.round(MyModelNB3.predict_proba(TestDF3),2))
#######################################################
### Bernoulli #########################################
#######################################################
### NOTE TO CLASS: This should use the Binary
## This is DF 4
## TrainDF4, TestDF4
from sklearn.naive_bayes import BernoulliNB
BernModel = BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
BernModel.fit(TrainDF4, Train4Labels)
print("\nBernoulli prediction:\n", BernModel.predict(TestDF4))
print("\nActual:")
print(Test4Labels)
#
bn_matrix = confusion_matrix(Test4Labels, BernModel.predict(TestDF4))
print("\nThe confusion matrix for text Bernoulli is:")
print(bn_matrix)
#############################################
########### SVM ############################
#############################################
from sklearn.svm import LinearSVC
SVM_Model=LinearSVC(C=10)
SVM_Model.fit(TrainDF1, Train1Labels)
#BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
print("SVM prediction:\n", SVM_Model.predict(TestDF1))
print("Actual:")
print(Test1Labels)
SVM_matrix = confusion_matrix(Test1Labels, SVM_Model.predict(TestDF1))
print("\nThe confusion matrix for basic linear SVC is:")
print(SVM_matrix)
print("\n\n")
#############################################
########### SVM ############################
#############################################
#from sklearn.svm import LinearSVC
###
### SVMs do not run on qualitative data.
### ALWAYS remove the Labels from the Test and Train data
###
### Here is what we have from above:
## TrainDF_nolabels, TrainLabels
### TestDF, TestLabels
#################################
## Here - rather than creating three
## copies of everything
## You can set the variables: TRAIN, TRAIN_Labels
## TEST and TEST_Labels
## to whatever you wish
######################################################
TRAIN= TrainDF1
TRAIN_Labels= Train1Labels
TEST= TestDF1
TEST_Labels= Test1Labels
SVM_Model1=LinearSVC(C=50)
SVM_Model1.fit(TRAIN, TRAIN_Labels)
print("SVM prediction:\n", SVM_Model1.predict(TEST))
print("Actual:")
print(TEST_Labels)
SVM_matrix = confusion_matrix(TEST_Labels, SVM_Model1.predict(TEST))
print("\nThe confusion matrix for Linear SVC C=50 is:")
print(SVM_matrix)
print("\n\n")
#--------------other kernels
## RBF
SVM_Model2=sklearn.svm.SVC(C=10, kernel='rbf',
verbose=True, gamma="auto")
SVM_Model2.fit(TRAIN, TRAIN_Labels)
print("SVM prediction:\n", SVM_Model2.predict(TEST))
print("Actual:")
print(TEST_Labels)
SVM_matrix = confusion_matrix(TEST_Labels, SVM_Model2.predict(TEST))
print("\nThe confusion matrix for rbf SVM is:")
print(SVM_matrix)
print("\n\n")
## POLY
SVM_Model3=sklearn.svm.SVC(C=10, kernel='poly',degree=2,
gamma="auto", verbose=True)
print(SVM_Model3)
SVM_Model3.fit(TRAIN, TRAIN_Labels)
print("SVM prediction:\n", SVM_Model3.predict(TEST))
print("Actual:")
print(TEST_Labels)
SVM_matrix = confusion_matrix(TEST_Labels, SVM_Model3.predict(TEST))
print("\nThe confusion matrix for SVM poly d=2 is:")
print(SVM_matrix)
print("\n\n")
###################################################
##
## Visualizing the top features
## Then Visualizing the margin with the top 2 in 2D
##
##########################################################
import matplotlib.pyplot as plt
## Credit: https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
## Define a function to visualize the TOP words (variables)
def plot_coefficients(MODEL=SVM_Model, COLNAMES=TrainDF1.columns, top_features=10):
## Model if SVM MUST be SVC, RE: SVM_Model=LinearSVC(C=10)
coef = MODEL.coef_.ravel()
top_positive_coefficients = np.argsort(coef,axis=0)[-top_features:]
top_negative_coefficients = np.argsort(coef,axis=0)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
# create plot
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in coef[top_coefficients]]
plt.bar( x= np.arange(2 * top_features) , height=coef[top_coefficients], width=.5, color=colors)
feature_names = np.array(COLNAMES)
plt.xticks(np.arange(0, (2*top_features)), feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
plot_coefficients()
plt.savefig('KeyWords.pdf')
#########################################################
## Using the top 2 features from above
## Let's look at the margin of the SVM
##################################################################
from sklearn.svm import SVC
X = np.array([TRAIN["dog"], TRAIN["hike"]])
X = X.transpose()
print(X)
#The classes of the training data
y = TRAIN_Labels
print(y)
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
y=lb.fit_transform(y)
y = np.array(y)
y = y.ravel() ## to make it the right 1D array type
print(y)
## Here - we need to make y into 0 or 1 so it will plot
#TRAIN
#Define the model with SVC
# Fit SVM with training data
clf = SVC(C=1, kernel="linear")
clf.fit(X, y)
margin = 2 / np.sqrt(np.sum(clf.coef_ ** 2))
# get the separating hyperplane
#The weights vector w
w = clf.coef_[0]
#print("The weight vector ", w)
#The slope of the SVM sep line
a = -w[0] / w[1]
#print("The slope of the SVM sep line is ", a)
#Create a variable xx that are values between 4 and 8
xx = np.linspace(0, 10)
#Equation of sep line in 2D
# x1 = - b/w1 - (w0/w1 )(x0)
## Note that clf_intercept_[0] is "b"
## Note that a = -w0/w1 and xx are a bunch of x values
## This is the y values for the main sep line
yy = a * xx - (clf.intercept_[0]) / w[1]
##These plot the two parellel margin lines
# plot the parallel lines to the separating hyperplane
#that pass through the support vectors and note the margin
#margin = 2 / np.sqrt(np.sum(clf.coef_ ** 2))
#translate the location of the center sep line by
# adding or subtracting a fraaction of the margin
yy_down = yy + .5*margin
yy_up = yy - .5*margin
# plot the line, the points, and the nearest vectors to the plane
#plt.figure(fignum, figsize=(4, 3))
plt.clf()
plt.plot(xx, yy, 'r-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=10,
facecolors='none', zorder=5)
#cmap is the color map
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=5, cmap=plt.cm.Paired)
plt.axis('tight')
##################################################################
############# PART 2 Using the Student Dataset #################
##################################################################
## The student dataset is not text data and so does not need to be
## vectorized.
## Also, the student dataset is clean. This will not normally
## be the case.
#####################################################################
## THE DATA IS HERE
## https://drive.google.com/file/d/18dJPOiiO9ogqOibJppc0lsDiQ2-bQs0f/view?usp=sharing
##
##
###############################################################
## Read the data into a dataframe
filename="C:/Users/profa/Documents/Python Scripts/ANLY503/DATA/StudentSummerProgramData.csv"
StudentDF=pd.read_csv(filename)
#print(StudentDF.head())
#from sklearn.model_selection import train_test_split
StudentTrainDF, StudentTestDF = train_test_split(StudentDF, test_size=0.3)
### OK - at this point we have Train and Test data for the text data
## in DOG and HIKE.
## Of course, this can be updated to work from sentiment (like POS and NEG)
## and can be update for multiple folders or one folder..
##-----------------------------------------------------------------
##
## Now we have a training set and a testing set.
#print("\nThe training set is:")
#print(StudentTrainDF)
#print("\nThe testing set is:")
#print(StudentTestDF)
## IMPORTANT - YOU CANNOT LEAVE LABELS ON THE TEST SET
## Save labels
StudentTestLabels=StudentTestDF["Decision"]
#print(StudentTestLabels)
## remove labels
StudentTestDF = StudentTestDF.drop(["Decision"], axis=1)
#print(StudentTestDF)
## Set up the training data so the models get what they expect
StudentTrainDF_nolabels=StudentTrainDF.drop(["Decision"], axis=1)
#print(StudentTrainDF_nolabels)
StudentTrainLabels=StudentTrainDF["Decision"]
#print(StudentTrainLabels)
#------------------------
## Some models do not run on qualitative data.....
## So, we will need to remove the variables: Gender and State
StudentTrainDF_nolabels_quant=StudentTrainDF_nolabels.drop(["Gender"], axis=1)
StudentTrainDF_nolabels_quant=StudentTrainDF_nolabels_quant.drop(["State"], axis=1)
StudentTestDF_quant=StudentTestDF.drop(["Gender"], axis=1)
StudentTestDF_quant=StudentTestDF_quant.drop(["State"], axis=1)
#------------------------------
####################################################################
########################### Naive Bayes ############################
####################################################################
#from sklearn.naive_bayes import MultinomialNB
#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.fit
#Create the modeler
MyModelNB= MultinomialNB()
## When you look up this model, you learn that it wants the
## DF seperate from the labels
MyModelNB.fit(StudentTrainDF_nolabels_quant, StudentTrainLabels)
Prediction = MyModelNB.predict(StudentTestDF_quant)
print("\nThe prediction from NB is:")
print(Prediction)
print("\nThe actual labels are:")
print(StudentTestLabels)
## confusion matrix
#from sklearn.metrics import confusion_matrix
## The confusion matrix is square and is labels X labels
## We ahve two labels, so ours will be 2X2
#The matrix shows
## rows are the true labels
## columns are predicted
## it is al[habetical
## The numbers are how many
cnf_matrix = confusion_matrix(StudentTestLabels, Prediction)
print("\nThe confusion matrix is:")
print(cnf_matrix)
### prediction probabilities
## columns are the labels in alphabetical order
## The decinal in the matrix are the prob of being
## that label
print(np.round(MyModelNB.predict_proba(StudentTestDF_quant),2))
#######################################################
### Bernoulli #########################################
#######################################################
### NOTE TO CLASS: This should use the Binary
from sklearn.naive_bayes import BernoulliNB
BernModel = BernoulliNB()
BernModel.fit(StudentTrainDF_nolabels_quant, StudentTrainLabels)
#BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
print("\nBernoulli prediction:\n", BernModel.predict(StudentTestDF_quant))
print("\nActual:")
print(StudentTestLabels)
bn_matrix = confusion_matrix(StudentTestLabels, BernModel.predict(StudentTestDF_quant))
print("\nThe confusion matrix for Bernoulli is:")
print(bn_matrix)
#############################################
########### SVM ############################
#############################################
#from sklearn.svm import LinearSVC
### NOTE - We CANNOT use SVM directly on the data.
### SVMs do not run on qualitative data.
SVM_Model1=LinearSVC(C=50)
SVM_Model1.fit(StudentTrainDF_nolabels_quant, StudentTrainLabels)
print("SVM 1 prediction:\n", SVM_Model1.predict(StudentTestDF_quant))
print("Actual:")
print(StudentTestLabels)
SVM_matrix = confusion_matrix(StudentTestLabels, SVM_Model1.predict(StudentTestDF_quant))
print("\nThe confusion matrix for Linear SVM is:")
print(SVM_matrix)
print("\n\n")
#--------------other kernels
## RBF
SVM_Model2=sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma="auto")
SVM_Model2.fit(StudentTrainDF_nolabels_quant, StudentTrainLabels)
print("SVM prediction:\n", SVM_Model2.predict(StudentTestDF_quant))
print("Actual:")
print(StudentTestLabels)
SVM_matrix = confusion_matrix(StudentTestLabels, SVM_Model2.predict(StudentTestDF_quant))
print("\nThe confusion matrix for rbf SVM is:")
print(SVM_matrix)
print("\n\n")
## POLY
SVM_Model3=sklearn.svm.SVC(C=1.0, kernel='poly', degree=3, gamma="auto")
SVM_Model3.fit(StudentTrainDF_nolabels_quant, StudentTrainLabels)
print("SVM prediction:\n", SVM_Model3.predict(StudentTestDF_quant))
print("Actual:")
print(StudentTestLabels)
SVM_matrix = confusion_matrix(StudentTestLabels, SVM_Model3.predict(StudentTestDF_quant))
print("\nThe confusion matrix for poly p = 3 SVM is:")
print(SVM_matrix)
print("\n\n")