The following Python code is for several ML methods including Decision Trees and Vis. If the vis prints poorly, save it as a file.
This code includes Decision Trees, Random Forest, Support Vector Machines, a lot of vis, and several other examples.
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
###############
## Supervised Learning Example Code
##
## Naive Bayes
## Bernoulli
##
## Note 1: It is better to make your own data and update
## the code for YOUR data.
##
## Note 2: This code focusses on text data but can easily
## be updated for record data.
##
## Datasets (corpus) can be found HERE:
## DOG
## https://drive.google.com/drive/folders/1UZSCzKyTKOOKqD2dPrmjIxS_IA9nnsDi?usp=sharing
##
## HIKE
## https://drive.google.com/drive/folders/141yq5AosVWiQSgk0fixcOn9RWIWmj2DH?usp=sharing
##
## Note that the other datasets for Record data are below in that section....
######################################
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
## Textmining Naive Bayes Example
import nltk
from sklearn import preprocessing
import pandas as pd
import sklearn
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
## For Stemming
from nltk.tokenize import sent_tokenize, word_tokenize
import os
from sklearn.model_selection import train_test_split
import random as rd
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
## conda install python-graphviz
## restart kernel (click the little red x next to the Console)
import graphviz
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA
#from mpl_toolkits.mplot3d import Axes3D
## conda install python-graphviz
## restart kernel (click the little red x next to the Console)
import graphviz
from sklearn.metrics import confusion_matrix
from sklearn.tree import export_graphviz
#from sklearn.externals.six import StringIO
from IPython.display import Image
## conda install pydotplus
import pydotplus
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
#from nltk.stem import WordNetLemmatizer
#LEMMER = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
##############################################################
##
## A Stemmer is a function that truncates and combines
## words such as hikings and hikers.
##
###############################################################
STEMMER=PorterStemmer()
print(STEMMER.stem("fishings"))
# Use NLTK's PorterStemmer in a function
def MY_STEMMER(str_input): #I like dogs a lot111 !!"
words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split() # I, like, dogs, a
words = [STEMMER.stem(w) for w in words]
return words
import string
import numpy as np
##import spacy
## https://spacy.io/usage/spacy-101
# create a spaCy tokenizer
###########################
## Stemming and Lemming
## Stemming is different to Lemmatization
## in the approach it uses to produce root
## forms of words and the word produced.
##
## !!! Stemming can result in words
## That are not actually words.
## trouble, troubling, troubled, troubles ....
## all become troubl
##Lemmatization is the process of grouping together
##the different inflected forms of a word so they can
## be analysed as a single item. Lemmatization is similar
## to stemming but it brings context to the words. So it
## links words with similar meaning to one word.
#####################################################################
##
## Here, I am creating a few vectorizers - some with stemming
## and some without. One for Bernoulli, and some that normalize
## using Tf-Idf
##
#####################################################################
MyVect_STEM=CountVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
##stop_words=["and", "or", "but"],
#token_pattern='(?u)[a-zA-Z]+',
#token_pattern=pattern,
tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True
)
MyVect_STEM_Bern=CountVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
##stop_words=["and", "or", "but"],
#token_pattern='(?u)[a-zA-Z]+',
#token_pattern=pattern,
tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True,
binary=True # 0 if the word is not in the doc and a 1 if it is
)
MyVect_IFIDF=TfidfVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
lowercase = True,
#binary=True
)
MyVect_IFIDF_STEM=TfidfVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
tokenizer=MY_STEMMER,
#strip_accents = 'unicode',
lowercase = True,
#binary=True
)
#
#We will be creating new data frames - one for NB and one for Bern.
## These are the two new and currently empty DFs
## Notice there are FOUR (4) of these because I have 4
## vectorizers
FinalDF_STEM=pd.DataFrame()
FinalDF_STEM_Bern=pd.DataFrame()
FinalDF_TFIDF=pd.DataFrame()
FinalDF_TFIDF_STEM=pd.DataFrame()
## You will need to know where things are on your computer.
## This code assumes that it is in the same folder/location
## as the folders DOG and HIKE. It will loop through the files in
## these two folders and will build the list needed to use
## CounterVectorizer.
## NOTICE: My loop has a path in it. This is for MY computer - not yours!
## You will need to adjust the path.
for name in ["DOG", "HIKE"]:
builder=name+"DF" #DOGDF
#print(builder)
builderB=name+"DFB"
path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\Week4_7\\"+name
FileList=[]
for item in os.listdir(path):
#print(path+ "\\" + item)
next1=path+ "/" + item
FileList.append(next1)
print("full list...")
#print(FileList)
## Do for all three
## MyVect_STEM and MyVect_IFIDF and MyVect_IFIDF_STEM
X1=MyVect_STEM.fit_transform(FileList)
X2=MyVect_IFIDF.fit_transform(FileList)
X3=MyVect_IFIDF_STEM.fit_transform(FileList)
XB=MyVect_STEM_Bern.fit_transform(FileList)
ColumnNames1=MyVect_STEM.get_feature_names()
NumFeatures1=len(ColumnNames1)
ColumnNames2=MyVect_IFIDF.get_feature_names()
NumFeatures2=len(ColumnNames2)
ColumnNames3=MyVect_IFIDF_STEM.get_feature_names()
NumFeatures3=len(ColumnNames3)
ColumnNamesB=MyVect_STEM_Bern.get_feature_names()
NumFeatures4=len(ColumnNamesB)
#print("Column names: ", ColumnNames2)
#Create a name
builderS=pd.DataFrame(X1.toarray(),columns=ColumnNames1)
builderT=pd.DataFrame(X2.toarray(),columns=ColumnNames2)
builderTS=pd.DataFrame(X3.toarray(),columns=ColumnNames3)
builderB=pd.DataFrame(XB.toarray(),columns=ColumnNamesB)
## Add column
#print("Adding new column....")
builderS["Label"]=name
builderT["Label"]=name
builderTS["Label"]=name
builderB["Label"]=name
#print(builderS)
FinalDF_STEM= FinalDF_STEM.append(builderS)
FinalDF_STEM_Bern= FinalDF_STEM_Bern.append(builderB)
FinalDF_TFIDF= FinalDF_TFIDF.append(builderT)
FinalDF_TFIDF_STEM= FinalDF_TFIDF_STEM.append(builderTS)
#print(FinalDF_STEM.head())
## Replace the NaN with 0 because it actually
## means none in this case
FinalDF_STEM=FinalDF_STEM.fillna(0)
FinalDF_STEM_Bern=FinalDF_STEM_Bern.fillna(0)
FinalDF_TFIDF=FinalDF_TFIDF.fillna(0)
FinalDF_TFIDF_STEM=FinalDF_TFIDF_STEM.fillna(0)
###### REMOVE number columns
## Remove columns with number from this one
##-------------------------------------------------------------------
####### Create a function that removes columns that are/contain nums
##-------------------------------------------------------------------
def RemoveNums(SomeDF):
#print(SomeDF)
print("Running Remove Numbers function....\n")
temp=SomeDF
MyList=[]
for col in temp.columns:
#print(col)
#Logical1=col.isdigit() ## is a num
Logical2=str.isalpha(col) ## this checks for anything
## that is not a letter
if(Logical2==False):# or Logical2==True):
#print(col)
MyList.append(str(col))
#print(MyList)
temp.drop(MyList, axis=1, inplace=True)
#print(temp)
#return temp
return temp
##########################################################
## Call the function ....
FinalDF_STEM=RemoveNums(FinalDF_STEM)
FinalDF_STEM_Bern=RemoveNums(FinalDF_STEM_Bern)
FinalDF_TFIDF=RemoveNums(FinalDF_TFIDF)
FinalDF_TFIDF_STEM=RemoveNums(FinalDF_TFIDF_STEM)
## Have a look:
## These print statements help you to see where you are
#print(FinalDF_STEM)
## Remove columns that contain "-" HOW TO....
#cols = [c for c in FinalDF_STEM.columns if "-" in c[:] ]
#FinalDF_STEM=FinalDF_STEM.drop(cols, axis = 1)
print(FinalDF_STEM) #1
print(FinalDF_STEM_Bern) #4
print(FinalDF_TFIDF) #2
print(FinalDF_TFIDF_STEM) #3
##################################################
##
## Now we have 4 labeled dataframes!
##
## Let's model them.....
##
######################################################
## Create the testing set - grab a sample from the training set.
## Be careful. Notice that right now, our train set is sorted by label.
## If your train set is large enough, you can take a random sample.
from sklearn.model_selection import train_test_split
import random as rd
#rd.seed(1234)
TrainDF1, TestDF1 = train_test_split(FinalDF_STEM, test_size=0.3)
print(TrainDF1)
print(TestDF1)
TrainDF2, TestDF2 = train_test_split(FinalDF_TFIDF, test_size=0.3)
TrainDF3, TestDF3 = train_test_split(FinalDF_TFIDF_STEM, test_size=0.3)
TrainDF4, TestDF4 = train_test_split(FinalDF_STEM_Bern, test_size=0.4)
print(TestDF4)
### OK - at this point we have Train and Test data for the text data
## in DOG and HIKE.
## Of course, this can be updated to work from sentiment (like POS and NEG)
## and can be update for multiple folders or one folder..
###############################################
## For all FOUR DFs - separate LABELS
#################################################
## IMPORTANT - YOU CANNOT LEAVE LABELS ON THE TEST SET
## Save labels
### TEST ---------------------
Test1Labels=TestDF1["Label"]
print(Test1Labels)
Test2Labels=TestDF2["Label"]
Test3Labels=TestDF3["Label"]
Test4Labels=TestDF4["Label"]
print(Test2Labels)
## remove labels
TestDF1 = TestDF1.drop(["Label"], axis=1)
TestDF2 = TestDF2.drop(["Label"], axis=1)
TestDF3 = TestDF3.drop(["Label"], axis=1)
TestDF4 = TestDF4.drop(["Label"], axis=1)
print(TestDF1)
## TRAIN ----------------------------
Train1Labels=TrainDF1["Label"]
Train2Labels=TrainDF2["Label"]
Train3Labels=TrainDF3["Label"]
Train4Labels=TrainDF4["Label"]
print(Train3Labels)
## remove labels
TrainDF1 = TrainDF1.drop(["Label"], axis=1)
TrainDF2 = TrainDF2.drop(["Label"], axis=1)
TrainDF3 = TrainDF3.drop(["Label"], axis=1)
TrainDF4 = TrainDF4.drop(["Label"], axis=1)
print(TrainDF3)
####################################################################
########################### Naive Bayes ############################
####################################################################
from sklearn.naive_bayes import MultinomialNB
#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.fit
#Create the modeler
MyModelNB= MultinomialNB()
## Run on all three Dfs.................
NB1=MyModelNB.fit(TrainDF1, Train1Labels)
Prediction1 = MyModelNB.predict(TestDF1)
print(np.round(MyModelNB.predict_proba(TestDF1),2))
NB2=MyModelNB.fit(TrainDF2, Train2Labels)
Prediction2 = MyModelNB.predict(TestDF2)
print(np.round(MyModelNB.predict_proba(TestDF2),2))
NB3=MyModelNB.fit(TrainDF3, Train3Labels)
Prediction3 = MyModelNB.predict(TestDF3)
print(np.round(MyModelNB.predict_proba(TestDF3),2))
NB4=MyModelNB.fit(TrainDF4, Train4Labels)
Prediction4 = MyModelNB.predict(TestDF4)
print(np.round(MyModelNB.predict_proba(TestDF4),2))
print("\nThe prediction from NB is:")
print(Prediction1)
print("\nThe actual labels are:")
print(Test1Labels)
print("\nThe prediction from NB is:")
print(Prediction2)
print("\nThe actual labels are:")
print(Test2Labels)
print("\nThe prediction from NB is:")
print(Prediction3)
print("\nThe actual labels are:")
print(Test3Labels)
print("\nThe prediction from NB is:")
print(Prediction4)
print("\nThe actual labels are:")
print(Test4Labels)
## confusion matrix
from sklearn.metrics import confusion_matrix
## The confusion matrix is square and is labels X labels
## We ahve two labels, so ours will be 2X2
#The matrix shows
## rows are the true labels
## columns are predicted
## it is al[habetical
## The numbers are how many
cnf_matrix1 = confusion_matrix(Test1Labels, Prediction1)
print("\nThe confusion matrix is:")
print(cnf_matrix1)
cnf_matrix2 = confusion_matrix(Test2Labels, Prediction2)
print("\nThe confusion matrix is:")
print(cnf_matrix2)
cnf_matrix3 = confusion_matrix(Test3Labels, Prediction3)
print("\nThe confusion matrix is:")
print(cnf_matrix3)
cnf_matrix4 = confusion_matrix(Test4Labels, Prediction4)
print("\nThe confusion matrix is:")
print(cnf_matrix4)
#######################################################
### Bernoulli #########################################
#######################################################
from sklearn.naive_bayes import BernoulliNB
BernModel = BernoulliNB()
BernModel.fit(TrainDF4, Train4Labels)
##BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
print("\nBernoulli prediction:\n")
Prediction=BernModel.predict(TestDF4)
print("\nActual:")
print(Test4Labels)
print("\The prediction\n")
print(Prediction)
#
bn_matrix = confusion_matrix(Test4Labels, Prediction)
print("\nThe confusion matrix is:")
print(bn_matrix)
#########################################################
############# Decision Trees #######################
#########################################################
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
## conda install python-graphviz
## restart kernel (click the little red x next to the Console)
import graphviz
from sklearn.metrics import confusion_matrix
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
MyDT=DecisionTreeClassifier(criterion='entropy', ##"entropy" or "gini"
splitter='best', ## or "random" or "best"
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
class_weight=None)
## ------------------------------
## This for loop will fit and predict Decision Trees for
## all 4 of the dataframes. Notice that this uses dynamic variables
## and eval
##--------------------------
##
print(TrainDF1)
for i in [1,2,3,4]:
temp1=str("TrainDF"+str(i)) ## TrainDF1
temp2=str("Train"+str(i)+"Labels") #Train1Labels
temp3=str("TestDF"+str(i)) #TestDF1
temp4=str("Test"+str(i)+"Labels") # Test1Labels
## perform DT
#MyDT.fit(TrainDF1, Train1Labels)
MyDT.fit(eval(temp1), eval(temp2))
## plot the tree
tree.plot_tree(MyDT)
plt.savefig(temp1)
feature_names=eval(str(temp1+".columns"))
dot_data = tree.export_graphviz(MyDT, out_file=None,
## The following creates TrainDF.columns for each
## which are the feature names.
feature_names=eval(str(temp1+".columns")),
#class_names=MyDT.class_names,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
## Create dynamic graph name
tempname=str("Graph" + str(i))
graph.render(tempname)
## Show the predictions from the DT on the test set
print("\nActual for DataFrame: ", i, "\n")
print(eval(temp2))
print("Prediction\n")
DT_pred=MyDT.predict(eval(temp3))
print(DT_pred)
## Show the confusion matrix
bn_matrix = confusion_matrix(eval(temp4), DT_pred)
print("\nThe confusion matrix is:")
print(bn_matrix)
FeatureImp=MyDT.feature_importances_
indices = np.argsort(FeatureImp)[::-1]
## print out the important features.....
for f in range(TrainDF4.shape[1]):
if FeatureImp[indices[f]] > 0:
print("%d. feature %d (%f)" % (f + 1, indices[f], FeatureImp[indices[f]]))
print ("feature name: ", feature_names[indices[f]])
## FYI for small datasets you can zip features....
## print(dict(zip(iris_pd.columns, clf.feature_importances_)))
#####################################################
## Visualize Decision Trees plotting paired surfaces
##
####################################################
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
f1=TrainDF1.columns.get_loc("dog")
f2=TrainDF1.columns.get_loc("hike")
f3=TrainDF1.columns.get_loc("workout")
f4=TrainDF1.columns.get_loc("happi")
n_classes =2
plot_colors = "ryb"
plot_step = 0.02
for pairidx, pair in enumerate([[f1, f2], [f1, f3], [f1, f4],
[f2,f3], [f3, f4]]):
#print(TrainDF1.iloc[:,pair])
X = TrainDF1.iloc[:, pair]
## Because we are plotting, using our GOD and HIKE labels will not work
## we need to change them to 0 and 1
y = Train1Labels
print(y)
oldy=y
#print(type(y))
y=y.replace("DOG", 1)
y=y.replace("HIKE", 0)
print(y)
# Train
DT = DecisionTreeClassifier().fit(X, y)
# Plot the decision boundary
plt.subplot(2, 3, pairidx + 1)
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
print(x_min)
y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max,plot_step),
np.arange(y_min, y_max,plot_step))
#print(yy)
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
#
Z = DT.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
print(Z)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s=30, label=oldy,edgecolor='black',
#c=color, s=15)
#label=y[i],
cmap=plt.cm.RdYlBu)
###---------------------------end for loop ----------------------------------
#plt.suptitle("Decision surface of a decision tree using paired features")
#plt.legend(loc='lower right', borderpad=0, handletextpad=0)
#plt.axis("tight")
#
#plt.figure()
#########################################################
##
## Random Forest for Text Data
##
#################################################################
RF = RandomForestClassifier()
RF.fit(TrainDF1, Train1Labels)
RF_pred=RF.predict(TestDF1)
bn_matrix_RF_text = confusion_matrix(Test1Labels, RF_pred)
print("\nThe confusion matrix is:")
print(bn_matrix_RF_text)
################# VIS RF---------------------------------
## FEATURE NAMES...................
FeaturesT=TrainDF1.columns
#Targets=StudentTestLabels_Num
figT, axesT = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(RF.estimators_[0],
feature_names = FeaturesT,
#class_names=Targets,
filled = True)
##save it
figT.savefig('RF_Tree_Text') ## creates png
#####------------------> View estimator Trees in RF
figT2, axesT2 = plt.subplots(nrows = 1,ncols = 5,figsize = (10,2), dpi=900)
for index in range(0, 5):
tree.plot_tree(RF.estimators_[index],
feature_names = FeaturesT,
filled = True,
ax = axesT2[index])
axesT2[index].set_title('Estimator: ' + str(index), fontsize = 11)
## Save it
figT2.savefig('FIVEtrees_RF.png')
#################-------------------------->
## Feature importance in RF
##-----------------------------------------
## Recall that FeaturesT are the columns names - the words in this case.
######
FeatureImpRF=RF.feature_importances_
indicesRF = np.argsort(FeatureImpRF)[::-1]
## print out the important features.....
for f2 in range(TrainDF1.shape[1]): ##TrainDF1.shape[1] is number of columns
if FeatureImpRF[indicesRF[f2]] >= 0.01:
print("%d. feature %d (%.2f)" % (f2 + 1, indicesRF[f2], FeatureImpRF[indicesRF[f2]]))
print ("feature name: ", FeaturesT[indicesRF[f2]])
## PLOT THE TOP 10 FEATURES...........................
top_ten_arg = indicesRF[:10]
#print(top_ten_arg)
plt.title('Feature Importances Dog and Hike')
plt.barh(range(len(top_ten_arg)), FeatureImpRF[top_ten_arg], color='b', align='center')
plt.yticks(range(len(top_ten_arg)), [FeaturesT[i] for i in top_ten_arg])
plt.xlabel('Relative Importance')
plt.show()
#####################################################################
##
## NN
##
#########################################################################
from sklearn.neural_network import MLPClassifier
MyNN = MLPClassifier(solver='lbfgs',
#solver='adam',
#solver='sgd',
alpha=1,
hidden_layer_sizes=(1, 2), random_state=1)
MyNN.fit(TrainDF1, Train1Labels)
NNPrediction = MyNN.predict(TestDF1)
print("confusion matrix for NN\n")
NN_matrix = confusion_matrix(Test1Labels, NNPrediction)
print(NN_matrix)
#############################################
########### SVM ############################
#############################################
#from sklearn.svm import LinearSVC
SVM_Model=LinearSVC(C=1)
SVM_Model.fit(TrainDF1, Train1Labels)
#print("SVM prediction:\n", SVM_Model.predict(TestDF1))
#print("Actual:")
#print(Test1Labels)
SVM_matrix = confusion_matrix(Test1Labels, SVM_Model.predict(TestDF1))
print("\nThe confusion matrix is:")
print(SVM_matrix)
print("\n\n")
#############################################
########### SVM ############################
#############################################
#from sklearn.svm import LinearSVC
###
### SVMs do not run on qualitative data.
### ALWAYS remove the Labels from the Test and Train data
###
### Here is what we have from above:
## TrainDF_nolabels, TrainLabels
### TestDF, TestLabels
##############################
## Rather than creating copies of everything
##
### !!!!!!!!!!!!!!!!!
## You can set the variables: TRAIN, TRAIN_Labels
## TEST and TEST_Labels
## to whatever you wish
######################################################
TRAIN= TrainDF1 ## As noted above - this can also be TrainDF2, etc.
print(TRAIN)
TRAIN_Labels= Train1Labels
print(TRAIN_Labels)
TEST= TestDF1
TEST_Labels= Test1Labels
SVM_Model1=LinearSVC(C=1)
SVM_Model1.fit(TRAIN, TRAIN_Labels)
#print("SVM prediction:\n", SVM_Model1.predict(TEST))
#print("Actual:")
#print(TEST_Labels)
SVM_matrix = confusion_matrix(TEST_Labels, SVM_Model1.predict(TEST))
print("\nThe confusion matrix is:")
print(SVM_matrix)
print("\n\n")
#--------------other kernels
## RBF------------------------------------------
##------------------------------------------------------
SVM_Model2=sklearn.svm.SVC(C=1, kernel='rbf',
verbose=True, gamma="auto")
SVM_Model2.fit(TRAIN, TRAIN_Labels)
#print("SVM prediction:\n", SVM_Model2.predict(TEST))
#print("Actual:")
#print(TEST_Labels)
print("RBF :\n")
SVM_matrix2 = confusion_matrix(TEST_Labels, SVM_Model2.predict(TEST))
print("\nThe confusion matrix is:")
print(SVM_matrix2)
print("\n\n")
##-----------------------------------------
## POLY
##_--------------------------------------------------
SVM_Model3=sklearn.svm.SVC(C=100, kernel='poly',degree=3,
gamma="auto", verbose=True)
#print(SVM_Model3)
SVM_Model3.fit(TRAIN, TRAIN_Labels)
#print("SVM prediction:\n", SVM_Model3.predict(TEST))
#print("Actual:")
#print(TEST_Labels)
print("POLY Degree 2:\n")
SVM_matrix3 = confusion_matrix(TEST_Labels, SVM_Model3.predict(TEST))
print("\nThe confusion matrix is:")
print(SVM_matrix3)
print("\n\n")
###################################################
##
## Visualizing the top features
## Then Visualizing the margin with the top 2 in 2D
##
##########################################################
import matplotlib.pyplot as plt
## Credit: https://medium.com/@aneesha/visualising-top-features-in-linear-svm-with-scikit-learn-and-matplotlib-3454ab18a14d
## Define a function to visualize the TOP words (variables)
def plot_coefficients(MODEL=SVM_Model, COLNAMES=TrainDF1.columns, top_features=10):
## Model if SVM MUST be SVC, RE: SVM_Model=LinearSVC(C=10)
coef = MODEL.coef_.ravel()
top_positive_coefficients = np.argsort(coef,axis=0)[-top_features:]
print(top_positive_coefficients)
top_negative_coefficients = np.argsort(coef,axis=0)[:top_features]
print(top_negative_coefficients)
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
# create plot
plt.figure(figsize=(15, 5))
colors = ["red" if c < 0 else "blue" for c in coef[top_coefficients]]
plt.bar( x= np.arange(2 * top_features) , height=coef[top_coefficients], width=.5, color=colors)
feature_names = np.array(COLNAMES)
plt.xticks(np.arange(0, (2*top_features)), feature_names[top_coefficients], rotation=60, ha="right")
plt.show()
plot_coefficients()
#########################################################
## Using the top 2 features from above
## Let's look at the margin of the SVM
##################################################################
from sklearn.svm import SVC
X = np.array([TRAIN["dog"], TRAIN["hike"]])
X = X.transpose()
print(X)
#The classes of the training data
y = TRAIN_Labels
print(y)
from sklearn.preprocessing import LabelBinarizer
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
y=lb.fit_transform(y)
y = np.array(y)
y = y.ravel() ## to make it the right 1D array type
print(y)
## Here - we need to make y into 0 or 1 so it will plot
#TRAIN
#Define the model with SVC
# Fit SVM with training data
clf = SVC(C=1, kernel="linear")
clf.fit(X, y)
margin = 2 / np.sqrt(np.sum(clf.coef_ ** 2))
# get the separating hyperplane
#The weights vector w
w = clf.coef_[0]
#print("The weight vector ", w)
#The slope of the SVM sep line
a = -w[0] / w[1]
#print("The slope of the SVM sep line is ", a)
#Create a variable xx that are values between 4 and 8
xx = np.linspace(0, 10)
#Equation of sep line in 2D
# x1 = - b/w1 - (w0/w1 )(x0)
## Note that clf_intercept_[0] is "b"
## Note that a = -w0/w1 and xx are a bunch of x values
## This is the y values for the main sep line
yy = a * xx - (clf.intercept_[0]) / w[1]
##These plot the two parellel margin lines
# plot the parallel lines to the separating hyperplane
#that pass through the support vectors and note the margin
#margin = 2 / np.sqrt(np.sum(clf.coef_ ** 2))
#translate the location of the center sep line by
# adding or subtracting a fraaction of the margin
yy_down = yy + .5*margin
yy_up = yy - .5*margin
# plot the line, the points, and the nearest vectors to the plane
#plt.figure(fignum, figsize=(4, 3))
plt.clf()
plt.plot(xx, yy, 'r-')
plt.plot(xx, yy_down, 'k--')
plt.plot(xx, yy_up, 'k--')
plt.scatter(clf.support_vectors_[:, 0], clf.support_vectors_[:, 1], s=10,
facecolors='none', zorder=5)
#cmap is the color map
plt.scatter(X[:, 0], X[:, 1], c=y, zorder=5, cmap=plt.cm.Paired)
plt.axis('tight')
##@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
##################################################################
############# PART 2 Using the Student Dataset #################
##################################################################
##@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
## The student dataset is not text data and so does not need to be
## vectorized.
## Also, the student dataset is clean. This will not normally
## be the case.
#####################################################################
## Read the data into a dataframe
## DATA: Just numeric and record labeled data
## https://drive.google.com/file/d/1uXtDBIP-dTbFNXbZC0DcCKxIXjocW3xF/view?usp=sharing
## ## There is also another dataset for which the labels are numbers and not words...
## https://drive.google.com/file/d/1g0go050nV02Fibk_9RGpBRGnMwIGQZu5/view?usp=sharing
########################################################################################
filenameNum="C:/Users/profa/Documents/Python Scripts/ANLY503/DATA/StudentSummerProgramData_Numeric_Labeled.csv"
#filenameNum="C:/Users/profa/Documents/Python Scripts/ANLY503/DATA/StudentSummerProgramData_Numeric_NumLabeled.csv"
StudentDF_Num=pd.read_csv(filenameNum)
print(StudentDF_Num.head())
################# Sklearn methods such as these do not run on mixed type data...
#filenameMixed="C:/Users/profa/Documents/Python Scripts/ANLY503/DATA/StudentSummerProgramData_Mixed_Labeled.csv"
#StudentDF_Mix=pd.read_csv(filenameMixed)
#print(StudentDF_Mix.head())
### Because the data is already clean and ready - I can seperate it
## into TRAINING and TESTING sets
####-----------------------------------------------
#from sklearn.model_selection import train_test_split
StudentTrainDF_Num, StudentTestDF_Num = train_test_split(StudentDF_Num, test_size=0.3)
#StudentTrainDF_Mix, StudentTestDF_Mix = train_test_split(StudentDF_Mix, test_size=0.3)
######## Seperate LABELS FROM DATA--------------------
## Make sure you know the name of the label
## For both datasets above - in this case - it is "Decision"
## TEST - Num
StudentTestLabels_Num=StudentTestDF_Num["Decision"] ## save labels
print(StudentTestLabels_Num)
StudentTestData_Num = StudentTestDF_Num.drop(["Decision"], axis=1) ##drop labels
print(StudentTestData_Num)
## TRAIN - Num
StudentTrainLabels_Num=StudentTrainDF_Num["Decision"] ## save labels
print(StudentTrainLabels_Num)
StudentTrainData_Num = StudentTrainDF_Num.drop(["Decision"], axis=1) ##drop labels
print(StudentTrainData_Num)
### TEST - Mixed
#StudentTestLabels_Mix=StudentTestDF_Mix["Decision"] ## save labels
#print(StudentTestLabels_Mix)
#StudentTestData_Mix = StudentTestDF_Mix.drop(["Decision"], axis=1) ##drop labels
#print(StudentTestData_Mix)
#
### TRAIN - Mixed
#StudentTrainLabels_Mix=StudentTrainDF_Mix["Decision"] ## save labels
#print(StudentTrainLabels_Mix)
#StudentTrainData_Mix = StudentTrainDF_Mix.drop(["Decision"], axis=1) ##drop labels
#print(StudentTrainData_Mix)
#print(StudentTestLabels_Num)
#print(StudentTestData_Num)
### TRAIN - Num
#print(StudentTrainLabels_Num)
#print(StudentTrainData_Num)
###############################################
## SCALE ALL DATA to between 0 and 1
#from sklearn import preprocessing
###########################################################
x = StudentTrainData_Num.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
StudentTrainData_Num_S = pd.DataFrame(x_scaled)
x2 = StudentTestData_Num.values #returns a numpy array
min_max_scaler2 = preprocessing.MinMaxScaler()
x_scaled2 = min_max_scaler2.fit_transform(x2)
StudentTestData_Num_S = pd.DataFrame(x_scaled2)
print(StudentTestData_Num_S)
####################################################################
########################### Naive Bayes ############################
####################################################################
#from sklearn.naive_bayes import MultinomialNB
#https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#sklearn.naive_bayes.MultinomialNB.fit
#Create the modeler
############# In Python - unlike R - you cannot run a standard NB on
## Mixed data. Therefore, we will only run it on our numeric dataset.
##################
MyModelNB_Num= MultinomialNB()
## When you look up this model, you learn that it wants the
## DF seperate from the labels
MyModelNB_Num.fit(StudentTrainData_Num, StudentTrainLabels_Num)
PredictionNB = MyModelNB_Num.predict(StudentTestData_Num)
#print("\nThe prediction from NB is:")
#print(PredictionNB)
#print("\nThe actual labels are:")
#print(StudentTestLabels_Num)
## confusion matrix
#from sklearn.metrics import confusion_matrix
## The confusion matrix is square and is labels X labels
## We ahve two labels, so ours will be 2X2
#The matrix shows
## rows are the true labels
## columns are predicted
## it is al[habetical
## The numbers are how many
cnf_matrix = confusion_matrix(StudentTestLabels_Num, PredictionNB)
print("\nThe confusion matrix is:")
print(cnf_matrix)
### prediction probabilities
## columns are the labels in alphabetical order
## The decinal in the matrix are the prob of being
## that label
print(np.round(MyModelNB_Num.predict_proba(StudentTestData_Num),2))
MyModelNB_Num.get_params(deep=True)
from sklearn.decomposition import PCA
#from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
## remap labels to numbers to view
ymap=StudentTrainLabels_Num
ymap=ymap.replace("Admit", 1)
ymap=ymap.replace("Decline", 0)
ymap=ymap.replace("Wait", 2)
pca = PCA(n_components=3)
proj = pca.fit_transform(StudentTrainData_Num)
plt.scatter(proj[:, 0], proj[:, 1], c=ymap, cmap="Paired")
plt.colorbar()
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
#########################################################
############# Decision Trees #######################
#########################################################
#@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
## In Python - the Decision Trees work ONLY on numeric data
### TEST
#print(StudentTestLabels_Num)
#print(StudentTestData_Num)
### TRAIN - Num
#print(StudentTrainLabels_Num)
#print(StudentTrainData_Num)
#---------------------------------------------
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import matplotlib.pyplot as plt
## conda install python-graphviz
## restart kernel (click the little red x next to the Console)
import graphviz
from sklearn.metrics import confusion_matrix
#https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
MyDT_R=DecisionTreeClassifier(criterion='entropy', ##"entropy" or "gini"
splitter='best', ## or "random" or "best"
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
class_weight=None)
## perform DT
MyDT_R.fit(StudentTrainData_Num, StudentTrainLabels_Num)
## plot the tree
tree.plot_tree(MyDT_R)
feature_namesR=StudentTrainData_Num.columns
print(feature_namesR)
##--------------------------------------------------------
## VIS 1 saved as Tree_Record.pdf
##-----------------------------------------------------------
TREE_data = tree.export_graphviz(MyDT_R, out_file=None,
feature_names=StudentTrainData_Num.columns,
filled=True,
rounded=True,
special_characters=True)
graph = graphviz.Source(TREE_data)
graph.render("Tree_Record")
##---
## Show the predictions from the DT on the test set
#print("\nActual\n")
#print(StudentTestLabels_Num)
#print("Prediction\n")
DT_pred_R=MyDT_R.predict(StudentTestData_Num)
#print(DT_pred_R)
##_--
## Show the confusion matrix
bn_matrix_R = confusion_matrix(StudentTestLabels_Num, DT_pred_R)
print("\nThe confusion matrix is:")
print(bn_matrix_R)
##---
## Feature Importance
FeatureImpR=MyDT_R.feature_importances_
indicesR = np.argsort(FeatureImpR)[::-1]
indicesR
print ("feature name: ", feature_namesR[indicesR])
## print out the important features.....
for f in range(StudentTrainData_Num.shape[1]):
if FeatureImpR[indicesR[f]] > 0:
print("%d. feature %d (%f)" % (f + 1, indicesR[f], FeatureImpR[indicesR[f]]))
print ("feature name: ", feature_namesR[indicesR[f]])
## FYI for small datasets you can zip features....
## print(dict(zip(iris_pd.columns, clf.feature_importances_)))
#######-----------------------------------------------------
## Visualize Decision Trees plotting paired surfaces
##
#####--------------------------------------------------------------
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
f1=StudentTrainData_Num.columns.get_loc("WritingScore")
f2=StudentTrainData_Num.columns.get_loc("TestScore")
n_classes =2
plot_colors = "ryb"
plot_step = 0.02
for pairidx, pair in enumerate([[f1, f2], [0, 2], [0, 3],
[1, 2], [1, 3]]):
#print(TrainDF1.iloc[:,pair])
X = StudentTrainData_Num.iloc[:, pair]
## Because we are plotting, using our GOD and HIKE labels will not work
## we need to change them to 0 and 1
y = StudentTrainLabels_Num
print(y)
oldy=y
#print(type(y))
y=y.replace("Admit", 1)
y=y.replace("Decline", 0)
y=y.replace("Wait", 2)
print(y)
# Train
DTC = DecisionTreeClassifier().fit(X, y)
# Plot the decision boundary
plt.subplot(2, 3, pairidx + 1)
x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
print(x_min)
y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max,plot_step),
np.arange(y_min, y_max,plot_step))
#print(yy)
plt.tight_layout(h_pad=0.5, w_pad=0.5, pad=2.5)
#
Z = DTC.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
print(Z)
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.RdYlBu)
plt.scatter(X.iloc[:, 0], X.iloc[:, 1], s=30, label=oldy,edgecolor='black',
#c=color, s=15)
#label=y[i],
cmap=plt.cm.RdYlBu)
###---------------------------end for loop ----------------------------------
#plt.suptitle("Decision surface of a decision tree using paired features: Student Admissions")
#plt.legend(loc='lower right', borderpad=0, handletextpad=0)
#plt.axis("tight")
#
#plt.figure()
######################################################
## MORE DT Vis...................................
###########################################################
# dot_data2 = StringIO()
# export_graphviz(MyDT_R, out_file=dot_data2,
# filled=True, rounded=True,
# special_characters=True,
# feature_names = StudentTrainData_Num.columns)
# #class_names=['0','1'])
# graph = pydotplus.graph_from_dot_data(dot_data2.getvalue())
# graph.write_png('StudentData.png')
# Image(graph.create_png())
#########################################################
##
## Random Forest
##
#################################################################
RF1 = RandomForestClassifier()
RF1.fit(StudentTrainData_Num, StudentTrainLabels_Num)
RF1_pred=RF1.predict(StudentTestData_Num)
bn_matrix_RF = confusion_matrix(StudentTestLabels_Num, RF1_pred)
print("\nThe confusion matrix is:")
print(bn_matrix_RF)
################# VIS RF---------------------------------
Features=StudentTrainData_Num.columns
#Targets=StudentTestLabels_Num
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
tree.plot_tree(RF1.estimators_[0],
feature_names = Features,
#class_names=Targets,
filled = True)
fig.savefig('RF_Tree') ## creates png
#####------------------> View estimator Trees in RF
fig2, axes2 = plt.subplots(nrows = 1,ncols = 3,figsize = (10,2), dpi=900)
for index in range(0, 3):
tree.plot_tree(RF1.estimators_[index],
feature_names = Features,
filled = True,
ax = axes2[index])
axes2[index].set_title('Estimator: ' + str(index), fontsize = 11)
fig2.savefig('THREEtrees_RF.png')
##@@@@@@@@@@@@@@@@@@@@@@@
#############################################
########### SVM ############################
#############################################
#from sklearn.svm import LinearSVC
### NOTE - We CANNOT use SVM directly on the data.
### SVMs do not run on qualitative data.
############ MUST NORMALIZE THE DATA!! ################
## This is done above. Notice the _S for scale after each DF
##-----
SVM_Model1=LinearSVC(C=1)
SVM_Model1.fit(StudentTrainData_Num_S, StudentTrainLabels_Num)
#print("SVM prediction:\n", SVM_Model1.predict(StudentTestData_Num_S))
#print("Actual:")
#print(StudentTestLabels_Num)
SVM_matrix = confusion_matrix(StudentTestLabels_Num, SVM_Model1.predict(StudentTestData_Num_S))
print("\nThe confusion matrix is:")
print(SVM_matrix)
print("\n\n")
#--------------other kernels
## RBF
SVM_Model2=sklearn.svm.SVC(C=1, kernel='rbf', degree=3, gamma="auto")
SVM_Model2.fit(StudentTrainData_Num_S, StudentTrainLabels_Num)
#print("SVM prediction:\n", SVM_Model2.predict(StudentTestData_Num_S))
#print("Actual:")
#print(StudentTestLabels_Num)
SVM_matrix2 = confusion_matrix(StudentTestLabels_Num, SVM_Model2.predict(StudentTestData_Num_S))
print("\nThe confusion matrix is:")
print(SVM_matrix2)
print("\n\n")
##------------------------------
## POLY
SVM_Model3=sklearn.svm.SVC(C=100, kernel='poly', degree=2, gamma="auto")
SVM_Model3.fit(StudentTrainData_Num_S, StudentTrainLabels_Num)
#print("SVM prediction:\n", SVM_Model3.predict(StudentTestData_Num_S))
#print("Actual:")
#print(StudentTestLabels_Num)
SVM_matrix3 = confusion_matrix(StudentTestLabels_Num, SVM_Model3.predict(StudentTestData_Num_S))
print("\nThe confusion matrix is:")
print(SVM_matrix3)
print("\n\n")