# -*- coding: utf-8 -*-
"""
@author: profa
"""
#####################################
##
## CountVectorizer and n grams
##
## Corpus text data - two folders
##
## Gates
##
## INSTRUCTIONS
##
## To use this code, you will need two corpus's
## on your computer. You will need to UPDATE
## the paths below to match YOUR paths.
##
## Here are examples of the DOG and HIKE corpus's
## I am using below
## https://drive.google.com/drive/folders/1Lm5HlK51q2JyrMbMwv9tS3WA34QxmXNN?usp=sharing
## https://drive.google.com/drive/folders/1k0GHdPafJqUFaJ6wFD9HNyc0lFK1AdSl?usp=sharing
########################################
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
import re
## YOU WILL NEED TO UPDATE THIS PATH
corpus_path="C:/Users/profa/Desktop/UCB/Text Mining/DATA/"
## Empty List
all_file_names = []
print("calling os...")
#print(os.listdir(path))
CorpusNameList=os.listdir(corpus_path)
#print(CorpusNameList)
ListOfCompleteFiles=[]
ListOfFileNames=[]
for folder_name in CorpusNameList:
#print(folder_name)
## The first for loop enters each corpus
next_corpus=corpus_path+str(folder_name)
for fname in os.listdir(next_corpus):
#print(fname)
full_path=next_corpus+"/"+fname
ListOfCompleteFiles.append(full_path)
ListOfFileNames.append(fname)
#print("DONE...")
print("full list...")
print(ListOfCompleteFiles)
print(ListOfFileNames)
##### Instantiate Vectorizers----------------
MyVect_CV=CountVectorizer(input="filename")
MyVect_TF=TfidfVectorizer(input='filename', stop_words="english",max_features=50)
Vect_TF = MyVect_TF.fit_transform(ListOfCompleteFiles)
ColumnNamesTF=MyVect_TF.get_feature_names()
CorpusDF_TF=pd.DataFrame(Vect_TF.toarray(),columns=ColumnNamesTF)
print(CorpusDF_TF)
Vect_CV = MyVect_CV.fit_transform(ListOfCompleteFiles)
ColumnNamesCV=MyVect_CV.get_feature_names()
CorpusDF_CV=pd.DataFrame(Vect_CV.toarray(),columns=ColumnNamesCV)
print(CorpusDF_CV)
####### Remove garbage
for nextcol in CorpusDF_CV.columns:
if(re.search(r'[^A-Za-z]+', nextcol)):
#print(nextcol)
CorpusDF_CV= CorpusDF_CV.drop([nextcol], axis=1)
# ## The following will remove any column with name
# ## of 3 or smaller - like "it" or "of" or "pre".
# ##print(len(nextcol)) ## check it first
# ## NOTE: You can also use this code to CONTROL
# ## the words in the columns. For example - you can
# ## have only words between lengths 5 and 9.
# ## In this case, we remove columns with words <= 3.
elif(len(str(nextcol))<3):
#print(nextcol)
CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)
elif(len(str(nextcol))>20):
#print(nextcol)
CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)
# elif(nextcol in RemoveWords):
# print(nextcol)
# CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)
print(CorpusDF_CV)
########################################################
##### Using n- grams with n > 1----------------
#############################################################
## We can split (toeknize)
## our text into one or more words called n-grams,
## When n = 1, we are splitting into single words.
## When n = 2, we are splitting into pairs of words.
## Define the length n by passing a tuple to the
## ngram_range argument.
## For example, 1,1 would give us unigrams or 1-grams such as "not"
## or "happy", etc.
## which we have. This is the default.
## Using 2,2 would give us bigrams or 2-grams, such as “not happy”.
## ngram_rangetuple (min_n, max_n), default=(1, 1)
## Example: (1,2) means 1-grams and 2-grams
########################################################################
MyVect_CV2=CountVectorizer(input="filename", ngram_range = (2, 2))
MyVect_TF2=TfidfVectorizer(input='filename',
stop_words="english",
#max_features=20,
ngram_range = (2, 3))
Vect_TF2 = MyVect_TF2.fit_transform(ListOfCompleteFiles)
ColumnNamesTF2=MyVect_TF2.get_feature_names()
CorpusDF_TF2=pd.DataFrame(Vect_TF2.toarray(),columns=ColumnNamesTF2)
print(CorpusDF_TF2)
## Remove columns that contains numbers
for item in CorpusDF_TF2.columns:
if(re.search(r'[^A-Za-z\s+]+', item)):
print(item)
CorpusDF_TF2=CorpusDF_TF2.drop([item], axis=1)
print(CorpusDF_TF2)
Vect_CV2 = MyVect_CV2.fit_transform(ListOfCompleteFiles)
ColumnNamesCV2=MyVect_CV2.get_feature_names()
CorpusDF_CV2=pd.DataFrame(Vect_CV2.toarray(),columns=ColumnNamesCV2)
print(CorpusDF_CV2)
print(CorpusDF_CV2.columns)
print(CorpusDF_CV2.T)
## Remove columns that contains numbers
for item in CorpusDF_CV2.columns:
if(re.search(r'[^A-Za-z\s+]+', item)):
print(item)
CorpusDF_CV2= CorpusDF_CV2.drop([item], axis=1)
print(CorpusDF_CV2)