Creating n-gram Dataframes in Python


# -*- coding: utf-8 -*-
"""

@author: profa
"""

#####################################
##
##   CountVectorizer and n grams
##
##   Corpus text data - two folders
##
##  Gates
##
## INSTRUCTIONS
## 
## To use this code, you will need two corpus's
## on your computer. You will need to UPDATE
## the paths below to match YOUR paths.
##
## Here are examples of the DOG and HIKE corpus's
## I am using below
## https://drive.google.com/drive/folders/1Lm5HlK51q2JyrMbMwv9tS3WA34QxmXNN?usp=sharing
## https://drive.google.com/drive/folders/1k0GHdPafJqUFaJ6wFD9HNyc0lFK1AdSl?usp=sharing
########################################

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
import re

## YOU WILL NEED TO UPDATE THIS PATH
corpus_path="C:/Users/profa/Desktop/UCB/Text Mining/DATA/"

## Empty List
all_file_names = []

print("calling os...")
#print(os.listdir(path))
CorpusNameList=os.listdir(corpus_path)
#print(CorpusNameList)

ListOfCompleteFiles=[]
ListOfFileNames=[]

for folder_name in CorpusNameList:
    #print(folder_name)
    ## The first for loop enters each corpus
    next_corpus=corpus_path+str(folder_name)
    for fname in os.listdir(next_corpus):
        #print(fname)
        full_path=next_corpus+"/"+fname
        ListOfCompleteFiles.append(full_path)
        ListOfFileNames.append(fname)
        
#print("DONE...")
print("full list...")
print(ListOfCompleteFiles)
print(ListOfFileNames)


##### Instantiate Vectorizers----------------
MyVect_CV=CountVectorizer(input="filename")
MyVect_TF=TfidfVectorizer(input='filename', stop_words="english",max_features=50)

Vect_TF = MyVect_TF.fit_transform(ListOfCompleteFiles)
ColumnNamesTF=MyVect_TF.get_feature_names()
CorpusDF_TF=pd.DataFrame(Vect_TF.toarray(),columns=ColumnNamesTF)
print(CorpusDF_TF)

Vect_CV = MyVect_CV.fit_transform(ListOfCompleteFiles)
ColumnNamesCV=MyVect_CV.get_feature_names()
CorpusDF_CV=pd.DataFrame(Vect_CV.toarray(),columns=ColumnNamesCV)
print(CorpusDF_CV)


####### Remove garbage
for nextcol in CorpusDF_CV.columns:
    if(re.search(r'[^A-Za-z]+', nextcol)):
        #print(nextcol)
         CorpusDF_CV= CorpusDF_CV.drop([nextcol], axis=1)
#    ## The following will remove any column with name
#    ## of 3 or smaller - like "it" or "of" or "pre".
#    ##print(len(nextcol))  ## check it first
#    ## NOTE: You can also use this code to CONTROL
#    ## the words in the columns. For example - you can
#    ## have only words between lengths 5 and 9. 
#    ## In this case, we remove columns with words <= 3.
    elif(len(str(nextcol))<3):
        #print(nextcol)
        CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)
    elif(len(str(nextcol))>20):
        #print(nextcol)
        CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)
    # elif(nextcol in RemoveWords):
    #     print(nextcol)
    #     CorpusDF_CV=CorpusDF_CV.drop([nextcol], axis=1)

print(CorpusDF_CV)
########################################################

##### Using n- grams with n > 1----------------

#############################################################
## We can split (toeknize) 
## our text into one or more words called n-grams, 
## When n = 1, we are splitting into single words. 
## When n = 2, we are splitting into pairs of words.
## Define the length n by passing a tuple to the 
## ngram_range argument. 
## For example, 1,1 would give us unigrams or 1-grams such as "not"
## or "happy", etc.
## which we have. This is the default. 
## Using  2,2 would give us bigrams or 2-grams, such as “not happy”.
## ngram_rangetuple (min_n, max_n), default=(1, 1)
## Example: (1,2) means 1-grams and 2-grams
########################################################################

MyVect_CV2=CountVectorizer(input="filename", ngram_range = (2, 2))

MyVect_TF2=TfidfVectorizer(input='filename', 
                          stop_words="english",
                          #max_features=20, 
                          ngram_range = (2, 3))

Vect_TF2 = MyVect_TF2.fit_transform(ListOfCompleteFiles)
ColumnNamesTF2=MyVect_TF2.get_feature_names()
CorpusDF_TF2=pd.DataFrame(Vect_TF2.toarray(),columns=ColumnNamesTF2)
print(CorpusDF_TF2)

## Remove columns that contains numbers
for item in CorpusDF_TF2.columns:
    if(re.search(r'[^A-Za-z\s+]+', item)):
        print(item)
        CorpusDF_TF2=CorpusDF_TF2.drop([item], axis=1)
        
print(CorpusDF_TF2)



Vect_CV2 = MyVect_CV2.fit_transform(ListOfCompleteFiles)
ColumnNamesCV2=MyVect_CV2.get_feature_names()
CorpusDF_CV2=pd.DataFrame(Vect_CV2.toarray(),columns=ColumnNamesCV2)
print(CorpusDF_CV2)
print(CorpusDF_CV2.columns)

print(CorpusDF_CV2.T)

## Remove columns that contains numbers
for item in CorpusDF_CV2.columns:
    if(re.search(r'[^A-Za-z\s+]+', item)):
        print(item)
        CorpusDF_CV2= CorpusDF_CV2.drop([item], axis=1)


print(CorpusDF_CV2)