Stemming, Lemming, Norm – Text Data Prep


# -*- coding: utf-8 -*-
"""
@author: profa
"""

###########################################
##
##
##  READ ME FIRST
##
##
## This code does not "run linearly"
## Instead, it is a collection of different options
## that can be used together or not and in many ways.
##
## To use this code, review and understand it first.
## Then comment or uncomment (and adjust) what you wish
## 
## You will also need to CREATE YOUR OWN DATA to use
## this code.
############################################################


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os
import re

#---------------------------------------------------------
##
## Stemming and Lemming
##
##---------------------------------------------------------
from nltk.stem import WordNetLemmatizer 
LEMMER = WordNetLemmatizer() 

from nltk.stem.porter import PorterStemmer
STEMMER=PorterStemmer()


print(STEMMER.stem("singing"))
print(LEMMER.lemmatize("singers"))

print(STEMMER.stem("singers"))
print(STEMMER.stem("sings"))
print(STEMMER.stem("songs"))
print(STEMMER.stem("analysis"))


# Use NLTK's PorterStemmer in a function
def MY_STEMMER(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()
    words = [STEMMER.stem(word) for word in words]
    return words

def MY_LEMMER(str_input):
    words = re.sub(r"[^A-Za-z\-]", " ", str_input).lower().split()
    words = [LEMMER.lemmatize(word) for word in words]
    return words

print(MY_STEMMER(("Hiking is loved by hikers and hikes!! 1234")))
print(MY_LEMMER("Hiking is loved by hikers and hikes!! 1234"))




all_file_names = []
## Put YOUR path here and create your own corpus. Mine here is
## called Dog_Hike. My Dog_Hike corpus contains 20 .txt documents
## some about dogs and some about hiking and each a different length.
path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\Dog_Hike"


#print("calling os...")
#print(os.listdir(path))
FileNameList=os.listdir(path)
print(FileNameList)

ListOfCompleteFiles=[]

for name in os.listdir(path):
    print(path+ "\\" + name)
    next1=path+ "\\" + name
    ListOfCompleteFiles.append(next1)
#print("DONE...")
print("full list...")
print(ListOfCompleteFiles)


##### For TfidfVectorizer Options----------------
MyVect_TF=TfidfVectorizer(input='filename', stop_words="english",max_features=50)
Vect = MyVect_TF.fit_transform(ListOfCompleteFiles)
ColumnNamesTF=MyVect_TF.get_feature_names()
CorpusDF_TF=pd.DataFrame(Vect.toarray(),columns=ColumnNamesTF)
print(CorpusDF_TF)




##### For CountVectorizer Options----------------
MyVect1=CountVectorizer(input='filename', stop_words="english")
#MyVect1=CountVectorizer(input='filename', stop_words="english", max_df=4, min_df=2)
#MyVect1=CountVectorizer(input='filename', stop_words="english", max_features=12)
##path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\SmallTextDocs"
Vect_DH = MyVect1.fit_transform(ListOfCompleteFiles)
CV_Stopwords=MyVect1.get_stop_words()
ColumnNames1=MyVect1.get_feature_names()
print(len(ColumnNames1))
CorpusDF_DH=pd.DataFrame(Vect_DH.toarray(),columns=ColumnNames1)
print(CorpusDF_DH)

## Aggregate by hand
#CorpusDF_DH["dog"]= CorpusDF_DH["dog"]+ CorpusDF_DH["dogs"]
#CorpusDF_DH= CorpusDF_DH.drop(["dogs"], axis=1)


print(CV_Stopwords)
print(list(CV_Stopwords).index("and"))

#RemoveWords=["plan", "great"]
RemoveWords=["and"]

for nextcol in CorpusDF_DH.columns:
    if(re.search(r'[^A-Za-z]+', nextcol)):
        #print(nextcol)
         CorpusDF_DH= CorpusDF_DH.drop([nextcol], axis=1)
#    ## The following will remove any column with name
#    ## of 3 or smaller - like "it" or "of" or "pre".
#    ##print(len(nextcol))  ## check it first
#    ## NOTE: You can also use this code to CONTROL
#    ## the words in the columns. For example - you can
#    ## have only words between lengths 5 and 9. 
#    ## In this case, we remove columns with words <= 3.
    elif(len(str(nextcol))<3):
        print(nextcol)
        CorpusDF_DH= CorpusDF_DH.drop([nextcol], axis=1)
    elif(len(str(nextcol))>15):
        print(nextcol)
        CorpusDF_DH= CorpusDF_DH.drop([nextcol], axis=1)
    elif(nextcol in RemoveWords):
        print(nextcol)
        CorpusDF_DH= CorpusDF_DH.drop([nextcol], axis=1)
        

print(CorpusDF_DH)
TheSums=CorpusDF_DH.sum(axis=1)
print(TheSums)


## Include a SUM column...
#CorpusDF_DH["SUM"] = CorpusDF_DH.agg("sum",axis=1)
## Alternative method...
#df['C'] = df.sum(axis=1)

## Divide each row by its SUM- normalize - 
NormDF = CorpusDF_DH.div(TheSums, axis=0)
print(NormDF)




##---------------------
## Using Stemming and Lemming
##-------------------------------------
MyVect_STEM=CountVectorizer(input='filename',
                        analyzer = 'word',
                        stop_words='english',
                        tokenizer=MY_STEMMER,
                        lowercase = True,
                        max_features=12
                        )

Vect_Stem = MyVect_STEM.fit_transform(ListOfCompleteFiles)
ColumnNames_s=MyVect_STEM.get_feature_names()
CorpusDF_Stem=pd.DataFrame(Vect_Stem.toarray(),columns=ColumnNames_s)
print(CorpusDF_Stem)


MyVect_LEM=CountVectorizer(input='filename',
                        analyzer = 'word',
                        stop_words='english',
                        tokenizer=MY_LEMMER,
                        lowercase = True,
                        max_features=12
                        )


Vect_LEM = MyVect_LEM.fit_transform(ListOfCompleteFiles)
ColumnNames_lem=MyVect_LEM.get_feature_names()
CorpusDF_LEM=pd.DataFrame(Vect_LEM.toarray(),columns=ColumnNames_lem)
print(CorpusDF_LEM)