Twitter API in Python

Twitter API in Python


As a first step, you will need to get a Twitter Developer Account and the 4 Twitter passcodes that will allow you to access Tweets. Twitter imposes LIMITS on use. If you exceed these limits – it can appear that you code has failed. To avoid confusion, review the Twitter documentation.



Once you have a Twitter Dev account and the 4 codes that are needed, you can use the following code.

This code is complex and advanced.

  1. As with all of my code – please use it to LEARN and not to just copy/paste. If you do use large portions, please REFERENCE my name 🙂
  2. You cannot “just use” this code. Some lines are for my computer and will not work on yours because our paths and locations are not the same. As such, to use this code you will need to UNDERSTAND each line. That’s the goal!

What Does This Code Do?

  1. It accesses Twitter in Python and collects Tweets on three #.
  2. It creates, cleans, and formats the data into LABELED, tokenized, and vectorized TEXT data where each row is a Tweet and each column is a word.
  3. This code uses a lot of advanced Python that will be FUN TO LEARN.
  4. Try to review the code and the use it as a tool to create your own.



##TwitterMining_Token_WordCloud.py
## Gates 
##
## json Tweets - cleaning

######################################################
## NOTE !!!!!!!!!!!!
##
## USAGE:
    ## You can run this code over and over with
    ## different # options.
    ## When each is done, a csv will be 
    ## created. It will be called:
    ## Labeled_Tweets_"+str(nohashname)+".csv
    ## For example, if we run #football, the file
    ## will be called:
        ## Labeled_Tweets_football.csv
   
    
    
    #####################################################

###Packages-----------------------
import pandas as pd
import tweepy
#conda install -c conda-forge tweepy
##
## HOW TO downgrade to Tweepy 3.7
## #pip install tweepy==3.7
## conda install should work too - just specify 3.7

from tweepy import OAuthHandler
import json
from tweepy import Stream
from tweepy.streaming import StreamListener
import sys
#
#
#
# NOTICE:
    #Tweepy v4.0.0 was released 
    # and it merged StreamListener into Stream.
    # code to subclass Stream instead.
    # This may affect the code. 
    # This depends on your verion of Tweepy etc.
    #
    ## Issues with Tweepy etc?
    ## Here is the reference:
        ## https://docs.tweepy.org/en/latest/api.html

from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer


from os import path
#from scipy.misc import imread
import matplotlib.pyplot as plt
##install wordcloud
## conda install -c conda-forge wordcloud
## May also have to run conda update --all on cmd
#import PIL
#import Pillow
#import wordcloud
from wordcloud import WordCloud, STOPWORDS
import pandas as pd
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
###-----------------------------------------


## All 4 keys are in my TwitterCodesFile.txt and are comma sep
filename="C://users//profa//desktop//TwitterCodesFile.txt"
with open(filename, "r") as FILE:
    keys=[i for line in FILE for i in line.split(',')]
    
#API Key:
consumer_key = keys[0]
#API Secret Key:
consumer_secret =keys[1]
#Access Token:
access_token =keys[2]
#Access Token Secret:
access_secret =keys[3]


auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth)
##-----------------------------------------------------------------
#Other Tweepy options - FYI
#for status in tweepy.Cursor(api.home_timeline).items(10):
#Process a single status
 #   print(status.text) 
#   
#def Gather(tweet):
 #   print(json.dumps(tweet))
#for friend in tweepy.Cursor(api.friends).items():
 #   Gather(friend._json)
#--------------------------------------------------------------

 
class Listener(StreamListener):
    print("In Listener...") 
    tweet_number=0
    #__init__ runs as soon as an instance of the class is created
    def __init__(self, max_tweets, hfilename, rawfile, TweetsList, LabelsList, nohashname):
        self.max_tweets=max_tweets
        print(self.max_tweets)     
    #on_data() is a function of StreamListener as is on_error and on_status    
    def on_data(self, data):
        self.tweet_number+=1 
        print("In on_data", self.tweet_number)
        try:
            print("In on_data in try")
            with open(hfilename, 'a') as f:
                with open(rawfile, 'a') as g:
                    tweet=json.loads(data)
                    ## RE: https://realpython.com/python-json/
                    tweet_text=tweet["text"]
                    #print(tweet_text,"\n")
                    TweetsList.append(tweet_text)
                    LabelsList.append(nohashname)
                    #print(TweetsList)
                    f.write(tweet_text) # the text from the tweet
                    json.dump(tweet, g)  #write the raw tweet
        except BaseException:
            print("NOPE")
            pass
        if self.tweet_number>=self.max_tweets:
            #sys.exit('Limit of '+str(self.max_tweets)+' tweets reached.')
            print("Got ", str(self.max_tweets), "tweets.")
            return False
    #method for on_error()
    def on_error(self, status):
        print("ERROR")#machi
        print(status)   #401 your keys are not working
        if(status==420):
            print("Error ", status, "rate limited")
            return False
#----------------end of class Listener



######################################
##
##       Loop that will gather tweets
##       for the specified hashtags
##
#########################################################

Hashes = ["trump", "biden", "football"]    
numtweets= 2  ## you can change this number!
CompleteTweetList=[]
CompleteLabelsList=[]


for hashname in Hashes:   
    
    ##########################################################
    ## There commented out lines can be integrated if you want
    ## the USER to enter the hashname and the number of tweets 
    ## to get:
    #hashname=input("Enter the hash name, such as #womensrights: ") 
    #numtweets=eval(input("How many tweets do you want to get?: "))
    
    # if(hashname[0]=="#"):
    #     nohashname=hashname[1:] #remove the hash
    # else:
    #     nohashname=hashname
    #     hashname="#"+hashname
    ##############################################################
    
    print("getting....", hashname,"\n")
    
    nohashname=hashname
    hashname="#"+hashname
    #Create a file for any hash name    
    hfilename="file_"+nohashname+".txt"
    ## FOr example, file_football.txt  if you used #football
    rawfile="file_rawtweets_"+nohashname+".txt"
    ## For example, file_rawtweets_football.txt
    ## Notice that the raw file is in json
    ## The hfilename is just text
    TweetsList=[]
    LabelsList=[]
    
    ################ Get the tweets..................................
    twitter_stream = Stream(auth, Listener(numtweets, 
                                           hfilename, rawfile, 
                                           TweetsList, LabelsList, 
                                           nohashname))
    ## https://developer.twitter.com/en/docs/twitter-api/tweets/filtered-stream/introduction
    ## https://developer.twitter.com/en/docs/twitter-api/enterprise/powertrack-api/overview
    #twitter_stream.filter(track=['#womensrights'])
    twitter_stream.filter(track=[hashname], languages=["en"])
    ##..................................................................
    
    ## Save each Tweet in a list
    ## This will create a LIST OF CONTENT
    ## that you can use with CountVectorizer, etc.
    
    print(TweetsList)
    print(LabelsList)
    
    ## Add these to the complete lists
    ## extend (rather than append) will add the elements
    ## one by one to the existing list. 
    CompleteTweetList.extend(TweetsList)
    CompleteLabelsList.extend(LabelsList)
    
    

######################## END OF LOOP #############################

print(CompleteTweetList)
print(CompleteLabelsList)

############################################
##
## The next goal is to create a csv file
## where the first column is the label
## which is the hashtag and the second column
## is the entire tweet WITH NO newlines, etc.
##
##############################################################
TW_file="TweetCSV2.csv"
FILE=open(TW_file, "w")
WhatToWrite="LABEL,Tweet\n"
FILE.write(WhatToWrite)
FILE.close()

## This fun python trick allows you to loop
## through two lists at once and together
ZIP_List = zip(CompleteTweetList, CompleteLabelsList)

with open(TW_file, "a") as f:
    for TW, Lab in ZIP_List:
        TW=TW.replace("\n", " ")
        TW=TW.replace(",", " ") 
        TW=re.sub(r"[^A-Za-z\-]", " ", TW)
        TW=re.sub(r"\bhttp\b | \bhttps\b", " ", TW)
        ## Why do we need to do this?
        #print("NEXT\n")
        print(TW)
        print(Lab)
        WhatToWrite=str(Lab+","+TW+"\n")
        f.write(WhatToWrite)
    
        
    

## OK!! We have now created a labeled csv
## file where each row is a tweet.
## Next, we can use CountVectorizer to 
## Convert this to a labeled DF    


###################################################


MyCV_T=CountVectorizer(input='content',
                        stop_words='english',
                        #max_features=100
                        )


### READ our csv file of tweets into two lists
## The first is the label, second the tweet

FileContents=pd.read_csv(TW_file)
print(FileContents["LABEL"])
print(FileContents["Tweet"])

## CONVERT the tweets to a LIST so we can use
## CountVectorizer
TweetsList = FileContents["Tweet"].to_list()
print(TweetsList)

LABEL_List = FileContents["LABEL"].to_list()
print(LABEL_List)

My_DTM_T=MyCV_T.fit_transform(TweetsList)

## BUT - we are not done! Right now, we havea DTM. 
## We actually want a dataframe.
## Let's convert our DTM to a DF

## TWO Steps:
    ## First - use your CountVectorizer to get all the column names
ColNames=MyCV_T.get_feature_names()
print("The vocab is: ", ColNames, "\n\n")

## NEXT - Use pandas to create data frames
My_DF_T=pd.DataFrame(My_DTM_T.toarray(),columns=ColNames)

## Let's look!
print(My_DF_T)
print(LABEL_List)

## Remove all columns that contain any numbers/digits
#for i in My_DF_T.columns:
#    print(i)

#droplist = [i for i in My_DF_T.columns if re.search(r"\d", i)] 
#print(droplist)      
#My_DF_T.drop(droplist,axis=1,inplace=True)


## Some clean-up - NOTE that some of this is not 
## actually needed because of steps taken above.
## However, it is nice to see code on how you would do 
## these things.
## These droplists find words that have numbers or are 
## things we do not need, like http, and they remove them
## from the datafrace,

droplist1 = [i for i in My_DF_T.columns if len(i)<3] 
print(droplist1)      
My_DF_T.drop(droplist1,axis=1,inplace=True)
## axis = 1 means drop the column (not the row)
## inplace means make the update right in the dataframe

print(My_DF_T)

droplist2 = [i for i in My_DF_T.columns if re.search(r'[^A-Za-z\-]', i)] 
print(droplist2)      
My_DF_T.drop(droplist2,axis=1,inplace=True)

print(My_DF_T)

## Note: The \bWORD\b matches the exact WORD
droplist3 = [i for i in My_DF_T.columns 
             if re.search(r'\bhttp\b | \bhttps\b | \brt\b', i)] 
print(droplist3)  
   
My_DF_T.drop(droplist3,axis=1,inplace=True)


print(My_DF_T)


## Add the list to the dataframe to create labels. 
My_DF_T.insert(loc=0, column='LABEL', value=LABEL_List)
print(My_DF_T)

## Write the DF to a csv file
My_DF_T.to_csv("Final_Tweets_DF_Labeled.csv")