API Example In Python – Gates Bolton Analytics

#################################################Gates
## API Example
## Gates
## 
## JSON and newsapi.org
##
## YOU will need to get your own API key to run this code
## newsapi.org
##############################################################
import requests  #to query the API 
import re  #regular expressions
import pandas as pd   # for dataframes

from sklearn.feature_extraction.text import CountVectorizer   
##This is one of the endpoint options you can read about on newsapi.org
End="https://newsapi.org/v2/everything"

URLPost = {'apiKey':'8f4134 YOUR KEY HERE 100f22b',
                    'q': 'football', 
                    'sources': 'fox-news',
                    'pageSize' : 30,
                    'language': 'en'}

response3=requests.get(End, URLPost)
print(response3)

## We MUST do this as well to see the results
## The results will be as json text and so we will use json
## to grab data from this text. 
jsontxt3 = response3.json()
print(jsontxt3)

## Create a new csv file to save the headlines
filename="NewsHeadlines3.csv"

MyFILE=open(filename,"w")
### Place the column names in - write to the first row
WriteThis="Date,Source,Title,Headline\n"
MyFILE.write(WriteThis)
MyFILE.close()
## Open the file for append
MyFILE=open(filename, "a")

## Go through the json text:
for items in jsontxt3["articles"]:
    Source=items["source"]["id"]
    print(Source)
    
    Date=items["publishedAt"]
    ##clean up the date
    NewDate=Date.split("T")
    Date=NewDate[0]
    print(Date)
    
    Title=items["title"]
    Title=re.sub(r'[^a-zA-Z]', " ", Title, flags=re.VERBOSE)
    Title=Title.replace(',', '')
    Title=' '.join(Title.split())
    Title=re.sub("\n|\r", "", Title)
    ##----------------------------------------------------------
    
    Headline=items["description"]
    Headline=re.sub(r'[^a-zA-Z]', " ", Headline, flags=re.VERBOSE)
    ## Be sure there are no commas in the headlines or it will
    ## write poorly to a csv file....
    Headline=Headline.replace(',', '')
    Headline=' '.join(Headline.split())
    Headline=re.sub("\n|\r", "", Headline)
    
    ### AS AN OPTION - remove words of a given length............
    Headline = ' '.join([wd for wd in Headline.split() if len(wd)>3])

    WriteThis=str(Date)+","+str(Source)+","+ str(Title) + "," + str(Headline) + "\n"
    MyFILE.write(WriteThis)
    
## CLOSE THE FILE
MyFILE.close()

############### PROCESS THE FILE ######################

## Read to DF
BBC_DF=pd.read_csv(filename)
print(BBC_DF.head())
# iterating the columns 
for col in BBC_DF.columns: 
    print(col) 
    
print(BBC_DF["Headline"])

## REMOVE any rows with NaN in them
BBC_DF = BBC_DF.dropna()
print(BBC_DF["Headline"])

### Tokenize and Vectorize the Headlines
## Create the list of headlines
HeadlineLIST=[]
for next1 in BBC_DF["Headline"]:
    HeadlineLIST.append(next1)

print("The headline list is")
print(HeadlineLIST)

### Vectorize
## Read about CountVectorizer here.....
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

MyCountV=CountVectorizer(
        input="content", 
        lowercase=True, 
        stop_words = "english",
        max_features=20
        )
 
MyDTM = MyCountV.fit_transform(HeadlineLIST)  # create a sparse matrix
print(type(MyDTM))

ColumnNames=MyCountV.get_feature_names_out()
MyDTM_DF=pd.DataFrame(MyDTM.toarray(),columns=ColumnNames)
print(MyDTM_DF)

## Write the dataframe to a csv file
MyDTM_DF.to_csv('CleanNews.csv')