#################################################Gates
## API Example
## Gates
##
## JSON and newsapi.org
##
## YOU will need to get your own API key to run this code
## newsapi.org
##############################################################
import requests #to query the API
import re #regular expressions
import pandas as pd # for dataframes
from sklearn.feature_extraction.text import CountVectorizer
##This is one of the endpoint options you can read about on newsapi.org
End="https://newsapi.org/v2/everything"
URLPost = {'apiKey':'8f4134 YOUR KEY HERE 100f22b',
'q': 'football',
'sources': 'fox-news',
'pageSize' : 30,
'language': 'en'}
response3=requests.get(End, URLPost)
print(response3)
## We MUST do this as well to see the results
## The results will be as json text and so we will use json
## to grab data from this text.
jsontxt3 = response3.json()
print(jsontxt3)
## Create a new csv file to save the headlines
filename="NewsHeadlines3.csv"
MyFILE=open(filename,"w")
### Place the column names in - write to the first row
WriteThis="Date,Source,Title,Headline\n"
MyFILE.write(WriteThis)
MyFILE.close()
## Open the file for append
MyFILE=open(filename, "a")
## Go through the json text:
for items in jsontxt3["articles"]:
Source=items["source"]["id"]
print(Source)
Date=items["publishedAt"]
##clean up the date
NewDate=Date.split("T")
Date=NewDate[0]
print(Date)
Title=items["title"]
Title=re.sub(r'[^a-zA-Z]', " ", Title, flags=re.VERBOSE)
Title=Title.replace(',', '')
Title=' '.join(Title.split())
Title=re.sub("\n|\r", "", Title)
##----------------------------------------------------------
Headline=items["description"]
Headline=re.sub(r'[^a-zA-Z]', " ", Headline, flags=re.VERBOSE)
## Be sure there are no commas in the headlines or it will
## write poorly to a csv file....
Headline=Headline.replace(',', '')
Headline=' '.join(Headline.split())
Headline=re.sub("\n|\r", "", Headline)
### AS AN OPTION - remove words of a given length............
Headline = ' '.join([wd for wd in Headline.split() if len(wd)>3])
WriteThis=str(Date)+","+str(Source)+","+ str(Title) + "," + str(Headline) + "\n"
MyFILE.write(WriteThis)
## CLOSE THE FILE
MyFILE.close()
############### PROCESS THE FILE ######################
## Read to DF
BBC_DF=pd.read_csv(filename)
print(BBC_DF.head())
# iterating the columns
for col in BBC_DF.columns:
print(col)
print(BBC_DF["Headline"])
## REMOVE any rows with NaN in them
BBC_DF = BBC_DF.dropna()
print(BBC_DF["Headline"])
### Tokenize and Vectorize the Headlines
## Create the list of headlines
HeadlineLIST=[]
for next1 in BBC_DF["Headline"]:
HeadlineLIST.append(next1)
print("The headline list is")
print(HeadlineLIST)
### Vectorize
## Read about CountVectorizer here.....
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
MyCountV=CountVectorizer(
input="content",
lowercase=True,
stop_words = "english",
max_features=20
)
MyDTM = MyCountV.fit_transform(HeadlineLIST) # create a sparse matrix
print(type(MyDTM))
ColumnNames=MyCountV.get_feature_names_out()
MyDTM_DF=pd.DataFrame(MyDTM.toarray(),columns=ColumnNames)
print(MyDTM_DF)
## Write the dataframe to a csv file
MyDTM_DF.to_csv('CleanNews.csv')