# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 17:34:48 2024
@author: profa
"""
######################
## Web Scraping and
## Preparing Text Data
## for analysis
##
## Gates
##
## This tutorial uses bs4
###############################
import pandas as pd
import numpy as np
import requests
## conda install -c conda-forge scrapy
from bs4 import BeautifulSoup
#Beautiful Soup is used to parse and prettify raw web data
from collections import Counter
## pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
## Get the data
url = "https://coffeegeek.com/"
res = requests.get(url)
htmlData = res.content
print(htmlData)
## Parse the Data
parsedData = BeautifulSoup(htmlData, "html.parser")
#there are other parsers as well, like lxml or html5lib
print(parsedData.prettify())
texts = parsedData.findAll(text=True)
print(texts)
#Getting the title of the HTML page.
print(parsedData.title)
print(parsedData.title.string)
body = parsedData.find('body')
# Extract the text
text = body.get_text()
print(type(text))
MyList=text.split()
print(MyList)
##filter the list
MyList = [item for item in MyList if item.isalpha()]
print(MyList)
MyList = [item for item in MyList if len(item) > 3]
print(MyList)
MyList=[x.lower() for x in MyList]
print(MyList)
StopWords=["website", "websites", "href", "this", "from"]
MyList = [word for word in MyList if word not in StopWords]
print(MyList)
word_counts = Counter(MyList)
print(word_counts)
print(type(word_counts))
# Convert to DataFrame
MyDF = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count'])
# Reset index to make 'a', 'b', 'c' a column
MyDF.reset_index(inplace=True)
MyDF.columns = ['word', 'count']
print(MyDF)
print(type(MyDF))
## Create a word cloud
word_freq = dict(zip(MyDF['word'], MyDF['count']))
print(word_freq)
# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=STOPWORDS).generate_from_frequencies(word_freq)
# Display the generated image
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
##-------------------------------
## Use files......................
MYFILE= open("BS4_Content.txt", "w")
for p in parsedData.select('p'):
print(p.get_text(strip=True, separator='\n'))
Content=p.get_text(strip=True, separator='\n')
print('-' * 80)
MYFILE.write(Content)
MYFILE.close()
## Finding Things
SiteLinks = parsedData.find_all("link")
for links in SiteLinks:
print(links)
## Get just the links
for link in parsedData.find_all('a'):
print(link.get('href'))
# get anchors
anchors = parsedData.find_all("a")
for a in anchors:
print(a)
# get the class of every div
divs = parsedData.find_all("div")
for div in divs:
print(div["class"])
# get contents
tag = parsedData.find('p')
MyContents=tag.contents
print(type(MyContents))
print(len(MyContents))
## Write things to a file
with open("BS4_Content2.txt", "w") as f:
f.write(str(tag.contents))