Web Scraping – Parsing – WordClouds with BeautifulSoup (BS4) and Python

# -*- coding: utf-8 -*-
"""
Created on Tue Nov 26 17:34:48 2024

@author: profa
"""

######################
## Web Scraping and 
## Preparing Text Data 
## for analysis
##
## Gates
##
## This tutorial uses bs4
###############################
import pandas as pd
import numpy as np
import requests
## conda install -c conda-forge scrapy
from bs4 import BeautifulSoup 
#Beautiful Soup is used to parse and prettify raw web data
from collections import Counter
## pip install wordcloud
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Get the data
url = "https://coffeegeek.com/"
res = requests.get(url)
htmlData = res.content
print(htmlData)

## Parse the Data
parsedData = BeautifulSoup(htmlData, "html.parser")
#there are other parsers as well, like lxml or html5lib
print(parsedData.prettify())

texts = parsedData.findAll(text=True)
print(texts)

#Getting the title of the HTML page.
print(parsedData.title)
print(parsedData.title.string)

body = parsedData.find('body')
# Extract the text
text = body.get_text()
print(type(text))
MyList=text.split()
print(MyList)

##filter the list
MyList = [item for item in MyList if item.isalpha()]
print(MyList)
MyList = [item for item in MyList if len(item) > 3]
print(MyList)
MyList=[x.lower() for x in MyList]
print(MyList)
StopWords=["website", "websites", "href", "this", "from"]
MyList = [word for word in MyList if word not in StopWords]
print(MyList)
word_counts = Counter(MyList)
print(word_counts)
print(type(word_counts))

# Convert to DataFrame
MyDF = pd.DataFrame.from_dict(word_counts, orient='index', columns=['count'])
# Reset index to make 'a', 'b', 'c' a column
MyDF.reset_index(inplace=True)
MyDF.columns = ['word', 'count']
print(MyDF)
print(type(MyDF))

## Create a word cloud
word_freq = dict(zip(MyDF['word'], MyDF['count']))
print(word_freq)

# Generate the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=STOPWORDS).generate_from_frequencies(word_freq)

# Display the generated image
plt.figure(figsize=(10, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
##-------------------------------

## Use files......................

MYFILE= open("BS4_Content.txt", "w") 

for p in parsedData.select('p'):
    print(p.get_text(strip=True, separator='\n'))
    Content=p.get_text(strip=True, separator='\n')
    print('-' * 80)
    MYFILE.write(Content)

MYFILE.close()

## Finding Things
SiteLinks = parsedData.find_all("link")
for links in SiteLinks:
    print(links)
    
## Get just the links
for link in parsedData.find_all('a'):
   print(link.get('href'))

# get anchors
anchors = parsedData.find_all("a")
for a in anchors:
    print(a)
    
# get the class of every div
divs = parsedData.find_all("div")
for div in divs:
    print(div["class"])
    
# get contents
tag = parsedData.find('p')
MyContents=tag.contents
print(type(MyContents))
print(len(MyContents))

## Write things to a file
with open("BS4_Content2.txt", "w") as f:
    f.write(str(tag.contents))