######################
## Web Scraping and
## Preparing Text Data
## for analysis
##
## Gates
##
## This tutorial uses bs4
###############################
import pandas as pd
import numpy as np
import requests
## conda install -c conda-forge scrapy
from bs4 import BeautifulSoup
#Beautiful Soup is used to parse and prettify raw web data
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
url = "https://coffeegeek.com/"
res = requests.get(url)
htmlData = res.content
print(htmlData)
parsedData = BeautifulSoup(htmlData, "html.parser")
#there are other parsers as well, like lxml or html5lib
print(parsedData.prettify())
#Getting the title of the HTML page.
print(parsedData.title)
print(parsedData.title.string)
## Finding Things
SiteLinks = parsedData.find_all("link")
for links in SiteLinks:
print(links)
## Get just the links
for link in parsedData.find_all('a'):
print(link.get('href'))
# get anchors
anchors = parsedData.find_all("a")
for a in anchors:
print(a)
# get the class of every div
divs = parsedData.find_all("div")
for div in divs:
print(div["class"])
# get contents
tag = parsedData.find('p')
print(tag.contents)