Scraping a Web Page using Python
An exercise in scraping and reading web pages using Python.
Introduction
How to scrape and read a web pages using BeautifulSoup, Selenium, and Python.
import urllib.request
from bs4 import BeautifulSoup
import requests
import pprint
from selenium import webdriver
# ----------| 1) SCRAPE A WEB PAGE |----------
with urllib.request.urlopen('https://podshop.com') as response:
podshop = response.read()
pprint.pprint(podshop)
# ----------| 2) READING A WEB PAGE WITH BEAUTIFUL SOUP |----------
page = requests.get('https://podshop.com/about/')
site = BeautifulSoup(page.content, 'html.parser')
print(site.title)
print(site.find_all('a'))
print(site.find_all('p'))
headerChildren = [c for c in site.head.children]
print(headerChildren)
navigationBar = site.find(id="nav")
for d in navigationBar.descendants:
print(d)
for s in d.previous_siblings:
print(s)
ta_divs = site.find_all("div", id="main")
print(len(ta_divs))
allData = []
for ta in ta_divs:
dataDict = {}
dataDict['title'] = ta.h2.get_text()
dataDict['link'] = ta.a.get('href')
dataDict['about'] = [p.get_text() for p in ta.find_all('p')]
allData.append(dataDict)
print(allData)
# ----------| 3) WEB SCRAPING WITH SELENIUM |----------
chromeDriver = '/Users/chris/Documents/Education/MSDS/PyCharm/dsc540/week8/chromedriver'
options = webdriver.ChromeOptions()
options.add_argument('headless')
browser = webdriver.Chrome(executable_path=chromeDriver, options=options)
browser.get('https://www.britishcycling.org.uk/membership/article/20120925-Power-Calculator-0')
searchForm = browser.find_element_by_id('fthr')
searchForm.send_keys('198')
submitButton = browser.find_element_by_id('fthr_button')
submitButton.click()
results = browser.find_elements_by_id('heart_rate_table')
for result in results:
print(result.text)
browser.quit()