beautifulsoup4 Selectors
import requests from bs4 import BeautifulSoup import pandas as pd import re
html = """ Ultra Running Events Ultra Running Events
50 Mile Races
-
Rocky Mountain 50
Date: August 10, 2025
Location: Boulder, Colorado
-
Desert Dash 50
Date: September 14, 2025
Location: Moab, Utah
100 Mile Races
-
Mountain Madness 100
Date: July 5, 2025
Location: Lake Tahoe, California
-
Endurance Beast 100
Date: October 3, 2025
Location: Asheville, North Carolina
Important Notes
All races start at 6:00 AM sharp.
Mandatory pre-race check-in the evening before.
"""
soup_html = BeautifulSoup(html, 'html.parser')
soup_html.select_one('h2')

soup_html.select_one('h2').get_text()

soup_html.select_one('strong').get_text()

soup_html.select('p') #grabs all p tags

soup_html.select('p')[0].get_text()

soup_html.select('p')[1].get_text()

all_p = soup_html.select('p') #grabs all p tags
for p in all_p: print(p.get_text())

soup_html.select('a')

a_element = soup_html.select('a')
for link in a_element: print(link['href'])

a_element[2]['href']

soup_html.select('#races-100')


#Example 11 Direct Descendents #Let’s say you want to extract the tag directly under each (but only if it's a direct child) #It will not match nested links. #Omit > when you want any level of nesting soup_html.select('h3.race-name > a')

#Example 12 After an Elements Siblings #Paragraphs after h3 tag #Have the same parent as an , and #Appear after that in the HTML, regardless of how many elements are in between #It's useful when you want to grab siblings after a specific element, but not necessarily immediately after soup_html.select("h3 ~p")

#Example 13 Element in one of two classes (e.g., .race-50 OR .race-100) sections = soup_html.select('section.race-50, section.race-100')
for section in sections: title = section.find('h2').text.strip() print(f"Section Title: {title}")

#Example 14 Element in both classes (e.g., .race-section AND .race-50) (Order doesnt matter) race50_section = soup_html.select('section.race-section.race-50')
race50_section_v2 = soup_html.select('section.race-50.race-section')
for section in race50_section: print(f"Found section: {section['id']}")

for section in race50_section_v2: print(f"Found section: {section['id']}")

URL = "http://books.toscrape.com/"
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
#Example 15 Select category links from sidebar category_links = soup.select("ul.nav-list ul a")
for a in category_links: print(a.text.strip(), URL + a['href'])

books = soup.select("article.product_pod h3 a")
books

books = soup.select("article.product_pod h3 a")
for book in books: print(book.get_text())

books = soup.select("article.product_pod")
for book in books: title = book.select_one("h3 a")["title"] price = book.select_one(".price_color").text rating = book.select_one("p.star-rating")["class"][-1] # e.g. 'Three' book_data.append({"title": title, "price": price, "rating": rating})
df = pd.DataFrame(book_data)
df.head(10)

df['price_clean'] = df['price'].str.replace('£', '', regex=False).astype(float)
#Convert GBP to USD (example rate: 1 GBP = 1.0737 USD) (CHECK THIS) exchange_rate = 1.0737
df['price_usd'] = df['price_clean'] * exchange_rate
df['Price_usd'] = df['price_usd'].apply(lambda x: f"${x:.2f}")
df_five_star = df.loc[df['rating'] == 'Five', ['title', 'Price_usd']]
df_five_star

df_five_star.to_csv('scrapped_book_data.csv')
df_five_star.to_excel('scrapped_book_data.xlsx')
Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.