Python Regular Expressions
#regex or regexpression
import re
#re.search() searches the entire string for a match.
Greatest_Olympians = "Mark Spitz, Paavo Nurmi, Usain Bolt, Carl Lewis, Michael Phelps, Jesse Owens"
search_nurmi = re.search("Nurmi", Greatest_Olympians)
if search_nurmi: print("Match found:", search_nurmi.group()) else: print("No match")

search_weissmuller = re.search("Weissmuller", Greatest_Olympians)
if search_weissmuller: print("Match found:", search_weissmuller.group()) else: print("No match")

match_amsterdam = re.match("Amsterdam", olympics)
if match_amsterdam: print("Match found:", match_amsterdam.group()) else: print("No match")

all_games = "1920 Antwerp Olympics, 1924 Paris Olympics, 1928 Amsterdam Olympics"
find_all_olympics = re.findall("Olympics", all_games)
len(find_all_olympics)

print("Matches found:", find_all_olympics)

#The finditer function in Python’s re module is used to find all non-overlapping matches of a regular expression pattern in a string. It returns an iterator yielding match objects for each match.
find_all_olympics_iter = re.finditer("Olympics", all_games)
#can’t do length
for olympics in find_all_olympics_iter: match_text = olympics.group() start_pos = olympics.start() end_pos = olympics.end() span = olympics.span() print(f"Match: {match_text}, Start: {start_pos}, End: {end_pos}, Span: {span}")

#re.sub() replaces matches in the string with a specified replacement. text = "Mike Stanton"
pattern="Mike"
replacement ="Giancarlo"
result = re.sub(pattern, replacement, text)
print("Result:", result)

#only replace one text = "Mike Stanton, Mike Schmidt, Mike Trout"
result = re.sub(pattern, replacement, text, count=1)
print("Result:", result)

#split – list string split
player_names = "Babe Ruth, Lou Gehrig, Hank Aaron, Mickey Mantle, Willie Mays, Ted Williams"
pattern = ', '
split_names = re.split(pattern, player_names)
print("Split names:", split_names)

split_names[0]

split_names[2]

#raw text explanation
#raw string -> not to handle backslashes in any way
#python prints same way its sepecified
non_raw_text = 'First line\nSecond line'
print(non_raw_text)

raw_text = r"First line\nSecond line"
print(raw_text)

text_with_tab = 'Column1\tColumn2\tColumn3'
print(text_with_tab)

raw_text_with_tab = r"Column1\tColumn2\tColumn3"
print(raw_text_with_tab)

#Matching Characters
#.
##period matches any character minus new line backlsash
pitcher = "Satchel Paige"
re.findall(r".", pitcher)

#starts with p, one character in middle, ends with a #case sensative re.findall(r"P.i", pitcher)

#starts with s, two characters in middle, ends with c #case sensative re.findall(r"S.{3}h", pitcher)

#^: Matches the start of the string.
re.findall(r"^.a", pitcher)

#$: Matches the end of the string. #end $ re.findall(r"ge$", pitcher)

#*: Matches 0 or more repetitions of the preceding character.
text = "ab a abbb abbbb a a abb"
re.findall(r"ab*", text)

#+: Matches 1 or more repetitions of the preceding character.
re.findall(r"ab+", text)

#?: Matches 0 or 1 repetition of the preceding character.
re.findall(r"ab?", text)

satchel_mlb_years = "Satchel Paige played for the Indians in 1948 and 1949. Browns in 1951, 1952, and 1953."
#\d: Matches any digit (equivalent to [0-9]).
re.findall(r"\d", satchel_mlb_years)

re.findall(r"\d{4}", satchel_mlb_years)

#\D: Matches any non-digit. re.findall(r"\D", satchel_mlb_years)

#\w: Matches any word character (alphanumeric plus underscore).
#The dot (.) matches any single character except for newline characters.
#The \w matches any word character.
#\w doesnt include whitespace, punctuation, special characters
re.findall(r"\w", satchel_mlb_years)

#\W: Matches any non-word character.
re.findall(r"\W", satchel_mlb_years)

#\s: Matches any whitespace character.
re.findall(r"\s", satchel_mlb_years)

#\S: Matches any non-whitespace character.
re.findall(r"\S", satchel_mlb_years)

#\b beginning or at the end of a word/block
#B not at the beginning or end of a word/block
satchel_mlb_years

# Find all words starting with ‘p’
##\w: Matches any word character (alphanumeric plus underscore).
re.findall(r"\bp\w*", satchel_mlb_years)

# Find all words ending with ‘d’
re.findall(r"\w*d\b", satchel_mlb_years)

# Find all words containing r not at a word boundary re.findall(r"\B\w*r\w*\B", satchel_mlb_years)

#[A-Z]
re.findall(r"[A-Z]", satchel_mlb_years)

re.findall(r"[A-C]", satchel_mlb_years)

#[a-z]
re.findall(r"[a-g]", satchel_mlb_years)

re.findall(r"[ae]", satchel_mlb_years)

#[0-9]
re.findall(r"[0-9]", satchel_mlb_years)

re.findall(r"[0-3]", satchel_mlb_years)

re.findall(r"[15]", satchel_mlb_years)

re.findall(r"[0-9]{4}", satchel_mlb_years)

# \. actual period \? actual question mark re.findall(“\.”, string)
examples = "What is a question? This is text."
re.findall(r"\.", examples)

re.findall(r"\?", examples)

# | is either or re.findall(“|”, string)
re.findall(r"\.|\?", examples)

re.findall(r"[0-2]|[a-c]", satchel_mlb_years)

#re.compile & flags
#compile a regular expression pattern into a regular expression object,
#which can then be used for matching, searching, and other operations.
#re.compile(pattern, flags=0)
#pattern: The regular expression pattern you want to compile.
#flags: Optional argument to modify the behavior of the pattern matching.
#Examples of flags include re.IGNORECASE, re.MULTILINE, re.DOTALL, etc
#compilation flags
#ASCII
#DOTALL
#IGNORECARE
#LOCALE
#MULTILINE
#VERBOSE
pattern = re.compile(r'\bstrikes\b')
match = pattern.search("Bob Feller strikes out Ted Williams.")
print(match.group())

#flags example Ignore Case
pattern = re.compile(r'\bfeller\b', re.IGNORECASE)
match = pattern.search("Bob Feller strikes out Ted Williams.")
print(match.group())

#flags example Multiline Case
pattern = re.compile(r'^Bob', re.MULTILINE)
text = """Bob Gibson Sandy Koufax Tom Seaver Bob Feller"""
matches = pattern.findall(text)
print(matches)

#flags Dotall
pattern = re.compile(r'Koufax.*Tom', re.DOTALL)
text = """Bob Gibson\nSandy Koufax\nTom Seaver\nBob Feller"""
match = pattern.search(text)
print(match.group()) # Output: first line\nsecond line

#Multiple flags example Ignore Case & multiline
pattern = re.compile(r'^bob', re.IGNORECASE | re.MULTILINE)
text = """Bob Gibson Sandy Koufax Tom Seaver bob Feller"""
matches = pattern.findall(text)
print(matches) # Output: ['Bob', 'bob']

#More Complicated Examples
#Matching Email Addresses Ex 1 email_string = "Contact my email ryannolandata@gmail.com for freelance projects"
pattern = r"\b\w+@\w+\.\w+\b" #also works without the boundaries
#\w – any character
# @ – email
# .
# + one or more characters
#+: Matches 1 or more repetitions of the preceding character.
re.findall(pattern, email_string)

#Matching Email Addresses Ex 2
pattern2 = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
re.findall(pattern2, email_string)

#Matching Phone Numbers Ex 3
phone_string = "Contact my phone number 123-456-7890 for freelance projects"
pattern3 = r"[0-9]{3}-[0-9]{3}-[0-9]{4}"
re.findall(pattern3, phone_string)

#Matching Phone Numbers Ex 4
pattern4 = r"\b\d{3}-\d{3}-\d{4}\b"
re.findall(pattern4, phone_string)

#Matching date Ex 5
date = "Today is 05/23/2024"
pattern5 = r"[0-9]{2}/[0-9]{2}/[0-9]{4}"
re.findall(pattern5, date)

pattern6 = r"\d{2}/\d{2}/\d{4}"
re.findall(pattern6, date)

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.