Python Regular Expressions

#regex or regexpression
				
					import re
				
			
#re.search() searches the entire string for a match.
				
					Greatest_Olympians = "Mark Spitz, Paavo Nurmi, Usain Bolt, Carl Lewis, Michael Phelps, Jesse Owens"
				
			
				
					search_nurmi = re.search("Nurmi", Greatest_Olympians)
				
			
				
					if search_nurmi:
    print("Match found:", search_nurmi.group())
else:
    print("No match")
				
			
				
					search_weissmuller = re.search("Weissmuller", Greatest_Olympians)
				
			
				
					if search_weissmuller:
    print("Match found:", search_weissmuller.group())
else:
    print("No match")
				
			
				
					match_amsterdam = re.match("Amsterdam", olympics)
				
			
				
					if match_amsterdam:
    print("Match found:", match_amsterdam.group())
else:
    print("No match")
				
			
				
					all_games = "1920 Antwerp Olympics, 1924 Paris Olympics, 1928 Amsterdam Olympics"
				
			
				
					find_all_olympics = re.findall("Olympics", all_games)
				
			
				
					len(find_all_olympics)
				
			
				
					print("Matches found:", find_all_olympics)
				
			
#The finditer function in Python’s re module is used to find all non-overlapping matches of a regular expression pattern in a string. It returns an iterator yielding match objects for each match.
				
					find_all_olympics_iter = re.finditer("Olympics", all_games)
				
			
#can’t do length
				
					for olympics in find_all_olympics_iter:
        match_text = olympics.group()
        start_pos = olympics.start()
        end_pos = olympics.end()
        span = olympics.span()
        print(f"Match: {match_text}, Start: {start_pos}, End: {end_pos}, Span: {span}")
				
			
				
					#re.sub() replaces matches in the string with a specified replacement.
text = "Mike Stanton"
				
			
				
					pattern="Mike"
				
			
				
					replacement ="Giancarlo"
				
			
				
					result = re.sub(pattern, replacement, text)
				
			
				
					print("Result:", result)
				
			
				
					#only replace one
text = "Mike Stanton, Mike Schmidt, Mike Trout"
				
			
				
					result = re.sub(pattern, replacement, text, count=1)
				
			
				
					print("Result:", result)
				
			
#split – list string split
				
					player_names = "Babe Ruth, Lou Gehrig, Hank Aaron, Mickey Mantle, Willie Mays, Ted Williams"
				
			
				
					pattern = ', '
				
			
				
					split_names = re.split(pattern, player_names)
				
			
				
					print("Split names:", split_names)
				
			
				
					split_names[0]
				
			
				
					split_names[2]
				
			
#raw text explanation
#raw string -> not to handle backslashes in any way
#python prints same way its sepecified
				
					non_raw_text = 'First line\nSecond line'
				
			
				
					print(non_raw_text)
				
			
				
					raw_text = r"First line\nSecond line"
				
			
				
					print(raw_text)
				
			
				
					text_with_tab = 'Column1\tColumn2\tColumn3'
				
			
				
					print(text_with_tab)
				
			
				
					raw_text_with_tab = r"Column1\tColumn2\tColumn3"
				
			
				
					print(raw_text_with_tab)
				
			
#Matching Characters
#.
##period matches any character minus new line backlsash
				
					pitcher = "Satchel Paige"
				
			
				
					re.findall(r".", pitcher)
				
			
				
					#starts with p, one character in middle, ends with a
#case sensative
re.findall(r"P.i", pitcher)
				
			
				
					#starts with s, two characters in middle, ends with c
#case sensative
re.findall(r"S.{3}h", pitcher)
				
			
#^: Matches the start of the string.
				
					re.findall(r"^.a", pitcher)
				
			
				
					#$: Matches the end of the string.
#end $
re.findall(r"ge$", pitcher)
				
			
#*: Matches 0 or more repetitions of the preceding character.
				
					text = "ab a abbb abbbb a a abb"
				
			
				
					re.findall(r"ab*", text)
				
			
#+: Matches 1 or more repetitions of the preceding character.
				
					re.findall(r"ab+", text)
				
			
#?: Matches 0 or 1 repetition of the preceding character.
				
					re.findall(r"ab?", text)
				
			
				
					satchel_mlb_years = "Satchel Paige played for the Indians in 1948 and 1949. Browns in 1951, 1952, and 1953."
				
			
#\d: Matches any digit (equivalent to [0-9]).
				
					re.findall(r"\d", satchel_mlb_years)
				
			
				
					re.findall(r"\d{4}", satchel_mlb_years)
				
			
				
					#\D: Matches any non-digit.
re.findall(r"\D", satchel_mlb_years)
				
			
#\w: Matches any word character (alphanumeric plus underscore).

#The dot (.) matches any single character except for newline characters.
#The \w matches any word character.

#\w doesnt include whitespace, punctuation, special characters
				
					re.findall(r"\w", satchel_mlb_years)
				
			
#\W: Matches any non-word character.
				
					re.findall(r"\W", satchel_mlb_years)
				
			
#\s: Matches any whitespace character.
				
					re.findall(r"\s", satchel_mlb_years)
				
			
#\S: Matches any non-whitespace character.
				
					re.findall(r"\S", satchel_mlb_years)
				
			
#\b beginning or at the end of a word/block
#B not at the beginning or end of a word/block
				
					satchel_mlb_years
				
			
# Find all words starting with ‘p’
##\w: Matches any word character (alphanumeric plus underscore).
				
					re.findall(r"\bp\w*", satchel_mlb_years)
				
			
# Find all words ending with ‘d’
				
					re.findall(r"\w*d\b", satchel_mlb_years)
				
			
				
					# Find all words containing r not at a word boundary
re.findall(r"\B\w*r\w*\B", satchel_mlb_years)
				
			
#[A-Z]
				
					re.findall(r"[A-Z]", satchel_mlb_years)
				
			
				
					re.findall(r"[A-C]", satchel_mlb_years)
				
			
#[a-z]
				
					re.findall(r"[a-g]", satchel_mlb_years)
				
			
				
					re.findall(r"[ae]", satchel_mlb_years)
				
			
#[0-9]
				
					re.findall(r"[0-9]", satchel_mlb_years)
				
			
				
					re.findall(r"[0-3]", satchel_mlb_years)
				
			
				
					re.findall(r"[15]", satchel_mlb_years)
				
			
				
					re.findall(r"[0-9]{4}", satchel_mlb_years)
				
			
# \. actual period \? actual question mark re.findall(“\.”, string)
				
					examples = "What is a question? This is text."
				
			
				
					re.findall(r"\.", examples)
				
			
				
					re.findall(r"\?", examples)
				
			
# | is either or re.findall(“|”, string)
				
					re.findall(r"\.|\?", examples)
				
			
				
					re.findall(r"[0-2]|[a-c]", satchel_mlb_years)
				
			
#re.compile & flags
#compile a regular expression pattern into a regular expression object,
#which can then be used for matching, searching, and other operations.
#re.compile(pattern, flags=0)

#pattern: The regular expression pattern you want to compile.
#flags: Optional argument to modify the behavior of the pattern matching.
#Examples of flags include re.IGNORECASE, re.MULTILINE, re.DOTALL, etc
#compilation flags
#ASCII
#DOTALL
#IGNORECARE
#LOCALE
#MULTILINE
#VERBOSE
				
					pattern = re.compile(r'\bstrikes\b')
				
			
				
					match = pattern.search("Bob Feller strikes out Ted Williams.")
				
			
				
					print(match.group())
				
			
#flags example Ignore Case
				
					pattern = re.compile(r'\bfeller\b', re.IGNORECASE)
				
			
				
					match = pattern.search("Bob Feller strikes out Ted Williams.")
				
			
				
					print(match.group())
				
			
#flags example Multiline Case
				
					pattern = re.compile(r'^Bob', re.MULTILINE)
				
			
				
					text = """Bob Gibson
Sandy Koufax
Tom Seaver
Bob Feller"""
				
			
				
					matches = pattern.findall(text)
				
			
				
					print(matches)
				
			
#flags Dotall
				
					pattern = re.compile(r'Koufax.*Tom', re.DOTALL)
				
			
				
					text = """Bob Gibson\nSandy Koufax\nTom Seaver\nBob Feller"""
				
			
				
					match = pattern.search(text)
				
			
				
					print(match.group())  # Output: first line\nsecond line
				
			
#Multiple flags example Ignore Case & multiline
				
					pattern = re.compile(r'^bob', re.IGNORECASE | re.MULTILINE)
				
			
				
					text = """Bob Gibson
Sandy Koufax
Tom Seaver
bob Feller"""
				
			
				
					matches = pattern.findall(text)
				
			
				
					print(matches)  # Output: ['Bob', 'bob']
				
			
#More Complicated Examples
				
					#Matching Email Addresses Ex 1
email_string = "Contact my email ryannolandata@gmail.com for freelance projects"
				
			
				
					pattern = r"\b\w+@\w+\.\w+\b"
#also works without the boundaries
				
			
#\w – any character
# @ – email
# .
# + one or more characters
#+: Matches 1 or more repetitions of the preceding character.
				
					re.findall(pattern, email_string)
				
			
#Matching Email Addresses Ex 2
				
					pattern2 = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
				
			
				
					re.findall(pattern2, email_string)
				
			
#Matching Phone Numbers Ex 3
				
					phone_string = "Contact my phone number 123-456-7890 for freelance projects"
				
			
				
					pattern3 = r"[0-9]{3}-[0-9]{3}-[0-9]{4}"
				
			
				
					re.findall(pattern3, phone_string)
				
			
#Matching Phone Numbers Ex 4
				
					pattern4 = r"\b\d{3}-\d{3}-\d{4}\b"
				
			
				
					re.findall(pattern4, phone_string)
				
			
#Matching date Ex 5
				
					date = "Today is 05/23/2024"
				
			
				
					pattern5 = r"[0-9]{2}/[0-9]{2}/[0-9]{4}"
				
			
				
					re.findall(pattern5, date)
				
			
				
					pattern6 = r"\d{2}/\d{2}/\d{4}"
				
			
				
					re.findall(pattern6, date)
				
			

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *