Python Regular Expressions

#regex or regexpression
  import re
#re.search() searches the entire string for a match.
  Greatest_Olympians = "Mark Spitz, Paavo Nurmi, Usain Bolt, Carl Lewis, Michael Phelps, Jesse Owens"
  search_nurmi = re.search("Nurmi", Greatest_Olympians)
  if search_nurmi: print("Match found:", search_nurmi.group()) else: print("No match")
  search_weissmuller = re.search("Weissmuller", Greatest_Olympians)
  if search_weissmuller: print("Match found:", search_weissmuller.group()) else: print("No match")
  match_amsterdam = re.match("Amsterdam", olympics)
  if match_amsterdam: print("Match found:", match_amsterdam.group()) else: print("No match")
  all_games = "1920 Antwerp Olympics, 1924 Paris Olympics, 1928 Amsterdam Olympics"
  find_all_olympics = re.findall("Olympics", all_games)
  len(find_all_olympics)
  print("Matches found:", find_all_olympics)
#The finditer function in Python’s re module is used to find all non-overlapping matches of a regular expression pattern in a string. It returns an iterator yielding match objects for each match.
  find_all_olympics_iter = re.finditer("Olympics", all_games)
#can’t do length
  for olympics in find_all_olympics_iter: match_text = olympics.group() start_pos = olympics.start() end_pos = olympics.end() span = olympics.span() print(f"Match: {match_text}, Start: {start_pos}, End: {end_pos}, Span: {span}")
  #re.sub() replaces matches in the string with a specified replacement. text = "Mike Stanton"
  pattern="Mike"
  replacement ="Giancarlo"
  result = re.sub(pattern, replacement, text)
  print("Result:", result)
  #only replace one text = "Mike Stanton, Mike Schmidt, Mike Trout"
  result = re.sub(pattern, replacement, text, count=1)
  print("Result:", result)
#split – list string split
  player_names = "Babe Ruth, Lou Gehrig, Hank Aaron, Mickey Mantle, Willie Mays, Ted Williams"
  pattern = ', '
  split_names = re.split(pattern, player_names)
  print("Split names:", split_names)
  split_names[0]
  split_names[2]
#raw text explanation
#raw string -> not to handle backslashes in any way
#python prints same way its sepecified
  non_raw_text = 'First line\nSecond line'
  print(non_raw_text)
  raw_text = r"First line\nSecond line"
  print(raw_text)
  text_with_tab = 'Column1\tColumn2\tColumn3'
  print(text_with_tab)
  raw_text_with_tab = r"Column1\tColumn2\tColumn3"
  print(raw_text_with_tab)
#Matching Characters
#.
##period matches any character minus new line backlsash
  pitcher = "Satchel Paige"
  re.findall(r".", pitcher)
  #starts with p, one character in middle, ends with a #case sensative re.findall(r"P.i", pitcher)
  #starts with s, two characters in middle, ends with c #case sensative re.findall(r"S.{3}h", pitcher)
#^: Matches the start of the string.
  re.findall(r"^.a", pitcher)
  #$: Matches the end of the string. #end $ re.findall(r"ge$", pitcher)
#*: Matches 0 or more repetitions of the preceding character.
  text = "ab a abbb abbbb a a abb"
  re.findall(r"ab*", text)
#+: Matches 1 or more repetitions of the preceding character.
  re.findall(r"ab+", text)
#?: Matches 0 or 1 repetition of the preceding character.
  re.findall(r"ab?", text)
  satchel_mlb_years = "Satchel Paige played for the Indians in 1948 and 1949. Browns in 1951, 1952, and 1953."
#\d: Matches any digit (equivalent to [0-9]).
  re.findall(r"\d", satchel_mlb_years)
  re.findall(r"\d{4}", satchel_mlb_years)
  #\D: Matches any non-digit. re.findall(r"\D", satchel_mlb_years)
#\w: Matches any word character (alphanumeric plus underscore).

#The dot (.) matches any single character except for newline characters.
#The \w matches any word character.

#\w doesnt include whitespace, punctuation, special characters
  re.findall(r"\w", satchel_mlb_years)
#\W: Matches any non-word character.
  re.findall(r"\W", satchel_mlb_years)
#\s: Matches any whitespace character.
  re.findall(r"\s", satchel_mlb_years)
#\S: Matches any non-whitespace character.
  re.findall(r"\S", satchel_mlb_years)
#\b beginning or at the end of a word/block
#B not at the beginning or end of a word/block
  satchel_mlb_years
# Find all words starting with ‘p’
##\w: Matches any word character (alphanumeric plus underscore).
  re.findall(r"\bp\w*", satchel_mlb_years)
# Find all words ending with ‘d’
  re.findall(r"\w*d\b", satchel_mlb_years)
  # Find all words containing r not at a word boundary re.findall(r"\B\w*r\w*\B", satchel_mlb_years)
#[A-Z]
  re.findall(r"[A-Z]", satchel_mlb_years)
  re.findall(r"[A-C]", satchel_mlb_years)
#[a-z]
  re.findall(r"[a-g]", satchel_mlb_years)
  re.findall(r"[ae]", satchel_mlb_years)
#[0-9]
  re.findall(r"[0-9]", satchel_mlb_years)
  re.findall(r"[0-3]", satchel_mlb_years)
  re.findall(r"[15]", satchel_mlb_years)
  re.findall(r"[0-9]{4}", satchel_mlb_years)
# \. actual period \? actual question mark re.findall(“\.”, string)
  examples = "What is a question? This is text."
  re.findall(r"\.", examples)
  re.findall(r"\?", examples)
# | is either or re.findall(“|”, string)
  re.findall(r"\.|\?", examples)
  re.findall(r"[0-2]|[a-c]", satchel_mlb_years)
#re.compile & flags
#compile a regular expression pattern into a regular expression object,
#which can then be used for matching, searching, and other operations.
#re.compile(pattern, flags=0)

#pattern: The regular expression pattern you want to compile.
#flags: Optional argument to modify the behavior of the pattern matching.
#Examples of flags include re.IGNORECASE, re.MULTILINE, re.DOTALL, etc
#compilation flags
#ASCII
#DOTALL
#IGNORECARE
#LOCALE
#MULTILINE
#VERBOSE
  pattern = re.compile(r'\bstrikes\b')
  match = pattern.search("Bob Feller strikes out Ted Williams.")
  print(match.group())
#flags example Ignore Case
  pattern = re.compile(r'\bfeller\b', re.IGNORECASE)
  match = pattern.search("Bob Feller strikes out Ted Williams.")
  print(match.group())
#flags example Multiline Case
  pattern = re.compile(r'^Bob', re.MULTILINE)
  text = """Bob Gibson Sandy Koufax Tom Seaver Bob Feller"""
  matches = pattern.findall(text)
  print(matches)
#flags Dotall
  pattern = re.compile(r'Koufax.*Tom', re.DOTALL)
  text = """Bob Gibson\nSandy Koufax\nTom Seaver\nBob Feller"""
  match = pattern.search(text)
  print(match.group()) # Output: first line\nsecond line
#Multiple flags example Ignore Case & multiline
  pattern = re.compile(r'^bob', re.IGNORECASE | re.MULTILINE)
  text = """Bob Gibson Sandy Koufax Tom Seaver bob Feller"""
  matches = pattern.findall(text)
  print(matches) # Output: ['Bob', 'bob']
#More Complicated Examples
  #Matching Email Addresses Ex 1 email_string = "Contact my email ryannolandata@gmail.com for freelance projects"
  pattern = r"\b\w+@\w+\.\w+\b" #also works without the boundaries
#\w – any character
# @ – email
# .
# + one or more characters
#+: Matches 1 or more repetitions of the preceding character.
  re.findall(pattern, email_string)
#Matching Email Addresses Ex 2
  pattern2 = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"
  re.findall(pattern2, email_string)
#Matching Phone Numbers Ex 3
  phone_string = "Contact my phone number 123-456-7890 for freelance projects"
  pattern3 = r"[0-9]{3}-[0-9]{3}-[0-9]{4}"
  re.findall(pattern3, phone_string)
#Matching Phone Numbers Ex 4
  pattern4 = r"\b\d{3}-\d{3}-\d{4}\b"
  re.findall(pattern4, phone_string)
#Matching date Ex 5
  date = "Today is 05/23/2024"
  pattern5 = r"[0-9]{2}/[0-9]{2}/[0-9]{4}"
  re.findall(pattern5, date)
  pattern6 = r"\d{2}/\d{2}/\d{4}"
  re.findall(pattern6, date)

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *