BeautifulSoup4 extract table

  import requests import pandas as pd from bs4 import BeautifulSoup

Basic Example HTML Code -> Runners

  html = """ <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Personal Running Bests</title></head> <body> <h1>Personal Running Bests</h1> <table> <thead> <tr> <th>Distance</th> <th>Time</th></tr></thead> <tbody> <tr> <td>5k</td> <td>18:30</td></tr> <tr> <td>10k</td> <td>37:50</td></tr> <tr> <td>Half Marathon</td> <td>1:25:11</td></tr> <tr> <td>Marathon</td> <td>3:17:00</td></tr> <tr> <td>50 Miler</td> <td>9:14:30</td></tr> <tr> <td>100 Miler</td> <td>32:11:11</td></tr></tbody></table><script id="wc-add-to-cart-js-extra">var wc_add_to_cart_params={"ajax_url":"\/wp-admin\/admin-ajax.php","wc_ajax_url":"\/?wc-ajax=%%endpoint%%","i18n_view_cart":"View cart","cart_url":"https:\/\/ryanandmattdatascience.com\/cart\/","is_cart":"","cart_redirect_after_add":"no"};</script>
<script id="woocommerce-js-extra">var woocommerce_params={"ajax_url":"\/wp-admin\/admin-ajax.php","wc_ajax_url":"\/?wc-ajax=%%endpoint%%","i18n_password_show":"Show password","i18n_password_hide":"Hide password"};</script>
<script id="WCPAY_ASSETS-js-extra">var wcpayAssets={"url":"https:\/\/ryanandmattdatascience.com\/wp-content\/plugins\/woocommerce-payments\/dist\/"};</script>
<script>var stm_wpcfto_ajaxurl='https://ryanandmattdatascience.com/wp-admin/admin-ajax.php';</script>
<script>var stm_wpcfto_nonces={"wpcfto_save_settings":"3040dd9fd9","get_image_url":"c8ef0ed5d1","wpcfto_upload_file":"eaefb35f0a","wpcfto_search_posts":"21e926921b"};</script>
<script>var elementskit_module_parallax_url="https://ryanandmattdatascience.com/wp-content/plugins/elementskit/modules/parallax/";</script>
<script>(function (){
var c=document.body.className;
c=c.replace(/woocommerce-no-js/, 'woocommerce-js');
document.body.className=c;
})();</script>
<script id="contact-form-7-js-before">var wpcf7={
"api": {
"root": "https:\/\/ryanandmattdatascience.com\/wp-json\/",
"namespace": "contact-form-7\/v1"
},
"cached": 1
};</script>
<script id="elementor-frontend-js-before">var elementorFrontendConfig={"environmentMode":{"edit":false,"wpPreview":false,"isScriptDebug":false},"i18n":{"shareOnFacebook":"Share on Facebook","shareOnTwitter":"Share on Twitter","pinIt":"Pin it","download":"Download","downloadImage":"Download image","fullscreen":"Fullscreen","zoom":"Zoom","share":"Share","playVideo":"Play Video","previous":"Previous","next":"Next","close":"Close","a11yCarouselPrevSlideMessage":"Previous slide","a11yCarouselNextSlideMessage":"Next slide","a11yCarouselFirstSlideMessage":"This is the first slide","a11yCarouselLastSlideMessage":"This is the last slide","a11yCarouselPaginationBulletMessage":"Go to slide"},"is_rtl":false,"breakpoints":{"xs":0,"sm":480,"md":768,"lg":1025,"xl":1440,"xxl":1600},"responsive":{"breakpoints":{"mobile":{"label":"Mobile Portrait","value":767,"default_value":767,"direction":"max","is_enabled":true},"mobile_extra":{"label":"Mobile Landscape","value":880,"default_value":880,"direction":"max","is_enabled":false},"tablet":{"label":"Tablet Portrait","value":1024,"default_value":1024,"direction":"max","is_enabled":true},"tablet_extra":{"label":"Tablet Landscape","value":1200,"default_value":1200,"direction":"max","is_enabled":false},"laptop":{"label":"Laptop","value":1366,"default_value":1366,"direction":"max","is_enabled":false},"widescreen":{"label":"Widescreen","value":2400,"default_value":2400,"direction":"min","is_enabled":false}},"hasCustomBreakpoints":false},"version":"3.29.2","is_static":false,"experimentalFeatures":{"additional_custom_breakpoints":true,"container":true,"e_local_google_fonts":true,"theme_builder_v2":true,"nested-elements":true,"editor_v2":true,"home_screen":true,"cloud-library":true,"e_opt_in_v4_page":true},"urls":{"assets":"https:\/\/ryanandmattdatascience.com\/wp-content\/plugins\/elementor\/assets\/","ajaxurl":"https:\/\/ryanandmattdatascience.com\/wp-admin\/admin-ajax.php","uploadUrl":"https:\/\/ryanandmattdatascience.com\/wp-content\/uploads"},"nonces":{"floatingButtonsClickTracking":"6001b37e8f"},"swiperClass":"swiper","settings":{"page":[],"editorPreferences":[]},"kit":{"active_breakpoints":["viewport_mobile","viewport_tablet"],"global_image_lightbox":"yes","lightbox_enable_counter":"yes","lightbox_enable_fullscreen":"yes","lightbox_enable_zoom":"yes","lightbox_enable_share":"yes","lightbox_title_src":"title","lightbox_description_src":"description","woocommerce_notices_elements":[]},"post":{"id":26636,"title":"BeautifulSoup%3A%20Extract%20HTML%20Tables%20Easily%20with%20Python","excerpt":"","featuredImage":false}};</script>
<script id="wc-order-attribution-js-extra">var wc_order_attribution={"params":{"lifetime":1.0e-5,"session":30,"base64":false,"ajaxurl":"https:\/\/ryanandmattdatascience.com\/wp-admin\/admin-ajax.php","prefix":"wc_order_attribution_","allowTracking":true},"fields":{"source_type":"current.typ","referrer":"current_add.rf","utm_campaign":"current.cmp","utm_source":"current.src","utm_medium":"current.mdm","utm_content":"current.cnt","utm_id":"current.id","utm_term":"current.trm","utm_source_platform":"current.plt","utm_creative_format":"current.fmt","utm_marketing_tactic":"current.tct","session_entry":"current_add.ep","session_start_time":"current_add.fd","session_pages":"session.pgs","session_count":"udata.vst","user_agent":"udata.uag"}};</script>
<script id="wpcf7-recaptcha-js-before">var wpcf7_recaptcha={
"sitekey": "6LcKOJoqAAAAAK1qRzkn1Yfhv4Q3nU-7lRfcSY9g",
"actions": {
"homepage": "homepage",
"contactform": "contactform"
}};</script>
<script id="elementor-pro-frontend-js-before">var ElementorProFrontendConfig={"ajaxurl":"https:\/\/ryanandmattdatascience.com\/wp-admin\/admin-ajax.php","nonce":"0cb36e8935","urls":{"assets":"https:\/\/ryanandmattdatascience.com\/wp-content\/plugins\/elementor-pro\/assets\/","rest":"https:\/\/ryanandmattdatascience.com\/wp-json\/"},"settings":{"lazy_load_background_images":true},"popup":{"hasPopUps":false},"shareButtonsNetworks":{"facebook":{"title":"Facebook","has_counter":true},"twitter":{"title":"Twitter"},"linkedin":{"title":"LinkedIn","has_counter":true},"pinterest":{"title":"Pinterest","has_counter":true},"reddit":{"title":"Reddit","has_counter":true},"vk":{"title":"VK","has_counter":true},"odnoklassniki":{"title":"OK","has_counter":true},"tumblr":{"title":"Tumblr"},"digg":{"title":"Digg"},"skype":{"title":"Skype"},"stumbleupon":{"title":"StumbleUpon","has_counter":true},"mix":{"title":"Mix"},"telegram":{"title":"Telegram"},"pocket":{"title":"Pocket","has_counter":true},"xing":{"title":"XING","has_counter":true},"whatsapp":{"title":"WhatsApp"},"email":{"title":"Email"},"print":{"title":"Print"},"x-twitter":{"title":"X"},"threads":{"title":"Threads"}},"woocommerce":{"menu_cart":{"cart_page_url":"https:\/\/ryanandmattdatascience.com\/cart\/","checkout_page_url":"https:\/\/ryanandmattdatascience.com\/checkout\/","fragments_nonce":"832cf328e5"}},"facebook_sdk":{"lang":"en_US","app_id":""},"lottie":{"defaultAnimationUrl":"https:\/\/ryanandmattdatascience.com\/wp-content\/plugins\/elementor-pro\/modules\/lottie\/assets\/animations\/default.json"}};</script>
<script id="elementskit-elementor-js-extra">var ekit_config={"ajaxurl":"https:\/\/ryanandmattdatascience.com\/wp-admin\/admin-ajax.php","nonce":"f431a88994"};</script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/jquery/jquery.min.js?ver=3.7.1" id="jquery-core-js"></script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/jquery/jquery-migrate.min.js?ver=3.4.1" id="jquery-migrate-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/9f90f89ca1c2e91b-back-to-top.js" id="keydesign-go-top-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/jquery-blockui/jquery.blockUI.min.js?ver=2.7.0-wc.9.9.4" id="jquery-blockui-js" defer data-wp-strategy="defer"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/frontend/add-to-cart.min.js?ver=9.9.4" id="wc-add-to-cart-js" defer data-wp-strategy="defer"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/js-cookie/js.cookie.min.js?ver=2.1.4-wc.9.9.4" id="js-cookie-js" defer data-wp-strategy="defer"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/frontend/woocommerce.min.js?ver=9.9.4" id="woocommerce-js" defer data-wp-strategy="defer"></script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/3774baff78af5230-reading-bar.js" id="keydesign-rebar-js"></script>
<script id="google_gtagjs-js-after">window.dataLayer=window.dataLayer||[];function gtag(){dataLayer.push(arguments);}
gtag("set","linker",{"domains":["ryanandmattdatascience.com"]});
gtag("js", new Date());
gtag("set", "developer_id.dZTNiMT", true);
gtag("config", "GT-MR8Z58JX");</script>
<script>document.getElementById("ak_js_1").setAttribute("value",(new Date()).getTime());</script>
<script type="speculationrules">{"prefetch":[{"source":"document","where":{"and":[{"href_matches":"\/*"},{"not":{"href_matches":["\/wp-*.php","\/wp-admin\/*","\/wp-content\/uploads\/*","\/wp-content\/*","\/wp-content\/plugins\/*","\/wp-content\/themes\/sierra\/*","\/*\\?(.+)"]}},{"not":{"selector_matches":"a[rel~=\"nofollow\"]"}},{"not":{"selector_matches":".no-prefetch, .no-prefetch a"}}]},"eagerness":"conservative"}]}</script>
<script>const lazyloadRunObserver=()=> {
const lazyloadBackgrounds=document.querySelectorAll(`.e-con.e-parent:not(.e-lazyloaded)`);
const lazyloadBackgroundObserver=new IntersectionObserver(( entries)=> {
entries.forEach(( entry)=> {
if(entry.isIntersecting){
let lazyloadBackground=entry.target;
if(lazyloadBackground){
lazyloadBackground.classList.add('e-lazyloaded');
}
lazyloadBackgroundObserver.unobserve(entry.target);
}});
}, { rootMargin: '200px 0px 200px 0px' });
lazyloadBackgrounds.forEach(( lazyloadBackground)=> {
lazyloadBackgroundObserver.observe(lazyloadBackground);
});
};
const events=[
'DOMContentLoaded',
'elementor/lazyload/observe',
];
events.forEach(( event)=> {
document.addEventListener(event, lazyloadRunObserver);
});</script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/dist/hooks.min.js?ver=4d63a3d491d11ffd8ac6" id="wp-hooks-js"></script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/dist/i18n.min.js?ver=5e580eb46a90c2b997e6" id="wp-i18n-js"></script>
<script id="wp-i18n-js-after">wp.i18n.setLocaleData({ 'text direction\u0004ltr': [ 'ltr' ] });</script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/96e7dc3f0e8559e4-index.js" id="swv-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/2912c657d0592cc5-index.js" id="contact-form-7-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/components/prism-core.min.js?ver=1.23.0" id="prismjs_core-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/plugins/autoloader/prism-autoloader.min.js?ver=1.23.0" id="prismjs_loader-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/plugins/normalize-whitespace/prism-normalize-whitespace.min.js?ver=1.23.0" id="prismjs_normalize-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/plugins/line-numbers/prism-line-numbers.min.js?ver=1.23.0" id="prismjs_line_numbers-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/plugins/toolbar/prism-toolbar.min.js?ver=1.23.0" id="prismjs_toolbar-js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/prism/1.23.0/plugins/copy-to-clipboard/prism-copy-to-clipboard.min.js?ver=1.23.0" id="prismjs_copy_to_clipboard-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/elementor/assets/js/webpack.runtime.min.js?ver=3.29.2" id="elementor-webpack-runtime-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/elementor/assets/js/frontend-modules.min.js?ver=3.29.2" id="elementor-frontend-modules-js"></script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/jquery/ui/core.min.js?ver=1.13.3" id="jquery-ui-core-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/plugins/elementor/assets/js/frontend.min.js?ver=3.29.2" id="elementor-frontend-js"></script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/6c2b63649cebbd01-front-end.js" id="sierra-scripts-js"></script>
<script src="https://ryanandmattdatascience.com/wp-includes/js/comment-reply.min.js?ver=6.8.1" id="comment-reply-js" async data-wp-strategy="async"></script>
<script src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/94d041d462db321c-frontend-script.js" id="elementskit-framework-js-frontend-js"></script>
<script id="elementskit-framework-js-frontend-js-after">var elementskit={
resturl: 'https://ryanandmattdatascience.com/wp-json/elementskit/v1/',
}</script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/7f83f9f56851a309-widget-scripts.js" id="ekit-widget-scripts-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/sourcebuster/sourcebuster.min.js?ver=9.9.4" id="sourcebuster-js-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/woocommerce/assets/js/frontend/order-attribution.min.js?ver=9.9.4" id="wc-order-attribution-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-includes/js/dist/vendor/wp-polyfill.min.js?ver=3.15.0" id="wp-polyfill-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/ec0187677793456f-index.js" id="wpcf7-recaptcha-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/092b9b120c6f0b41-keydesign-framework.js" id="keydesign-scripts-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/91954b488a9bfcad-akismet-frontend.js" id="akismet-frontend-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/elementor-pro/assets/js/webpack-pro.runtime.min.js?ver=3.28.3" id="elementor-pro-webpack-runtime-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/elementor-pro/assets/js/frontend.min.js?ver=3.28.3" id="elementor-pro-frontend-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/elementor-pro/assets/js/elements-handlers.min.js?ver=3.28.3" id="pro-elements-handlers-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/plugins/elementskit-lite/widgets/init/assets/js/animate-circle.min.js?ver=3.5.3" id="animate-circle-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/d1fe3f49c432e65a-elementor.js" id="elementskit-elementor-js"></script>
<script defer src="https://ryanandmattdatascience.com/wp-content/cache/speedycache/ryanandmattdatascience.com/assets/3572f383338e5760-elementor.js" id="elementskit-elementor-pro-js"></script>
<script>window._wpemojiSettings={"baseUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.1.0\/72x72\/","ext":".png","svgUrl":"https:\/\/s.w.org\/images\/core\/emoji\/15.1.0\/svg\/","svgExt":".svg","source":{"concatemoji":"https:\/\/ryanandmattdatascience.com\/wp-includes\/js\/wp-emoji-release.min.js?ver=6.8.1"}};
!function(i,n){var o,s,e;function c(e){try{var t={supportTests:e,timestamp:(new Date).valueOf()};sessionStorage.setItem(o,JSON.stringify(t))}catch(e){}}function p(e,t,n){e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(t,0,0);var t=new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data),r=(e.clearRect(0,0,e.canvas.width,e.canvas.height),e.fillText(n,0,0),new Uint32Array(e.getImageData(0,0,e.canvas.width,e.canvas.height).data));return t.every(function(e,t){return e===r[t]})}function u(e,t,n){switch(t){case"flag":return n(e,"\ud83c\udff3\ufe0f\u200d\u26a7\ufe0f","\ud83c\udff3\ufe0f\u200b\u26a7\ufe0f")?!1:!n(e,"\ud83c\uddfa\ud83c\uddf3","\ud83c\uddfa\u200b\ud83c\uddf3")&&!n(e,"\ud83c\udff4\udb40\udc67\udb40\udc62\udb40\udc65\udb40\udc6e\udb40\udc67\udb40\udc7f","\ud83c\udff4\u200b\udb40\udc67\u200b\udb40\udc62\u200b\udb40\udc65\u200b\udb40\udc6e\u200b\udb40\udc67\u200b\udb40\udc7f");case"emoji":return!n(e,"\ud83d\udc26\u200d\ud83d\udd25","\ud83d\udc26\u200b\ud83d\udd25")}return!1}function f(e,t,n){var r="undefined"!=typeof WorkerGlobalScope&&self instanceof WorkerGlobalScope?new OffscreenCanvas(300,150):i.createElement("canvas"),a=r.getContext("2d",{willReadFrequently:!0}),o=(a.textBaseline="top",a.font="600 32px Arial",{});return e.forEach(function(e){o[e]=t(a,e,n)}),o}function t(e){var t=i.createElement("script");t.src=e,t.defer=!0,i.head.appendChild(t)}"undefined"!=typeof Promise&&(o="wpEmojiSettingsSupports",s=["flag","emoji"],n.supports={everything:!0,everythingExceptFlag:!0},e=new Promise(function(e){i.addEventListener("DOMContentLoaded",e,{once:!0})}),new Promise(function(t){var n=function(){try{var e=JSON.parse(sessionStorage.getItem(o));if("object"==typeof e&&"number"==typeof e.timestamp&&(new Date).valueOf()<e.timestamp+604800&&"object"==typeof e.supportTests)return e.supportTests}catch(e){}return null}();if(!n){if("undefined"!=typeof Worker&&"undefined"!=typeof OffscreenCanvas&&"undefined"!=typeof URL&&URL.createObjectURL&&"undefined"!=typeof Blob)try{var e="postMessage("+f.toString()+"("+[JSON.stringify(s),u.toString(),p.toString()].join(",")+"));",r=new Blob([e],{type:"text/javascript"}),a=new Worker(URL.createObjectURL(r),{name:"wpTestEmojiSupports"});return void(a.onmessage=function(e){c(n=e.data),a.terminate(),t(n)})}catch(e){}c(n=f(s,u,p))}t(n)}).then(function(e){for(var t in e)n.supports[t]=e[t],n.supports.everything=n.supports.everything&&n.supports[t],"flag"!==t&&(n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&n.supports[t]);n.supports.everythingExceptFlag=n.supports.everythingExceptFlag&&!n.supports.flag,n.DOMReady=!1,n.readyCallback=function(){n.DOMReady=!0}}).then(function(){return e}).then(function(){var e;n.supports.everything||(n.readyCallback(),(e=n.source||{}).concatemoji?t(e.concatemoji):e.wpemoji&&e.twemoji&&(t(e.twemoji),t(e.wpemoji)))}))}((window,document),window._wpemojiSettings);</script>
</body></html>"""

Extract headers

  headers = [th.get_text(strip=True) for th in table.find_all("th")]
  headers

Step 5: Extract table rows

  rows = []
  for tr in table.find_all("tr")[1:]: # Skip header row cells = [td.get_text(strip=True) for td in tr.find_all("td")] if cells: rows.append(cells)
  # Step 6: Create a pandas DataFrame df = pd.DataFrame(rows, columns=headers)
  df

Example 2 Metallica Ticket Sales

  url = "https://en.wikipedia.org/wiki/WorldWired_Tour"
  response = requests.get(url)
  html = response.text
  soup = BeautifulSoup(html, "html.parser")

Only Grab the First Table We don't want that in this instance

  table = soup.find("table")
  table

Grab All Tables. We don't want that in this instance, we need to target a specific table

  # or find all tables tables = soup.find_all("table")
  tables

Target a specific table #TO DO -> 2017 Concert

  table = None
  for th in soup.find_all("th"): if "Date (2017)" in th.get_text(): table = th.find_parent("table") break
  table
  headers = [th.get_text(strip=True) for th in table.find('tr').find_all('th')]
  for tr in table.find_all('tr')[1:]: # Skip header row cells = tr.find_all(['th', 'td']) row = [cell.get_text(strip=True).replace('\xa0', ' ') for cell in cells] rows.append(row)
  df = pd.DataFrame(rows, columns=headers[:len(rows[0])]) # Avoid header mismatch
  df

Cleaning up data, tons of ways we can start cleaning up date, some sites super easy and nice tables, this one isnt the best, should have multiple support columns etc

rename date column

  df.rename(columns={'Date (2017)': 'date'}, inplace=True)

remove [] in date column

  df['date'] = df['date'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
  df

forward fill to fix city, venue, and country issue

  df['City'] = df['City'].ffill() df['Country'] = df['Country'].ffill() df['Venue'] = df['Venue'].ffill()
  df

fix column names

  df.columns = df.columns.str.strip()
  df.columns = df.columns.str.replace(' ', '_')
  df.columns = df.columns.str.lower()
  df

Fix attendance issue

  def is_attendance(val): if pd.isna(val): return False pattern = r'^\d{1,3}(?:,\d{3})? ?/ ?\d{1,3}(?:,\d{3})?$' return bool(pd.Series(val).str.contains(pattern, regex=True)[0])
  df

Fix stadium issue

  venue_keywords = r'\b(?:Arena|Center|Field|Stadium|Garden|Park|Speedway)\b'
  mask = df['country'].str.contains(venue_keywords, case=False, na=False)
  df.loc[mask, 'venue'] = df.loc[mask, 'country']
  df.loc[mask, 'country'] = None # Clear them from 'country'
  df

Export to CSV

  df.to_csv("metallica.csv", index=False)

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *