Pandas datetime
import pandas as pd import numpy as np from datetime import timedelta
data = { "Date": [ "1965-05-08", "1975-08-02", "1981-10-25", "1994-11-27", "2002-10-22", "2015-06-12", "2019-08-30", "2021-11-23" ], "City": [ "Jacksonville", "Jacksonville", "Orlando", "Gainesville", "Sunrise", "Orlando", "Miami Gardens", "Hollywood" ], "Venue": [ "Jacksonville Coliseum", "Gator Bowl Stadium", "Orlando Stadium (Tangerine Bowl)", "Florida Field (Ben Hill Griffin Stadium)", "Office Depot Center", "Orlando Citrus Bowl (Camping World Stadium)", "Hard Rock Stadium", "Hard Rock Live at Seminole Hard Rock Hotel & Casino" ], "Estimated Capacity": [ 10000, # Approximate capacity for Jacksonville Coliseum 72000, # Approximate capacity for Gator Bowl Stadium 60000, 88000, 20000, 47225, 65326, 7000 ] }
df = pd.DataFrame(data)
df.head(10)

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df2 = df.copy()
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['dayofweek_int'] = df['Date'].dt.dayofweek
df['dayofweek_name'] = df['Date'].dt.day_name()
df

df['Date'].min()

df['Date'].max()

df.loc[df['Date'] > '2020-01-01']

df.loc[(df['Date'] > '2020-01-01') & (df['Date'].dt.month == 11)]

df['month_before_show'] = df['Date'] - pd.DateOffset(months=1)
df.head(3)

df['days_since_event'] = (pd.Timestamp('2025-04-17') - df['Date']).dt.days
df.sort_values('Date')

df.sort_values('Date', ascending=False)

new_row = pd.DataFrame([{ 'Date': pd.NaT, 'City': 'Jacksonville', 'Venue': 'Metlife', 'Estimated Capacity': 70000 }]) df2 = pd.concat([df2, new_row], ignore_index=True)
df2['Date'].isna()

df2['Date'] = df2['Date'].fillna(pd.Timestamp('2019-07-19'))
new_row_2 = pd.DataFrame([{ 'Date': pd.NaT, 'City': 'Plant City', 'Venue': 'Strawberry Festival Stadium', 'Estimated Capacity': 3000 }]) df2 = pd.concat([df2, new_row_2], ignore_index=True)
df2.tail()

df2.dropna(subset=['Date'], inplace=True)
df2.tail()

df['previous_concert'] = df['Date'].shift(1)
df['next_concert'] = df['Date'].shift(-1)
df2['Rolling_3_Event_Mean'] = df2['Estimated Capacity'].rolling(window=3).mean()
df2['Expanding_Mean'] = df2['Estimated Capacity'].expanding().mean()
df2.set_index('Date', inplace=True)
df2

df2.sort_index(inplace=True)
df2['Estimated Capacity'].resample('10YS').mean() # monthly average

df2['Estimated Capacity'].resample('10YS').agg({'mean', 'max', 'min','sum'})

df2.loc['2019']

df2.loc['2019-07']

df2.loc['2019-07-19']

df2.loc['2010-01-01':'2025-01-01']

df2.loc[(df2.index >= '2010-01-01') & (df2.index <= '2025-01-01')]

df2['year'] = df2.index.year
df2['month'] = df2.index.month
df2['day of week'] = df2.index.dayofweek
df2['day name'] = df2.index.day_name()
data2 = { "Train Number": ["T123", "T456", "T789", "T101", "T202", "T303"], "Departure Time": ["08:00", "09:30", "11:00", "13:00", "15:30", "18:00"], "Destination": ["Zermatt", "Paris", "Zermatt", "Paris", "Zermatt", "Paris"], "Platform": [3, 5, 2, 6, 1, 4], "Status": ["On Time", "Delayed", "On Time", "On Time", "Delayed", "On Time"] }
df_train_schedule = pd.DataFrame(data2)
df_train_schedule

today = pd.Timestamp.now().date()
df_train_schedule["Departure Time"] = pd.to_datetime(df_train_schedule["Departure Time"].apply(lambda t: f"{today} {t}"))
df_train_schedule["Departure Time"] = df_train_schedule["Departure Time"].dt.tz_localize("Europe/Zurich")
df_train_schedule

df_train_schedule['Departure Time EST'] = df_train_schedule['Departure Time'].dt.tz_convert('US/Eastern')
df_train_schedule

df_train_schedule["Next Departure"] = df_train_schedule.sort_values("Departure Time").groupby("Destination")["Departure Time"].shift(-1)
df_train_schedule

df_train_schedule['next_train_time_diff'] = df_train_schedule['Next Departure'] - df_train_schedule['Departure Time']
df_train_schedule

df_train_schedule["Train Arrival Time"] = df_train_schedule["Departure Time"] + timedelta(hours=3)
df_train_schedule

filtered_train_times = df_train_schedule[ (df_train_schedule["Train Arrival Time"].dt.time >= pd.to_datetime("12:00").time()) & (df_train_schedule["Train Arrival Time"].dt.time <= pd.to_datetime("17:00").time()) ]
filtered_train_times

pd.date_range(start='2024-01-01', end='2024-01-10')

pd.date_range(start='2024-01-01', periods=6, freq='ME')

data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-02', # duplicate on 2nd '2025-04-04', '2025-04-06' # missing 3rd and 5th ], 'train': ['TGV 101', 'TGV 102', 'TGV 103', 'TGV 104', 'TGV 105'] }
df3 = pd.DataFrame(data)
df3['date'] = pd.to_datetime(df3['date'])
duplicates = df3[df3['date'].duplicated()]
duplicates

data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-04', '2025-04-06', # missing 3rd, 5th '2025-04-07', '2025-04-09', # missing 8th '2025-04-10', '2025-04-11', '2025-04-12' ], 'train': [ 'TGV 101', 'TGV 103', 'TGV 104', 'TGV 105', 'TGV 106', 'TGV 107', 'TGV 108', 'TGV 109', 'TGV 110' ] }
df4 = pd.DataFrame(data)
df4['date'] = pd.to_datetime(df4['date'])
df4 = df4.set_index(['date'])
df_full = df4.asfreq('D', fill_value='No departures')
df_full

idx = pd.date_range('2023-01-01', periods=4, freq='ME') ts = pd.Series([10, 20, 30, 40], index=idx)
ts_daily = ts.asfreq('D')
ts_daily.head()

data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-03', '2025-04-04', '2025-04-06' # missing 5th ], 'train': ['TGV 101', 'TGV 102', np.nan, 'TGV 104', 'TGV 105'] }
df7 = pd.DataFrame(data)
df7

df_filled_forward = df7.ffill()
df_filled_forward

df_filled_backward = df7.bfill()
df_filled_backward

full_range = pd.date_range(start=df3['date'].min(), end=df3['date'].max())
missing = full_range.difference(df3['date'])
missing

df = pd.DataFrame({'date_str': ['01/04/2025', '02/04/2025', '03/04/2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d/%m/%Y')
df.head()

df = pd.DataFrame({'date_str': ['04-01-2025', '04-02-2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%m-%d-%Y')
df.head()

df = pd.DataFrame({'timestamp': ['01/04/2025 14:30:00', '02/04/2025 08:45:00']})
df['datetime'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M:%S')
df.head()

df = pd.DataFrame({'date_str': ['01-Apr-2025', '02-Apr-2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d-%b-%Y')
df.head()

df = pd.DataFrame({'date_str': ['01 April 2025', '02 April 2025']})
df['date'] = pd.to_datetime(df['date_str'], format='%d %B %Y')
df.head()

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.