Pandas datetime

  import pandas as pd import numpy as np from datetime import timedelta
  data = { "Date": [ "1965-05-08", "1975-08-02", "1981-10-25", "1994-11-27", "2002-10-22", "2015-06-12", "2019-08-30", "2021-11-23" ], "City": [ "Jacksonville", "Jacksonville", "Orlando", "Gainesville", "Sunrise", "Orlando", "Miami Gardens", "Hollywood" ], "Venue": [ "Jacksonville Coliseum", "Gator Bowl Stadium", "Orlando Stadium (Tangerine Bowl)", "Florida Field (Ben Hill Griffin Stadium)", "Office Depot Center", "Orlando Citrus Bowl (Camping World Stadium)", "Hard Rock Stadium", "Hard Rock Live at Seminole Hard Rock Hotel & Casino" ], "Estimated Capacity": [ 10000, # Approximate capacity for Jacksonville Coliseum 72000, # Approximate capacity for Gator Bowl Stadium 60000, 88000, 20000, 47225, 65326, 7000 ] }
  df = pd.DataFrame(data)
  df.head(10)
  df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
  df2 = df.copy()
  df['year'] = df['Date'].dt.year
  df['month'] = df['Date'].dt.month
  df['dayofweek_int'] = df['Date'].dt.dayofweek
  df['dayofweek_name'] = df['Date'].dt.day_name()
  df
  df['Date'].min()
  df['Date'].max()
  df.loc[df['Date'] > '2020-01-01']
  df.loc[(df['Date'] > '2020-01-01') & (df['Date'].dt.month == 11)]
  df['month_before_show'] = df['Date'] - pd.DateOffset(months=1)
  df.head(3)
  df['days_since_event'] = (pd.Timestamp('2025-04-17') - df['Date']).dt.days
  df.sort_values('Date')
  df.sort_values('Date', ascending=False)
  new_row = pd.DataFrame([{ 'Date': pd.NaT, 'City': 'Jacksonville', 'Venue': 'Metlife', 'Estimated Capacity': 70000 }]) df2 = pd.concat([df2, new_row], ignore_index=True)
  df2['Date'].isna()
  df2['Date'] = df2['Date'].fillna(pd.Timestamp('2019-07-19'))
  new_row_2 = pd.DataFrame([{ 'Date': pd.NaT, 'City': 'Plant City', 'Venue': 'Strawberry Festival Stadium', 'Estimated Capacity': 3000 }]) df2 = pd.concat([df2, new_row_2], ignore_index=True)
  df2.tail()
  df2.dropna(subset=['Date'], inplace=True)
  df2.tail()
  df['previous_concert'] = df['Date'].shift(1)
  df['next_concert'] = df['Date'].shift(-1)
  df2['Rolling_3_Event_Mean'] = df2['Estimated Capacity'].rolling(window=3).mean()
  df2['Expanding_Mean'] = df2['Estimated Capacity'].expanding().mean()
  df2.set_index('Date', inplace=True)
  df2
  df2.sort_index(inplace=True)
  df2['Estimated Capacity'].resample('10YS').mean() # monthly average
  df2['Estimated Capacity'].resample('10YS').agg({'mean', 'max', 'min','sum'})
  df2.loc['2019']
  df2.loc['2019-07']
  df2.loc['2019-07-19']
  df2.loc['2010-01-01':'2025-01-01']
  df2.loc[(df2.index >= '2010-01-01') & (df2.index <= '2025-01-01')]
  df2['year'] = df2.index.year
  df2['month'] = df2.index.month
  df2['day of week'] = df2.index.dayofweek
  df2['day name'] = df2.index.day_name()
  data2 = { "Train Number": ["T123", "T456", "T789", "T101", "T202", "T303"], "Departure Time": ["08:00", "09:30", "11:00", "13:00", "15:30", "18:00"], "Destination": ["Zermatt", "Paris", "Zermatt", "Paris", "Zermatt", "Paris"], "Platform": [3, 5, 2, 6, 1, 4], "Status": ["On Time", "Delayed", "On Time", "On Time", "Delayed", "On Time"] }
  df_train_schedule = pd.DataFrame(data2)
  df_train_schedule
  today = pd.Timestamp.now().date()
  df_train_schedule["Departure Time"] = pd.to_datetime(df_train_schedule["Departure Time"].apply(lambda t: f"{today} {t}"))
  df_train_schedule["Departure Time"] = df_train_schedule["Departure Time"].dt.tz_localize("Europe/Zurich")
  df_train_schedule
  df_train_schedule['Departure Time EST'] = df_train_schedule['Departure Time'].dt.tz_convert('US/Eastern')
  df_train_schedule
  df_train_schedule["Next Departure"] = df_train_schedule.sort_values("Departure Time").groupby("Destination")["Departure Time"].shift(-1)
  df_train_schedule
  df_train_schedule['next_train_time_diff'] = df_train_schedule['Next Departure'] - df_train_schedule['Departure Time']
  df_train_schedule
  df_train_schedule["Train Arrival Time"] = df_train_schedule["Departure Time"] + timedelta(hours=3)
  df_train_schedule
  filtered_train_times = df_train_schedule[ (df_train_schedule["Train Arrival Time"].dt.time >= pd.to_datetime("12:00").time()) & (df_train_schedule["Train Arrival Time"].dt.time <= pd.to_datetime("17:00").time()) ]
  filtered_train_times
  pd.date_range(start='2024-01-01', end='2024-01-10')
  pd.date_range(start='2024-01-01', periods=6, freq='ME')
  data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-02', # duplicate on 2nd '2025-04-04', '2025-04-06' # missing 3rd and 5th ], 'train': ['TGV 101', 'TGV 102', 'TGV 103', 'TGV 104', 'TGV 105'] }
  df3 = pd.DataFrame(data)
  df3['date'] = pd.to_datetime(df3['date'])
  duplicates = df3[df3['date'].duplicated()]
  duplicates
  data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-04', '2025-04-06', # missing 3rd, 5th '2025-04-07', '2025-04-09', # missing 8th '2025-04-10', '2025-04-11', '2025-04-12' ], 'train': [ 'TGV 101', 'TGV 103', 'TGV 104', 'TGV 105', 'TGV 106', 'TGV 107', 'TGV 108', 'TGV 109', 'TGV 110' ] }
   df4 = pd.DataFrame(data)
  df4['date'] = pd.to_datetime(df4['date'])
  df4 = df4.set_index(['date'])
  df_full = df4.asfreq('D', fill_value='No departures')
  df_full
  idx = pd.date_range('2023-01-01', periods=4, freq='ME') ts = pd.Series([10, 20, 30, 40], index=idx)
  ts_daily = ts.asfreq('D')
  ts_daily.head()
  data = { 'date': [ '2025-04-01', '2025-04-02', '2025-04-03', '2025-04-04', '2025-04-06' # missing 5th ], 'train': ['TGV 101', 'TGV 102', np.nan, 'TGV 104', 'TGV 105'] }
  df7 = pd.DataFrame(data)
  df7
  df_filled_forward = df7.ffill()
  df_filled_forward
  df_filled_backward = df7.bfill()
  df_filled_backward
  full_range = pd.date_range(start=df3['date'].min(), end=df3['date'].max())
  missing = full_range.difference(df3['date'])
  missing
  df = pd.DataFrame({'date_str': ['01/04/2025', '02/04/2025', '03/04/2025']})
  df['date'] = pd.to_datetime(df['date_str'], format='%d/%m/%Y')
  df.head()
  df = pd.DataFrame({'date_str': ['04-01-2025', '04-02-2025']})
  df['date'] = pd.to_datetime(df['date_str'], format='%m-%d-%Y')
  df.head()
  df = pd.DataFrame({'timestamp': ['01/04/2025 14:30:00', '02/04/2025 08:45:00']})
  df['datetime'] = pd.to_datetime(df['timestamp'], format='%d/%m/%Y %H:%M:%S')
  df.head()
  df = pd.DataFrame({'date_str': ['01-Apr-2025', '02-Apr-2025']})
  df['date'] = pd.to_datetime(df['date_str'], format='%d-%b-%Y')
  df.head()
  df = pd.DataFrame({'date_str': ['01 April 2025', '02 April 2025']})
  df['date'] = pd.to_datetime(df['date_str'], format='%d %B %Y')
  df.head()

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *