simpsons paradox in Python
import matplotlib.pyplot as plt import seaborn as sns import numpy as np import pandas as pd
# Define the data data = { "Player": ["Paul Goldschmidt", "Paul Goldschmidt", "Carlos Santana", "Carlos Santana"], "Pitcher Arm": ["Lefty", "Righty", "Lefty", "Righty"], "Hits": [35, 105, 100, 50], "At Bats": [100, 400, 300, 200], }
# Create a DataFrame df = pd.DataFrame(data)
df.head(5)

# Calculate batting averages df['Batting Avg'] = df['Hits'] / df['At Bats']
# Aggregate total stats for each player df_totals = df.groupby("Player").sum(numeric_only=True).reset_index() df_totals['Batting Avg'] = df_totals['Hits'] / df_totals['At Bats']
# Plot Batting Average by Pitcher Arm fig, axes = plt.subplots(1, 2, figsize=(14, 6)) # Plot individual batting averages for player in df['Player'].unique(): subset = df[df['Player'] == player] axes[0].bar(subset['Pitcher Arm'], subset['Batting Avg'], label=player) axes[0].set_title("Batting Average by Pitcher Arm") axes[0].set_ylabel("Batting Average") axes[0].legend(title="Player") # Plot aggregated batting averages axes[1].bar(df_totals['Player'], df_totals['Batting Avg'], color=['orange', 'blue']) axes[1].set_title("Overall Batting Average by Player") axes[1].set_ylabel("Batting Average") plt.tight_layout() plt.show()

2nd example Running
np.random.seed(42)
# Parameters for dataset n_young = 500 # Number of young runners n_old = 500 # Number of old runners
# Generate data for younger runners young_miles = np.random.uniform(20, 65, n_young) # Miles ran per week young_fastest_mile = 500 - 3 * young_miles + np.random.normal(0, 15, n_young) # Fastest mile time (better as mileage increases)
# Generate data for older runners old_miles = np.random.uniform(20, 70, n_old) # Miles ran per week old_fastest_mile = 800 - 1 * old_miles + np.random.normal(0, 15, n_old) # Fastest mile time (worse performance overall)
# Shift old runners to have higher mileage and worse times to create the paradox old_miles += 5 old_fastest_mile += 20
# Combine data into a single DataFrame data = pd.DataFrame({ "Miles Ran Per Week": np.concatenate([young_miles, old_miles]), "Fastest Mile Time (seconds)": np.concatenate([young_fastest_mile, old_fastest_mile]), "Age Group": ["Young"] * n_young + ["Old"] * n_old })
1st plot just the mile time/miles per week
# Plotting plt.figure(figsize=(10, 6)) sns.scatterplot( x="Miles Ran Per Week", y="Fastest Mile Time (seconds)", hue="Age Group", data=data, palette={"Young": "red", "Old": "blue"}, alpha=0.6 ) # Add trend lines for each group sns.regplot( x="Miles Ran Per Week", y="Fastest Mile Time (seconds)", data=data[data["Age Group"] == "Young"], scatter=False, color="red", label="Trend: Young" ) sns.regplot( x="Miles Ran Per Week", y="Fastest Mile Time (seconds)", data=data[data["Age Group"] == "Old"], scatter=False, color="blue", label="Trend: Old" ) # Add overall trend line sns.regplot( x="Miles Ran Per Week", y="Fastest Mile Time (seconds)", data=data, scatter=False, color="black", label="Overall Trend" ) # Customize the plot plt.title("Simpson's Paradox in Running Data", fontsize=16) plt.xlabel("Miles Ran Per Week", fontsize=12) plt.ylabel("Fastest Mile Time (seconds)", fontsize=12) plt.legend(title="Group") plt.grid(True, alpha=0.3) plt.show()

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.