Multicollinearity

				
					import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from numpy.linalg import eig
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score
				
			
				
					np.random.seed(73)
				
			
				
					num_players = 200
				
			
				
					batting_average = np.random.normal(0.28, 0.05, num_players)
				
			
				
					batting_average = np.clip(batting_average, 0.2, 0.4)
				
			
				
					AB = np.random.randint(300, 700, num_players)  # At-bats for the season
				
			
				
					H = AB * batting_average
				
			
dividing the total number of bases a player records by their total number of at-bats
maybe replace this with something else?
				
					height = np.random.normal(72, 6, num_players)
				
			
				
					Wind = np.random.uniform(-1.5, 1.5, num_players)  # Average wind factor affecting performance
				
			
				
					(AB * 0.02) + (height * 0.01) + (Wind * 7) + np.random.normal(0, 10, num_players)
				
			
				
					RBIs_rounded = np.round(RBIs)
H_rounded = np.round(H)
AB_rounded = np.round(AB)
				
			
				
					# Create the DataFrame
data = pd.DataFrame({
    'AB': AB_rounded,                  # At-Bats
    'H': H_rounded,                    # Hits
    'Height': height,                # Slugging Percentage
    'Wind': Wind,              # Wind Factor
    'RBIs': RBIs_rounded               # Runs Batted In (Target)
})
				
			
				
					print(data.head())
				
			
				
					X = data[['AB', 'H', 'Height', 'Wind']]
				
			
				
					y = data['RBIs']
				
			
				
					X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
				
			
				
					lr = LinearRegression()
				
			
				
					lr.fit(X_train, y_train)
				
			
				
					y_pred = lr.predict(X_test)
				
			
				
					mse_before = mean_squared_error(y_test, y_pred)
				
			
				
					r2_before = r2_score(y_test, y_pred)
				
			
				
					print("Regression Coefficients (Before):", lr.coef_)
				
			
				
					print("Mean Squared Error (Before):", mse_before)
				
			
				
					print("R^2 (Before):", r2_before)
				
			

CORRELATION MATRIX

				
					correlation_matrix = X.corr()
				
			
				
					# Visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
				
			

VIF

				
					vif_data = pd.DataFrame()
				
			
				
					vif_data["Feature"] = X.columns
				
			
				
					vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
				
			
				
					print(vif_data)
				
			
Instead of using raw height, you might normalize or categorize height into bins, which could reduce the numerical interdependence.
Calculate Condition Index (CI)
				
					features = data[['AB', 'H', 'Height', 'Wind']]
				
			
				
					normalized_features = (features - features.mean()) / features.std()
				
			
				
					cov_matrix = np.cov(normalized_features.T)
				
			
				
					eigenvalues, _ = np.linalg.eig(cov_matrix)
				
			
				
					condition_indices = np.sqrt(eigenvalues.max() / eigenvalues)
				
			
				
					ci_data = pd.DataFrame({
    "Condition Index": condition_indices
})
				
			
				
					print(ci_data)
				
			
How to address Multicollinearity
Drop a Feature (At Bats)
				
					X_reduced = X[['H', 'Height', 'Wind']]
				
			
look at corr matrix
				
					correlation_matrix = X_reduced.corr()
				
			
				
					# Visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()
				
			
VIF
				
					vif_data_2 = pd.DataFrame()
				
			
				
					vif_data_2["Feature"] = X_reduced.columns
				
			
				
					vif_data_2["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]
				
			
				
					print(vif_data_2)
				
			
				
					lr_reduced = LinearRegression()
				
			
				
					lr_reduced.fit(X_train[['H', 'Height', 'Wind']], y_train)
				
			
				
					y_pred_reduced = lr_reduced.predict(X_test[['H', 'Height', 'Wind']])
				
			
				
					mse_reduced = mean_squared_error(y_test, y_pred_reduced)
				
			
				
					r2_reduced = r2_score(y_test, y_pred_reduced)
				
			
				
					print("Regression Coefficients (Reduced):", lr_reduced.coef_)
				
			
				
					print("Mean Squared Error (Reduced):", mse_reduced)
				
			
				
					print("R^2 (Reduced):", r2_reduced)
				
			
Perform PCA
				
					pca = PCA(n_components=3)  # Reduce to 3 components
				
			
				
					X_train_pca = pca.fit_transform(X_train)
				
			
				
					X_test_pca = pca.transform(X_test)
				
			
				
					lr_pca = LinearRegression()
				
			
				
					lr_pca.fit(X_train_pca, y_train)
				
			
				
					y_pred_pca = lr_pca.predict(X_test_pca)
				
			
				
					mse_pca = mean_squared_error(y_test, y_pred_pca)
				
			
				
					r2_pca = r2_score(y_test, y_pred_pca)
				
			
				
					print("Regression Coefficients (PCA):", lr_pca.coef_)
				
			
				
					print("Mean Squared Error (PCA):", mse_pca)
				
			
				
					print("R^2 (PCA):", r2_pca)
				
			
Ridge Regression
Ridge regression reduces the impact of multicollinearity by adding an L2 penalty to the regression coefficients.
				
					ridge = Ridge(alpha=1.0)
				
			
				
					ridge.fit(X_train, y_train)
				
			
				
					y_pred_ridge = ridge.predict(X_test)
				
			
				
					mse_ridge = mean_squared_error(y_test, y_pred_ridge)
				
			
				
					r2_ridge = r2_score(y_test, y_pred_ridge)
				
			
				
					print("Regression Coefficients (Ridge):", ridge.coef_)
				
			
				
					print("Mean Squared Error (Ridge):", mse_ridge)
				
			
				
					print("Mean Squared Error (Ridge):", r2_ridge)
				
			

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *