scikit-learn

Multicollinearity

				
					import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from numpy.linalg import eig
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error, r2_score

				
					np.random.seed(73)

				
					num_players = 200

				
					batting_average = np.random.normal(0.28, 0.05, num_players)

				
					batting_average = np.clip(batting_average, 0.2, 0.4)

				
					AB = np.random.randint(300, 700, num_players)  # At-bats for the season

				
					H = AB * batting_average

dividing the total number of bases a player records by their total number of at-bats

maybe replace this with something else?

				
					height = np.random.normal(72, 6, num_players)

				
					Wind = np.random.uniform(-1.5, 1.5, num_players)  # Average wind factor affecting performance

				
					(AB * 0.02) + (height * 0.01) + (Wind * 7) + np.random.normal(0, 10, num_players)

				
					RBIs_rounded = np.round(RBIs)
H_rounded = np.round(H)
AB_rounded = np.round(AB)

				
					# Create the DataFrame
data = pd.DataFrame({
    'AB': AB_rounded,                  # At-Bats
    'H': H_rounded,                    # Hits
    'Height': height,                # Slugging Percentage
    'Wind': Wind,              # Wind Factor
    'RBIs': RBIs_rounded               # Runs Batted In (Target)
})

				
					print(data.head())

				
					X = data[['AB', 'H', 'Height', 'Wind']]

				
					y = data['RBIs']

				
					X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

				
					lr = LinearRegression()

				
					lr.fit(X_train, y_train)

				
					y_pred = lr.predict(X_test)

				
					mse_before = mean_squared_error(y_test, y_pred)

				
					r2_before = r2_score(y_test, y_pred)

				
					print("Regression Coefficients (Before):", lr.coef_)

				
					print("Mean Squared Error (Before):", mse_before)

				
					print("R^2 (Before):", r2_before)

CORRELATION MATRIX

				
					correlation_matrix = X.corr()

				
					# Visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

VIF

				
					vif_data = pd.DataFrame()

				
					vif_data["Feature"] = X.columns

				
					vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

				
					print(vif_data)

Instead of using raw height, you might normalize or categorize height into bins, which could reduce the numerical interdependence.

Calculate Condition Index (CI)

				
					features = data[['AB', 'H', 'Height', 'Wind']]

				
					normalized_features = (features - features.mean()) / features.std()

				
					cov_matrix = np.cov(normalized_features.T)

				
					eigenvalues, _ = np.linalg.eig(cov_matrix)

				
					condition_indices = np.sqrt(eigenvalues.max() / eigenvalues)

				
					ci_data = pd.DataFrame({
    "Condition Index": condition_indices
})

				
					print(ci_data)

How to address Multicollinearity

Drop a Feature (At Bats)

				
					X_reduced = X[['H', 'Height', 'Wind']]

look at corr matrix

				
					correlation_matrix = X_reduced.corr()

				
					# Visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix")
plt.show()

VIF

				
					vif_data_2 = pd.DataFrame()

				
					vif_data_2["Feature"] = X_reduced.columns

				
					vif_data_2["VIF"] = [variance_inflation_factor(X_reduced.values, i) for i in range(X_reduced.shape[1])]

				
					print(vif_data_2)

				
					lr_reduced = LinearRegression()

				
					lr_reduced.fit(X_train[['H', 'Height', 'Wind']], y_train)

				
					y_pred_reduced = lr_reduced.predict(X_test[['H', 'Height', 'Wind']])

				
					mse_reduced = mean_squared_error(y_test, y_pred_reduced)

				
					r2_reduced = r2_score(y_test, y_pred_reduced)

				
					print("Regression Coefficients (Reduced):", lr_reduced.coef_)

				
					print("Mean Squared Error (Reduced):", mse_reduced)

				
					print("R^2 (Reduced):", r2_reduced)

Perform PCA

				
					pca = PCA(n_components=3)  # Reduce to 3 components

				
					X_train_pca = pca.fit_transform(X_train)

				
					X_test_pca = pca.transform(X_test)

				
					lr_pca = LinearRegression()

				
					lr_pca.fit(X_train_pca, y_train)

				
					y_pred_pca = lr_pca.predict(X_test_pca)

				
					mse_pca = mean_squared_error(y_test, y_pred_pca)

				
					r2_pca = r2_score(y_test, y_pred_pca)

				
					print("Regression Coefficients (PCA):", lr_pca.coef_)

				
					print("Mean Squared Error (PCA):", mse_pca)

				
					print("R^2 (PCA):", r2_pca)

Ridge Regression

Ridge regression reduces the impact of multicollinearity by adding an L2 penalty to the regression coefficients.

				
					ridge = Ridge(alpha=1.0)

				
					ridge.fit(X_train, y_train)

				
					y_pred_ridge = ridge.predict(X_test)

				
					mse_ridge = mean_squared_error(y_test, y_pred_ridge)

				
					r2_ridge = r2_score(y_test, y_pred_ridge)

				
					print("Regression Coefficients (Ridge):", ridge.coef_)

				
					print("Mean Squared Error (Ridge):", mse_ridge)

				
					print("Mean Squared Error (Ridge):", r2_ridge)

Free Community

Join 1,000+ AI Automation Builders

Weekly tutorials, live calls & direct access to Ryan & Matt.

Join Free →

Ryan Nolan

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Multicollinearity

Table of Contents

CORRELATION MATRIX

VIF

Join 1,000+ AI Automation Builders

Ryan Nolan

Important Links

LinkedIn

Social Media

Keep Learning

adaboost classifier

Kaggle House price prediction Regression Analysis

kaggle titanic tutorial

hyperparameter tuning with scikit learn

principal component analysis scikit learn