machine learning imbalanced classes

				
					# Importing the necessary libraries
#https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours
from sklearn.preprocessing import StandardScaler


#no ensemble

#no pipelines or cross validation

#The imblearn package contains a lot of different samplers for oversampling and undersampling.
#These samplers can not be placed in a standard sklearn pipeline.




#look over this full thing
##https://www.kaggle.com/code/marcinrutecki/smote-and-tomek-links-for-imbalanced-data
				
			
#Read over
#data professor
#emma Ding
#mahesh huddar
#ritvik math
Part 1 Load a Dataset
				
					df = pd.read_csv('/content/creditcard.csv')
				
			
				
					df.head(5)
				
			
Part 2 SIMPLE EDA
				
					# Check class distribution
print(df['Class'].value_counts())
				
			
				
					# Visualizing the class distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='Class', data=df)
plt.title('Class Distribution')
plt.show()
				
			
Part 3 Set Up the Data
				
					# Prepare features and target
X = df.drop(['Class', 'Time'], axis=1)  # Dropping 'Time' as it's not useful for prediction
y = df['Class']
				
			
				
					X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
				
			
Part 4 BASELINE MODEL – NO FIXING THE IMBALANCE
				
					# Create a baseline Logistic Regression model without handling class imbalance
model = LogisticRegression(max_iter=1000, random_state=42)
#model = RandomForestClassifier(random_state=42)
				
			
				
					# Train the model on the imbalanced data
model.fit(X_train, y_train)
				
			
				
					# Make predictions
y_pred = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred)
				
			
				
					f1_score(y_test, y_pred)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
part 5
Oversampling Example
Oversampling Example 1 RandomOverSampler
				
					ros = RandomOverSampler(random_state=42)
				
			
				
					X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
				
			
				
					# Assuming y_resampled is a Pandas Series
class_counts = y_resampled.value_counts()
				
			
				
					# Plot the class distribution
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange'])
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution After Resampling')
plt.show()
				
			
				
					# Train the model on the resampled data
model.fit(X_resampled, y_resampled)
				
			
				
					y_pred_ros = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred_ros)
				
			

To start we’re going to create a simple dataframe in python

led to overfitting
				
					f1_score(y_test, y_pred_ros)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_ros)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
part 6
Oversampling Method Example 2 SMOTE
 
				
					smote = SMOTE(random_state=42)
				
			
				
					X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
				
			
				
					# Assuming y_resampled is a Pandas Series
class_counts = y_resampled.value_counts()
				
			
				
					# Plot the class distribution
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange'])
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution After Resampling')
plt.show()
				
			
				
					# Train the model on the resampled data
model.fit(X_resampled, y_resampled)
				
			
				
					# Make predictions on the test set
y_pred_smote = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred_smote)
				
			
				
					f1_score(y_test, y_pred_smote)
				
			
				
					df.loc[idx]
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_smote)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
part 7
Use StandardScaler or MinMaxScaler from sklearn before applying ADASYN:
				
					scaler = StandardScaler()
				
			
				
					X_train_scaled = scaler.fit_transform(X_train)
				
			
				
					adasyn = ADASYN(random_state=42)
				
			
				
					X_resampled, y_resampled = adasyn.fit_resample(X_train_scaled, y_train)
				
			
				
					class_counts = y_resampled.value_counts()
				
			
				
					class_counts = y_resampled.value_counts()
# Plot the class distribution
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange'])
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution After Resampling')
plt.show()
				
			
				
					model.fit(X_resampled, y_resampled)
				
			
				
					X_test_scaled = scaler.transform(X_test)  # Ensure test data is scaled
				
			
				
					y_pred_smote = model.predict(X_test_scaled)
				
			
				
					accuracy_score(y_test, y_pred_smote)
				
			
Overfitting
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_smote)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
part 8
UNDER SAMPLING
part 9 RandomUnderSampler
				
					undersample = RandomUnderSampler(random_state=42)
				
			
				
					X_resampled_under, y_resampled_under = undersample.fit_resample(X_train, y_train)
				
			
				
					class_counts = y_resampled_under.value_counts()
# Plot the class distribution
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange'])
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution After Resampling')
plt.show()
				
			
				
					# Train the model on the undersampled data
model.fit(X_resampled_under, y_resampled_under)
				
			
				
					# Make predictions on the test set
y_pred_under = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred_under)
				
			
				
					f1_score(y_test, y_pred_under)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_under)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
#another example
#EditedNearestNeighbours

#Removes samples from the majority class that are misclassified by a k-nearest neighbors classifier.
				
					EditedNearestNeighbours = EditedNearestNeighbours()
				
			
				
					X_resampled_under, y_resampled_under = EditedNearestNeighbours.fit_resample(X_train, y_train)
				
			
				
					print(df['Class'].value_counts())
				
			
				
					 y_resampled_under.value_counts()
				
			
				
					class_counts = y_resampled_under.value_counts()
# Plot the class distribution
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange'])
plt.xticks([0, 1], ['Class 0', 'Class 1'])
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution After Resampling')
plt.show()
				
			
				
					# Train the model on the undersampled data
model.fit(X_resampled_under, y_resampled_under)
				
			
				
					# Make predictions on the test set
y_pred_under = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred_under)
				
			
				
					f1_score(y_test, y_pred_under)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred_under)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
Cost Sensitive LEarning
Example x Balanced Class Weight
				
					model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
				
			
				
					# Make predictions
y_pred = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred)
				
			
				
					f1_score(y_test, y_pred)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
Example X class weights
Several machine learning algorithms, such as Decision Trees, SVM, and Random Forest,
allow you to specify class weights.
This is similar to cost-sensitive learning but is a feature built directly into the algorithm.
				
					# Manually set custom class weights
# In this example, we give a higher weight to the minority class (fraud class, which is typically 1)
class_weights = {0: 1, 1: 10}  # Class 0 (non-fraud) gets weight 1, Class 1 (fraud) gets weight 10
				
			
				
					model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weights)
				
			
				
					model.fit(X_train, y_train)
				
			
				
					# Make predictions
y_pred = model.predict(X_test)
				
			
				
					accuracy_score(y_test, y_pred)
				
			
				
					f1_score(y_test, y_pred)
				
			
				
					# Visualize the confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.xlabel('Predicted Labels')  # Add x-axis label for predicted values
plt.ylabel('Actual Labels')    # Add y-axis label for actual values
plt.title('Confusion Matrix (Imbalanced Data)')
plt.show()
				
			
				
					df.loc[idx]
				
			
				
					df.loc[idx]
				
			
				
					df.loc[idx]
				
			
				
					df.loc[idx]
				
			

Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.

Leave a Reply

Your email address will not be published. Required fields are marked *