machine learning imbalanced classes
# Importing the necessary libraries #https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.metrics import confusion_matrix, f1_score, accuracy_score from imblearn.over_sampling import SMOTE, RandomOverSampler, ADASYN from imblearn.under_sampling import RandomUnderSampler, EditedNearestNeighbours from sklearn.preprocessing import StandardScaler #no ensemble #no pipelines or cross validation #The imblearn package contains a lot of different samplers for oversampling and undersampling. #These samplers can not be placed in a standard sklearn pipeline. #look over this full thing ##https://www.kaggle.com/code/marcinrutecki/smote-and-tomek-links-for-imbalanced-data
#Read over
#data professor
#emma Ding
#mahesh huddar
#ritvik math
Part 1 Load a Dataset
df = pd.read_csv('/content/creditcard.csv')
df.head(5)

Part 2 SIMPLE EDA
# Check class distribution print(df['Class'].value_counts())

# Visualizing the class distribution plt.figure(figsize=(6, 4)) sns.countplot(x='Class', data=df) plt.title('Class Distribution') plt.show()

Part 3 Set Up the Data
# Prepare features and target X = df.drop(['Class', 'Time'], axis=1) # Dropping 'Time' as it's not useful for prediction y = df['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
Part 4 BASELINE MODEL – NO FIXING THE IMBALANCE
# Create a baseline Logistic Regression model without handling class imbalance model = LogisticRegression(max_iter=1000, random_state=42) #model = RandomForestClassifier(random_state=42)
# Train the model on the imbalanced data model.fit(X_train, y_train)

# Make predictions y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

f1_score(y_test, y_pred)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

part 5
Oversampling Example
Oversampling Example 1 RandomOverSampler
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
# Assuming y_resampled is a Pandas Series class_counts = y_resampled.value_counts()
# Plot the class distribution plt.figure(figsize=(8, 6)) plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange']) plt.xticks([0, 1], ['Class 0', 'Class 1']) plt.xlabel('Class') plt.ylabel('Count') plt.title('Class Distribution After Resampling') plt.show()

# Train the model on the resampled data model.fit(X_resampled, y_resampled)

y_pred_ros = model.predict(X_test)
accuracy_score(y_test, y_pred_ros)

To start we’re going to create a simple dataframe in python
led to overfitting
f1_score(y_test, y_pred_ros)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred_ros) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

part 6
Oversampling Method Example 2 SMOTE
Â
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
# Assuming y_resampled is a Pandas Series class_counts = y_resampled.value_counts()
# Plot the class distribution plt.figure(figsize=(8, 6)) plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange']) plt.xticks([0, 1], ['Class 0', 'Class 1']) plt.xlabel('Class') plt.ylabel('Count') plt.title('Class Distribution After Resampling') plt.show()

# Train the model on the resampled data model.fit(X_resampled, y_resampled)

# Make predictions on the test set y_pred_smote = model.predict(X_test)
accuracy_score(y_test, y_pred_smote)

f1_score(y_test, y_pred_smote)

df.loc[idx]
# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred_smote) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

part 7
Use StandardScaler or MinMaxScaler from sklearn before applying ADASYN:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
adasyn = ADASYN(random_state=42)
X_resampled, y_resampled = adasyn.fit_resample(X_train_scaled, y_train)
class_counts = y_resampled.value_counts()
class_counts = y_resampled.value_counts() # Plot the class distribution plt.figure(figsize=(8, 6)) plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange']) plt.xticks([0, 1], ['Class 0', 'Class 1']) plt.xlabel('Class') plt.ylabel('Count') plt.title('Class Distribution After Resampling') plt.show()

model.fit(X_resampled, y_resampled)

X_test_scaled = scaler.transform(X_test) # Ensure test data is scaled
y_pred_smote = model.predict(X_test_scaled)
accuracy_score(y_test, y_pred_smote)

Overfitting
# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred_smote) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

part 8
UNDER SAMPLING
part 9 RandomUnderSampler
undersample = RandomUnderSampler(random_state=42)
X_resampled_under, y_resampled_under = undersample.fit_resample(X_train, y_train)
class_counts = y_resampled_under.value_counts() # Plot the class distribution plt.figure(figsize=(8, 6)) plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange']) plt.xticks([0, 1], ['Class 0', 'Class 1']) plt.xlabel('Class') plt.ylabel('Count') plt.title('Class Distribution After Resampling') plt.show()

# Train the model on the undersampled data model.fit(X_resampled_under, y_resampled_under)

# Make predictions on the test set y_pred_under = model.predict(X_test)
accuracy_score(y_test, y_pred_under)

f1_score(y_test, y_pred_under)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred_under) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

#another example
#EditedNearestNeighbours
#Removes samples from the majority class that are misclassified by a k-nearest neighbors classifier.
EditedNearestNeighbours = EditedNearestNeighbours()
X_resampled_under, y_resampled_under = EditedNearestNeighbours.fit_resample(X_train, y_train)
print(df['Class'].value_counts())

y_resampled_under.value_counts()

class_counts = y_resampled_under.value_counts() # Plot the class distribution plt.figure(figsize=(8, 6)) plt.bar(class_counts.index, class_counts.values, color=['blue', 'orange']) plt.xticks([0, 1], ['Class 0', 'Class 1']) plt.xlabel('Class') plt.ylabel('Count') plt.title('Class Distribution After Resampling') plt.show()

# Train the model on the undersampled data model.fit(X_resampled_under, y_resampled_under)

# Make predictions on the test set y_pred_under = model.predict(X_test)
accuracy_score(y_test, y_pred_under)

f1_score(y_test, y_pred_under)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred_under) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

Cost Sensitive LEarning
Example x Balanced Class Weight
model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')

# Make predictions y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

f1_score(y_test, y_pred)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

Example X class weights
Several machine learning algorithms, such as Decision Trees, SVM, and Random Forest,
allow you to specify class weights.
This is similar to cost-sensitive learning but is a feature built directly into the algorithm.
# Manually set custom class weights # In this example, we give a higher weight to the minority class (fraud class, which is typically 1) class_weights = {0: 1, 1: 10} # Class 0 (non-fraud) gets weight 1, Class 1 (fraud) gets weight 10
model = LogisticRegression(max_iter=1000, random_state=42, class_weight=class_weights)
model.fit(X_train, y_train)

# Make predictions y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

f1_score(y_test, y_pred)

# Visualize the confusion matrix cm = confusion_matrix(y_test, y_pred) plt.figure(figsize=(6, 4)) sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud']) plt.xlabel('Predicted Labels') # Add x-axis label for predicted values plt.ylabel('Actual Labels') # Add y-axis label for actual values plt.title('Confusion Matrix (Imbalanced Data)') plt.show()

df.loc[idx]
df.loc[idx]
df.loc[idx]
df.loc[idx]
Ryan is a Data Scientist at a fintech company, where he focuses on fraud prevention in underwriting and risk. Before that, he worked as a Data Analyst at a tax software company. He holds a degree in Electrical Engineering from UCF.