Sklearn Gaussian Mixture Models

import numpy as np from sklearn.mixture import GaussianMixture from sklearn.datasets import make_blobs import matplotlib.pyplot as plt import seaborn as sns import pandas as pd
data, true_labels = make_blobs(n_samples=300, centers=3, cluster_std=2.0, random_state=42)
plt.scatter(data[:, 0], data[:, 1], s=30) plt.title("Generated Blob Data") plt.show()

Example 1 -

gmm = GaussianMixture(n_components=3, random_state=42)
gmm.fit(data)
predicted_labels = gmm.predict(data)
cluster_centers = gmm.means_ print(cluster_centers)
Plot clustered data
plt.scatter(data[:, 0], data[:, 1], c=predicted_labels, cmap='viridis', s=30, label='Cluster Points') # Annotate cluster centers with their labels for idx, (x, y) in enumerate(cluster_centers): plt.text(x, y, str(idx), color="black", fontsize=12, ha="center", va="center", bbox=dict(facecolor='white', edgecolor='black', boxstyle='round,pad=0.3')) # Add title and legend plt.title("GMM Clustering Predictions with Cluster Labels") plt.show()
Predict a New Point and see the probabilities
new_point = [[0, 3]]
predicted_cluster = gmm.predict(new_point) print(f"The point {new_point} is predicted to belong to cluster {predicted_cluster[0]}")

The point [[0, 3]] is predicted to belong to cluster 2

probabilities = gmm.predict_proba(new_point) print(f"Probabilities for each cluster: {probabilities}")

Probabilities for each cluster: [[3.65071761e-02 6.71974709e-07 9.63492152e-01]]

Example 2 -

This Example will take a look at Babe Ruth Card values. It includes a cluster of 300 cards around 5k and 100 cards around 10k.
np.random.seed(42)
cluster_1 = np.random.normal(loc=5000, scale=2500, size=300) cluster_1 = cluster_1[cluster_1 > 0]
cluster_2 = np.random.normal(loc=20000, scale=5000, size=100) cluster_2 = cluster_2[cluster_2 > 0]
prices = np.concatenate([cluster_1, cluster_2]).reshape(-1, 1)
prices_flat = prices.flatten() # Plot the histogram with KDE plt.figure(figsize=(8, 6)) sns.histplot(prices_flat, kde=True, bins=30, color='blue', edgecolor='black', stat='density') # Add titles and labels plt.title("Distribution of Babe Ruth Baseball Card Prices", fontsize=16) plt.xlabel("Price ($)", fontsize=12) plt.ylabel("Density", fontsize=12) plt.show()
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(prices)
predicted_labels = gmm.predict(prices)
Create a dataframe with the predictions
data_df = pd.DataFrame({ 'Price': prices.flatten(), 'Cluster': predicted_labels }) data_df.head(10)
top_expensive = data_df.sort_values(by='Price', ascending=False) top_expensive.head(10)

Leave a Reply

Your email address will not be published. Required fields are marked *