Scikit-learn Pipelines

				
					import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


				
			
				
					d1 = {'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
    'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
				
			
				
					df1 = pd.DataFrame(data=d1)
				
			
				
					X1 = df1[['Social_media_followers']]
				
			
				
					y1 = df1[['Sold_out']]
				
			
				
					X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.3,random_state=19)
				
			
				
					imputer = SimpleImputer(strategy='mean')
				
			
				
					lr = LogisticRegression()
				
			
				
					pipe1 = make_pipeline(imputer, lr)
				
			
				
					pipe1.fit(X1_train, y1_train)
				
			
				
					pipe1.score(X1_train,y1_train)
				
			
				
					pipe1.score(X1_test,y1_test)
				
			
				
					pipe1.named_steps.simpleimputer.statistics_
				
			
				
					pipe1.named_steps.logisticregression.coef_
				
			

Pipelines for numerical and categorical data must be separate

				
					d2 = {'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
    'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
    'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
				
			
				
					df = pd.DataFrame(data=d2)
df.head(10)
				
			
				
					X = df.iloc[:, 0:2]
				
			
				
					y = df.iloc[:,2]
				
			
				
					X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=75)
				
			
				
					num_cols = ['Social_media_followers']
				
			
				
					cat_cols = ['Genre']
				
			
				
					num_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='mean')),
    ('scale',StandardScaler())
])
				
			
				
					cat_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
				
			
				
					col_trans = ColumnTransformer(transformers=[
    ('num_pipeline',num_pipeline,num_cols),
    ('cat_pipeline',cat_pipeline,cat_cols)
    ],
    remainder='drop',
    n_jobs=-1)
				
			
				
					dtc = DecisionTreeClassifier()
				
			
				
					pipefinal = make_pipeline(col_trans, dtc)
				
			
				
					pipefinal.fit(X_train, y_train)
				
			
				
					pipefinal.score(X_test, y_test)
				
			

Save pipeline

				
					joblib.dump(pipefinal,"pipe.joblib")
				
			
				
					pipefinal2 = joblib.load("pipe.joblib")
				
			

Leave a Reply

Your email address will not be published. Required fields are marked *