Scikit-learn Pipelines
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
d1 = {'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
df1 = pd.DataFrame(data=d1)
X1 = df1[['Social_media_followers']]
y1 = df1[['Sold_out']]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.3,random_state=19)
imputer = SimpleImputer(strategy='mean')
lr = LogisticRegression()
pipe1 = make_pipeline(imputer, lr)
pipe1.fit(X1_train, y1_train)
pipe1.score(X1_train,y1_train)
pipe1.score(X1_test,y1_test)
pipe1.named_steps.simpleimputer.statistics_
pipe1.named_steps.logisticregression.coef_
Pipelines for numerical and categorical data must be separate
d2 = {'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'],
'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000],
'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
df = pd.DataFrame(data=d2)
df.head(10)
X = df.iloc[:, 0:2]
y = df.iloc[:,2]
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=75)
num_cols = ['Social_media_followers']
cat_cols = ['Genre']
num_pipeline = Pipeline(steps=[
('impute', SimpleImputer(strategy='mean')),
('scale',StandardScaler())
])
cat_pipeline = Pipeline(steps=[
('impute', SimpleImputer(strategy='most_frequent')),
('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
col_trans = ColumnTransformer(transformers=[
('num_pipeline',num_pipeline,num_cols),
('cat_pipeline',cat_pipeline,cat_cols)
],
remainder='drop',
n_jobs=-1)
dtc = DecisionTreeClassifier()
pipefinal = make_pipeline(col_trans, dtc)
pipefinal.fit(X_train, y_train)
pipefinal.score(X_test, y_test)
Save pipeline
joblib.dump(pipefinal,"pipe.joblib")
pipefinal2 = joblib.load("pipe.joblib")