Scikit-learn Pipelines

  import pandas as pd import numpy as np import joblib from sklearn.model_selection import train_test_split from sklearn.impute import SimpleImputer from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.pipeline import make_pipeline, Pipeline from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer
  d1 = {'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000], 'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
  df1 = pd.DataFrame(data=d1)
  X1 = df1[['Social_media_followers']]
  y1 = df1[['Sold_out']]
  X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.3,random_state=19)
  imputer = SimpleImputer(strategy='mean')
  lr = LogisticRegression()
  pipe1 = make_pipeline(imputer, lr)
  pipe1.fit(X1_train, y1_train)
  pipe1.score(X1_train,y1_train)
  pipe1.score(X1_test,y1_test)
  pipe1.named_steps.simpleimputer.statistics_
  pipe1.named_steps.logisticregression.coef_

Pipelines for numerical and categorical data must be separate

  d2 = {'Genre':['Rock', 'Metal', 'Bluegrass', 'Rock', np.nan, 'Rock', 'Rock', np.nan, 'Bluegrass', 'Rock'], 'Social_media_followers':[1000000, np.nan, 2000000, 1310000, 1700000, np.nan, 4100000, 1600000, 2200000, 1000000], 'Sold_out':[1,0,0,1,0,0,0,1,0,1]}
  df = pd.DataFrame(data=d2) df.head(10)
  X = df.iloc[:, 0:2]
  y = df.iloc[:,2]
  X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=75)
  num_cols = ['Social_media_followers']
  cat_cols = ['Genre']
  num_pipeline = Pipeline(steps=[ ('impute', SimpleImputer(strategy='mean')), ('scale',StandardScaler()) ])
  cat_pipeline = Pipeline(steps=[ ('impute', SimpleImputer(strategy='most_frequent')), ('one-hot',OneHotEncoder(handle_unknown='ignore', sparse_output=False)) ])
  col_trans = ColumnTransformer(transformers=[ ('num_pipeline',num_pipeline,num_cols), ('cat_pipeline',cat_pipeline,cat_cols) ], remainder='drop', n_jobs=-1)
  dtc = DecisionTreeClassifier()
  pipefinal = make_pipeline(col_trans, dtc)
  pipefinal.fit(X_train, y_train)
  pipefinal.score(X_test, y_test)

Save pipeline

  joblib.dump(pipefinal,"pipe.joblib")
  pipefinal2 = joblib.load("pipe.joblib")

Leave a Reply

Your email address will not be published. Required fields are marked *