// example.
subset_feature = ["embarked", "sex", "pclass", "age", "fare"]
X = X[subset_feature]
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Then, we introspect the information regarding each column data type.
After Change
("num", numeric_transformer, selector(dtype_exclude="category")),
("cat", categorical_transformer, selector(dtype_include="category"))
])
clf = Pipeline(steps=[("preprocessor", preprocessor),
("classifier", LogisticRegression())])
clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The resulting score is not exactly the same as the one from the previous
// pipeline becase the dtype-based selector treats the ``pclass`` columns as
// a numeric features instead of a categorical feature as previously:
selector(dtype_exclude="category")(X_train)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
selector(dtype_include="category")(X_train)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Using the prediction pipeline in a grid search
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Grid search can also be performed on the different preprocessing steps
// defined in the ``ColumnTransformer`` object, together with the classifier"s
// hyperparameters as part of the ``Pipeline``.
// We will search for both the imputer strategy of the numeric preprocessing
// and the regularization parameter of the logistic regression using
// :class:`sklearn.model_selection.GridSearchCV`.
param_grid = {
"preprocessor__num__imputer__strategy": ["mean", "median"],
"classifier__C": [0.1, 1.0, 10, 100],
}
grid_search = GridSearchCV(clf, param_grid, cv=10)
grid_search
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Calling "fit" triggers the cross-validated search for the best
// hyper-parameters combination:
//
grid_search.fit(X_train, y_train)
print(f"Best params:")
print(grid_search.best_params_)
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// The internal cross-validation scores obtained by those parameters is:
print(f"Internal CV score: {grid_search.best_score_:.3f}")
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// We can also introspect the top grid search results as a pandas dataframe:
import pandas as pd
cv_results = pd.DataFrame(grid_search.cv_results_)
cv_results = cv_results.sort_values("mean_test_score", ascending=False)
cv_results[["mean_test_score", "std_test_score",
"param_preprocessor__num__imputer__strategy",
"param_classifier__C"
]].head(5)