3) Train machine learning models to predict sepsis outcome (In progress)

26 minute read

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import matplotlib.patches as mpatches
import seaborn as sns 
import missingno as msno 
from utils import my_histogram, make_stacked_table, my_stacked_barplot

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn import tree
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
import lightgbm as lgb

import optuna
from optuna.integration import LightGBMPruningCallback
from optuna.integration import XGBoostPruningCallback
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report


import imblearn
from imblearn.over_sampling import SMOTE

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)

1. Prepare the data

Load all tables with extracted features.

train_outcome = pd.read_csv("../data/train_outcome.csv")
test_outcome = pd.read_csv("../data/test_outcome.csv")

df_train_demo = pd.read_csv("../data/df_train_demo.csv")
df_test_demo = pd.read_csv("../data/df_test_demo.csv")

df_train_measurment_time_info = pd.read_csv("../data/df_train_measurment_time_info.csv")
df_test_measurment_time_info = pd.read_csv("../data/df_test_measurment_time_info.csv")

df_train_measurement_grouped_stat = pd.read_csv("../data/df_train_measurement_grouped_stat.csv")
df_test_measurement_grouped_stat = pd.read_csv("../data/df_test_measurment_grouped_stat.csv")

Merge all tables.

df_train_all = df_train_demo.merge(df_train_measurment_time_info, how = "left", on = "ID") \
                            .merge(df_train_measurement_grouped_stat, how = "left", on = "ID") \
                            .merge(train_outcome, how = "left", on = "ID")    

df_test_all = df_test_demo.merge(df_test_measurment_time_info, how = "left", on = "ID") \
                          .merge(df_test_measurement_grouped_stat, how = "left", on = "ID") \
                          .merge(test_outcome, how = "left", on = "ID")    

X_train = df_train_all.drop(["Outcome", "ID"], axis = 1)
y_train = df_train_all.Outcome

X_test = df_test_all.drop(["Outcome", "ID"], axis = 1)
y_test = df_test_all.Outcome

Standardize the train and test data.

col_names = X_train.columns

scaler = StandardScaler()
scaler.fit(X_train)

StandardScaler()

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

X_train_std = pd.DataFrame(scaler.transform(X_train), columns = col_names)
X_test_std = pd.DataFrame(scaler.transform(X_test), columns = col_names)

2. Baseline error

y_test.value_counts()

0    5623
1     867
Name: Outcome, dtype: int64

sns.countplot(x = y_test)

<AxesSubplot:xlabel='Outcome', ylabel='count'>

png

Note that only 13% of patients have an Outcome = 1.
That is, if we just predict all patient as Outcome = 0, we can achieve the 13% error, that is our baseline error.
Let’s choose models to tune the hyperparameters by comparing each model’s error with the baseline error.

names = ["KNN", 
         "LDA", 
         "QDA", 
         "Naive Bayes",
         "Logistic regression",
         "Decesion tree", 
         "Random Forest",  
         "AdaBoost",
         "XGBoost",
         "Polynomial kernel SVM",
         "Radial kernel SVM",
         "GBM",
         "LightGBM"
        ]

classifiers = [
    KNeighborsClassifier(3),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    GaussianNB(), 
    LogisticRegression(),
    DecisionTreeClassifier(random_state = 42),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    xgb.XGBClassifier(),
    SVC(kernel = "poly", probability = True),
    SVC(kernel = "rbf", probability = True),
    GradientBoostingClassifier(),
    lgb.LGBMClassifier()
    ]

result_accuracy = pd.DataFrame(names, columns = ["model_name"])

y_pred_dict = {}

for name, clf in zip(names, classifiers):
    clf.fit(X_train_std, y_train)
    
    y_pred = clf.predict(X_test_std)
    y_pred_proba = clf.predict_proba(X_test_std)
    y_pred_dict[name] = y_pred
    
    error = np.mean(y_pred != y_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba[:, 1])
    
    result_accuracy.loc[result_accuracy.model_name == name, "Error"] = error
    result_accuracy.loc[result_accuracy.model_name == name, "BER"] = 1 - balanced_accuracy
    result_accuracy.loc[result_accuracy.model_name == name, "AUC"] = auc
    

result_accuracy

	model_name	Error	BER	AUC
0	KNN	0.092296	0.301056	0.764946
1	LDA	0.110169	0.362100	0.813528
2	QDA	0.155470	0.301905	0.799748
3	Naive Bayes	0.117565	0.295640	0.841536
4	Logistic regression	0.106317	0.354024	0.815425
5	Decesion tree	0.115408	0.236349	0.763651
6	Random Forest	0.066256	0.216764	0.911303
7	AdaBoost	0.072111	0.223069	0.888091
8	XGBoost	0.065485	0.203637	0.911156
9	Polynomial kernel SVM	0.089522	0.300431	0.825197
10	Radial kernel SVM	0.077042	0.258108	0.900935
11	GBM	0.065948	0.211708	0.908800
12	LightGBM	0.063328	0.200441	0.916949

Almost all models have smaller error than the baseline error 13%.
Among the models, the error and the balanced error of LightGBM is the lowest at 6.3% and 20%.
Also, Randomforest and XGBoost have small BER and high AUC values.
So let’s tune the hyperparameters of LightGBM, XGBoost, and RandomForest.
Before tune the hyperparameters, let’s check the confusion matrix.

lgbm_cm = confusion_matrix(y_test, y_pred_dict["LightGBM"])
lgbm_cm_display = ConfusionMatrixDisplay(confusion_matrix = lgbm_cm, display_labels = clf.classes_)

xgboost_cm = confusion_matrix(y_test, y_pred_dict["XGBoost"])
xgboost_cm_display = ConfusionMatrixDisplay(confusion_matrix = xgboost_cm, display_labels = clf.classes_)

rf_cm = confusion_matrix(y_test, y_pred_dict["Random Forest"])
rf_cm_display = ConfusionMatrixDisplay(confusion_matrix = rf_cm, display_labels = clf.classes_)

fig, axes = plt.subplots(1, 3, figsize = (20, 5))

lgbm_cm_display.plot(ax = axes[0])
axes[0].set_title("LightGBM", fontsize = 15)

xgboost_cm_display.plot(ax = axes[1])
axes[1].set_title("XGBoost", fontsize = 15)

rf_cm_display.plot(ax = axes[2])
axes[2].set_title("Random Forest", fontsize = 15)

Text(0.5, 1.0, 'Random Forest')

png

Models perform poorly at predicting Outcome = 1.

3. Hyperparameters tunning

Now, let’s tune the hyperparameter.
There are two evaluation metrics. One is BER, and the other is AUC.
First, tune the hyperparemeter of LightGBM, XGBoost, and RandomForest to maximize the AUC.
Among trained LightGBM, XGBoost, and RandomForest models with different hyperparameters, select top 15 models.
From the best 15 models, get average of the scores.
By using the mean score from 15 best models, calcualte the AUC.
Then, let’s find the best cutoff value for the mean score to achieve the minimum BER.

3.1. LightGBM

Candidate hyperparameters are as follow:
- learning_rate: 0.01 ~ 0.3
- num_leaves: 20 ~ 3000 with step = 20
- max_depth: 3 ~ 12
- min_data_in_leaf: 200 ~ 10000 with step = 100
- max_bing: 200 ~ 300
- lambda_l1: 0 ~ 100 with step = 5
- lambda_l2: 0 ~ 100 with step = 5
- min_gain_to_split: 0 ~ 15
- bagging_fraction: 0.2 ~ 0.95 with step = 0.1
- feature_fraction: 0.2 ~ 0.95 with step = 0.1
There are two evaluation metrics. One is BER, and the other is AUC.
First, tune the hyperparemeter to maximize the AUC.
Then, let’s find the best cutoff value for score to achieve the minimum BER.

def lgbm_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step = 20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step = 100),
        "max_bin": trial.suggest_int("max_bin", 200, 300),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step = 5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step = 5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step = 0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step = 0.1
        ),
        "silent": 1,
        "verbose": -1
    }
    
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    cv_scores = np.empty(5)
    
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective = "binary", **param_grid, n_jobs = -1)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric = ["auc"],
            early_stopping_rounds = 100,
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],  # Add a pruning callback
            verbose = -1
        )
        preds_proba = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, preds_proba[:, 1])
        cv_scores[idx] = auc

    return np.mean(cv_scores)

lgbm_study = optuna.create_study(direction = "maximize", study_name = "LGBM Classifier")
func = lambda trial: lgbm_objective(trial, X_train_std, y_train)
lgbm_study.optimize(func, n_trials = 100)

lgbm_trials_history = lgbm_study.get_trials()

trial_list = []
hyperparameters_list = []
auc_list = []

for i in range(len(lgbm_trials_history)):
    trial_list.append(lgbm_trials_history[i].number) 
    hyperparameters_list.append(lgbm_trials_history[i].params)
    auc_list.append(lgbm_trials_history[i].values[0])
    
lgbm_auc_history = pd.DataFrame({"trial": trial_list, "auc": auc_list, "params": hyperparameters_list})

lgbm_auc_history.sort_values("auc", ascending = False)

	trial	auc	params
85	85	0.943124	{'n_estimators': 10000, 'learning_rate': 0.145...
55	55	0.940711	{'n_estimators': 10000, 'learning_rate': 0.120...
67	67	0.940047	{'n_estimators': 10000, 'learning_rate': 0.132...
80	80	0.939630	{'n_estimators': 10000, 'learning_rate': 0.172...
39	39	0.939020	{'n_estimators': 10000, 'learning_rate': 0.136...
...	...	...	...
10	10	0.500000	{'n_estimators': 10000, 'learning_rate': 0.197...
8	8	0.500000	{'n_estimators': 10000, 'learning_rate': 0.259...
4	4	0.500000	{'n_estimators': 10000, 'learning_rate': 0.253...
2	2	0.500000	{'n_estimators': 10000, 'learning_rate': 0.054...
0	0	0.500000	{'n_estimators': 10000, 'learning_rate': 0.265...

100 rows × 3 columns

3.2. XGBoost

Candidate hyperparameters are as follow:
- n_estimators: 100, 300, 500, 1000
- learning_rate: 1e-8 ~ 1
- max_depth: 3 ~ 20
- lambda: 1e-8 ~ 1
- alpha: 1e-8 ~ 1
- gamma: 1e-8 ~ 1
- subsample: 0.01 ~ 0.3 with step = 0.02
- colsample_bytree: 0.4 ~ 1.0 with step = 0.1
- colsample_bylevel: 0.4 ~ 1.0 with step = 0.1

def xgboost_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [100, 300, 500, 1000]),
        "eta": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "subsample": trial.suggest_float("subsample", 0.01, 0.3, step = 0.02),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0, step = 0.1),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.4, 1.0, step = 0.1),
        "verbosity": 0
    }

    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    cv_scores = np.empty(5)

    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = xgb.XGBClassifier(objective = "binary:logistic", **param_grid, n_jobs = -1)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric = ["auc"],
            early_stopping_rounds = 100,
            callbacks=[
                XGBoostPruningCallback(trial, "validation_0-auc")
            ],  # Add a pruning callback
        )
        
        preds_proba = model.predict_proba(X_test)
        auc = roc_auc_score(y_test, preds_proba[:, 1])
        cv_scores[idx] = auc

    return np.mean(cv_scores)

xgboost_study = optuna.create_study(direction = "maximize", study_name = "XGBoost Classifier")
func = lambda trial: xgboost_objective(trial, X_train_std, y_train)
xgboost_study.optimize(func, n_trials = 100)

xgboost_trials_history = xgboost_study.get_trials()

trial_list = []
hyperparameters_list = []
auc_list = []

for i in range(len(xgboost_trials_history)):
    trial_list.append(xgboost_trials_history[i].number) 
    hyperparameters_list.append(xgboost_trials_history[i].params)
    auc_list.append(xgboost_trials_history[i].values[0])
    
xgboost_auc_history = pd.DataFrame({"trial": trial_list, "auc": auc_list, "params": hyperparameters_list})

xgboost_auc_history.sort_values("auc", ascending = False)

	trial	auc	params
25	25	0.925149	{'n_estimators': 500, 'alpha': 9.4863506092290...
41	41	0.924905	{'n_estimators': 500, 'alpha': 3.4232784401779...
34	34	0.919624	{'n_estimators': 500, 'alpha': 2.2767849840122...
29	29	0.917429	{'n_estimators': 300, 'alpha': 2.0219184532052...
88	88	0.916726	{'n_estimators': 300, 'alpha': 2.7510681730152...
...	...	...	...
80	80	0.500000	{'n_estimators': 500, 'alpha': 2.4663181451532...
21	21	0.500000	{'n_estimators': 500, 'alpha': 1.0123144391688...
9	9	0.500000	{'n_estimators': 100, 'alpha': 4.2479074365499...
57	57	0.500000	{'n_estimators': 100, 'alpha': 3.2214812056006...
22	22	0.500000	{'n_estimators': 500, 'alpha': 1.0470925931211...

100 rows × 3 columns

3.3. RandomForest

Candidate hyperparameters are as follow:
- n_estimators: 100, 300, 500, 1000
- learning_rate: 1e-8 ~ 1
- max_depth: 3 ~ 20
- max_features: auto, sqrt, log2
- min_samples_leaf: 1 ~ 10
- min_samples_split: 2 ~ 10

def rf_objective(trial, X, y):
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [100, 300, 500, 1000]),
        "max_depth": trial.suggest_int("max_depth", 3, 20, step = 2),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 10),
    }
    
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
    
    model = RandomForestClassifier(**param_grid, n_jobs = -1)
    cv_results = cross_validate(model, X, y, cv = cv, scoring = "roc_auc", n_jobs = -1)


    return np.mean(cv_results['test_score'])

rf_study = optuna.create_study(direction = "maximize", study_name = "RandomForest Classifier")
func = lambda trial: rf_objective(trial, X_train_std, y_train)
rf_study.optimize(func, n_trials = 20)

rf_trials_history = rf_study.get_trials()

trial_list = []
hyperparameters_list = []
auc_list = []

for i in range(len(rf_trials_history)):
    trial_list.append(rf_trials_history[i].number) 
    hyperparameters_list.append(rf_trials_history[i].params)
    auc_list.append(rf_trials_history[i].values[0])
    
rf_auc_history = pd.DataFrame({"trial": trial_list, "auc": auc_list, "params": hyperparameters_list})

rf_auc_history.sort_values("auc", ascending = False)

	trial	auc	params
13	13	0.921056	{'n_estimators': 500, 'max_depth': 15, 'max_fe...
15	15	0.920583	{'n_estimators': 300, 'max_depth': 15, 'max_fe...
19	19	0.920497	{'n_estimators': 300, 'max_depth': 15, 'max_fe...
14	14	0.920441	{'n_estimators': 500, 'max_depth': 15, 'max_fe...
2	2	0.920377	{'n_estimators': 500, 'max_depth': 17, 'max_fe...
18	18	0.920144	{'n_estimators': 300, 'max_depth': 15, 'max_fe...
11	11	0.920087	{'n_estimators': 500, 'max_depth': 13, 'max_fe...
12	12	0.919690	{'n_estimators': 500, 'max_depth': 13, 'max_fe...
10	10	0.919352	{'n_estimators': 500, 'max_depth': 13, 'max_fe...
9	9	0.918802	{'n_estimators': 100, 'max_depth': 19, 'max_fe...
5	5	0.917771	{'n_estimators': 100, 'max_depth': 19, 'max_fe...
16	16	0.917703	{'n_estimators': 300, 'max_depth': 11, 'max_fe...
17	17	0.917431	{'n_estimators': 300, 'max_depth': 11, 'max_fe...
3	3	0.916428	{'n_estimators': 100, 'max_depth': 19, 'max_fe...
7	7	0.905486	{'n_estimators': 100, 'max_depth': 7, 'max_fea...
4	4	0.905325	{'n_estimators': 1000, 'max_depth': 7, 'max_fe...
6	6	0.895565	{'n_estimators': 300, 'max_depth': 5, 'max_fea...
1	1	0.893271	{'n_estimators': 1000, 'max_depth': 5, 'max_fe...
8	8	0.892012	{'n_estimators': 1000, 'max_depth': 5, 'max_fe...
0	0	0.871615	{'n_estimators': 500, 'max_depth': 3, 'max_fea...

3.4. Choose the best 15 models

Among all trained models, let’s choose the best 15 models based on the AUC value.

lgbm_auc_history["model"] = "lgbm"
xgboost_auc_history["model"] = "xgboost"
rf_auc_history["model"] = "rf"

all_auc_history = pd.concat([lgbm_auc_history, xgboost_auc_history, rf_auc_history])

best_15_models = all_auc_history.sort_values("auc", ascending = False).head(15)
best_15_models

	trial	auc	params	model
85	85	0.943124	{'n_estimators': 10000, 'learning_rate': 0.145...	lgbm
55	55	0.940711	{'n_estimators': 10000, 'learning_rate': 0.120...	lgbm
67	67	0.940047	{'n_estimators': 10000, 'learning_rate': 0.132...	lgbm
80	80	0.939630	{'n_estimators': 10000, 'learning_rate': 0.172...	lgbm
39	39	0.939020	{'n_estimators': 10000, 'learning_rate': 0.136...	lgbm
60	60	0.938714	{'n_estimators': 10000, 'learning_rate': 0.127...	lgbm
87	87	0.930233	{'n_estimators': 10000, 'learning_rate': 0.110...	lgbm
31	31	0.929903	{'n_estimators': 10000, 'learning_rate': 0.120...	lgbm
30	30	0.928990	{'n_estimators': 10000, 'learning_rate': 0.129...	lgbm
61	61	0.928796	{'n_estimators': 10000, 'learning_rate': 0.114...	lgbm
95	95	0.928361	{'n_estimators': 10000, 'learning_rate': 0.127...	lgbm
38	38	0.927850	{'n_estimators': 10000, 'learning_rate': 0.124...	lgbm
28	28	0.927820	{'n_estimators': 10000, 'learning_rate': 0.139...	lgbm
96	96	0.927811	{'n_estimators': 10000, 'learning_rate': 0.127...	lgbm
33	33	0.927721	{'n_estimators': 10000, 'learning_rate': 0.107...	lgbm

4. Select cut-off score

Let’s find the best cut-off score for classifying binary outcome class.
Split the train set into train and validation set and train the model by using the train set with the parameters found in the hyperparamter tuning step. Then, we can define a set of thresholds and then evaluate predicted probabilities under each in order to find and select the optimal threshold by using the validation set.
First, split the train set into the train and validation set.
Since our data is imbalanced, use the stratified sampling with the ratio = 0.2.

X_y_train = pd.concat([X_train_std, pd.DataFrame({"Outcome": y_train})], axis = 1)

X_y_train_train = X_y_train.groupby("Outcome", group_keys = False).apply(lambda x: x.sample(frac = 0.8))
X_y_train_train.Outcome.value_counts()

0    10463
1     1652
Name: Outcome, dtype: int64

X_y_train_valid = X_y_train[~X_y_train.index.isin(X_y_train_train.index)]
X_y_train_valid.Outcome.value_counts()

0    2616
1     413
Name: Outcome, dtype: int64

X_train_train = X_y_train_train.drop("Outcome", axis = 1)
y_train_train = X_y_train_train.Outcome

X_train_valid = X_y_train_valid.drop("Outcome", axis = 1)
y_train_valid = X_y_train_valid.Outcome

We splited the train set into train and validation set
- train set: 12,115
  - Outcome = 0: 10,463
  - Outcome = 1: 1,652
- validation set: 3,029
  - Outcome = 0: 2,619
  - Outcome = 1: 413
Now calculate the scores from the best 15 models and get the average scores, which will be used to calcualte the AUC and BER.

best_15_models_proba = []

for i in range(best_15_models.shape[0]):
    params = best_15_models.iloc[i].params 
    model_name = best_15_models.iloc[i].model 
    
    if model_name == "lgbm": model = lgb.LGBMClassifier(**params)
    elif model_name == "xgboost": model = xgb.XGBClassifier(**params)
    elif model_name == "rf": model = RandomForestClassifier(**params)
    
    model.fit(X_train_train, y_train_train)
    y_pred_proba = model.predict_proba(X_train_valid)
    best_15_models_proba.append(y_pred_proba)
    

best_15_avg_prob = best_15_models_proba[0]

for i in range(1, 15):
    best_15_avg_prob = best_15_avg_prob + best_15_models_proba[i]

best_15_avg_prob = best_15_avg_prob/15

best_15_avg_prob

array([[0.98045849, 0.01954151],
       [0.97563099, 0.02436901],
       [0.95791448, 0.04208552],
       ...,
       [0.04403311, 0.95596689],
       [0.769854  , 0.230146  ],
       [0.98536498, 0.01463502]])

thresholds = np.arange(0.0, 1.0, 0.0001)
ber_list = np.zeros(shape = (len(thresholds)))

for index, elem in enumerate(thresholds):
    # Corrected probabilities
    y_pred = (best_15_avg_prob[:, 1] > elem).astype("int")
    
    # Calculate the BER
    ber_list[index] = 1 - balanced_accuracy_score(y_train_valid, y_pred)

# Find the optimal threshold
index = np.argmin(ber_list)
threshold_Opt = round(thresholds[index], ndigits = 4)
ber_Opt = round(ber_list[index], ndigits = 4)
print('Best Threshold: {} with BER: {}'.format(threshold_Opt, ber_Opt))

Best Threshold: 0.1867 with BER: 0.126

plt.grid()
sns.lineplot(x = thresholds, y = ber_list)
plt.xlabel("Threshold", fontsize = 12)
plt.ylabel("Balanced Error Rate", fontsize = 12)
plt.title("Threshold Tuning Curve", fontsize = 15)

plt.plot(threshold_Opt, ber_Opt, "ro")
plt.text(x = threshold_Opt - 0.08, y = ber_Opt + 0.05, s = f'Optimal threshold \n for Outcome = 1: {threshold_Opt}')

Text(0.1067, 0.176, 'Optimal threshold \n for Outcome = 1: 0.1867')

png

When we set 0.1867 as the cut-off score for the Outcome = 1, then we can achieve the best BER 0.126 on the validation set.

5. Calculate the BER and AUC

Now let’s calculate the BER and AUC on the test data.

best_15_models_proba_test = []

for i in range(best_15_models.shape[0]):
    params = best_15_models.iloc[i].params 
    model_name = best_15_models.iloc[i].model 
    
    if model_name == "lgbm": model = lgb.LGBMClassifier(**params)
    elif model_name == "xgboost": model = xgb.XGBClassifier(**params)
    elif model_name == "rf": model = RandomForestClassifier(**params)
    
    model.fit(X_train_std, y_train)
    y_pred_proba = model.predict_proba(X_test_std)
    best_15_models_proba_test.append(y_pred_proba)
    

best_15_avg_prob_test = best_15_models_proba_test[0]

for i in range(1, 15):
    best_15_avg_prob_test = best_15_avg_prob_test + best_15_models_proba_test[i]

best_15_avg_prob_test = best_15_avg_prob_test / 15

best_15_avg_prob_test

array([[0.98290209, 0.01709791],
       [0.89942117, 0.10057883],
       [0.97529847, 0.02470153],
       ...,
       [0.98090421, 0.01909579],
       [0.98463654, 0.01536346],
       [0.96578141, 0.03421859]])

y_pred = (best_15_avg_prob_test[:, 1] > threshold_Opt).astype("int")

test_ber = 1 - balanced_accuracy_score(y_test, y_pred)
test_auc = roc_auc_score(y_test, best_15_avg_prob_test[:, 1])

print("Best AUC before tuning the parameters: ", result_accuracy.AUC.max())
print("AUC after tuning the parameters: ", test_auc)

Best AUC before tuning the parameters:  0.9169488636328671
AUC after tuning the parameters:  0.9158752536593299

print("Best BER before tuning the parameters: ", result_accuracy.BER.min())
print("BER after tuning the parameters: ", test_ber)

Best BER before tuning the parameters:  0.200440664177713
BER after tuning the parameters:  0.16305466857266282

AUC is almost the same after tuning, but BER significantly decreased after tuning.

cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = clf.classes_)
cm_display.plot();    

png

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.94      0.95      5623
           1       0.64      0.74      0.69       867

    accuracy                           0.91      6490
   macro avg       0.80      0.84      0.82      6490
weighted avg       0.92      0.91      0.91      6490

Youngjun Woo