Example Gradient Boosted Classifier Using Python’s Scikit Learn

Danny Morris

2019/06/25

Conda Environment

library(reticulate)

use_condaenv("r-reticulate", required = TRUE)

Python Pacakges

import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import pickle

Data

The data comes from the online source UCI Machine Learning repository.

breast_cancer = pd.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data")

Supply column names.

breast_cancer.columns = ['ID', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7',
                         'X8', 'X9', 'malignant']

Drop columns and remove rows with missing values.

breast_cancer = breast_cancer.drop(columns = ["ID", "X6"])

breast_cancer = breast_cancer.dropna()

Features and Labels

X = breast_cancer.drop(columns = "malignant")
y = breast_cancer[["malignant"]]

Encode Labels

Our labels are currently one of [2, 4]. The label encoder maps them to [0, 1].

label_enc = preprocessing.LabelEncoder()

label_enc.fit(y)
## LabelEncoder()
## 
## C:\PROGRA~3\ANACON~1\envs\R-RETI~1\lib\site-packages\sklearn\preprocessing\label.py:219: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
##   y = column_or_1d(y, warn=True)
y_enc = label_enc.transform(y) 
## C:\PROGRA~3\ANACON~1\envs\R-RETI~1\lib\site-packages\sklearn\preprocessing\label.py:252: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
##   y = column_or_1d(y, warn=True)
pd.value_counts(y_enc)
## 0    457
## 1    241
## dtype: int64

Train/Test Splits

X_train, X_test, y_train, y_test = train_test_split(
  X, y_enc, test_size=0.33, random_state=42)

Min/Max Scaler

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
## C:\PROGRA~3\ANACON~1\envs\R-RETI~1\lib\site-packages\sklearn\preprocessing\data.py:334: DataConversionWarning: Data with input dtype int64 were all converted to float64 by MinMaxScaler.
##   return self.partial_fit(X, y)
X_test = scaler.transform(X_test)

Training and Evaluation

learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]

Fit a model for each learing rate and compare overall accuarcy.

for learning_rate in learning_rates:

    gb = GradientBoostingClassifier(
      n_estimators=20, 
      learning_rate = learning_rate, 
      max_features=2, 
      max_depth = 2, 
      random_state = 0)
      
    gb.fit(X_train, y_train)
    
    print("Learning rate: ", learning_rate)
    print("Accuracy (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print()
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.05, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  0.05
## Accuracy (training): 0.964
## Accuracy (validation): 0.965
## 
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.1, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  0.1
## Accuracy (training): 0.970
## Accuracy (validation): 0.965
## 
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.25, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  0.25
## Accuracy (training): 0.970
## Accuracy (validation): 0.965
## 
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.5, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  0.5
## Accuracy (training): 0.985
## Accuracy (validation): 0.961
## 
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.75, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  0.75
## Accuracy (training): 0.989
## Accuracy (validation): 0.952
## 
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=1, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
## Learning rate:  1
## Accuracy (training): 0.983
## Accuracy (validation): 0.944

Select best learning rate, retrain, and collect more training statistics.

gb = GradientBoostingClassifier(
  n_estimators=20, 
  learning_rate = 0.5, 
  max_features=2, 
  max_depth = 2, 
  random_state = 0)

gb.fit(X_train, y_train)
## GradientBoostingClassifier(criterion='friedman_mse', init=None,
##               learning_rate=0.5, loss='deviance', max_depth=2,
##               max_features=2, max_leaf_nodes=None,
##               min_impurity_decrease=0.0, min_impurity_split=None,
##               min_samples_leaf=1, min_samples_split=2,
##               min_weight_fraction_leaf=0.0, n_estimators=20,
##               n_iter_no_change=None, presort='auto', random_state=0,
##               subsample=1.0, tol=0.0001, validation_fraction=0.1,
##               verbose=0, warm_start=False)
predictions = gb.predict(X_test)

print("Confusion Matrix:")
## Confusion Matrix:
print(confusion_matrix(y_test, predictions))
## [[143   6]
##  [  3  79]]
print()
print("Classification Report")
## Classification Report
print(classification_report(y_test, predictions))
##               precision    recall  f1-score   support
## 
##            0       0.98      0.96      0.97       149
##            1       0.93      0.96      0.95        82
## 
##    micro avg       0.96      0.96      0.96       231
##    macro avg       0.95      0.96      0.96       231
## weighted avg       0.96      0.96      0.96       231

AUC.

# ROC curve and Area-Under-Curve (AUC)

y_scores_gb = gb.decision_function(X_test)
fpr_gb, tpr_gb, _ = roc_curve(y_test, y_scores_gb)
roc_auc_gb = auc(fpr_gb, tpr_gb)

print("Area under ROC curve = {:0.2f}".format(roc_auc_gb))
## Area under ROC curve = 0.99

Save Model

saved_model = pickle.dumps(gb)

loaded_model = pickle.loads(saved_model)