# Machine learning with `scikit-learn`

Let's start to do some machine learning with `scikit-learn`, using the data we prepared and saved in `gro_training.pkl`

# 1. Read the data

In [None]:
import os
import pickle as pkl

# Use here the path containing the data files
path_data = './'

with open(os.path.join(path_data, 'gro_training.pkl'), 'rb') as f:
    data = pkl.load(f)
    
features = data['features']
labels = data['labels']

In [None]:
data.keys()

In [None]:
features.head(6)

In [None]:
features.columns

In [None]:
features.describe(include='all')

In [None]:
features.dtypes

# 2. Normalize continuous features

Let's normalize them in [0, 1] (we could also standardize them)

In [None]:
import numpy as np
import pandas as pd

# Dates are now treated as continuous features
cnt_featnames = [
    'Years_At_Residence',
    'Net_Annual_Income',
    'Years_At_Business',
    'Number_Of_Dependant',
    'BirthDate',
    'Customer_Open_Date',
    'Prod_Decision_Date',
    'Nb_Of_Products'    
]

In [None]:
# Let's keep the continuous features appart (to normalize them)
continuous_features = features[cnt_featnames]

# and the categorical features
categorical_features = features.drop(cnt_featnames, axis=1)

In [None]:
continuous_features.head()

In [None]:
categorical_features.head()

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Min-max scaling of the continuous features
scaler = MinMaxScaler()
scaler.fit(continuous_features)

In [None]:
scaler.data_min_

In [None]:
scaler.data_max_

In [None]:
continuous_features = scaler.transform(continuous_features)

In [None]:
continuous_features

In [None]:
continuous_features = pd.DataFrame(continuous_features, columns=cnt_featnames)
continuous_features.agg(['min', 'max'])

In [None]:
pd.DataFrame({'min': scaler.data_min_, 'max': scaler.data_max_}, index=cnt_featnames)

In [None]:
# The full matrix features ready for learning
X = pd.concat((categorical_features, continuous_features), axis=1)
y = labels

In [None]:
X.shape

In [None]:
y.sum()

In [None]:
X.head()

## 5.2 Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

logreg = LogisticRegression(penalty='l1', solver='liblinear')
logreg.fit(X, y)

In [None]:
features_names = X.columns.tolist()

In [None]:
logreg.coef_

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 5))
plt.stem(logreg.coef_.ravel(), use_line_collection=True)
plt.title('Logistic regression coefficients', fontsize=18)
# We change the fontsize of minor ticks label
_ = plt.xticks(np.arange(logreg.coef_[0].shape[0]), features_names,
           rotation='vertical', fontsize=14)
_ = plt.yticks(fontsize=14)

In [None]:
logreg.intercept_

## 5.3 Path of solutions (for different levels of penalization)

In [None]:
logreg = LogisticRegression(penalty='l1', tol=1e-3,
                            class_weight='balanced',
                            solver='liblinear',max_iter=2000)

c_path = np.logspace(-3, 3, 15)

coeffs = []
for c in c_path:
    logreg.C = c
    # set_params(C=c)
    logreg.fit(X, y)
    coeffs.append(logreg.coef_.ravel().copy())

coeffs = np.array(coeffs)
plt.figure(figsize=(15, 6))
plt.semilogx(c_path, coeffs)
ymin, ymax = plt.ylim()
plt.xlabel('C', fontsize=16)
plt.xticks(fontsize=14)
plt.ylabel('Coefficients', fontsize=16)
plt.yticks(fontsize=14)
plt.title('Logistic regression path (L1 penalization)', fontsize=18)

plt.show()

## 5.4 Cross-validation of the penalization level

In [None]:
logreg_cv = LogisticRegressionCV(penalty='l1', class_weight='balanced',
                                 Cs=np.logspace(-3, 7, 10),
                                 cv=5, n_jobs=-1, refit=True, solver='liblinear')
logreg_cv.fit(X, y)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve

logreg_cv.scores_[1]

In [None]:
logreg_cv.scores_[1].shape

In [None]:
logreg_cv.scores_[1].mean(axis=0)

In [None]:
_ = plt.plot(logreg_cv.scores_[1].T)

plt.plot(logreg_cv.scores_[1].mean(axis=0), 'black', lw=4)

In [None]:
logreg_cv.C_

## 5.6 Classification metrics and reports

Let's comparison several classifiers in terms of AUC and other classification metrics

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3)

logreg_cv = LogisticRegressionCV(penalty='l2', class_weight='balanced',
                                 Cs=np.logspace(-3, 7, 10),
                                 cv=3, n_jobs=-1)
logreg_cv.fit(X_train, y_train)

In [None]:
logreg_cv.predict(X_test)

In [None]:
logreg_cv.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, f1_score

fpr, tpr, _ = roc_curve(y_test, logreg_cv.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label="LR (AUC=%.2f)" % roc_auc, lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=18)
plt.legend(loc="lower right", fontsize=16)

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
    GradientBoostingClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier

# All classifiers with default parameters
clfs = [
    RandomForestClassifier(n_estimators=100),
    AdaBoostClassifier(),
    SVC(probability=True, gamma='auto'),
    GradientBoostingClassifier(),
    DecisionTreeClassifier(),
    ExtraTreeClassifier(),
    LogisticRegression(solver='lbfgs'),
    LogisticRegression(solver='lbfgs', class_weight='balanced',max_iter=2000)
]

aucs = []
fprs = []
tprs = []
precisions = []
recalls = []
f1_scores = []

for clf in clfs:
    print(clf.__class__.__name__)
    clf.fit(X_train, y_train)
    fpr, tpr, _ = roc_curve(y_test, clf.predict_proba(X_test)[:, 1])    
    fprs.append(fpr)
    tprs.append(tpr)
    aucs.append(auc(fpr, tpr))

    precision, recall, _ = precision_recall_curve(y_test, clf.predict_proba(X_test)[:, 1])
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1_score(y_test, clf.predict(X_test)))    

In [None]:
from sklearn.metrics import classification_report

In [None]:
names = [clf.__class__.__name__ for clf in clfs]

plt.figure(figsize=(9, 7))
plt.plot([0, 1], [0, 1], 'k--')

for fpr, tpr, auc, name in zip(fprs, tprs, aucs, names):
    plt.plot(fpr, tpr, label=name + ' (AUC=%.2f)' % auc, lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=16)
plt.ylabel('True Positive Rate', fontsize=16)
plt.title('Receiver operating characteristic', fontsize=18)
plt.legend(loc="lower right", fontsize=14)

In [None]:
plt.figure(figsize=(9, 7))

for precision, recall, f1_score, name in zip(precisions, recalls, f1_scores, names):
    plt.plot(recall, precision, label=name + ' (F1=%.2f)' % f1_score, lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall', fontsize=16)
plt.ylabel('Precision', fontsize=16)
plt.title('Precision/recall curve', fontsize=18)
plt.legend(loc="upper right", fontsize=14)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve

print(classification_report(y_test, logreg.predict(X_test)))

# 6. Serialization of models

Once again, a very powerful tool is `pickle`: once your model is trained, you can simply save it using `pickle`.

In [None]:
import pickle as pkl

# Let's save the model into a file
with open('logreg.pkl', 'wb') as f:
    pkl.dump(logreg, f)

In [None]:
# Now we can load it for later use. This is VERY convenient in production.
# And not that pickle is usually VERY efficient and fast
with open('logreg.pkl', 'rb') as f:
    logreg = pkl.load(f)

In [None]:
# Our logistic regression is back!
logreg

# A. Some extra things we can do

### Extra things from `scikit-learn`

- The sklearn pipeline: Standardization + Classification (or dimension reduction + classification)
- V-fold cross-validation
- Polynomials of features
- Binarization of continuous features

### Improvement of results for the `gro` datasets

- Let's not remove data
- Imputation of missing values
- Polynomials of features
- Binarization of continuous features

In [None]:
100 * y.mean()

Dans le jeu de données d'entrainment on n'a 7% de labels egaux à 1, contre 93% de 0 : problème de **class unblancing**

In [None]:
?LogisticRegression