# Decision trees and boosting with `scikit-learn`

We use the Iris dataset again.

# 1. Reminder : `iris` dataset

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import datasets
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_style('white')

# Let's load the iris dataset
iris = datasets.load_iris()

data = pd.concat([
    pd.DataFrame(iris.data, columns=iris.feature_names),
    pd.DataFrame(iris.target, columns=['target'])],
    axis=1)
data.head()

In [None]:
data.describe()

In [None]:
print('Class labels:', np.unique(data['target']))
print(iris.target_names)

In [None]:
sns.pairplot(data, hue='target', vars=iris.feature_names)

# 2. Multiclass logistic regression (softmax regression) with a pipeline

In this simple example with use a pipeline which:
- First standardize the data (center and reduces)
- Trains a multiclass logistic regression

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Let's make a pipeline that first standardize (center and reduce) the data 
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(solver='lbfgs'))
])

# We only use third and fourth feature (petal length and petal width) for display
X = data[iris.feature_names[2:4]].values
y = data['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, stratify=y)

In [None]:
# In a pipeline, parameters can be accessed as follows
pipeline.set_params(logreg__C=1e1)
pipeline.named_steps['logreg'].C

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
def plot_decision_regions(clf, X_train, y_train, X_test, y_test, plot_features=(0, 1)):
    from matplotlib.colors import ListedColormap

    cm = plt.cm.RdBu
    cmap = ListedColormap(['red', 'white', 'blue'])    
    fig = plt.figure(figsize=(8, 5))
    ax = plt.subplot(1, 1, 1)
    # plot the decision surface
    feat1 = plot_features[0]
    feat2 = plot_features[1]
    
    x1_min, x1_max = X[:, feat1].min() - 1, X[:, feat1].max() + 1
    x2_min, x2_max = X[:, feat2].min() - 1, X[:, feat2].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.02),
                           np.arange(x2_min, x2_max, 0.02))
    
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    ct = plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cm)
    plt.colorbar(ct)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, feat1], 
                    y=X[y == cl, feat2],
                    edgecolor='black',
                    s = 100,
                    color=cmap(idx),
                    label=cl)

    ax.text(xx1.max() - 0.5, xx2.min() + 0.5, ('Test accuracy=%.2f' % score).lstrip('0'),
            size=15, horizontalalignment='right')
    
    plt.xlabel('petal length', fontsize=16)
    plt.ylabel('petal width', fontsize=16)
    plt.legend(loc='upper left')
    plt.tight_layout()

In [None]:
plot_decision_regions(pipeline, X_train, y_train, X_test, y_test)

# 3. Decision tree learning

In [None]:
from sklearn.tree import DecisionTreeClassifier
help(DecisionTreeClassifier)

In [None]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=30, min_impurity_decrease=0)
tree.fit(X_train, y_train)

plot_decision_regions(tree, X_train, y_train, X_test, y_test)

**QUESTION**

Try to change the depth of the tree and visualize the results

**QUESTION**

Change stuff above to see what's going on with the other features

## 4. Ensemble methods (adaboost, gradient boosting, random forests)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

adaboost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), 
                              n_estimators=100)
adaboost.fit(X_train, y_train)

plot_decision_regions(adaboost, X_train, y_train, X_test, y_test)

In [None]:
gradient_boosting = GradientBoostingClassifier(loss='deviance')
gradient_boosting.fit(X_train, y_train)

plot_decision_regions(gradient_boosting, X_train, y_train, X_test, y_test)

In [None]:
?RandomForestClassifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

forest = RandomForestClassifier(n_estimators=50)
forest.fit(X_train, y_train)

plot_decision_regions(forest, X_train, y_train, X_test, y_test)

**QUESTION**

Try to understand the parameters of these ensemble methods (make a connection with the courses and play with them !)