# k-nn learning with `scikit-learn`

We use the Iris dataset from `scikit-learn`. Here, the features are the sepal length and width, as well as the petal length and width of the flowers. The classes are already converted to integer labels where 0=Iris-Setosa, 1=Iris-Versicolor, 2=Iris-Virginica. This is a standard toy dataset used in statistics.

# 1. Loading and quick study of the `iris` dataset

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import neighbors, datasets
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_style('white')


# Let's load the iris dataset
iris = datasets.load_iris()

data = pd.concat([
    pd.DataFrame(iris.data, columns=iris.feature_names),
    pd.DataFrame(iris.target, columns=['target'])],
    axis=1)
data.head()

In [None]:
data.describe()

In [None]:
print('Class labels:', np.unique(data['target']))
print(iris.target_names)

In [None]:
sns.pairplot(data,hue='target', vars=iris.feature_names)

# 2. k-nn classification

In [None]:
def plot_decision_regions(clf, X_train, y_train, X_test, y_test, plot_features=(0, 1)):
    from matplotlib.colors import ListedColormap

    cm = plt.cm.RdBu
    cmap = ListedColormap(['red', 'white', 'blue'])    
    fig = plt.figure(figsize=(8, 5))
    ax = plt.subplot(1, 1, 1)
    # plot the decision surface
    feat1 = plot_features[0]
    feat2 = plot_features[1]
    
    x1_min, x1_max = X[:, feat1].min() - 1, X[:, feat1].max() + 1
    x2_min, x2_max = X[:, feat2].min() - 1, X[:, feat2].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, 0.02),
                           np.arange(x2_min, x2_max, 0.02))
    
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    Z = clf.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    ct = plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cm)
    plt.colorbar(ct)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())
    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, feat1], 
                    y=X[y == cl, feat2],
                    edgecolor='black',
                    s = 100,
                    color=cmap(idx),
                    label=cl)

    ax.text(xx1.max() - 0.5, xx2.min() + 0.5, ('Test accuracy=%.2f' % score).lstrip('0'),
            size=15, horizontalalignment='right')
    
    plt.xlabel('petal length', fontsize=16)
    plt.ylabel('petal width', fontsize=16)
    plt.legend(loc='upper left')
    plt.tight_layout()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
help(KNeighborsClassifier)

In [None]:
n_neighbors = 15

# We only use third and fourth feature (petal length and petal width) for display
X = data[iris.feature_names[2:4]].values
y = data['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, stratify=y)

# we create an instance of Neighbours Classifier and fit the data.
clf = neighbors.KNeighborsClassifier(n_neighbors)
    
plot_decision_regions(clf,X_train, y_train, X_test, y_test)

plt.title("3-Class classification (k = %i)"
             % (n_neighbors))

plt.show()

**QUESTION**

Change the number of neighbors and visualize the results.
Then, change the variables used.

# 3. k-nn regressionÂ¶

In [None]:
np.random.seed(0)
X = np.sort(6 * np.random.rand(50, 1), axis=0)
y = np.sin(X)
plt.scatter(X,y)

In [None]:
# Add noise to targets
y[::4] += (np.random.rand(1)/5)

plt.scatter(X,y) 

In [None]:
# Fit regression model
n_neighbors = 2
T = np.linspace(0, 6, 100)[:, np.newaxis]

knn = neighbors.KNeighborsRegressor(n_neighbors)
y_ = knn.fit(X, y).predict(T)
plt.scatter(X, y, color='darkorange', label='data')
plt.plot(T, y_, color='navy', label='prediction')
plt.axis('tight')
plt.legend()
plt.title("KNeighborsRegressor (k = %i)" % (n_neighbors))

plt.tight_layout()
plt.show()

**QUESTION**

Change the number of neighbors and visualize the results. Try with other simulated data.