import sklearn
from sklearn.datasets import load_wine

dataset = load_wine(return_X_y=True)

# array 178x13 (178 wine examples each with 41 features)
features = dataset[0]
# array of 178 elements, each element is a number the class: 0,1 2
class_labels = dataset[1]
print("features shape:", features.shape)
print("class_labels shape:", class_labels.shape)

features shape: (178, 13)
class_labels shape: (178,)


dataset[0][0:1]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03]])


dataset[1][0:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


# Import library to work with tabular data: https://pandas.pydata.org/
import pandas as pd

x, y = load_wine(return_X_y=True, as_frame=True)

x.head(3)


y.unique()

array([0, 1, 2])


import matplotlib.pyplot as plt

fig, axs = plt.subplots(figsize=(4, 3))
y.hist()
plt.suptitle("Label balance")
plt.show()


# Load dataset from torchvision.datasets
from torchvision import datasets

train_set = datasets.CIFAR10("content", train=True, download=True)
val_set = datasets.CIFAR10("content", train=False, download=True)
labels_names = train_set.classes

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to content/cifar-10-python.tar.gz

100%|██████████| 170498071/170498071 [00:03<00:00, 52644756.64it/s]

Extracting content/cifar-10-python.tar.gz to content
Files already downloaded and verified


import matplotlib.pyplot as plt

img_1 = train_set.data[0]
img_2 = train_set.data[1]
img_3 = train_set.data[2]

fix, ax = plt.subplots(1, 3, figsize=(10, 3))
ax[0].set_title("First image in train data")
ax[0].imshow(img_1)
ax[1].set_title("Second image in train data")
ax[1].imshow(img_2)
ax[2].set_title("Third image in train data")
ax[2].imshow(img_3)
plt.show()


from sklearn.metrics import DistanceMetric

dist = DistanceMetric.get_metric("manhattan")
pairwise_dist = dist.pairwise([img_1.flatten(), img_2.flatten(), img_3.flatten()])


import numpy as np

fig, ax = plt.subplots(figsize=(4, 4))
im = ax.imshow(pairwise_dist)

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(pairwise_dist)))
ax.set_yticks(np.arange(len(pairwise_dist)))
ax.set_xticklabels([f"img{i}" for i in range(1, 4)])
ax.set_yticklabels([f"img{i}" for i in range(1, 4)])

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

# Loop over data dimensions and create text annotations.
for i in range(len(pairwise_dist)):
    for j in range(len(pairwise_dist)):
        text = ax.text(
            j,
            i,
            "{:0.2f}".format(pairwise_dist[i, j]),
            ha="center",
            va="center",
            color="w",
        )

ax.set_title("Pairwise L_1 distance for first 3 images in CIFAR 10 ")
fig.tight_layout()
plt.show()


from sklearn.metrics.pairwise import manhattan_distances

# in order to limit computational time
index_limiter = 1000
# convert all (32,32,4) images into (32*32*4) vectors
flattened_images = val_set.data.reshape(val_set.data.shape[0], -1)[:index_limiter]

classwise_distance = np.zeros((len(val_set.classes), len(val_set.classes)))

# iterate over all pair of classes and slice their members
for class_id_i, class_name_i in enumerate(val_set.classes):
    class_i_mask = np.asarray(val_set.targets[:index_limiter]) == class_id_i

    for class_id_j, class_name_j in enumerate(val_set.classes):
        class_j_mask = np.asarray(val_set.targets[:index_limiter]) == class_id_j

        # manhattan_distances returns pairwise distance matrix for samples
        # so in order to get mean distance for classes one should calc mean
        # value over its higher triangle part or simply calc mean over whole matrix
        # and divide by 2.0
        classwise_distance[class_id_i, class_id_j] = (
            np.mean(
                manhattan_distances(
                    flattened_images[class_i_mask], flattened_images[class_j_mask]
                )
            )
            / 2.0
        )

fig, ax = plt.subplots(figsize=(8, 8))
im = ax.imshow(classwise_distance)

# Show all ticks and label them with the respective list entries
ax.set_xticks(np.arange(len(val_set.classes)))
ax.set_yticks(np.arange(len(val_set.classes)))
ax.set_xticklabels(val_set.classes)
ax.set_yticklabels(val_set.classes)

# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

ax.set_title("Mean class-wise Мanhattan distance for CIFAR 10")
fig.tight_layout()
fig.colorbar(im)
plt.show()


fig, ax = plt.subplots(figsize=(3, 3))
sample_ship_img = val_set.data[18]
ax.set_title("Image in validation data")
plt.imshow(sample_ship_img)
plt.show()


from sklearn.neighbors import KNeighborsClassifier

# in order to limit computational time
index_limiter = 5000
x = train_set.data.reshape(train_set.data.shape[0], -1)[:index_limiter]
y = train_set.targets[:index_limiter]

res = np.empty(shape=(3, 5), dtype=object)
i = 0

for distance_type in ["euclidean", "manhattan", "chebyshev"]:
    for k in range(3, 7, 1):
        knn = KNeighborsClassifier(n_neighbors=k, metric=distance_type)
        knn.fit(x, y)
        result_class_id = knn.predict([sample_ship_img.flatten()])[0]
        result_class = train_set.classes[result_class_id]
        res[i][0] = distance_type
        res[i][k - 2] = result_class
    i += 1


import pandas as pd

pandas_res = pd.DataFrame(res, columns=["distance", "k=3", "k=4", "k=5", "k=6"])
pandas_res.set_index("distance", inplace=True)
pandas_res


from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=6, metric="chebyshev")
knn.fit(x, y)
accuracy = accuracy_score(y_pred=knn.predict(x), y_true=y)  # accuracy

print("Accuracy:", f"{accuracy*100}%")

Accuracy: 27.98%


from sklearn.model_selection import train_test_split

# split data to train/test
x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size=0.2)
x_val, x_test, y_val, y_test = train_test_split(x_tmp, y_tmp, test_size=0.2)

print("Train:", np.array(x_train).shape, np.array(y_train).shape)
print("Val:", np.array(x_val).shape, np.array(y_val).shape)
print("Test:", np.array(x_test).shape, np.array(y_test).shape)
print("Total:", np.array(x).shape, np.array(y).shape)

Train: (4000, 3072) (4000,)
Val: (800, 3072) (800,)
Test: (200, 3072) (200,)
Total: (5000, 3072) (5000,)


knn = KNeighborsClassifier(n_neighbors=6, metric="chebyshev")
knn.fit(x_train, y_train)

accuracy_train = accuracy_score(y_pred=knn.predict(x_train), y_true=y_train)
accuracy_val = accuracy_score(y_pred=knn.predict(x_val), y_true=y_val)
accuracy_test = accuracy_score(y_pred=knn.predict(x_test), y_true=y_test)

print("Accuracy train:", f"{accuracy_train*100}%")
print("Accuracy val :", f"{accuracy_val*100}%")
print("Accuracy test :", f"{accuracy_test*100}%")

Accuracy train: 26.775%
Accuracy val : 17.125%
Accuracy test : 12.5%


num_neighbors = np.arange(1, 31)  # array of the numbers of neighbors from 1 to 30

quality = np.zeros(num_neighbors.shape[0])

for i in range(num_neighbors.shape[0]):  # for all elements
    # create knn for all number of neighbors
    knn = KNeighborsClassifier(n_neighbors=num_neighbors[i])
    knn.fit(x_train, y_train)
    q = accuracy_score(y_pred=knn.predict(x_train), y_true=y_train)  # accuracy
    quality[i] = q  # fill quality

plt.figure(figsize=(8, 4))
plt.title("k-NN on train", size=18)
plt.xlabel("Neighbors", size=12)
plt.ylabel("Accuracy", size=12)
plt.plot(num_neighbors, quality)
plt.xticks(num_neighbors)
plt.show()


num_neighbors = np.arange(
    1, 31
)  # array of the numbers of nearest neigbors from 1 to 30
train_quality = np.zeros(num_neighbors.shape[0])  # quality on train data
test_quality = np.zeros(num_neighbors.shape[0])  # quality on test data

for i in range(num_neighbors.shape[0]):
    knn = KNeighborsClassifier(n_neighbors=num_neighbors[i])
    knn.fit(x_train, y_train)

    # accuracy on train data
    train_quality[i] = accuracy_score(y_pred=knn.predict(x_train), y_true=y_train)

    # accuracy on test data
    test_quality[i] = accuracy_score(y_pred=knn.predict(x_test), y_true=y_test)

# accuracy plot  on train and test data
plt.figure(figsize=(8, 4))
plt.title("k-NN on train vs test", size=18)
plt.plot(num_neighbors, train_quality, label="train")
plt.plot(num_neighbors, test_quality, label="test")
plt.legend()
plt.xticks(num_neighbors)
plt.xlabel("Neighbors", size=12)
plt.ylabel("Accuracy", size=12)
plt.show()


def count_lables(lables):
    lable_count = {}
    for item in lables:
        if item not in lable_count:
            lable_count[item] = 0
        lable_count[item] += 1
    return lable_count


def print_split_stat(x_train, x_test, y_train, y_test):
    # print("Train labels: ", y_train)
    # print("Test labels:  ", y_test)
    print("Train statistics: ", count_lables(y_train))
    print("Test statistics:  ", count_lables(y_test))


from sklearn.datasets import load_iris

data, labels = load_iris(return_X_y=True)
print("DataSet labels:\n", labels)
print("DataSet statistics: ", count_lables(labels))

DataSet labels:
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
DataSet statistics:  {0: 50, 1: 50, 2: 50}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, shuffle=False, random_state=42
)

print_split_stat(x_train, x_test, y_train, y_test)

Train statistics:  {0: 50, 1: 25}
Test statistics:   {1: 25, 2: 50}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, random_state=42
)

print_split_stat(x_train, x_test, y_train, y_test)

Train statistics:  {1: 27, 2: 27, 0: 21}
Test statistics:   {1: 23, 0: 29, 2: 23}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, random_state=42, stratify=labels
)

print_split_stat(x_train, x_test, y_train, y_test)

Train statistics:  {0: 25, 1: 25, 2: 25}
Test statistics:   {0: 25, 2: 25, 1: 25}


from sklearn.preprocessing import StandardScaler

np.random.seed(42)  # setting the initialization parameter for random values

x_train_feature = x_train[:, 0].reshape(-1, 1)

plt.figure(1, figsize=(8, 3))
plt.subplot(121)  # set location
plt.scatter(x_train_feature, range(len(x_train_feature)), c=y_train)
plt.ylabel("Num examples", fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("Non scaled data", fontsize=18)

# scale data  with StandardScaler
scaler = StandardScaler()
scaler.fit(x_train_feature)
x_train_feature_scaled = scaler.transform(x_train_feature)

plt.subplot(122)
plt.scatter(x_train_feature_scaled, range(len(x_train_feature)), c=y_train)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("StandardScaler", fontsize=18)
plt.show()


# split data to train/test
x_train, x_test, y_train, y_test = train_test_split(
    data, labels, random_state=42, test_size=0.5
)

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)

print("Without normalization")
accuracy_train = accuracy_score(y_pred=knn.predict(x_train), y_true=y_train)
print("accuracy_train", round(accuracy_train, 3))
accuracy_test = accuracy_score(y_pred=knn.predict(x_test), y_true=y_test)
print("accuracy_test", round(accuracy_test, 3))

Without normalization
accuracy_train 0.933
accuracy_test 0.947


scaler = StandardScaler()
scaler.fit(x_train)
x_train_norm = scaler.transform(x_train)  # scaling data
x_test_norm = scaler.transform(x_test)  # scaling data

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train_norm, y_train)

print("With normalization")
accuracy_train = accuracy_score(y_pred=knn.predict(x_train_norm), y_true=y_train)
print("accuracy_train", round(accuracy_train, 3))
accuracy_test = accuracy_score(y_pred=knn.predict(x_test_norm), y_true=y_test)
print("accuracy_test", round(accuracy_test, 3))

With normalization
accuracy_train 0.96
accuracy_test 0.973


import numpy as np
import sklearn.datasets
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

dataset = sklearn.datasets.load_iris()  # load data
x = dataset.data  # features
y = dataset.target  # labels(classes)

np.random.seed(42)


def split_and_train(x, y, random_state):
    x_train, x_val, y_train, y_val = train_test_split(
        x, y, train_size=0.8, stratify=y, random_state=random_state
    )

    max_neighbors = 30
    num_neighbors = np.arange(1, max_neighbors + 1)  # array of the number of neighbors

    train_accuracy = np.zeros(max_neighbors)
    val_accuracy = np.zeros(max_neighbors)

    for k in num_neighbors:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train)

        train_accuracy[k - 1] = accuracy_score(
            y_pred=knn.predict(x_train), y_true=y_train
        )
        val_accuracy[k - 1] = accuracy_score(y_pred=knn.predict(x_val), y_true=y_val)

    # accuracy plot on train and test data
    plt.figure(figsize=(10, 4))
    plt.title(f"KNN on train vs val, seed = {random_state}", size=20)
    plt.plot(num_neighbors, train_accuracy, label="train")
    plt.plot(num_neighbors, val_accuracy, label="val")
    plt.legend()
    plt.xticks(num_neighbors, size=12)
    plt.xlabel("Neighbors", size=14)
    plt.ylabel("Accuracy", size=14)
    plt.show()


split_and_train(x, y, random_state=42)


split_and_train(x, y, random_state=4)


from sklearn.model_selection import KFold

x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])

print("index without shuffle")
kf = KFold(n_splits=3)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)

print("index with shuffle")
kf = KFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)

index without shuffle
TRAIN: [3 4 5 6 7 8] TEST: [0 1 2]
TRAIN: [0 1 2 6 7 8] TEST: [3 4 5]
TRAIN: [0 1 2 3 4 5] TEST: [6 7 8]
index with shuffle
TRAIN: [0 2 3 4 6 8] TEST: [1 5 7]
TRAIN: [1 3 4 5 6 7] TEST: [0 2 8]
TRAIN: [0 1 2 5 7 8] TEST: [3 4 6]


from sklearn.model_selection import cross_val_score, StratifiedKFold

np.random.seed(42)

dataset = sklearn.datasets.load_iris()  # load data
x = dataset.data  # features
y = dataset.target  # labels(classes)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, stratify=y, random_state=42
)

cv = StratifiedKFold(n_splits=5)

knn = KNeighborsClassifier(n_neighbors=3)
accuracy3 = cross_val_score(knn, x_train, y_train, cv=cv, scoring="accuracy")

knn = KNeighborsClassifier(n_neighbors=5)
accuracy5 = cross_val_score(knn, x_train, y_train, cv=cv, scoring="accuracy")


knn_cv = np.vstack(
    (
        np.hstack((accuracy3, accuracy3.mean(), accuracy3.std())),
        np.hstack((accuracy5, accuracy5.mean(), accuracy5.std())),
    )
)


import pandas as pd

table = pd.DataFrame(
    knn_cv, columns=["Fold1", "Fold2", "Fold3", "Fold4", "Fold5", "Mean", "Std"]
)
table = table.set_axis(["Accuracy"] * 2)


table


from sklearn.datasets import make_moons

x, y = make_moons(n_samples=1000, noise=0.3, random_state=42)

plt.figure(figsize=(10, 5))
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()


x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, stratify=y, random_state=42
)


from sklearn.model_selection import GridSearchCV
from warnings import simplefilter

simplefilter("ignore", category=RuntimeWarning)

"""
Parameters for GridSearchCV:
estimator — model
cv — num of fold to cross-validation splitting
param_grid — parameters names
scoring — metrics
n_jobs — number of jobs to run in parallel, -1 means using all processors.
"""

model = GridSearchCV(
    estimator=KNeighborsClassifier(),
    cv=KFold(5, shuffle=True, random_state=42),
    param_grid={
        "n_neighbors": np.arange(1, 31),
        "metric": ["euclidean", "manhattan"],
        "weights": ["uniform", "distance"],
    },
    scoring="accuracy",
    n_jobs=-1,
)
model.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

KNeighborsClassifier()

KNeighborsClassifier()


print("Metric:", model.best_params_["metric"])
print("Num neighbors:", model.best_params_["n_neighbors"])
print("Weigths:", model.best_params_["weights"])

Metric: euclidean
Num neighbors: 30
Weigths: distance


from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(x_test)
print(
    f"Percent correct predictions {np.round(accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)
print(
    f"Percent correct predictions(balanced classes) {np.round(balanced_accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)

Percent correct predictions 95.5 %
Percent correct predictions(balanced classes) 95.5 %


list(model.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_metric',
 'param_n_neighbors',
 'param_weights',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']


plt.figure(figsize=(14, 4))
plt.subplot(121)
plt.plot(model.cv_results_["mean_test_score"])
plt.title("Mean test score", size=20)
plt.xlabel("Num of experiment", size=15)
plt.ylabel("Accuracy", size=15)

plt.subplot(122)
plt.plot(model.cv_results_["param_metric"])
plt.title("Param Metric", size=20)
plt.xlabel("Num of experiment", size=15)

plt.show()


selected_means = []
selected_std = []
num_neighbors = []
for ind, params in enumerate(model.cv_results_["params"]):
    if (
        params["metric"] == model.best_params_["metric"]
        and params["weights"] == model.best_params_["weights"]
    ):
        num_neighbors.append(params["n_neighbors"])
        selected_means.append(model.cv_results_["mean_test_score"][ind])
        selected_std.append(model.cv_results_["std_test_score"][ind])


plt.figure(figsize=(10, 4))
plt.title(f"KNN CV, {params['metric']}, {params['weights']}", size=18)
plt.errorbar(num_neighbors, selected_means, yerr=selected_std, fmt="-o")
plt.xticks(num_neighbors, size=13)
plt.ylabel("Mean_test_score", size=15)
plt.xlabel("Neighbors", size=15)

plt.show()


from sklearn.model_selection import RandomizedSearchCV

"""
Parameters for RandomizedSearchCV:
estimator — model
cv — num of fold to cross-validation splitting
param_distributions — parameters names
n_iter — number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
scoring — metrics
n_jobs — number of jobs to run in parallel, -1 means using all processors.
"""

model = RandomizedSearchCV(
    estimator=KNeighborsClassifier(),
    n_iter=100,
    cv=KFold(5, shuffle=True, random_state=42),
    param_distributions={
        "n_neighbors": np.arange(1, 31),
        "metric": ["euclidean", "manhattan"],
        "weights": ["uniform", "distance"],
    },
    scoring="accuracy",
    n_jobs=-1,
)
model.fit(x_train, y_train)

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=KNeighborsClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                                        'weights': ['uniform', 'distance']},
                   scoring='accuracy')

RandomizedSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
                   estimator=KNeighborsClassifier(), n_iter=100, n_jobs=-1,
                   param_distributions={'metric': ['euclidean', 'manhattan'],
                                        'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                                        'weights': ['uniform', 'distance']},
                   scoring='accuracy')

KNeighborsClassifier()

KNeighborsClassifier()


print("Metric:", model.best_params_["metric"])
print("Num neighbors:", model.best_params_["n_neighbors"])
print("Weigths:", model.best_params_["weights"])

Metric: manhattan
Num neighbors: 29
Weigths: distance


y_pred = model.predict(x_test)
print(
    f"Percent correct predictions {np.round(accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)
print(
    f"Percent correct predictions(balanced classes) {np.round(balanced_accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)

Percent correct predictions 95.0 %
Percent correct predictions(balanced classes) 95.0 %


from sklearn.metrics import classification_report

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ["class 0", "class 1", "class 2"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.50      1.00      0.67         1
     class 1       0.00      0.00      0.00         1
     class 2       1.00      0.67      0.80         3

    accuracy                           0.60         5
   macro avg       0.50      0.56      0.49         5
weighted avg       0.70      0.60      0.61         5


import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
fig.tight_layout(pad=3.0)
plt.rcParams.update({"font.size": 16})
# font = {'size':'21'}
ax[0].set_title("Balanced data")
ax[1].set_title("Unbalanced data")

labels = ["Airplane", "Auto", "Bird"]

# Balanced data
air, auto, bird = 150, 150, 150
actual_b = np.array([0] * air + [1] * auto + [2] * bird)
predicted_b = np.array([0] * (air - 10) + [1] * (auto + 20) + [2] * (bird - 10))

# Unbalanced data
air, auto, bird = 430, 10, 10
actual_ub = np.array([0] * air + [1] * auto + [2] * bird)
predicted_ub = np.array([0] * (air + 20) + [1] * (auto - 10) + [2] * (bird - 10))

metrics.ConfusionMatrixDisplay(
    confusion_matrix=metrics.confusion_matrix(actual_b, predicted_b),
    display_labels=labels,
).plot(ax=ax[0])

metrics.ConfusionMatrixDisplay(
    confusion_matrix=metrics.confusion_matrix(actual_ub, predicted_ub),
    display_labels=labels,
).plot(ax=ax[1])

label_font = {"size": "15"}  # Adjust to fit
ax[0].set_xlabel("Predicted labels", fontdict=label_font)
ax[0].set_ylabel("True labels", fontdict=label_font)
ax[1].set_xlabel("Predicted labels", fontdict=label_font)
ax[1].set_ylabel("True labels", fontdict=label_font)

plt.show()

print(
    "Accuracy Balanced   Data:", round(metrics.accuracy_score(actual_b, predicted_b), 2)
)
print(
    "Accuracy Unbalanced Data:",
    round(metrics.accuracy_score(actual_ub, predicted_ub), 2),
)

Accuracy Balanced   Data: 0.96
Accuracy Unbalanced Data: 0.96


print(
    "Balanced accuracy for Balanced data  :",
    round(metrics.balanced_accuracy_score(actual_b, predicted_b), 2),
)
print(
    "Balanced accuracy for Unbalanced data :",
    round(metrics.balanced_accuracy_score(actual_ub, predicted_ub), 2),
)

Balanced accuracy for Balanced data  : 0.96
Balanced accuracy for Unbalanced data : 0.33


# fmt: off
y_true = [[0,1,1,1],
         [0,0,1,0],
         [1,1,0,0]]

y_pred = [[0,1,0,1],
          [0,1,1,1],
          [1,0,1,1]]
# fmt: on


from sklearn.metrics import multilabel_confusion_matrix

multilabel_confusion_matrix(y_true, y_pred)

array([[[2, 0],
        [0, 1]],

       [[0, 1],
        [1, 1]],

       [[0, 1],
        [1, 1]],

       [[0, 2],
        [0, 1]]])


from sklearn.metrics import classification_report

label_names = ["label A", "label B", "label C", "label D"]

print(classification_report(y_true, y_pred, target_names=label_names))

              precision    recall  f1-score   support

     label A       1.00      1.00      1.00         1
     label B       0.50      0.50      0.50         2
     label C       0.50      0.50      0.50         2
     label D       0.33      1.00      0.50         1

   micro avg       0.50      0.67      0.57         6
   macro avg       0.58      0.75      0.62         6
weighted avg       0.56      0.67      0.58         6
 samples avg       0.56      0.72      0.57         6

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0

	Fold1	Fold2	Fold3	Fold4	Fold5	Mean	Std
Accuracy	0.916667	0.958333	0.958333	0.958333	1.0	0.958333	0.026352
Accuracy	0.916667	1.000000	0.958333	1.000000	1.0	0.975000	0.033333

	$\large y=1$	$\large y=0$
$\large \widehat{y}=1$	$\large \text{True Positive} \ (TP) $	$\large \text{False Positive} \ (FP) $
$\large \widehat{y}=0$	$\large \text{False Negative} \ (FN)$	$\large \text{True Negative} \ (TN) $

	k=3	k=4	k=5	k=6
distance
euclidean	automobile	ship	ship	ship
manhattan	automobile	automobile	truck	ship
chebyshev	ship	ship	ship	ship

Два пути¶

Задача курса¶

AI, ML, DL¶

Области применения¶

Связь с наукой¶

Обзор курса¶

Задачи¶

Базовые¶

Классификация¶

Регрессия¶

Кластеризация¶

Комбинированные задачи¶

План исследования¶

Сбор и подготовка данных¶

Обучение vs применение¶

Разведочный анализ¶

Baseline¶

Метрики¶

Построение модели, эксперименты¶

Проверка гипотез¶

Анализ работы модели¶

Инструменты¶

Данные¶

Связность данных¶

Загрузка и визуализация данных¶

Работа с данными и моделью¶

Описание данных¶

Близость данных согласно метрике¶

Описание модели k-NN¶

Простейшая метрика¶

Разделение train-validation-test¶

Параметры и гиперпараметры модели¶

Стратификация¶

Нормализация¶

k-NN в прикладных задачах¶

Кросс-валидация¶

Алгоритм кросс-валидации¶

Оценка результата кросс-валидации¶

Типичные ошибки при кросс-валидации¶

GridSearch¶

RandomizedSearch¶

Метрики¶

Accuracy¶

Precision, Recall¶

F-мера¶

AUC-ROC¶

Построение¶

Смысл метрики¶

Multilabel¶