python_list = [[1, 2, 3], [4, 5, 6]]
python_list_various = ["a", 15, 123.8, [99, "I love you"], [True, True, False]]

print(python_list_various)
print(python_list)

['a', 15, 123.8, [99, 'I love you'], [True, True, False]]
[[1, 2, 3], [4, 5, 6]]


import numpy as np

numpy_arr = np.array(python_list, dtype=float)
print(numpy_arr)

# This code will cause an error
# invalid_numpy_arr = np.array([[1,2,3],[4,5]],dtype = float)

[[1. 2. 3.]
 [4. 5. 6.]]


vector = np.array([1, 0, 0])
row_diff = numpy_arr - vector
print("Substract row from array", row_diff)

scalar_product = numpy_arr.dot(vector)
print("Scalar product", scalar_product)

Substract row from array [[0. 2. 3.]
 [3. 5. 6.]]
Scalar product [1. 4.]


import torch

my_tensor = torch.tensor(numpy_arr)
print("torch.Tensor\n", my_tensor, "\nshape =", my_tensor.shape)

squared_numpy = my_tensor.pow(2).numpy()
print("Numpy\n", squared_numpy, "\nshape =", squared_numpy.shape)

torch.Tensor
 tensor([[1., 2., 3.],
        [4., 5., 6.]], dtype=torch.float64) 
shape = torch.Size([2, 3])
Numpy
 [[ 1.  4.  9.]
 [16. 25. 36.]] 
shape = (2, 3)


# Download dataset
# !wget -q https://data.cityofnewyork.us/api/views/rsgh-akpg/rows.csv?accessType=DOWNLOAD -O dogs.csv
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/dogs.csv -O dogs.csv


import pandas as pd


# Load into pandas and display a sample
dataset = pd.read_csv("dogs.csv")

dataset.head(3)


if len(dataset) == len(dataset.drop_duplicates()):
    print("Очевидных дубликатов нет")
else:
    print(
        "%.2f процентов данных являются дубликатами"
        % len(dataset.drop_duplicates())
        / len(dataset)
        * 100
    )

Очевидных дубликатов нет


import matplotlib.pyplot as plt
import numpy as np

x = np.arange(len(dataset))
plt.xlabel("Index")
plt.ylabel("UniqueID")
plt.scatter(x, dataset["UniqueID"], s=0.1)
plt.show()


dataset["UniqueID"].max(), len(dataset["UniqueID"])

(12383, 22663)


dataset["UniqueID"].dtype

dtype('int64')


dataset["UniqueID"].min(), dataset["UniqueID"].max()

(1, 12383)


dataset_filtered = dataset.copy()
dataset_filtered["UniqueID"] = dataset["UniqueID"].astype("uint16")


def resources_gain(
    column="UniqueID", orig_dataset=dataset, filtered_dataset=dataset_filtered
):
    original_memory = orig_dataset[column].memory_usage(deep=True)
    memory_after_conversion = filtered_dataset[column].memory_usage(deep=True)
    gain = original_memory / memory_after_conversion
    print(f"Gain: {round(gain, 2)}")


resources_gain(
    column="UniqueID", orig_dataset=dataset, filtered_dataset=dataset_filtered
)

Gain: 3.99


dataset_filtered["DateOfBite"] = pd.to_datetime(dataset["DateOfBite"])


resources_gain(
    column="DateOfBite", orig_dataset=dataset, filtered_dataset=dataset_filtered
)

Gain: 8.87


dataset_filtered["DateOfBite"].hist()
plt.show()


import sklearn
from sklearn.datasets import load_wine

# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine

# Download dataset
dataset = load_wine(
    return_X_y=True
)  # also we can get data in Bunch (dictionary) or pandas DataFrame

features = dataset[0]  # array 178x13 (178 bottles each with 13 features)
class_labels = dataset[
    1
]  # array of 178 elements, each element is a number the class: 0,1 2
print("features shape:", features.shape)
print("class_labels shape:", class_labels.shape)

features shape: (178, 13)
class_labels shape: (178,)


dataset

(array([[1.423e+01, 1.710e+00, 2.430e+00, ..., 1.040e+00, 3.920e+00,
         1.065e+03],
        [1.320e+01, 1.780e+00, 2.140e+00, ..., 1.050e+00, 3.400e+00,
         1.050e+03],
        [1.316e+01, 2.360e+00, 2.670e+00, ..., 1.030e+00, 3.170e+00,
         1.185e+03],
        ...,
        [1.327e+01, 4.280e+00, 2.260e+00, ..., 5.900e-01, 1.560e+00,
         8.350e+02],
        [1.317e+01, 2.590e+00, 2.370e+00, ..., 6.000e-01, 1.620e+00,
         8.400e+02],
        [1.413e+01, 4.100e+00, 2.740e+00, ..., 6.100e-01, 1.600e+00,
         5.600e+02]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2]))


# Import library to work with tabular data: https://pandas.pydata.org/
import pandas as pd

dataset_bunch = load_wine(return_X_y=False)
print(dataset_bunch.keys())

df = pd.DataFrame(dataset_bunch.data, columns=dataset_bunch.feature_names)
df.head()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


from IPython.display import clear_output
from torchvision import datasets

train_set = datasets.CIFAR10("content", train=True, download=True)
val_set = datasets.CIFAR10("content", train=False, download=True)

clear_output()


import pickle
from matplotlib import pyplot as plt

plt.rcParams["figure.figsize"] = (20, 10)

# load labels names for visualization
with open("content/cifar-10-batches-py/batches.meta", "rb") as infile:
    cifar_meta = pickle.load(infile)
labels_name = cifar_meta["label_names"]

for j in range(10):
    img, label = train_set[j]
    plt.subplot(1, 10, j + 1)
    plt.imshow(img)
    plt.axis("off")
    plt.title(labels_name[label])


train_set[0]

(<PIL.Image.Image image mode=RGB size=32x32 at 0x7A5CAE613B50>, 6)


from torch.utils.data import DataLoader
from torchvision import transforms

val_set.transform = transforms.Compose(
    [transforms.ToTensor()]
)  # PIL Image to Pytorch tensor
val_loader = DataLoader(val_set, batch_size=8, shuffle=False)

for batch in val_loader:
    imgs, labels = batch
    print(len(batch))
    print("Images: ", imgs.shape)
    print("Labels: ", labels.shape)
    print(labels)
    break

2
Images:  torch.Size([8, 3, 32, 32])
Labels:  torch.Size([8])
tensor([3, 8, 8, 0, 6, 6, 1, 6])


import torch


class FakeModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.train_data = None
        self.train_labels = None

    def fit(self, x, y):
        # Simple store all data
        self.train_data = (
            torch.vstack((self.train_data, x)) if self.train_data != None else x
        )
        self.train_labels = (
            torch.hstack((self.train_labels, y)) if self.train_labels != None else y
        )

    def forward(self, x):
        # x is a batch, not a single sample!
        # Return random number instead of predictions
        class_count = torch.unique(self.train_labels).shape[0]
        # https://pytorch.org/docs/stable/generated/torch.randint.html#torch-randint
        # size is shape of output tensor
        label = torch.randint(low=0, high=class_count - 1, size=(x.shape[0],))
        return label


train_set.transform = transforms.Compose(
    [
        transforms.ToTensor(),
    ]
)  # PIL Image to PyTorch tensor
train_loader = DataLoader(train_set, batch_size=1024, shuffle=True)

model = FakeModel()

for img_batch, labels_batch in train_loader:
    model.fit(img_batch, labels_batch)


img_batch, label_batch = next(iter(val_loader))
predicted_labels = model(img_batch)

for i, predicted_label in enumerate(predicted_labels):
    img = img_batch[i].permute(1, 2, 0).numpy() * 255
    plt.subplot(1, len(predicted_labels), i + 1)
    plt.imshow(img.astype(int))
    plt.axis("off")
    plt.title(labels_name[int(predicted_label)])


from sklearn.metrics import accuracy_score

accuracy = []
for img_batch, labels_batch in val_loader:
    predicted = model(img_batch)
    batch_accuracy = accuracy_score(labels_batch, predicted)
    accuracy.append(batch_accuracy)

print("Accuracy", torch.tensor(accuracy).mean())

Accuracy tensor(0.0980, dtype=torch.float64)


from IPython.display import clear_output
from torchvision import datasets

# Load dataset from torchvision.datasets
train_set = datasets.CIFAR10("content", train=True, download=True)
val_set = datasets.CIFAR10("content", train=False, download=True)
labels_names = train_set.classes
clear_output()


import matplotlib.pyplot as plt

img_1 = train_set.data[0]
img_2 = train_set.data[1]
img_3 = train_set.data[2]

fix, ax = plt.subplots(1, 3, figsize=(12, 4))
ax[0].set_title("First image in CIFAR10 train data")
ax[0].imshow(img_1)
ax[1].set_title("Second image in CIFAR10 train data")
ax[1].imshow(img_2)
ax[2].set_title("Third image in CIFAR10 train data")
ax[2].imshow(img_3)
plt.show()


sample_ship_img = val_set.data[18]
plt.figure(figsize=(4, 4))
plt.imshow(sample_ship_img)
plt.show()


from sklearn.neighbors import KNeighborsClassifier

# in order to limit computational time
index_limiter = 5000
x = train_set.data.reshape(train_set.data.shape[0], -1)[:index_limiter]
y = train_set.targets[:index_limiter]

for metric_type in ["euclidean", "manhattan", "chebyshev"]:
    print()
    for k in range(3, 7, 1):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric_type)
        knn.fit(x, y)
        result_class_id = knn.predict([sample_ship_img.flatten()])[0]
        result_class = train_set.classes[result_class_id]
        print(f"{k}-NN with {metric_type} metric\npredicted class is: {result_class}\n")

3-NN with euclidean metric
predicted class is: automobile

4-NN with euclidean metric
predicted class is: ship

5-NN with euclidean metric
predicted class is: ship

6-NN with euclidean metric
predicted class is: ship


3-NN with manhattan metric
predicted class is: automobile

4-NN with manhattan metric
predicted class is: automobile

5-NN with manhattan metric
predicted class is: truck

6-NN with manhattan metric
predicted class is: ship


3-NN with chebyshev metric
predicted class is: ship

4-NN with chebyshev metric
predicted class is: ship

5-NN with chebyshev metric
predicted class is: ship

6-NN with chebyshev metric
predicted class is: ship


import sklearn.datasets

cancer = sklearn.datasets.load_breast_cancer()  # load data

x = cancer.data  # features
y = cancer.target  # labels(classes)
print(f"x shape: {x.shape}, y shape: {y.shape}")
print(f"x[0]: \n {x[0]}")
print(f"y[0]: \n {y[0]}")

x shape: (569, 30), y shape: (569,)
x[0]: 
 [1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]
y[0]: 
 0


plt.figure(figsize=(6, 4))  # set figure size
plt.bar(1, y[y == 1].shape, label=cancer.target_names[0])  # 1 label
plt.bar(0, y[y == 0].shape, label=cancer.target_names[1])  # 0 label
plt.title("Class balance")
plt.ylabel("Num examples")
plt.xticks(ticks=[1, 0], labels=["1", "0"])
plt.legend(loc="upper left")
plt.show()


import pandas as pd

pd.DataFrame(x).describe()


import seaborn as sns

plt.figure(figsize=(10, 5))
ax = sns.boxenplot(data=pd.DataFrame(x), orient="h", palette="Set2")
ax.set(xscale="log", xlim=(1e-4, 1e4), xlabel="Values", ylabel="Features")
plt.show()


from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


test = x[:, 0].reshape(-1, 1)

plt.figure(1, figsize=(24, 5))
plt.subplot(141)  # set location
plt.scatter(test, range(len(test)), c=y)
plt.ylabel("Num examples", fontsize=15)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("Non scaled data", fontsize=18)

# scale data with MinMaxScaler
test_scaled = MinMaxScaler().fit_transform(test)
plt.subplot(142)
plt.scatter(test_scaled, range(len(test)), c=y)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("MinMaxScaler", fontsize=18)

# scale data  with StandardScaler
test_scaled = StandardScaler().fit_transform(test)
plt.subplot(143)
plt.scatter(test_scaled, range(len(test)), c=y)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("StandardScaler", fontsize=18)

# scale data  with RobustScaler
test_scaled = RobustScaler().fit_transform(test)
plt.subplot(144)
plt.scatter(test_scaled, range(len(test)), c=y)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("RobustScaler", fontsize=18)
plt.show()


x_norm = StandardScaler().fit_transform(x)  # scaled data


pd.DataFrame(x_norm).describe()


plt.figure(figsize=(10, 5))
ax = sns.boxenplot(data=pd.DataFrame(x_norm), orient="h", palette="Set2")
ax.set(xlabel="Values", ylabel="Features")
plt.show()


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


# split data to train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=25)

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train, y_train)

print("Without normalization")
accuracy_train = accuracy_score(y_pred=knn.predict(x_train), y_true=y_train)
print("accuracy_train", round(accuracy_train, 3))
accuracy_test = accuracy_score(y_pred=knn.predict(x_test), y_true=y_test)
print("accuracy_test", round(accuracy_test, 3))

Without normalization
accuracy_train 0.946
accuracy_test 0.909


scaler = StandardScaler()
scaler.fit(x_train)
x_train_norm = scaler.transform(x_train)  # scaling data
x_test_norm = scaler.transform(x_test)  # scaling data

knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(x_train_norm, y_train)

print("With normalization")
accuracy_train = accuracy_score(y_pred=knn.predict(x_train_norm), y_true=y_train)
print("accuracy_train", round(accuracy_train, 3))
accuracy_test = accuracy_score(y_pred=knn.predict(x_test_norm), y_true=y_test)
print("accuracy_test", round(accuracy_test, 3))

With normalization
accuracy_train 0.979
accuracy_test 0.951


import numpy as np


num_neighbors = np.arange(1, 31)  # array of the numbers of neighbors from 1 to 30

quality = np.zeros(num_neighbors.shape[0])

for i in range(num_neighbors.shape[0]):  # for all elements
    # create knn for all number of neighbors
    knn = KNeighborsClassifier(n_neighbors=num_neighbors[i])
    knn.fit(x_train_norm, y_train)
    q = accuracy_score(y_pred=knn.predict(x_test_norm), y_true=y_test)  # accuracy
    quality[i] = q  # fill quality

plt.figure(figsize=(10, 5))
plt.title("KNN on train", size=20)
plt.xlabel("Neighbors", size=10)
plt.ylabel("Accuracy", size=10)
plt.plot(num_neighbors, quality)
plt.xticks(num_neighbors)
plt.show()


num_neighbors = np.arange(
    1, 31
)  # array of the numbers of nearest neigbors from 1 to 30
train_quality = np.zeros(num_neighbors.shape[0])  # quality on train data
test_quality = np.zeros(num_neighbors.shape[0])  # quality on test data

for i in range(num_neighbors.shape[0]):
    knn = KNeighborsClassifier(n_neighbors=num_neighbors[i])
    knn.fit(x_train_norm, y_train)

    # accuracy on train data
    train_quality[i] = accuracy_score(y_pred=knn.predict(x_train_norm), y_true=y_train)

    # accuracy on test data
    test_quality[i] = accuracy_score(y_pred=knn.predict(x_test_norm), y_true=y_test)

# accuracy plot  on train and test data
plt.figure(figsize=(10, 5))
plt.title("KNN on train vs test", size=20)
plt.plot(num_neighbors, train_quality, label="train")
plt.plot(num_neighbors, test_quality, label="test")
plt.legend()
plt.xticks(num_neighbors)
plt.xlabel("Neighbors", size=12)
plt.ylabel("Accuracy", size=12)
plt.show()


from sklearn.metrics import classification_report

y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ["class 0", "class 1", "class 2"]
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

     class 0       0.50      1.00      0.67         1
     class 1       0.00      0.00      0.00         1
     class 2       1.00      0.67      0.80         3

    accuracy                           0.60         5
   macro avg       0.50      0.56      0.49         5
weighted avg       0.70      0.60      0.61         5


import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics

fig, ax = plt.subplots(1, 2, figsize=(10, 4))
fig.tight_layout(pad=3.0)
plt.rcParams.update({"font.size": 16})
# font = {'size':'21'}
ax[0].set_title("Balanced data")
ax[1].set_title("Unbalanced data")

labels = ["Airplane", "Auto", "Bird"]

# Balanced data
air, auto, bird = 150, 150, 150
actual_b = np.array([0] * air + [1] * auto + [2] * bird)
predicted_b = np.array([0] * (air - 10) + [1] * (auto + 20) + [2] * (bird - 10))

# Unbalanced data
air, auto, bird = 430, 10, 10
actual_ub = np.array([0] * air + [1] * auto + [2] * bird)
predicted_ub = np.array([0] * (air + 20) + [1] * (auto - 10) + [2] * (bird - 10))

metrics.ConfusionMatrixDisplay(
    confusion_matrix=metrics.confusion_matrix(actual_b, predicted_b),
    display_labels=labels,
).plot(ax=ax[0])

metrics.ConfusionMatrixDisplay(
    confusion_matrix=metrics.confusion_matrix(actual_ub, predicted_ub),
    display_labels=labels,
).plot(ax=ax[1])

label_font = {"size": "15"}  # Adjust to fit
ax[0].set_xlabel("Predicted labels", fontdict=label_font)
ax[0].set_ylabel("True labels", fontdict=label_font)
ax[1].set_xlabel("Predicted labels", fontdict=label_font)
ax[1].set_ylabel("True labels", fontdict=label_font)

plt.show()

print(
    "Accuracy Balanced   Data:", round(metrics.accuracy_score(actual_b, predicted_b), 2)
)
print(
    "Accuracy Unbalanced Data:",
    round(metrics.accuracy_score(actual_ub, predicted_ub), 2),
)

Accuracy Balanced   Data: 0.96
Accuracy Unbalanced Data: 0.96


print(
    "Balanced accuracy for Balanced data  :",
    round(metrics.balanced_accuracy_score(actual_b, predicted_b), 2),
)
print(
    "Balanced accuracy for Unbalanced data :",
    round(metrics.balanced_accuracy_score(actual_ub, predicted_ub), 2),
)

Balanced accuracy for Balanced data  : 0.96
Balanced accuracy for Unbalanced data : 0.33


import numpy as np


def generate_wave_set(n_support=1000, n_train=25, std=0.3):
    data = {}
    # Select a certain number of points from the interval from 0 to 2*pi
    data["support"] = np.linspace(0, 2 * np.pi, num=n_support)
    # For each point calculate the value sin (x) + 1
    data["values"] = np.sin(data["support"]) + 1
    #  Sample features - a certain number of points with a return
    data["x_train"] = np.sort(
        np.random.choice(data["support"], size=n_train, replace=True)
    )
    # target variable - calculate sin(x) + 1 and add noise
    data["y_train"] = (
        np.sin(data["x_train"])
        + 1
        + np.random.normal(0, std, size=data["x_train"].shape[0])
    )
    return data


data = generate_wave_set(1000, 250)


# Split data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    data["x_train"], data["y_train"], test_size=0.2
)  # 80% training and 20% test

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)

x_train shape: (200,)
x_test shape: (50,)


from torchvision import datasets
from IPython.display import clear_output

np.random.seed(42)

dataset = datasets.CIFAR10("content", train=True, download=True)

clear_output()

data, _, labels, _ = train_test_split(
    dataset.data / 255,  # normalize
    np.array(dataset.targets),
    train_size=10,  # get only 10 imgs
    random_state=42,
    stratify=dataset.targets,
)
print("Data shape:", data.shape)

Data shape: (10, 32, 32, 3)


import matplotlib.pyplot as plt

fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(10, 5))
for i in range(10):
    axs[i // 5][i % 5].imshow(data[i])
    axs[i // 5][i % 5].set_title(labels[i])
plt.show()


x_test = data[3]


# L1 distance
def compute_L1(a, b):
    return np.sum(np.abs(a - b))


# distance calculation
distances = []
for i in range(10):
    l1 = compute_L1(x_test, data[i])
    distances.append(l1)

distances = np.array(distances)
print(distances)

[ 666.98039216  675.2        1027.75294118    0.          826.68235294
  897.95686275  940.43529412 1264.40784314  729.96470588  717.65098039]


neighbor_index = np.argmin(distances)  # index of nearest neighbor
print(neighbor_index)

3


data_test, _, labels_test, _ = train_test_split(
    dataset.data / 255,  # normalize
    np.array(dataset.targets),
    train_size=10,  # get only 10 imgs
    random_state=24,
    stratify=dataset.targets,
)


fig, axs = plt.subplots(nrows=2, ncols=5, figsize=(10, 5))
for i in range(10):
    axs[i // 5][i % 5].imshow(data_test[i])
    axs[i // 5][i % 5].set_title(labels_test[i])
plt.show()


x_test = data_test[1]

# distance calculation
distances = []
for i in range(10):
    l1 = compute_L1(x_test, data[i])
    distances.append(l1)

distances = np.array(distances)
print(distances)

neighbor_index = np.argmin(distances)
print(labels[neighbor_index])

[ 700.03137255  737.02745098  853.63529412  710.8627451   913.35686275
 1000.6627451   995.59607843 1353.4745098   864.71764706  722.45098039]
6


from sklearn.datasets import load_iris

iris = load_iris()
print(iris.target)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


def count_lables(lables):
    lable_count = {}
    for item in lables:
        if item not in lable_count:
            lable_count[item] = 0
        lable_count[item] += 1
    return lable_count


def print_split_stat(x_train, x_test, y_train, y_test):
    print("Train labels: ", y_train)
    print("Test labels:  ", y_test)
    print("Train statistics: ", count_lables(y_train))
    print("Test statistics:  ", count_lables(y_test))


data, labels = load_iris(return_X_y=True)
print("DataSet labels: ", labels)
print("DataSet statistics: ", count_lables(labels))

DataSet labels:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]
DataSet statistics:  {0: 50, 1: 50, 2: 50}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, shuffle=False, random_state=42
)

print_split_stat(x_train, x_test, y_train, y_test)

Train labels:  [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1]
Test labels:   [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2]
Train statistics:  {0: 50, 1: 25}
Test statistics:   {1: 25, 2: 50}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, random_state=42
)

print_split_stat(x_train, x_test, y_train, y_test)

Train labels:  [1 2 1 0 1 2 0 0 1 1 0 2 0 0 1 1 2 1 2 2 1 0 0 2 2 0 0 0 1 2 0 2 2 0 1 1 2
 1 2 0 2 1 2 1 1 1 0 1 1 0 1 2 2 0 1 2 2 0 2 0 1 2 2 1 2 1 1 2 2 0 1 2 0 1
 2]
Test labels:   [1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0 0 0 1 0 0 2 1
 0 0 0 2 1 1 0 0 1 2 2 1 2 1 2 1 0 2 1 0 0 0 1 2 0 0 0 1 0 1 2 0 1 2 0 2 2
 1]
Train statistics:  {1: 27, 2: 27, 0: 21}
Test statistics:   {1: 23, 0: 29, 2: 23}


x_train, x_test, y_train, y_test = train_test_split(
    data, labels, train_size=0.5, random_state=42, stratify=labels
)

print_split_stat(x_train, x_test, y_train, y_test)

Train labels:  [0 1 2 1 0 2 0 0 0 2 1 0 2 2 2 0 1 0 1 0 0 2 2 0 0 0 1 1 2 0 2 1 2 1 2 2 0
 1 1 0 1 0 2 2 0 1 2 1 0 1 1 0 2 0 1 0 2 1 2 2 0 2 2 2 1 1 2 1 2 1 0 1 1 1
 0]
Test labels:   [0 0 0 0 2 0 1 2 1 2 1 0 1 1 0 0 0 2 1 1 2 2 2 1 2 2 1 0 1 1 2 0 0 1 0 2 2
 2 1 1 1 0 0 1 2 0 1 0 1 1 1 1 2 0 1 2 2 2 1 0 1 2 2 0 0 2 0 0 2 0 1 0 2 2
 2]
Train statistics:  {0: 25, 1: 25, 2: 25}
Test statistics:   {0: 25, 2: 25, 1: 25}


import random

random.seed = 42
target = [random.randint(0, 1) for _ in range(200)]


target_public = target[:100]
target_private = target[100:]


from sklearn.metrics import accuracy_score

public_accuracy_list = []
private_accuracy_list = []
best_public_accuracy = 0

for _ in range(1000):
    ans = [random.randint(0, 1) for _ in range(200)]

    public_accuracy = accuracy_score(target_public, ans[:100])
    private_accuracy = accuracy_score(target_private, ans[100:])

    if public_accuracy > best_public_accuracy:
        predict = ans
        best_public_accuracy = public_accuracy
        best_private_accuracy = private_accuracy

    public_accuracy_list.append(best_public_accuracy)
    private_accuracy_list.append(best_private_accuracy)


plt.figure(figsize=(10, 4))
plt.plot(range(1000), public_accuracy_list, label="Public accuracy")
plt.plot(range(1000), private_accuracy_list, label="Private accuracy")
plt.legend()
plt.show()


import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


dataset = sklearn.datasets.load_iris()  # load data
x = dataset.data  # features
y = dataset.target  # labels(classes)

np.random.seed(42)


def split_and_train(x, y, random_state):
    x_train, x_val, y_train, y_val = train_test_split(
        x, y, train_size=0.8, stratify=y, random_state=random_state
    )

    max_neighbors = 30
    num_neighbors = np.arange(1, max_neighbors + 1)  # array of the number of neighbors

    train_accuracy = np.zeros(max_neighbors)
    val_accuracy = np.zeros(max_neighbors)

    for k in num_neighbors:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(x_train, y_train)

        train_accuracy[k - 1] = accuracy_score(
            y_pred=knn.predict(x_train), y_true=y_train
        )
        val_accuracy[k - 1] = accuracy_score(y_pred=knn.predict(x_val), y_true=y_val)

    # accuracy plot on train and test data
    plt.figure(figsize=(10, 4))
    plt.title("KNN on train vs val", size=20)
    plt.plot(num_neighbors, train_accuracy, label="train")
    plt.plot(num_neighbors, val_accuracy, label="val")
    plt.legend()
    plt.xticks(num_neighbors, size=12)
    plt.xlabel("Neighbors", size=14)
    plt.ylabel("Accuracy", size=14)
    plt.show()


split_and_train(x, y, random_state=42)


split_and_train(x, y, random_state=4)


from sklearn.model_selection import KFold

x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])

print("index without shuffle")
kf = KFold(n_splits=3)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)

print("index with shuffle")
kf = KFold(n_splits=3, random_state=42, shuffle=True)
for train_index, test_index in kf.split(x):
    print("TRAIN:", train_index, "TEST:", test_index)

index without shuffle
TRAIN: [3 4 5 6 7 8] TEST: [0 1 2]
TRAIN: [0 1 2 6 7 8] TEST: [3 4 5]
TRAIN: [0 1 2 3 4 5] TEST: [6 7 8]
index with shuffle
TRAIN: [0 2 3 4 6 8] TEST: [1 5 7]
TRAIN: [1 3 4 5 6 7] TEST: [0 2 8]
TRAIN: [0 1 2 5 7 8] TEST: [3 4 6]


from sklearn.model_selection import cross_val_score, StratifiedKFold

np.random.seed(42)

dataset = sklearn.datasets.load_iris()  # load data
x = dataset.data  # features
y = dataset.target  # labels(classes)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, stratify=y, random_state=42
)

cv = StratifiedKFold(n_splits=5)
knn = KNeighborsClassifier(n_neighbors=3)
accuracy = cross_val_score(knn, x_train, y_train, cv=cv, scoring="accuracy")
print("3NN accuracy: ", accuracy)
print(
    "%0.2f accuracy with a standard deviation of %0.2f"
    % (accuracy.mean(), accuracy.std())
)

knn = KNeighborsClassifier(n_neighbors=5)
accuracy = cross_val_score(knn, x_train, y_train, cv=cv, scoring="accuracy")
print("5NN accuracy: ", accuracy)
print(
    "%0.2f accuracy with a standard deviation of %0.2f"
    % (accuracy.mean(), accuracy.std())
)

3NN accuracy:  [0.91666667 0.95833333 0.95833333 0.95833333 1.        ]
0.96 accuracy with a standard deviation of 0.03
5NN accuracy:  [0.91666667 1.         0.95833333 1.         1.        ]
0.97 accuracy with a standard deviation of 0.03


from sklearn.datasets import make_moons

x, y = make_moons(n_samples=1000, noise=0.3, random_state=42)

plt.figure(figsize=(10, 5))
plt.scatter(x[:, 0], x[:, 1], c=y)
plt.show()


x_train, x_test, y_train, y_test = train_test_split(
    x, y, train_size=0.8, stratify=y, random_state=42
)


from sklearn.model_selection import GridSearchCV

"""
Parameters for GridSearchCV:
estimator — model
cv — num of fold to cross-validation splitting
param_grid — parameters names
scoring — metrics
n_jobs — number of jobs to run in parallel, -1 means using all processors.
"""

model = GridSearchCV(
    estimator=KNeighborsClassifier(),
    cv=KFold(5, shuffle=True, random_state=42),
    param_grid={
        "n_neighbors": np.arange(1, 31),
        "metric": ["euclidean", "manhattan"],
        "weights": ["uniform", "distance"],
    },
    scoring="accuracy",
    n_jobs=-1,
)
model.fit(x_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid={'metric': ['euclidean', 'manhattan'],
                         'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30]),
                         'weights': ['uniform', 'distance']},
             scoring='accuracy')

KNeighborsClassifier()

KNeighborsClassifier()


print("Metric:", model.best_params_["metric"])
print("Num neighbors:", model.best_params_["n_neighbors"])
print("Weigths:", model.best_params_["weights"])

Metric: euclidean
Num neighbors: 30
Weigths: distance


from sklearn.metrics import balanced_accuracy_score

y_pred = model.predict(x_test)
print(
    f"Percent correct predictions {np.round(accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)
print(
    f"Percent correct predictions (balanced classes) {np.round(balanced_accuracy_score(y_pred=y_pred, y_true=y_test)*100,2)} %"
)

Percent correct predictions 95.5 %
Percent correct predictions (balanced classes) 95.5 %


list(model.cv_results_.keys())

['mean_fit_time',
 'std_fit_time',
 'mean_score_time',
 'std_score_time',
 'param_metric',
 'param_n_neighbors',
 'param_weights',
 'params',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'mean_test_score',
 'std_test_score',
 'rank_test_score']


plt.figure(figsize=(10, 4))
plt.plot(model.cv_results_["mean_test_score"])
plt.title("mean_test_score", size=20)
plt.xlabel("Num of experiment", size=15)
plt.ylabel("Accuracy", size=15)
plt.show()


selected_means = []
selected_std = []
num_neighbors = []
for ind, params in enumerate(model.cv_results_["params"]):
    if (
        params["metric"] == model.best_params_["metric"]
        and params["weights"] == model.best_params_["weights"]
    ):
        num_neighbors.append(params["n_neighbors"])
        selected_means.append(model.cv_results_["mean_test_score"][ind])
        selected_std.append(model.cv_results_["std_test_score"][ind])


plt.figure(figsize=(10, 4))
plt.title(f"KNN CV, {params['metric']}, {params['weights']}", size=18)
plt.errorbar(num_neighbors, selected_means, yerr=selected_std, fmt="-o")
plt.xticks(num_neighbors, size=13)
plt.ylabel("Mean_test_score", size=15)
plt.xlabel("Neighbors", size=15)

plt.show()

	UniqueID	DateOfBite	Species	Breed	Age	Gender	SpayNeuter	Borough	ZipCode
0	1	January 01 2018	DOG	UNKNOWN	NaN	U	False	Brooklyn	11220
1	2	January 04 2018	DOG	UNKNOWN	NaN	U	False	Brooklyn	NaN
2	3	January 06 2018	DOG	Pit Bull	NaN	U	False	Brooklyn	11224

	alcohol	malic_acid	ash	alcalinity_of_ash	magnesium	total_phenols	flavanoids	nonflavanoid_phenols	proanthocyanins	color_intensity	hue	od280/od315_of_diluted_wines	proline
0	14.23	1.71	2.43	15.6	127.0	2.80	3.06	0.28	2.29	5.64	1.04	3.92	1065.0
1	13.20	1.78	2.14	11.2	100.0	2.65	2.76	0.26	1.28	4.38	1.05	3.40	1050.0
2	13.16	2.36	2.67	18.6	101.0	2.80	3.24	0.30	2.81	5.68	1.03	3.17	1185.0
3	14.37	1.95	2.50	16.8	113.0	3.85	3.49	0.24	2.18	7.80	0.86	3.45	1480.0
4	13.24	2.59	2.87	21.0	118.0	2.80	2.69	0.39	1.82	4.32	1.04	2.93	735.0

	0	1	2	3	4	5	6	7	8	9	...	20	21	22	23	24	25	26	27	28	29
count	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	...	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000	569.000000
mean	14.127292	19.289649	91.969033	654.889104	0.096360	0.104341	0.088799	0.048919	0.181162	0.062798	...	16.269190	25.677223	107.261213	880.583128	0.132369	0.254265	0.272188	0.114606	0.290076	0.083946
std	3.524049	4.301036	24.298981	351.914129	0.014064	0.052813	0.079720	0.038803	0.027414	0.007060	...	4.833242	6.146258	33.602542	569.356993	0.022832	0.157336	0.208624	0.065732	0.061867	0.018061
min	6.981000	9.710000	43.790000	143.500000	0.052630	0.019380	0.000000	0.000000	0.106000	0.049960	...	7.930000	12.020000	50.410000	185.200000	0.071170	0.027290	0.000000	0.000000	0.156500	0.055040
25%	11.700000	16.170000	75.170000	420.300000	0.086370	0.064920	0.029560	0.020310	0.161900	0.057700	...	13.010000	21.080000	84.110000	515.300000	0.116600	0.147200	0.114500	0.064930	0.250400	0.071460
50%	13.370000	18.840000	86.240000	551.100000	0.095870	0.092630	0.061540	0.033500	0.179200	0.061540	...	14.970000	25.410000	97.660000	686.500000	0.131300	0.211900	0.226700	0.099930	0.282200	0.080040
75%	15.780000	21.800000	104.100000	782.700000	0.105300	0.130400	0.130700	0.074000	0.195700	0.066120	...	18.790000	29.720000	125.400000	1084.000000	0.146000	0.339100	0.382900	0.161400	0.317900	0.092080
max	28.110000	39.280000	188.500000	2501.000000	0.163400	0.345400	0.426800	0.201200	0.304000	0.097440	...	36.040000	49.540000	251.200000	4254.000000	0.222600	1.058000	1.252000	0.291000	0.663800	0.207500

	0	1	2	3	4	5	6	7	8	9	...	20	21	22	23	24	25	26	27	28	29
count	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	...	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02	5.690000e+02
mean	-3.153111e-15	-6.568462e-15	-6.993039e-16	-8.553985e-16	6.081447e-15	-1.136369e-15	-2.997017e-16	1.023981e-15	-1.860648e-15	-1.504752e-15	...	-2.297713e-15	1.742016e-15	-1.198807e-15	6.118909e-16	-5.094929e-15	-2.122887e-15	6.118909e-16	-1.998011e-16	-2.422589e-15	2.497514e-15
std	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	...	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00	1.000880e+00
min	-2.029648e+00	-2.229249e+00	-1.984504e+00	-1.454443e+00	-3.112085e+00	-1.610136e+00	-1.114873e+00	-1.261820e+00	-2.744117e+00	-1.819865e+00	...	-1.726901e+00	-2.223994e+00	-1.693361e+00	-1.222423e+00	-2.682695e+00	-1.443878e+00	-1.305831e+00	-1.745063e+00	-2.160960e+00	-1.601839e+00
25%	-6.893853e-01	-7.259631e-01	-6.919555e-01	-6.671955e-01	-7.109628e-01	-7.470860e-01	-7.437479e-01	-7.379438e-01	-7.032397e-01	-7.226392e-01	...	-6.749213e-01	-7.486293e-01	-6.895783e-01	-6.421359e-01	-6.912304e-01	-6.810833e-01	-7.565142e-01	-7.563999e-01	-6.418637e-01	-6.919118e-01
50%	-2.150816e-01	-1.046362e-01	-2.359800e-01	-2.951869e-01	-3.489108e-02	-2.219405e-01	-3.422399e-01	-3.977212e-01	-7.162650e-02	-1.782793e-01	...	-2.690395e-01	-4.351564e-02	-2.859802e-01	-3.411812e-01	-4.684277e-02	-2.695009e-01	-2.182321e-01	-2.234689e-01	-1.274095e-01	-2.164441e-01
75%	4.693926e-01	5.841756e-01	4.996769e-01	3.635073e-01	6.361990e-01	4.938569e-01	5.260619e-01	6.469351e-01	5.307792e-01	4.709834e-01	...	5.220158e-01	6.583411e-01	5.402790e-01	3.575891e-01	5.975448e-01	5.396688e-01	5.311411e-01	7.125100e-01	4.501382e-01	4.507624e-01
max	3.971288e+00	4.651889e+00	3.976130e+00	5.250529e+00	4.770911e+00	4.568425e+00	4.243589e+00	3.927930e+00	4.484751e+00	4.910919e+00	...	4.094189e+00	3.885905e+00	4.287337e+00	5.930172e+00	3.955374e+00	5.112877e+00	4.700669e+00	2.685877e+00	6.046041e+00	6.846856e+00

	$\large y=1$	$\large y=0$
$\large \widehat{y}=1$	$\large True Positive (TP) $	$\large False Positive (FP) $
$\large \widehat{y}=0$	$\large False Negative (FN)$	$\large True Negative (TN) $

Задача курса¶

AI, ML, DL¶

Области применения¶

Связь с наукой¶

Обзор курса¶

Лекция 1 Intro¶

Лекция 2 Линейный классификатор¶

Лекция 3 Классическое машинное обучение¶

Лекция 4 Генерация и отбор признаков¶

Лекция 5 Нейронные сети¶

Лекция 6 Свёрточные нейронные сети¶

Лекция 7 Улучшение сходимости нейросетей и борьба с переобучением¶

Лекция 8 Реккурентные нейронные сети¶

Лекция 9 Архитектуры CNN¶

Лекция 10 Explainability¶

Лекция 11 Обучение на реальных данных¶

Лекция 12 Сегментация и детектирование¶

Лекция 13 Автоэнкодеры¶

Лекция 14 Генеративные сети¶

Лекция 15 Обучение с подкреплением¶

Базовые задачи¶

Классификация¶

Регрессия¶

Кластеризация¶

Комбинированные задачи¶

AlphaFold¶

Подробнее про AlphaFold¶

Инструменты¶

Контейнеры¶

List¶

NumPy¶

Torch.Tensor¶

План исследования¶

Сбор и подготовка данных¶

Извлечение закономерностей¶

Валидация результата¶

Пример ML задачи¶

Вариант №1¶

Вариант №2¶

Данные¶

Связность данных¶

Загрузка и визуализация данных¶

Табличные данные¶

Изображения¶

Алгоритм k-NN¶

Описание модели¶

Описание данных¶

Близость данных согласно метрике¶

Нормализация данных¶

Параметры и гиперпараметры модели¶

k-NN для классификации¶

Переобучение k-NN¶

Метрики¶

Accuracy¶

Precision, Recall¶

F-мера¶

AUC-ROC¶

Построение¶

Смысл метрики¶

Разделение train-validation-test¶

Примеры ошибок в данных и при разбиении¶

Утечка данных¶

Дублирование данных¶

Утечка, спрятанная в признаках¶

Разделение на train и test¶

Перемешивание данных¶

Данные из различных источников¶

Подбор гиперпараметров на тестовой выборке¶

Гиперпараметры модели¶

Проблема подбора гиперпараметров¶

Кросс-валидация¶

Алгоритм кросс-валидации¶

Оценка результата кросс-валидации¶

Типичные ошибки при кросс-валидации¶

Кросс-валидация для научных исследований: на что обратить внимание¶

GridSearch¶

RandomizedSearch¶