!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/dev-2.0/L05/lc_mnist_weights.txt
!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/dev-2.0/L05/lc_cifar10_weights.txt


import numpy as np
import matplotlib.pyplot as plt

# Display templates
plt.rcParams["figure.figsize"] = (25, 10)

W = np.loadtxt("lc_mnist_weights.txt")  # load weigths, shape (785, 10)
print(f"Shape with bias: {W.shape}")

# Remove bias
W = W[:-1, :]
print(f"Shape without bias: {W.shape}")

# Normalize
w_min, w_max = np.min(W), np.max(W)
templates = 255 * (W - w_min) / (w_max - w_min)

# Display templates
labels_names = [str(i) for i in range(10)]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    img = templates[:, i].reshape(28, 28).astype(int)
    plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.title(labels_names[i], size=25)

Shape with bias: (785, 10)
Shape without bias: (784, 10)


# Display templates
plt.rcParams["figure.figsize"] = (25, 10)

W = np.loadtxt("lc_cifar10_weights.txt")  # load weigths, shape (3073, 10)
print(f"Shape with bias: {W.shape}")

# Remove bias
W = W[:-1, :]
print(f"Shape without bias: {W.shape}")

# Normalize
w_min, w_max = np.min(W), np.max(W)
templates = 255 * (W - w_min) / (w_max - w_min)

# Display templates
labels_names = [
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    img = templates[:, i].reshape(3, 32, 32).transpose(1, 2, 0).astype(int)
    plt.imshow(img)
    plt.axis("off")
    plt.title(labels_names[i], size=25)

Shape with bias: (3073, 10)
Shape without bias: (3072, 10)


import numpy as np

x = np.random.rand(3072)  # random image
W1 = np.random.randn(3072, 100) * 0.0001  # without bias
W2 = np.random.randn(100, 10) * 0.0001  # without bias

scores1 = np.matmul(x, W1)  # matrix multiplication, equivalent x @ W1
scores2 = np.matmul(scores1, W2)  # matrix multiplication, of the next classifier

print(f"First classifier shape: {scores1.shape}")
print(f"Second classifier shape: {scores2.shape}")

First classifier shape: (100,)
Second classifier shape: (10,)


def sigmoid(s):
    return 1 / (1 + np.exp(-s))


# define vectorized sigmoid to implement with ndarray element-wise
sigmoid_np = np.vectorize(sigmoid)

scores1 = np.matmul(x, W1)
activations = sigmoid_np(scores1)  # values after non-linear function
scores2 = np.matmul(activations, W2)

print(f"First classifier shape: {scores1.shape}")
print(f"Activations shape: {scores1.shape}")
print(f"Second classifier shape: {scores2.shape}")

First classifier shape: (100,)
Activations shape: (100,)
Second classifier shape: (10,)


class NeuralNet:
    def __init__(self):
        self.W1 = np.random.randn(3072, 100) * 0.0001
        self.W2 = np.random.randn(100, 10) * 0.0001

    def predict(self, x):
        scores1 = np.matmul(x, W1)  # Linear
        activations = sigmoid_np(scores1)  # activation Sigmoid
        scores2 = np.matmul(activations, W2)  # Linear

        return scores2


x = np.random.rand(3072)  # image
model = NeuralNet()
scores = model.predict(x)
print(f"Model output shape: {scores.shape}")

Model output shape: (10,)


from IPython.display import HTML
from base64 import b64encode

!wget -qN https://edunet.kea.su/repo/EduNet-content/dev-2.0/L05/out/universal_approximation.mp4

mp4 = open("universal_approximation.mp4", "rb").read()
data_url = f"data:video/mp4;base64,{b64encode(mp4).decode()}"
HTML(f"<video width=1000  controls><source src={data_url} type='video/mp4'></video>")


import torch

a = torch.Tensor()


a = torch.tensor([1.1, 2.2, 3.2])
a.dtype

torch.float32


a = torch.tensor([1.1, 2.2, 3.2], dtype=torch.float64)
a.dtype

torch.float64


a = torch.ones(size=(3, 2))
a.size()

torch.Size([3, 2])


a = torch.full(size=(3, 2), fill_value=3.74)
a

tensor([[3.7400, 3.7400],
        [3.7400, 3.7400],
        [3.7400, 3.7400]])


a = a.T
a

tensor([[3.7400, 3.7400, 3.7400],
        [3.7400, 3.7400, 3.7400]])


c = torch.exp(a)
print("Exponents tensor:\n", c)

c += 1
print("\nAdd 1 to tensor:\n", c)

Exponents tensor:
 tensor([[42.0980, 42.0980, 42.0980],
        [42.0980, 42.0980, 42.0980]])

Add 1 to tensor:
 tensor([[43.0980, 43.0980, 43.0980],
        [43.0980, 43.0980, 43.0980]])


c.sum()

tensor(258.5880)


a = torch.zeros((2, 5, 1, 8))
print("Original tensor size:\n", a.size())

a = a.permute(dims=(2, 0, 3, 1))  # permute dimensions
print("After permute tensor size:\n", a.size())

a = a.squeeze()  # delete dimension
print("After squzee tensor size:\n", a.size())

a = a.unsqueeze(dim=0)  # add dimension
print("After unsquzee tensor size:\n", a.size())

Original tensor size:
 torch.Size([2, 5, 1, 8])
After permute tensor size:
 torch.Size([1, 2, 8, 5])
After squzee tensor size:
 torch.Size([2, 8, 5])
After unsquzee tensor size:
 torch.Size([1, 2, 8, 5])


a.numpy()

array([[[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]]], dtype=float32)


a = torch.rand(2, 8)
print("Original tensor:\n", a)

b = a.view(4, 4)
print("\nTensor after view:\n", b)

print("\nTensor b uses the same memory space as tensor a:")
id(a[0, 0]) == id(b[0, 0])

Original tensor:
 tensor([[0.3213, 0.5171, 0.2343, 0.0594, 0.4392, 0.0780, 0.9155, 0.5565],
        [0.2291, 0.6820, 0.3036, 0.7657, 0.6133, 0.3067, 0.3429, 0.2563]])

Tensor after view:
 tensor([[0.3213, 0.5171, 0.2343, 0.0594],
        [0.4392, 0.0780, 0.9155, 0.5565],
        [0.2291, 0.6820, 0.3036, 0.7657],
        [0.6133, 0.3067, 0.3429, 0.2563]])

Tensor b uses the same memory space as tensor a:

True


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Cuda available: {torch.cuda.is_available()} \n")

a = a.to(device)  # moving tensor to gpu
b = torch.full_like(a, 2).to(device)
c = a * b  # compute on gpu (more fast with parallel computing)
c

Cuda available: True

tensor([[0.6425, 1.0342, 0.4686, 0.1189, 0.8785, 0.1560, 1.8310, 1.1131],
        [0.4583, 1.3640, 0.6071, 1.5313, 1.2266, 0.6134, 0.6858, 0.5127]],
       device='cuda:0')


x = torch.tensor([-1.0, -2.0])  # x from above example

x = torch.cat([x, torch.tensor([1.0])])  # concatenate x with 1. for bias trick

W = torch.tensor([2.0, -3.0, -3.0], requires_grad=True)  # w from above example

print(f"W.grad = {W.grad} (before forward and backward pass grad is 'None')")

# forward pass to compute f
s = x.matmul(W)
f = torch.sigmoid(s)
print(f"f(x, W) = {f:.2f}")

# backward pass to compute gradient df/dW
f.backward()
print(f"W.grad = {W.grad}")

W.grad = None (before forward and backward pass grad is 'None')
f(x, W) = 0.73
W.grad = tensor([-0.1966, -0.3932,  0.1966])


print(f"x.grad = {x.grad}")

x.grad = None


f_detached = f.detach()

print(f"f_detached = {f_detached:.2f}")
print(f"f_detached type: {type(f_detached)}")

f_detached = 0.73
f_detached type: <class 'torch.Tensor'>


value = f_detached.item()

print(f"value = {value:.2f}")
print(f"value type: {type(value)}")

value = 0.73
value type: <class 'float'>


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# This is the parameter we want to optimize -> requires_grad=True
W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)
print(f"W.grad = {W.grad} (before forward pass must be 'None')\n")

# forward pass to compute MSE
y_pred = W * x_train
E = y_pred - y_train
SE = E**2
MSE = SE.mean()
print(f"MSE = {MSE}")

# backward pass to compute gradient dMSE/dw
MSE.backward()
print(f"W.grad = {W.grad}")

W.grad = None (before forward pass must be 'None')

MSE = 7.5
W.grad = -15.0


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# This is the parameter we want to optimize -> requires_grad=True
W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)

# forward pass to compute MSE
y_pred = W * x_train
E = y_pred - y_train
E.retain_grad()  # Save grads for intermediate tensor E in memory
SE = E**2
MSE = SE.mean()

print("========== Backprop 1 ==============")
MSE.backward(retain_graph=True)
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

print("========== Backprop 2 ==============")
MSE.backward(retain_graph=True)
# Gradients are accumulated
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

print("========== Backprop 3 ==============")
W.grad.zero_()  # Nullify gradients for W for the next iteration
MSE.backward(retain_graph=True)
# Gradients for W are not accumulated, but not for E
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

========== Backprop 1 ==============
dMSE/dE = tensor([-0.5000, -1.0000, -1.5000, -2.0000])
dMSE/dW = -15.0
========== Backprop 2 ==============
dMSE/dE = tensor([-1., -2., -3., -4.])
dMSE/dW = -30.0
========== Backprop 3 ==============
dMSE/dE = tensor([-1.5000, -3.0000, -4.5000, -6.0000])
dMSE/dW = -15.0


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)


# Define model output
def forward(x_train):
    return W * x_train


# Compute MSE loss
def criterion(y_pred, y_train):
    return ((y_pred - y_train) ** 2).mean()


print(f"Prediction before training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}\n")

# Training
learning_rate = 0.005
num_epochs = 102

for epoch in range(num_epochs):
    # Propagate forward
    y_pred = forward(x_train)

    # Compute MSE loss
    MSE = criterion(y_pred, y_train)

    # Propagate backward, compute gradients
    MSE.backward()

    # Update weights
    with torch.no_grad():  #  We don't want this step to be the part of the computational graph
        W -= learning_rate * W.grad

    # Nullify gradients after updating to avoid their accumulation
    W.grad.zero_()

    if epoch % 10 == 1:
        print(f"epoch {epoch}: w = {W.item():.3f}, loss = {MSE.item():.8f}")

print(f"\nPrediction after training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}")

Prediction before training: f(x) = tensor([1., 2., 3., 4.], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

epoch 1: w = 1.144, loss = 6.41718674
epoch 11: w = 1.608, loss = 1.34952068
epoch 21: w = 1.820, loss = 0.28380114
epoch 31: w = 1.917, loss = 0.05968266
epoch 41: w = 1.962, loss = 0.01255111
epoch 51: w = 1.983, loss = 0.00263946
epoch 61: w = 1.992, loss = 0.00055505
epoch 71: w = 1.996, loss = 0.00011674
epoch 81: w = 1.998, loss = 0.00002455
epoch 91: w = 1.999, loss = 0.00000516
epoch 101: w = 2.000, loss = 0.00000109

Prediction after training: f(x) = tensor([1.9996, 3.9993, 5.9989, 7.9986], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])


import torch.nn as nn

x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)


# Define model output
def forward(x_train):
    return W * x_train


print(f"Prediction before training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}\n")

# Training
learning_rate = 0.005
num_epochs = 102

criterion = nn.MSELoss()
optimizer = torch.optim.SGD([W], lr=learning_rate)

for epoch in range(num_epochs):
    # Propagate forward
    y_pred = forward(x_train)

    # Compute MSE loss
    MSE = criterion(y_pred, y_train)

    # Propagate backward, compute gradients
    MSE.backward()

    # Update weights
    optimizer.step()

    # Nullify gradients after updating to avoid their accumulation
    optimizer.zero_grad()

    if epoch % 10 == 1:
        print(f"epoch {epoch}: w = {W.item():.3f}, loss = {MSE.item():.8f}")

print(f"\nPrediction after training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}")

Prediction before training: f(x) = tensor([1., 2., 3., 4.], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

epoch 1: w = 1.144, loss = 6.41718674
epoch 11: w = 1.608, loss = 1.34951985
epoch 21: w = 1.820, loss = 0.28380090
epoch 31: w = 1.917, loss = 0.05968266
epoch 41: w = 1.962, loss = 0.01255111
epoch 51: w = 1.983, loss = 0.00263946
epoch 61: w = 1.992, loss = 0.00055505
epoch 71: w = 1.996, loss = 0.00011674
epoch 81: w = 1.998, loss = 0.00002455
epoch 91: w = 1.999, loss = 0.00000516
epoch 101: w = 2.000, loss = 0.00000109

Prediction after training: f(x) = tensor([1.9996, 3.9993, 5.9989, 7.9986], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

torch.nn.MSELoss()


criterion = nn.MSELoss()

# batch of 1 element and 3 components in output vector
model_output = torch.Tensor([0.5, -0.25, 0.75])
print(f"model_output: {model_output}")

target = torch.Tensor([1, 0.25, 0.25])
print(f"target: {target}")

loss_mse = criterion(model_output, target)
print(f"loss_mse: {loss_mse}")

model_output: tensor([ 0.5000, -0.2500,  0.7500])
target: tensor([1.0000, 0.2500, 0.2500])
loss_mse: 0.25

torch.nn.L1Loss()


criterion = nn.L1Loss()

# batch of 1 element and 3 components in output vector
model_output = torch.Tensor([0.5, -0.25, 0.75])
print(f"model_output: {model_output}")

target = torch.Tensor([1, 0.25, 0.25])
print(f"target: {target}")

loss_mae = criterion(model_output, target)
print(f"loss_mae: {loss_mae}")

model_output: tensor([ 0.5000, -0.2500,  0.7500])
target: tensor([1.0000, 0.2500, 0.2500])
loss_mae: 0.5

torch.nn.CrossEntropyLoss()


criterion = nn.CrossEntropyLoss()


# fmt: off
model_output = torch.tensor([[2.4, 1.9, 7.3],
                             [9.5, 2.7, 4.0],
                             [5.7, 4.1, 0.2]])  # logits
# fmt: on

print(f"model_output:\n {model_output}")

target = torch.tensor([2, 0, 1], dtype=torch.long)  # class labels
print(f"target: {target}")

loss_ce = criterion(model_output, target)
print(f"loss_ce: {loss_ce}")

model_output:
 tensor([[2.4000, 1.9000, 7.3000],
        [9.5000, 2.7000, 4.0000],
        [5.7000, 4.1000, 0.2000]])
target: tensor([2, 0, 1])
loss_ce: 0.6014580726623535


import numpy as np

ce_1 = -np.log(np.exp(7.3) / (np.exp(2.4) + np.exp(1.9) + np.exp(7.3)))
ce_2 = -np.log(np.exp(9.5) / (np.exp(9.5) + np.exp(2.7) + np.exp(4.0)))
ce_3 = -np.log(np.exp(4.1) / (np.exp(5.7) + np.exp(4.1) + np.exp(0.2)))

ce = (1 / 3) * (ce_1 + ce_2 + ce_3)
print(f"hand-calculated loss_ce: {ce}")

hand-calculated loss_ce: 0.601458161156722


# fmt: off
# Scores for batch of two samples
model_output = torch.tensor([[30.0, 2.0],
                             [30.0, 2.0]])

target = torch.tensor([0, 1])  # Second sample belongs to class 1
# but logit for class 0 is greater: 30 > 2. So it was misclassified
# fmt: on


criterion = torch.nn.CrossEntropyLoss()
loss = criterion(model_output, target)
print(f"Loss = {loss.item():.2f}")

Loss = 14.00


weights = torch.tensor([0.2, 0.8])
criterion = torch.nn.CrossEntropyLoss(weight=weights)
loss = criterion(model_output, target)
print(f"Loss = {loss.item():.2f}")

Loss = 22.40


criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor([1.0, 4.0]))
loss = criterion(model_output, target)
print(f"Loss = {loss.item():.2f}")

Loss = 22.40


#!wget -qN https://raw.githubusercontent.com/AdeelH/pytorch-multi-class-focal-loss/master/focal_loss.py
!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/dev-2.0/L05/focal_loss.py


from focal_loss import FocalLoss


criterion = FocalLoss(alpha=None, gamma=2.0)

# fmt: off
model_output = torch.tensor([[2.4, 1.9, 7.3],
                             [9.5, 2.7, 4.0],
                             [5.7, 4.1, 0.2]])  # model output is logits, as in CrossEntropyLoss
# fmt: on
print(f"model_output:\n {model_output}")

target = torch.tensor([2, 0, 1], dtype=torch.long)  # class labels
print(f"target: {target}")

loss_fl = criterion(model_output, target)
print(f"loss_fl: {loss_fl}")

model_output:
 tensor([[2.4000, 1.9000, 7.3000],
        [9.5000, 2.7000, 4.0000],
        [5.7000, 4.1000, 0.2000]])
target: tensor([2, 0, 1])
loss_fl: 0.4129861891269684


criterion = nn.NLLLoss()
logsoftmax = nn.LogSoftmax(dim=1)

print(f"model_output:\n {model_output}")

logprobs = logsoftmax(model_output)
print(f"logprobs:\n {logprobs}")

print(f"target: {target}")

loss_nll = criterion(logprobs, target)
print(f"loss_nll: {loss_nll}")

model_output:
 tensor([[2.4000, 1.9000, 7.3000],
        [9.5000, 2.7000, 4.0000],
        [5.7000, 4.1000, 0.2000]])
logprobs:
 tensor([[-4.9119e+00, -5.4119e+00, -1.1892e-02],
        [-5.1870e-03, -6.8052e+00, -5.5052e+00],
        [-1.8730e-01, -1.7873e+00, -5.6873e+00]])
target: tensor([2, 0, 1])
loss_nll: 0.6014580726623535

torch.nn.BCELoss()


criterion = nn.BCELoss()

model_output = torch.rand(1)
print(f"model_output: {model_output}")

target = torch.empty(1).random_(2)
print(f"target: {target}")

loss_bce = criterion(model_output, target)
print(f"loss_bce: {loss_bce}")

model_output: tensor([0.2704])
target: tensor([0.])
loss_bce: 0.31529176235198975


criterion = nn.BCELoss()

model_output = torch.ones(5)
print(f"model_output: {model_output}")

target = torch.zeros(5)
print(f"target: {target}")

loss_bce = criterion(model_output, target)
print(f"loss_bce: {loss_bce}")

model_output: tensor([1., 1., 1., 1., 1.])
target: tensor([0., 0., 0., 0., 0.])
loss_bce: 100.0

torch.nn.Sigmoid()


activation = nn.Sigmoid()
input_values = torch.randn(5) * 5
activation_sig = activation(input_values)
print(f"input_values: {input_values}\nactivation_sig: {activation_sig}")

input_values: tensor([ -6.0099,   3.1845, -10.8239,  -0.1848,   2.1212])
activation_sig: tensor([2.4484e-03, 9.6025e-01, 1.9916e-05, 4.5393e-01, 8.9295e-01])

torch.nn.Tanh()


activation = nn.Tanh()
input_values = torch.tensor([11.1529, 4.3029, 0.5081, -3.8456, -1.9058])
activation_tanh = activation(input_values)
print(f"input_values: {input_values}\nactivation_tanh: {activation_tanh}")

input_values: tensor([11.1529,  4.3029,  0.5081, -3.8456, -1.9058])
activation_tanh: tensor([ 1.0000,  0.9996,  0.4685, -0.9991, -0.9567])

torch.nn.ReLU()


activation = nn.ReLU()
input_values = torch.randn(5)
activation_relu = activation(input_values)
print(f"input_values: {input_values}\nactivation_relu: {activation_relu}")

input_values: tensor([ 0.3004,  0.6543, -0.5772,  0.4165,  1.4428])
activation_relu: tensor([0.3004, 0.6543, 0.0000, 0.4165, 1.4428])

torch.nn.LeakyReLU()


activation = nn.LeakyReLU(0.01)
input_values = torch.randn(5)
activation_lrelu = activation(input_values)
print(f"input_values: {input_values}\nactivation_lrelu: {activation_lrelu}")

input_values: tensor([-0.7783,  0.3690,  0.9008, -0.7253,  0.5118])
activation_lrelu: tensor([-0.0078,  0.3690,  0.9008, -0.0073,  0.5118])

torch.nn.GELU()


activation = nn.GELU()
input_values = torch.randn(5) * 5
activation_gelu = activation(input_values)
print(f"input_values: {input_values}\nactivation_gelu: {activation_gelu}")

input_values: tensor([-4.9624, -2.4004, -7.7321, -1.3959,  7.3756])
activation_gelu: tensor([-1.7747e-06, -1.9656e-02,  0.0000e+00, -1.1358e-01,  7.3756e+00])


from torchvision import datasets
from torchvision.transforms import ToTensor
from IPython.display import clear_output

train_data = datasets.MNIST(
    root="./MNIST", train=True, download=True, transform=ToTensor()
)

test_data = datasets.MNIST(
    root="./MNIST", train=False, download=True, transform=ToTensor()
)

clear_output()

print("Train data info:\n", train_data)
print("\nTest data info:\n", test_data)

Train data info:
 Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST
    Split: Train
    StandardTransform
Transform: ToTensor()

Test data info:
 Dataset MNIST
    Number of datapoints: 10000
    Root location: ./MNIST
    Split: Test
    StandardTransform
Transform: ToTensor()


import matplotlib.pyplot as plt

num_imgs_to_visualize = 10

figure = plt.figure(figsize=(20, 20))

for i in range(num_imgs_to_visualize):
    # here we indexing the Dataset-object "as is" and gettig a tuple (img, label)
    img, label = test_data[i]

    figure.add_subplot(1, num_imgs_to_visualize, i + 1)
    plt.imshow(img.squeeze(), cmap="gray")
    plt.title(label)
    plt.axis("off")
plt.show()


import torch
import numpy as np
import random


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)


set_random_seed(42)


from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False)


# get one next batch
imgs, labels = next(iter(train_dataloader))

print(f"Images batch shape: {imgs.size()} : [batch_size, num_channels, H, W]")
print(f"Labels batch shape: {labels.size()}")

print("\nThe first sample in the batch:")
img = imgs[0].squeeze()
label = labels[0].item()

plt.figure(figsize=(3, 3))
plt.imshow(img, cmap="gray")
plt.title(label)
plt.axis("off")
plt.show()

Images batch shape: torch.Size([8, 1, 28, 28]) : [batch_size, num_channels, H, W]
Labels batch shape: torch.Size([8])

The first sample in the batch:


import torch

print("train_data.data:")
print("Type: ", type(train_data.data))
print("Size: ", train_data.data.size())
print("Dtype:", train_data.data.dtype)
print("Max:  ", torch.max(train_data.data).item())
print("Min:  ", torch.min(train_data.data).item())

train_data.data:
Type:  <class 'torch.Tensor'>
Size:  torch.Size([60000, 28, 28])
Dtype: torch.uint8
Max:   255
Min:   0


mean = torch.mean(train_data.data.double()).item()
std = torch.std(train_data.data.double()).item()
print(f"mean = {mean:.2f}, std = {std:.2f}")

mean = 33.32, std = 78.57


mean /= 255
std /= 255
print(f"Scaled mean = {mean:.2f}, std = {std:.2f}")

Scaled mean = 0.13, std = 0.31


from torchvision import transforms

transform_with_normalize = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize(mean, std)]
)

print(transform_with_normalize)

Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)


print("Old train transform:", train_data.transform)
print("Old test transform:", test_data.transform)

train_data.transform = transform_with_normalize
test_data.transform = transform_with_normalize

print("\nNew train transform:", train_data.transform)
print("New test transform:", test_data.transform)

Old train transform: ToTensor()
Old test transform: ToTensor()

New train transform: Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)
New test transform: Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


from torch import nn


class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.layers_stack(x)
        return logits


model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)


# random input of 3 images
sample_batch = torch.rand(
    3, 1, 28, 28, device=device
)  # [batch_size, num_channels, H, W]

# model output
logits = model(sample_batch)

# predicted probabilities
pred_probab = nn.Softmax(dim=1)(logits)

# predicted classes
y_pred = pred_probab.argmax(dim=1)

print(f"Input size:       {sample_batch.size()} : [batch_size, num_channels, H, W]")
print(f"Output size:      {logits.size()}        : [batch_size, num_classes]")
print(
    f"Predicted class:  {y_pred}          : [class for sample 1, class for sample 2, class for sample 3]"
)

Input size:       torch.Size([3, 1, 28, 28]) : [batch_size, num_channels, H, W]
Output size:      torch.Size([3, 10])        : [batch_size, num_classes]
Predicted class:  tensor([3, 3, 3], device='cuda:0')          : [class for sample 1, class for sample 2, class for sample 3]


sample_batch = torch.rand(3, 1, 28, 28)
print(f"Input size: {sample_batch.size()}")

Input size: torch.Size([3, 1, 28, 28])


flatten = nn.Flatten()
flat_image = flatten(sample_batch)
print(f"Size after Flatten: {flat_image.size()}")

Size after Flatten: torch.Size([3, 784])


layer1 = nn.Linear(in_features=784, out_features=512)
hidden1 = layer1(flat_image)
print(f"Size after Linear:  {hidden1.size()}")

Size after Linear:  torch.Size([3, 512])


print(f"Size of linear layer weights: {layer1.weight.size()}")
print(f"Type of linear layer weights: {type(layer1.weight)}")

print(f"\nSize of linear layer biases: {layer1.bias.size()}")
print(f"Type of linear layer biases: {type(layer1.bias)}")

Size of linear layer weights: torch.Size([512, 784])
Type of linear layer weights: <class 'torch.nn.parameter.Parameter'>

Size of linear layer biases: torch.Size([512])
Type of linear layer biases: <class 'torch.nn.parameter.Parameter'>


activations1 = nn.ReLU()(hidden1)

print(f"Before ReLU:  {hidden1}")
print(f"After ReLU:  {activations1}")
print(f"\n Size after ReLU:  {activations1.size()}")

Before ReLU:  tensor([[-0.1436, -0.0007, -0.3674,  ...,  0.1573, -0.4531,  0.4364],
        [ 0.0430,  0.1094, -0.1115,  ...,  0.2678, -0.6268, -0.2382],
        [ 0.1194, -0.0029, -0.4102,  ...,  0.3693, -0.2665, -0.1590]],
       grad_fn=<AddmmBackward0>)
After ReLU:  tensor([[0.0000, 0.0000, 0.0000,  ..., 0.1573, 0.0000, 0.4364],
        [0.0430, 0.1094, 0.0000,  ..., 0.2678, 0.0000, 0.0000],
        [0.1194, 0.0000, 0.0000,  ..., 0.3693, 0.0000, 0.0000]],
       grad_fn=<ReluBackward0>)

 Size after ReLU:  torch.Size([3, 512])


seq_modules = nn.Sequential(flatten, layer1, nn.ReLU(), nn.Linear(512, 10))

sample_batch = torch.rand(3, 1, 28, 28)
logits = seq_modules(sample_batch)

print(f"Output size: {logits.size()}")

Output size: torch.Size([3, 10])


softmax = nn.Softmax(dim=1)

pred_probab = softmax(logits)

print(f"Size after Softmax: {pred_probab.size()}")

Size after Softmax: torch.Size([3, 10])


print(f"Model structure: {model}\n")

for name, param in model.named_parameters():
    print(f"Layer: {name:25}  Size: {param.size()}")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)

Layer: layers_stack.0.weight      Size: torch.Size([512, 784])
Layer: layers_stack.0.bias        Size: torch.Size([512])
Layer: layers_stack.2.weight      Size: torch.Size([128, 512])
Layer: layers_stack.2.bias        Size: torch.Size([128])
Layer: layers_stack.4.weight      Size: torch.Size([10, 128])
Layer: layers_stack.4.bias        Size: torch.Size([10])


num_epochs = 10
batch_size = 64
learning_rate = 1e-3


train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


# Initialize the loss function
criterion = nn.CrossEntropyLoss()


optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


def train_loop(dataloader, model, criterion, optimizer):
    num_batches = len(dataloader)

    train_loss = 0

    for imgs, labels in dataloader:
        # Compute prediction and loss
        pred = model(imgs.to(device))
        loss = criterion(pred, labels.to(device))

        # Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= num_batches
    print(f"Train loss: {train_loss:>8f}")

    return train_loss


def test_loop(dataloader, model, criterion):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    test_loss, correct = 0, 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            # Compute prediction and loss
            pred = model(imgs.to(device))
            loss = criterion(pred, labels.to(device))

            test_loss += loss.item()
            correct += (
                (pred.argmax(1) == labels.to(device)).type(torch.float).sum().item()
            )

    test_loss /= num_batches
    correct /= size
    print(f"Test loss: {test_loss:>8f}, test accuracy: {(100*correct):>0.1f}% \n")

    return test_loss


# for plotting
loss_history = {"train": [], "test": []}

for i in range(num_epochs):
    print(f"Epoch {i+1}")
    train_loss = train_loop(train_dataloader, model, criterion, optimizer)
    test_loss = test_loop(test_dataloader, model, criterion)

    loss_history["train"].append(train_loss)
    loss_history["test"].append(test_loss)
print("Done!")

Epoch 1
Train loss: 2.127127
Test loss: 1.873118, test accuracy: 68.9% 

Epoch 2
Train loss: 1.500568
Test loss: 1.096297, test accuracy: 80.4% 

Epoch 3
Train loss: 0.878231
Test loss: 0.684559, test accuracy: 85.2% 

Epoch 4
Train loss: 0.615779
Test loss: 0.526531, test accuracy: 87.5% 

Epoch 5
Train loss: 0.502805
Test loss: 0.449756, test accuracy: 88.5% 

Epoch 6
Train loss: 0.441968
Test loss: 0.403329, test accuracy: 89.2% 

Epoch 7
Train loss: 0.404127
Test loss: 0.373890, test accuracy: 89.7% 

Epoch 8
Train loss: 0.378117
Test loss: 0.352685, test accuracy: 90.2% 

Epoch 9
Train loss: 0.358783
Test loss: 0.335675, test accuracy: 90.4% 

Epoch 10
Train loss: 0.343455
Test loss: 0.323084, test accuracy: 90.8% 

Done!


plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), loss_history["train"], label="train")
plt.plot(range(1, num_epochs + 1), loss_history["test"], label="test")
plt.xlabel("Epochs", fontsize=15)
plt.ylabel("Loss", fontsize=15)
plt.legend()
plt.grid()
plt.show()


# get batch
imgs, labels = next(iter(test_dataloader))
print("imgs shape: ", imgs.shape)

imgs shape:  torch.Size([64, 1, 28, 28])


# get output
pred = model(imgs.to(device))
print("pred shape: ", pred.shape)

pred shape:  torch.Size([64, 10])


# remove axis
imgs = torch.reshape(imgs, (64, 28, 28))
print("imgs shape(after reshape): ", imgs.shape)

imgs shape(after reshape):  torch.Size([64, 28, 28])


# take 10 first images
imgs = imgs[:10]
print("imgs shape: ", imgs.shape)

imgs shape:  torch.Size([10, 28, 28])


import numpy as np

pred = pred[:10].detach()
print("Prediction(1 sample):\n", pred[0])

digits = np.argmax(pred.cpu().numpy(), axis=1)
print("Predicted class: ", digits[0])

Prediction(1 sample):
 tensor([-0.9875, -2.5655, -0.6203,  1.7831, -1.2819, -1.1455, -7.1734,  8.7986,
        -1.2607,  2.7094], device='cuda:0')
Predicted class:  7


plt.figure(figsize=(25.0, 25.0))
for i in range(10):
    img = imgs[i]

    plt.subplot(1, 10, i + 1)
    plt.title(
        f"pred: {digits[i]} real: {labels[i].numpy()}"
    )  # predicted and real values
    plt.axis("off")
    plt.imshow(img.numpy(), cmap="gray")


torch.save(model.state_dict(), "model_weights.pth")


model = NeuralNetwork()
model.load_state_dict(torch.load("model_weights.pth"))
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)

Ограничения линейного классификатора¶

Проблемы классификации более сложных объектов¶

ХОR — проблема¶

Многослойные нейронные сети¶

Веса и смещения¶

Нейронная сеть как универсальный аппроксиматор¶

Обучение нейронной сети¶

Прямое и обратное распространение¶

Метод обратного распространения ошибки¶

Вычислительный граф¶

Пошаговый разбор метода обратного распространения¶

Анимация работы метода обратного распространения ошибки¶

Реализация в PyTorch¶

Основная сущность — torch.Tensor¶

Автоматическое вычисление градиента¶

Обратное распространение в PyTorch¶

Функции потерь (loss functions)¶

Mean Squared Error¶

Mean Absolute Error¶

Cross-Entropy¶

Веса классов¶

Focal Loss¶

Negative Log Likelihood¶

Binary Cross-Entropy¶

Binary Cross-Entropy With Logits¶

Итоги¶

Функции активации¶

Свойства функций активации¶

Различные функции активации¶

Логистическая функция¶

Гиперболический тангенс¶

ReLU¶

Leaky ReLU¶

GELU (Gaussian Error Linear Unit)¶

Визуализация функций активации¶

Углубление в PyTorch. Пример нейронной сети на MNIST¶

Dataset и DataLoader¶

Загрузка набора данных¶

Итерирование по Dataset и визуализация данных¶

Подготовка данных для обучения с помощью DataLoader¶

Итерирование по DataLoader¶

Взвешенное формирование батчей для работы с дисбалансом¶

Трансформации (Transforms)¶

ToTensor¶

Normalize¶

Compose¶

Создание нейронной сети¶

Выбор устройства (device) для обучения¶

Описание класса модели¶

Слои модели¶

Слой nn.Flatten¶

Слой nn.Linear¶

Слой nn.ReLU¶

Объединение модулей в nn.Sequential¶

Слой nn.Softmax¶

Параметры модели¶

Обучение нейронной сети¶

Гиперпараметры¶

Оптимизация параметров (обучение сети)¶

Функция потерь (Loss function)¶

Оптимизатор (Optimizer)¶

Реализация обучения¶

Предсказания обученной модели¶

Сохранение и загрузка весов модели¶

Итерирование по `Dataset` и визуализация данных¶

Итерирование по `DataLoader`¶

Слой `nn.Flatten`¶

Слой `nn.Linear`¶

Слой `nn.ReLU`¶

Объединение модулей в `nn.Sequential`¶

Слой `nn.Softmax`¶