!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/L05/lc_mnist_weights.txt
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/L05/lc_cifar10_weights.txt


import torch
import numpy as np
import matplotlib.pyplot as plt

# Display templates
plt.rcParams["figure.figsize"] = (25, 10)

W = torch.from_numpy(np.loadtxt("lc_mnist_weights.txt"))  # load weigths, shape 3073x10
print(f"Shape with bias: {W.shape}")

# Remove bias
W = W[:-1, :]
print(f"Shape without bias: {W.shape}")

# Denormalize
w_min = torch.min(W)
w_max = torch.max(W)
templates = 255 * (W - w_min) / (w_max - w_min)

# Display templates
labels_names = [str(i) for i in range(10)]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    img = templates[:, i].view(1, 28, 28).permute(1, 2, 0).type(torch.uint8)
    plt.imshow(img, cmap="gray")
    plt.axis("off")
    plt.title(labels_names[i])

Shape with bias: torch.Size([785, 10])
Shape without bias: torch.Size([784, 10])


# Display templates
plt.rcParams["figure.figsize"] = (25, 10)

W = torch.from_numpy(
    np.loadtxt("lc_cifar10_weights.txt")
)  # load weigths, shape 3073x10
print(f"Shape with bias: {W.shape}")

# Remove bias
W = W[:-1, :]
print(f"Shape without bias: {W.shape}")

# Denormalize
w_min = torch.min(W)
w_max = torch.max(W)
templates = 255 * (W - w_min) / (w_max - w_min)

# Display templates
labels_names = [
    "plane",
    "car",
    "bird",
    "cat",
    "deer",
    "dog",
    "frog",
    "horse",
    "ship",
    "truck",
]
for i in range(10):
    plt.subplot(1, 10, i + 1)
    img = templates[:, i].view(3, 32, 32).permute(1, 2, 0).type(torch.uint8)
    plt.imshow(img)
    plt.axis("off")
    plt.title(labels_names[i])

Shape with bias: torch.Size([3073, 10])
Shape without bias: torch.Size([3072, 10])


import random


def set_random_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.deterministic = True


set_random_seed(42)


x = torch.rand(3072)  # random image
W1 = torch.randn(3072, 100) * 0.0001  # without bias
W2 = torch.randn(100, 10) * 0.0001  # without bias
scores1 = x.matmul(W1)  # matrix multiplication, equivalent x@W1
scores2 = scores1.matmul(W2)  # matrix multiplication, of the next classifier

print(f"First classifier shape: {scores1.shape}")
print(f"Second classifier shape: {scores2.shape}")

First classifier shape: torch.Size([100])
Second classifier shape: torch.Size([10])


scores1 = x.matmul(W1)
print(
    f"\nFirst 8 elements of Scores1: {scores1[:8]}"
)  # take the first 8 values for visualization
activations = torch.sigmoid(scores1)  # only values greater than zero
print(f"\nActivations {activations[:8]}")  # take the first 8 values for visualization
scores2 = activations.matmul(W2)
print(f"\nScores2 {scores2}")

First 8 elements of Scores1: tensor([-0.0037,  0.0029,  0.0009,  0.0038,  0.0030,  0.0022,  0.0028,  0.0020])

Activations tensor([0.4991, 0.5007, 0.5002, 0.5009, 0.5007, 0.5005, 0.5007, 0.5005])

Scores2 tensor([-6.6097e-04, -4.0941e-04, -2.0049e-04, -1.1673e-04,  9.7170e-04,
        -3.0337e-04, -5.1134e-05,  7.1515e-04,  1.7552e-03,  8.0079e-05])


class NeuralNet:
    def __init__(self):
        self.W1 = torch.randn(3072, 100) * 0.0001
        self.W2 = torch.randn(100, 10) * 0.0001

    def predict(self, x):
        scores1 = x.matmul(self.W1)  # Linear
        activations1 = torch.sigmoid(scores1)  # activation ReLU
        scores2 = activations1.matmul(self.W2)  # Linear
        return scores2


x = torch.rand(3072)  # image
model = NeuralNet()
scores = model.predict(x)
print(f"scores: \n {scores}")

scores: 
 tensor([-3.0459e-04, -4.3039e-04,  2.9862e-04, -1.0788e-03, -5.7298e-05,
         9.4889e-04,  8.5047e-06, -8.5582e-05, -2.2691e-04,  2.5647e-04])


from IPython.display import HTML
from base64 import b64encode

!wget -q https://edunet.kea.su/repo/EduNet-content/L05/out/universal_approximation.mp4

mp4 = open("universal_approximation.mp4", "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f'<video width=1000 controls><source src={data_url} type="video/mp4"></video>')


import torch

a = torch.Tensor()


a = torch.tensor([1.1, 2.2, 3.2])
a.dtype

torch.float32


a = torch.tensor([1.1, 2.2, 3.2], dtype=torch.float64)
a.dtype

torch.float64


a = torch.ones(size=(3, 2))
a.size()

torch.Size([3, 2])


a = torch.full((3, 2), 5.1)
a

tensor([[5.1000, 5.1000],
        [5.1000, 5.1000],
        [5.1000, 5.1000]])


a = a.T
a

tensor([[5.1000, 5.1000, 5.1000],
        [5.1000, 5.1000, 5.1000]])


c = torch.atan2(a[0], a[1])
c

tensor([0.7854, 0.7854, 0.7854])


c.sum()

tensor(2.3562)


a = torch.zeros((2, 5, 1, 8))
print("Original tensor size:\n", a.size())

a = a.permute(dims=(2, 0, 3, 1))  # permute dimensions
print("After permute tensor size:\n", a.size())

a = a.squeeze()  # delete dimension
print("After squzee tensor size:\n", a.size())

a = a.unsqueeze(dim=0)  # add dimension
print("After unsquzee tensor size:\n", a.size())

Original tensor size:
 torch.Size([2, 5, 1, 8])
After permute tensor size:
 torch.Size([1, 2, 8, 5])
After squzee tensor size:
 torch.Size([2, 8, 5])
After unsquzee tensor size:
 torch.Size([1, 2, 8, 5])


a.numpy()

array([[[[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]],

        [[0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.],
         [0., 0., 0., 0., 0.]]]], dtype=float32)


a = torch.rand(2, 8)
print("Original tensor:\n", a)
b = a.view(
    4, 4
)  # carefully with structured data, reshape/view can transform image to unreadable view
print("Tensor after view tensor:\n", b)
b += 1
print("Add 1 to tensor:\n", b)

Original tensor:
 tensor([[0.3389, 0.0980, 0.8943, 0.6329, 0.2009, 0.1293, 0.8345, 0.8799],
        [0.1481, 0.8887, 0.9965, 0.8724, 0.1077, 0.4430, 0.3849, 0.1974]])
Tensor after view tensor:
 tensor([[0.3389, 0.0980, 0.8943, 0.6329],
        [0.2009, 0.1293, 0.8345, 0.8799],
        [0.1481, 0.8887, 0.9965, 0.8724],
        [0.1077, 0.4430, 0.3849, 0.1974]])
Add 1 to tensor:
 tensor([[1.3389, 1.0980, 1.8943, 1.6329],
        [1.2009, 1.1293, 1.8345, 1.8799],
        [1.1481, 1.8887, 1.9965, 1.8724],
        [1.1077, 1.4430, 1.3849, 1.1974]])


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Cuda available: ", torch.cuda.is_available(), "\n")
a = a.to(device)  # tensor to gpu
b = torch.full_like(a, 2).to(device)
c = a * b  # compute on gpu (more fast with parallel computing)
c

Cuda available:  True

tensor([[2.6779, 2.1960, 3.7885, 3.2659, 2.4018, 2.2586, 3.6691, 3.7598],
        [2.2962, 3.7775, 3.9930, 3.7448, 2.2155, 2.8861, 2.7698, 2.3949]],
       device='cuda:0')


x_train = torch.tensor([1.0, 2.0, 3.0, 4.0])
y_train = torch.tensor([2.0, 4.0, 6.0, 8.0])

W = torch.tensor(1.0, requires_grad=True)

print(f"W.grad = {W.grad} (before forward pass must be 'None')")

y_pred = W * x_train
criterion = torch.nn.MSELoss()
MSE = criterion(y_pred, y_train)
print(f"MSE = {MSE}")

# backward pass to compute gradient dMSE/dw
MSE.backward()
print(f"W.grad = {W.grad}")

W.grad = None (before forward pass must be 'None')
MSE = 7.5
W.grad = -15.0


W.detach()

tensor(1.)


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# This is the parameter we want to optimize -> requires_grad=True
W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)
print(f"W.grad = {W.grad} (before forward pass must be 'None')\n")
# forward pass to compute MSE
y_pred = W * x_train
E = y_pred - y_train
SE = E**2
MSE = SE.mean()
print(f"MSE = {MSE}")

# backward pass to compute gradient dMSE/dw
MSE.backward()
print(f"W.grad = {W.grad}")
print(f"E.grad = {E.retain_grad()}")

W.grad = None (before forward pass must be 'None')

MSE = 7.5
W.grad = -15.0
E.grad = None


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

# This is the parameter we want to optimize -> requires_grad=True
W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)

# forward pass to compute MSE
y_pred = W * x_train
E = y_pred - y_train
E.retain_grad()  # Save grads for intermediate tensor E in memory
SE = E**2
MSE = SE.sum().div(4)

print("========== Backprop 1 ==============")
MSE.backward(retain_graph=True)
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

print("========== Backprop 2 ==============")
MSE.backward(retain_graph=True)
# Gradients are accumulated
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

print("========== Backprop 3 ==============")
W.grad.zero_()  # Nullify gradients for W for the next iteration
MSE.backward(retain_graph=True)
# Gradients for W are not accumulated, but not for E
print(f"dMSE/dE = {E.grad}")
print(f"dMSE/dW = {W.grad}")

========== Backprop 1 ==============
dMSE/dE = tensor([-0.5000, -1.0000, -1.5000, -2.0000])
dMSE/dW = -15.0
========== Backprop 2 ==============
dMSE/dE = tensor([-1., -2., -3., -4.])
dMSE/dW = -30.0
========== Backprop 3 ==============
dMSE/dE = tensor([-1.5000, -3.0000, -4.5000, -6.0000])
dMSE/dW = -15.0


x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)


# Define model output
def forward(x_train):
    return W * x_train


# Compute MSE loss
def criterion(y_pred, y_train):
    return ((y_pred - y_train) ** 2).mean()


print(f"Prediction before training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}\n")

# Training
learning_rate = 0.005
num_epochs = 102

for epoch in range(num_epochs):
    # Propagate forward
    y_pred = forward(x_train)

    # Compute MSE loss
    MSE = criterion(y_pred, y_train)

    # Propagate backward, compute gradients
    MSE.backward()

    # Update weights
    with torch.no_grad():  #  We don't want this step to be the part of the computational graph
        W -= learning_rate * W.grad

    # Nullify gradients after updating to avoid their accumulation
    W.grad.zero_()

    if epoch % 10 == 1:
        print(f"epoch {epoch}: w = {W.item():.3f}, loss = {MSE.item():.8f}")

print(f"\nPrediction after training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}")

Prediction before training: f(x) = tensor([1., 2., 3., 4.], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

epoch 1: w = 1.144, loss = 6.41718674
epoch 11: w = 1.608, loss = 1.34952068
epoch 21: w = 1.820, loss = 0.28380114
epoch 31: w = 1.917, loss = 0.05968266
epoch 41: w = 1.962, loss = 0.01255111
epoch 51: w = 1.983, loss = 0.00263946
epoch 61: w = 1.992, loss = 0.00055505
epoch 71: w = 1.996, loss = 0.00011674
epoch 81: w = 1.998, loss = 0.00002455
epoch 91: w = 1.999, loss = 0.00000516
epoch 101: w = 2.000, loss = 0.00000109

Prediction after training: f(x) = tensor([1.9996, 3.9993, 5.9989, 7.9986], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])


import torch.nn as nn

x_train = torch.tensor([1, 2, 3, 4], dtype=torch.float32)
y_train = torch.tensor([2, 4, 6, 8], dtype=torch.float32)

W = torch.tensor(1.0, dtype=torch.float32, requires_grad=True)


# Define model output
def forward(x_train):
    return W * x_train


print(f"Prediction before training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}\n")

# Training
learning_rate = 0.005
num_epochs = 102

criterion = nn.MSELoss()
optimizer = torch.optim.SGD([W], lr=learning_rate)

for epoch in range(num_epochs):
    # Propagate forward
    y_pred = forward(x_train)

    # Compute MSE loss
    MSE = criterion(y_pred, y_train)

    # Propagate backward, compute gradients
    MSE.backward()

    # Update weights
    optimizer.step()

    # Nullify gradients after updating to avoid their accumulation
    optimizer.zero_grad()

    if epoch % 10 == 1:
        print(f"epoch {epoch}: w = {W.item():.3f}, loss = {MSE.item():.8f}")

print(f"\nPrediction after training: f(x) = {forward(x_train)}")
print(f"True values: y = {y_train}")

Prediction before training: f(x) = tensor([1., 2., 3., 4.], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

epoch 1: w = 1.144, loss = 6.41718674
epoch 11: w = 1.608, loss = 1.34951985
epoch 21: w = 1.820, loss = 0.28380090
epoch 31: w = 1.917, loss = 0.05968266
epoch 41: w = 1.962, loss = 0.01255111
epoch 51: w = 1.983, loss = 0.00263946
epoch 61: w = 1.992, loss = 0.00055505
epoch 71: w = 1.996, loss = 0.00011674
epoch 81: w = 1.998, loss = 0.00002455
epoch 91: w = 1.999, loss = 0.00000516
epoch 101: w = 2.000, loss = 0.00000109

Prediction after training: f(x) = tensor([1.9996, 3.9993, 5.9989, 7.9986], grad_fn=<MulBackward0>)
True values: y = tensor([2., 4., 6., 8.])

torch.nn.MSELoss()


criterion = nn.MSELoss()

# batch of 1 element and 3 components in output vector
model_output = torch.Tensor([0.5, -0.25, 0.75])
print(f"model_output: {model_output}")

target = torch.Tensor([1, 0.25, 0.25])
print(f"target: {target}")

loss_mse = criterion(model_output, target)
print(f"loss_mse: {loss_mse}")

model_output: tensor([ 0.5000, -0.2500,  0.7500])
target: tensor([1.0000, 0.2500, 0.2500])
loss_mse: 0.25

torch.nn.L1Loss()


criterion = nn.L1Loss()

# batch of 1 element and 3 components in output vector
model_output = torch.Tensor([0.5, -0.25, 0.75])
print(f"model_output: {model_output}")

target = torch.Tensor([1, 0.25, 0.25])
print(f"target: {target}")

loss_mae = criterion(model_output, target)
print(f"loss_mae: {loss_mae}")

model_output: tensor([ 0.5000, -0.2500,  0.7500])
target: tensor([1.0000, 0.2500, 0.2500])
loss_mae: 0.5

torch.nn.CrossEntropyLoss()


criterion = nn.CrossEntropyLoss()

model_output = torch.rand(3, 3)
print(f"model_output:\n {model_output}")

target = torch.empty(3, dtype=torch.long).random_(3)
print(f"target: {target}")

loss_ce = criterion(model_output, target)
print(f"loss_ce: {loss_ce}")

model_output:
 tensor([[0.7950, 0.3205, 0.4384],
        [0.8802, 0.5885, 0.1743],
        [0.3752, 0.5399, 0.0035]])
target: tensor([0, 2, 1])
loss_ce: 1.0814415216445923


import numpy as np

ce_1 = -np.log(np.exp(0.7950) / (np.exp(0.7950) + np.exp(0.3205) + np.exp(0.4384)))
ce_2 = -np.log(np.exp(0.1743) / (np.exp(0.8802) + np.exp(0.5885) + np.exp(0.1743)))
ce_3 = -np.log(np.exp(0.5399) / (np.exp(0.3752) + np.exp(0.5399) + np.exp(0.0035)))

ce = (1 / 3) * (ce_1 + ce_2 + ce_3)
print(f"hand-calculated loss_ce: {ce}")

hand-calculated loss_ce: 1.0814430511898192


criterion = nn.NLLLoss()
logsoftmax = nn.LogSoftmax(dim=1)

print(f"model_output:\n {model_output}")

logprobs = logsoftmax(model_output)
print(f"logprobs:\n {logprobs}")

print(f"target: {target}")

loss_nll = criterion(logprobs, target)
print(f"loss_nll: {loss_nll}")

model_output:
 tensor([[0.7950, 0.3205, 0.4384],
        [0.8802, 0.5885, 0.1743],
        [0.3752, 0.5399, 0.0035]])
logprobs:
 tensor([[-0.8425, -1.3171, -1.1991],
        [-0.8068, -1.0985, -1.5127],
        [-1.0538, -0.8891, -1.4255]])
target: tensor([0, 2, 1])
loss_nll: 1.0814415216445923

torch.nn.BCELoss()


criterion = nn.BCELoss()

model_output = torch.rand(1)
print(f"model_output: {model_output}")

target = torch.empty(1).random_(2)
print(f"target: {target}")

loss_bce = criterion(model_output, target)
print(f"loss_bce: {loss_bce}")

model_output: tensor([0.3649])
target: tensor([1.])
loss_bce: 1.0081740617752075


criterion = nn.BCELoss()

model_output = torch.ones((5))
print(f"model_output: {model_output}")

target = torch.zeros(5)
print(f"target: {target}")

loss_bce = criterion(model_output, target)
print(f"loss_bce: {loss_bce}")

model_output: tensor([1., 1., 1., 1., 1.])
target: tensor([0., 0., 0., 0., 0.])
loss_bce: 100.0

torch.nn.Sigmoid()


activation = nn.Sigmoid()
input_values = torch.randn(5) * 5
activation_sig = activation(input_values)
print(f"input_values: {input_values}\nactivation_sig: {activation_sig}")

input_values: tensor([2.0176, 7.8325, 7.8253, 2.3941, 5.9826])
activation_sig: tensor([0.8826, 0.9996, 0.9996, 0.9164, 0.9975])

torch.nn.Tanh()


activation = nn.Tanh()
input_values = torch.tensor([11.1529, 4.3029, 0.5081, -3.8456, -1.9058])
activation_tanh = activation(input_values)
print(f"input_values: {input_values}\nactivation_tanh: {activation_tanh}")

input_values: tensor([11.1529,  4.3029,  0.5081, -3.8456, -1.9058])
activation_tanh: tensor([ 1.0000,  0.9996,  0.4685, -0.9991, -0.9567])

torch.nn.ReLU()


activation = nn.ReLU()
input_values = torch.randn(5)
activation_relu = activation(input_values)
print(f"input_values: {input_values}\nactivation_relu: {activation_relu}")

input_values: tensor([ 1.1878,  0.6047, -0.8048, -0.5702, -0.2748])
activation_relu: tensor([1.1878, 0.6047, 0.0000, 0.0000, 0.0000])

torch.nn.LeakyReLU()


activation = nn.LeakyReLU(0.01)
input_values = torch.randn(5)
activation_lrelu = activation(input_values)
print(f"input_values: {input_values}\nactivation_lrelu: {activation_lrelu}")

input_values: tensor([-1.9153,  1.7038,  1.1516, -1.6949, -1.2661])
activation_lrelu: tensor([-0.0192,  1.7038,  1.1516, -0.0169, -0.0127])


activation = nn.GELU()
input_values = torch.randn(5) * 5
activation_gelu = activation(input_values)
print(f"input_values: {input_values}\nactivation_gelu: {activation_gelu}")

input_values: tensor([-0.9675,  1.3878,  0.4633, -1.2794,  8.2854])
activation_gelu: tensor([-0.1612,  1.2731,  0.3143, -0.1284,  8.2854])

torch.nn.GELU()


from torchvision import datasets
from torchvision.transforms import ToTensor
from IPython.display import clear_output

train_data = datasets.MNIST(
    root="./MNIST", train=True, download=True, transform=ToTensor()
)

test_data = datasets.MNIST(
    root="./MNIST", train=False, download=True, transform=ToTensor()
)

clear_output()

print("Train data info:\n", train_data)
print("\nTest data info:\n", test_data)

Train data info:
 Dataset MNIST
    Number of datapoints: 60000
    Root location: ./MNIST
    Split: Train
    StandardTransform
Transform: ToTensor()

Test data info:
 Dataset MNIST
    Number of datapoints: 10000
    Root location: ./MNIST
    Split: Test
    StandardTransform
Transform: ToTensor()


import matplotlib.pyplot as plt

num_imgs_to_visualize = 10

figure = plt.figure(figsize=(20, 20))

for i in range(num_imgs_to_visualize):
    # here we indexing the Dataset-object "as is" and gettig a tuple (img, label)
    img, label = test_data[i]

    figure.add_subplot(1, num_imgs_to_visualize, i + 1)
    plt.imshow(img.squeeze(), cmap="gray")
    plt.title(label)
    plt.axis("off")
plt.show()


from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_data, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=8, shuffle=False)


# get one next batch
imgs, labels = next(iter(train_dataloader))

print(f"Images batch shape: {imgs.size()} : [batch_size, num_channels, H, W]")
print(f"Labels batch shape: {labels.size()}")

print("\nThe first sample in the batch:")
img = imgs[0].squeeze()
label = labels[0].item()

plt.figure(figsize=(3, 3))
plt.imshow(img, cmap="gray")
plt.title(label)
plt.axis("off")
plt.show()

Images batch shape: torch.Size([8, 1, 28, 28]) : [batch_size, num_channels, H, W]
Labels batch shape: torch.Size([8])

The first sample in the batch:


import torch

print("train_data.data:")
print("Type: ", type(train_data.data))
print("Size: ", train_data.data.size())
print("Dtype:", train_data.data.dtype)
print("Max:  ", torch.max(train_data.data).item())
print("Min:  ", torch.min(train_data.data).item())

train_data.data:
Type:  <class 'torch.Tensor'>
Size:  torch.Size([60000, 28, 28])
Dtype: torch.uint8
Max:   255
Min:   0


mean = torch.mean(train_data.data.double()).item()
std = torch.std(train_data.data.double()).item()
print(f"mean = {mean:.2f}, std = {std:.2f}")

mean = 33.32, std = 78.57


mean /= 255
std /= 255
print(f"Scaled mean = {mean:.2f}, std = {std:.2f}")

Scaled mean = 0.13, std = 0.31


from torchvision import transforms

transform_with_normalize = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize(mean, std)]
)

print(transform_with_normalize)

Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)


print("Old train transform:", train_data.transform)
print("Old test transform:", test_data.transform)

train_data.transform = transform_with_normalize
test_data.transform = transform_with_normalize

print("\nNew train transform:", train_data.transform)
print("New test transform:", test_data.transform)

Old train transform: ToTensor()
Old test transform: ToTensor()

New train transform: Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)
New test transform: Compose(
    ToTensor()
    Normalize(mean=0.1306604762738429, std=0.30810780717887876)
)


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


from torch import nn


class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.layers_stack(x)
        return logits


model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)


# random input of 3 images
sample_batch = torch.rand(
    3, 1, 28, 28, device=device
)  # [batch_size, num_channels, H, W]

# model output
logits = model(sample_batch)

# predicted probabilities
pred_probab = nn.Softmax(dim=1)(logits)

# predicted classes
y_pred = pred_probab.argmax(dim=1)

print(f"Input size:       {sample_batch.size()} : [batch_size, num_channels, H, W]")
print(f"Output size:      {logits.size()}        : [batch_size, num_classes]")
print(
    f"Predicted class:  {y_pred}          : [class for sample 1, class for sample 2, class for sample 3]"
)

Input size:       torch.Size([3, 1, 28, 28]) : [batch_size, num_channels, H, W]
Output size:      torch.Size([3, 10])        : [batch_size, num_classes]
Predicted class:  tensor([7, 7, 7], device='cuda:0')          : [class for sample 1, class for sample 2, class for sample 3]


sample_batch = torch.rand(3, 1, 28, 28)
print(f"Input size: {sample_batch.size()}")

Input size: torch.Size([3, 1, 28, 28])


flatten = nn.Flatten()
flat_image = flatten(sample_batch)
print(f"Size after Flatten: {flat_image.size()}")

Size after Flatten: torch.Size([3, 784])


layer1 = nn.Linear(in_features=784, out_features=512)
hidden1 = layer1(flat_image)
print(f"Size after Linear:  {hidden1.size()}")

Size after Linear:  torch.Size([3, 512])


print(f"Size of linear layer weights: {layer1.weight.size()}")
print(f"Type of linear layer weights: {type(layer1.weight)}")

print(f"\nSize of linear layer biases: {layer1.bias.size()}")
print(f"Type of linear layer biases: {type(layer1.bias)}")

Size of linear layer weights: torch.Size([512, 784])
Type of linear layer weights: <class 'torch.nn.parameter.Parameter'>

Size of linear layer biases: torch.Size([512])
Type of linear layer biases: <class 'torch.nn.parameter.Parameter'>


activations1 = nn.ReLU()(hidden1)

print(f"Before ReLU:  {hidden1}")
print(f"After ReLU:  {activations1}")
print(f"\n Size after ReLU:  {activations1.size()}")

Before ReLU:  tensor([[ 0.0761, -0.0194, -0.2215,  ...,  0.1836, -0.2779,  0.0627],
        [ 0.3971,  0.0891, -0.6315,  ...,  0.3182, -0.2746,  0.2194],
        [ 0.3238, -0.0899, -0.7042,  ...,  0.1712, -0.0055,  0.0566]],
       grad_fn=<AddmmBackward0>)
After ReLU:  tensor([[0.0761, 0.0000, 0.0000,  ..., 0.1836, 0.0000, 0.0627],
        [0.3971, 0.0891, 0.0000,  ..., 0.3182, 0.0000, 0.2194],
        [0.3238, 0.0000, 0.0000,  ..., 0.1712, 0.0000, 0.0566]],
       grad_fn=<ReluBackward0>)

 Size after ReLU:  torch.Size([3, 512])


seq_modules = nn.Sequential(flatten, layer1, nn.ReLU(), nn.Linear(512, 10))

sample_batch = torch.rand(3, 1, 28, 28)
logits = seq_modules(sample_batch)

print(f"Output size: {logits.size()}")

Output size: torch.Size([3, 10])


softmax = nn.Softmax(dim=1)

pred_probab = softmax(logits)

print(f"Size after Softmax: {pred_probab.size()}")

Size after Softmax: torch.Size([3, 10])


print(f"Model structure: {model}\n")

for name, param in model.named_parameters():
    print(f"Layer: {name:25}  Size: {param.size()}")

Model structure: NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)

Layer: layers_stack.0.weight      Size: torch.Size([512, 784])
Layer: layers_stack.0.bias        Size: torch.Size([512])
Layer: layers_stack.2.weight      Size: torch.Size([128, 512])
Layer: layers_stack.2.bias        Size: torch.Size([128])
Layer: layers_stack.4.weight      Size: torch.Size([10, 128])
Layer: layers_stack.4.bias        Size: torch.Size([10])


num_epochs = 10
batch_size = 64
learning_rate = 1e-3


train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False)


# Initialize the loss function
criterion = nn.CrossEntropyLoss()


optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


def train_loop(dataloader, model, criterion, optimizer):
    num_batches = len(dataloader)

    train_loss = 0

    for imgs, labels in dataloader:
        # Compute prediction and loss
        pred = model(imgs.to(device))
        loss = criterion(pred, labels.to(device))

        # Optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    train_loss /= num_batches
    print(f"Train loss: {train_loss:>8f}")

    return train_loss


def test_loop(dataloader, model, criterion):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)

    test_loss, correct = 0, 0

    with torch.no_grad():
        for imgs, labels in dataloader:
            # Compute prediction and loss
            pred = model(imgs.to(device))
            loss = criterion(pred, labels.to(device))

            test_loss += loss.item()
            correct += (
                (pred.argmax(1) == labels.to(device)).type(torch.float).sum().item()
            )

    test_loss /= num_batches
    correct /= size
    print(f"Test loss: {test_loss:>8f}, test accuracy: {(100*correct):>0.1f}% \n")

    return test_loss


# for plotting
loss_history = {"train": [], "test": []}

for i in range(num_epochs):
    print(f"Epoch {i+1}")
    train_loss = train_loop(train_dataloader, model, criterion, optimizer)
    test_loss = test_loop(test_dataloader, model, criterion)

    loss_history["train"].append(train_loss)
    loss_history["test"].append(test_loss)
print("Done!")

Epoch 1
Train loss: 2.129134
Test loss: 1.867858, test accuracy: 67.3% 

Epoch 2
Train loss: 1.485368
Test loss: 1.080338, test accuracy: 78.6% 

Epoch 3
Train loss: 0.877476
Test loss: 0.692152, test accuracy: 84.2% 

Epoch 4
Train loss: 0.628923
Test loss: 0.538475, test accuracy: 86.9% 

Epoch 5
Train loss: 0.515250
Test loss: 0.456962, test accuracy: 88.3% 

Epoch 6
Train loss: 0.450605
Test loss: 0.408414, test accuracy: 89.2% 

Epoch 7
Train loss: 0.409812
Test loss: 0.376002, test accuracy: 89.8% 

Epoch 8
Train loss: 0.381615
Test loss: 0.353203, test accuracy: 90.3% 

Epoch 9
Train loss: 0.360897
Test loss: 0.337453, test accuracy: 90.5% 

Epoch 10
Train loss: 0.345000
Test loss: 0.323591, test accuracy: 90.8% 

Done!


plt.figure(figsize=(10, 5))
plt.plot(range(1, num_epochs + 1), loss_history["train"], label="train")
plt.plot(range(1, num_epochs + 1), loss_history["test"], label="test")
plt.xlabel("Epochs", fontsize=15)
plt.ylabel("Loss", fontsize=15)
plt.legend()
plt.grid()
plt.show()


# get batch
imgs, labels = next(iter(test_dataloader))
print("imgs shape: ", imgs.shape)

imgs shape:  torch.Size([64, 1, 28, 28])


# get output
pred = model(imgs.to(device))
print("pred shape: ", pred.shape)

pred shape:  torch.Size([64, 10])


# First sample in prediction batch
pred[0]

tensor([ 0.0246, -4.8137, -0.5842,  1.5116, -1.7782, -0.2193, -6.6226,  8.8951,
        -1.3204,  3.2529], device='cuda:0', grad_fn=<SelectBackward0>)


# Calculate probabilities
nn.Softmax(dim=0)(pred[0].detach())

tensor([1.3985e-04, 1.1076e-06, 7.6077e-05, 6.1865e-04, 2.3051e-05, 1.0958e-04,
        1.8147e-07, 9.9547e-01, 3.6435e-05, 3.5291e-03], device='cuda:0')


# remove axis
imgs = torch.reshape(imgs, (64, 28, 28))
print("imgs shape(after reshape): ", imgs.shape)

imgs shape(after reshape):  torch.Size([64, 28, 28])


# take 10 first images
imgs = imgs[:10]
print("imgs shape: ", imgs.shape)

imgs shape:  torch.Size([10, 28, 28])


import numpy as np

pred = pred[:10].detach()
print("Prediction(1 sample):\n", pred[0])
digits = np.argmax(pred.cpu().numpy(), axis=1)
print("Predicted class: ", digits[0])

Prediction(1 sample):
 tensor([ 0.0246, -4.8137, -0.5842,  1.5116, -1.7782, -0.2193, -6.6226,  8.8951,
        -1.3204,  3.2529], device='cuda:0')
Predicted class:  7


plt.figure(figsize=(25.0, 25.0))
for i in range(10):
    img = imgs[i]

    plt.subplot(1, 10, i + 1)
    plt.title(
        "pred: " + str(digits[i]) + " real: " + str(labels[i].numpy())
    )  # predicted and real values
    plt.axis("off")
    plt.imshow(img.numpy(), cmap="gray")


torch.save(model.state_dict(), "model_weights.pth")


model = NeuralNetwork()
model.load_state_dict(torch.load("model_weights.pth"))
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=128, bias=True)
    (3): ReLU()
    (4): Linear(in_features=128, out_features=10, bias=True)
  )
)

Ограничения Линейного классификатора¶

ХОR — проблема¶

Проблемы классификации более сложных объектов¶

Многослойные сети¶

Обучение нейронной сети¶

Прямое и обратное распространение¶

Веса сети¶

Как вычислить результат работы нейронной сети¶

Смещение (bias)¶

Нейронная сеть как универсальный аппроксиматор¶

Метод обратного распространения ошибки¶

Основная идея метода¶

Граф вычислений¶

Пошаговый разбор метода обратного распространения¶

Более сложные случаи¶

Обратное распространение для векторов¶

Множественная вершина¶

Анимация работы метода обратного распространения ошибки¶

Введение в PyTorch¶

Основная сущность — torch.Tensor¶

Автоматическое вычисление градиентов¶

Другие интересные подмодули фреймворка¶

Backprop in PyTorch¶

Преимущества и недостатки метода¶

Функции потерь (loss functions)¶

Mean Squared Error¶

Mean Absolute Error¶

Cross-Entropy¶

Negative Log Likelihood¶

Binary Cross-Entropy¶

Binary Cross-Entropy With Logits¶

Итоги¶

Функции активации¶

Свойства функций активации¶

Различные функции активации¶

Логистическая функция¶

Гиперболический тангенс¶

ReLU¶

Leaky ReLU¶

GELU (Gaussian Error Linear Unit)¶

Визуализация функций активации:¶

Углубление в PyTorch. Пример нейронной сети на MNIST¶

Dataset и DataLoader¶

Загрузка набора данных¶

Итерирование по Dataset и визуализация данных¶

Подготовка данных для обучения с помощью DataLoader¶

Итерирование по DataLoader¶

Трансформации (Transforms)¶

ToTensor¶

Normalize¶

Compose¶

Создание нейронной сети¶

Выбор устройства (device) для обучения¶

Описание класса модели¶

Слои модели¶

Слой nn.Flatten¶

Слой nn.Linear¶

Слой nn.ReLU¶

Объединение модулей в nn.Sequential¶

Слой nn.Softmax¶

Параметры модели¶

Обучение нейронной сети¶

Гиперпараметры¶

Оптимизация параметров (обучение сети)¶

Функция потерь (Loss function)¶

Оптимизатор (Optimizer)¶

Реализация обучения¶

Посмотрим на предсказания обученной модели¶

Сохранение и загрузка весов модели¶

Итерирование по `Dataset` и визуализация данных¶

Итерирование по `DataLoader`¶

Слой `nn.Flatten`¶

Слой `nn.Linear`¶

Слой `nn.ReLU`¶

Объединение модулей в `nn.Sequential`¶

Слой `nn.Softmax`¶