import torchvision
import torchvision.transforms as transforms
from IPython.display import clear_output
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST

# transforms for data
transform = torchvision.transforms.Compose(
    [torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.13), (0.3))]
)

train_set = MNIST(root="./MNIST", train=True, download=True, transform=transform)
test_set = MNIST(root="./MNIST", train=False, download=True, transform=transform)

batch_size = 32
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

clear_output()
print("Already downloaded!")

Already downloaded!


import torch.nn as nn


class SimpleMNIST_NN(nn.Module):
    def __init__(self, n_layers, activation=nn.Sigmoid):
        super().__init__()
        self.n_layers = n_layers  # Num of layers
        self.activation = activation()
        layers = [nn.Linear(28 * 28, 100), self.activation]  # input layer
        for _ in range(n_layers - 1):  # append num of layers
            layers.append(nn.Linear(100, 100))
            layers.append(self.activation)
        layers.append(nn.Linear(100, 10))  # 10 classes
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        x = x.view(-1, 28 * 28)  # reshape to [-1, 784]
        x = self.layers(x)
        return x


def exponential_smoothing(scalars, weight):
    last = scalars[0]
    smoothed = []
    for point in scalars:
        smoothed_val = last * weight + (1 - weight) * point
        smoothed.append(smoothed_val)
        last = smoothed_val

    return smoothed


import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np


class HistoryPlotter:
    def __init__(self):
        # dict for safe learning history
        self._history_dict = {}

    def add(self, history):
        """
        Save learning history.
        history: dict with keys: model_name, epoсhs, loss_on_train, loss_on_test
        """
        self._history_dict[history["model_name"]] = history
        self.color_list = list(mcolors.TABLEAU_COLORS.keys())

    def plot(self, models, show_smooth=True, smooth_val=0.90):
        """
        Plot informatiom from self._history_dict.
        models: list of model_name (len <= 5, or extend color_list)
        """
        fig, ax = plt.subplots(3, 1, figsize=(10, 10))
        for model_num, model_name in enumerate(models):
            history = self._history_dict[model_name]
            for idx, (key, title) in enumerate(
                zip(["loss_on_train", "loss_on_test"], ["train loss", "test loss"])
            ):
                epoch_len = len(history[key]) // history["epoсhs"]
                loss_len = len(history[key])
                ticks_positions = np.arange(history["epoсhs"] + 1)

                if show_smooth:
                    x = np.arange(len(history[key])) / epoch_len
                    # Plot train loss and test loss:
                    # 1. plot smoothing vals
                    ax[idx].plot(
                        x,
                        exponential_smoothing(history[key], smooth_val),
                        label=model_name + " smoothed",
                        color=self.color_list[2 * model_num + idx],
                    )
                    # 2. plot raw vals
                    ax[idx].plot(
                        x,
                        history[key],
                        label=model_name + " raw",
                        alpha=0.2,
                        color=self.color_list[2 * model_num + idx],
                    )
                    # 3. add descriptions if it is nesessary
                    if not ax[idx].title.get_text():
                        ax[idx].set_title(title)
                        ax[idx].set_xlabel("epochs")
                        ax[idx].set_ylabel("loss")
                        ax[idx].set_xticks(ticks_positions)
                        ax[idx].set_xticklabels(np.arange(history["epoсhs"] + 1))
                    ax[idx].legend()

                # Plot mean train and test loss combined:
                # 1. calculate mean and std
                mean_loss_on_epoch = [
                    np.mean(history[key][i : i + epoch_len])
                    for i in range(0, loss_len, epoch_len)
                ]
                std_loss_on_epoch = [
                    np.std(history[key][i : i + epoch_len])
                    for i in range(0, loss_len, epoch_len)
                ]
                # 2. plot
                ax[2].errorbar(
                    np.arange(history["epoсhs"]) + idx / 30.0,
                    mean_loss_on_epoch,
                    yerr=std_loss_on_epoch,
                    capsize=5,
                    fmt="X--",
                    label=model_name + " " + title,
                )
                # 3. add descriptions if it is necessary
                if not ax[2].title.get_text():
                    ax[2].set_title("\nAverage loss per epoch", {"fontsize": 12})
                    ax[2].set_xticks(np.arange(history["epoсhs"]))
                    ax[2].set_xticklabels(np.arange(history["epoсhs"]))
                    ax[2].set_xlabel("epochs")
                    ax[2].set_ylabel("loss")
                ax[2].legend()
        plt.subplots_adjust(hspace=0.4)
        plt.show()


history_plotter = HistoryPlotter()


import torch

# compute on cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train_epoch(model, optimizer, criterion, train_loader):
    loss_history = []
    for batch in train_loader:
        optimizer.zero_grad()
        x_train, y_train = batch  # parse data
        x_train, y_train = x_train.to(device), y_train.to(device)  # compute on gpu
        y_pred = model(x_train)  # get predictions
        loss = criterion(y_pred, y_train)  # compute loss
        loss_history.append(loss.cpu().detach().item())  # write loss to log
        loss.backward()
        optimizer.step()
    return loss_history


def validate(model, criterion, val_loader):
    cumloss = 0
    loss_history = []
    with torch.no_grad():
        for batch in val_loader:
            x_train, y_train = batch  # parse data
            x_train, y_train = x_train.to(device), y_train.to(device)  # compute on gpu
            y_pred = model(x_train)  # get predictions
            loss = criterion(y_pred, y_train)  # compute loss
            loss_history.append(loss.cpu().detach().item())  # write loss to log
            cumloss += loss
    return cumloss / len(val_loader), loss_history  # mean loss and history


from tqdm.notebook import tqdm


def train_model(model, optimizer, model_name=None, num_epochs=5):
    criterion = nn.CrossEntropyLoss().to(device)

    train_history = {}
    train_history["model_name"] = model_name
    train_history["epoсhs"] = num_epochs
    train_history["loss_on_train"] = []
    train_history["loss_on_test"] = []

    for epoch in tqdm(range(num_epochs)):
        loss_on_train = train_epoch(model, optimizer, criterion, train_loader)
        _, loss_on_test = validate(model, criterion, test_loader)
        train_history["loss_on_train"].extend(loss_on_train)
        train_history["loss_on_test"].extend(loss_on_test)
    return train_history


model_name = "n_layers2_sigmoid"
model = SimpleMNIST_NN(n_layers=2).to(device)
print(model)

SimpleMNIST_NN(
  (activation): Sigmoid()
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=100, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=100, out_features=10, bias=True)
  )
)


import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.001)

history = train_model(model, optimizer, model_name=model_name)
history_plotter.add(history)
history_plotter.plot([model_name])

  0%|          | 0/5 [00:00<?, ?it/s]


model_name = "n_layers3_sigmoid"
model = SimpleMNIST_NN(n_layers=3).to(device)
print(model)

SimpleMNIST_NN(
  (activation): Sigmoid()
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=100, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=100, out_features=100, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=100, out_features=100, bias=True)
    (5): Sigmoid()
    (6): Linear(in_features=100, out_features=10, bias=True)
  )
)


optimizer = optim.SGD(model.parameters(), lr=0.001)

history = train_model(model, optimizer, model_name=model_name)
history_plotter.add(history)
history_plotter.plot(["n_layers2_sigmoid", model_name])

  0%|          | 0/5 [00:00<?, ?it/s]


from collections import defaultdict


def get_forward_hook(history_dict, key):
    def forward_hook(self, input_, output):
        history_dict[key] = input_[0].cpu().detach().numpy().flatten()

    return forward_hook


def get_backward_hook(history_dict, key):
    def backward_hook(grad):  # for tensors
        history_dict[key] = grad.abs().cpu().detach().numpy().flatten()

    return backward_hook


def register_model_hooks(model):
    cur_ind = 0
    hooks_data_history = defaultdict(list)
    for child in model.layers.children():
        if isinstance(child, nn.Linear):
            forward_hook = get_forward_hook(
                hooks_data_history, f"sigmoid_out_{cur_ind}"
            )
            child.register_forward_hook(forward_hook)

            cur_ind += 1
            backward_hook = get_backward_hook(
                hooks_data_history, f"gradient_linear_{cur_ind}"
            )
            child.weight.register_hook(backward_hook)
    return hooks_data_history


model = SimpleMNIST_NN(n_layers=3).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

hooks_data_history = register_model_hooks(model)

model_name = "n_layers3_sigmoid2"
history = train_model(model, optimizer, model_name=model_name)
history_plotter.add(history)
history_plotter.plot(["n_layers2_sigmoid", model_name])

  0%|          | 0/5 [00:00<?, ?it/s]


def plot_hooks_data(hooks_data_history):
    keys = hooks_data_history.keys()
    n_layers = len(keys) // 2

    activation_names = [f"sigmoid_out_{i}" for i in range(1, n_layers)]
    activations_on_layers = [
        hooks_data_history[activation] for activation in activation_names
    ]

    gradient_names = [f"gradient_linear_{i + 1}" for i in range(n_layers)]
    gradients_on_layers = [hooks_data_history[gradient] for gradient in gradient_names]

    for plot_name, values, labels in zip(
        ["activations", "gradients"],
        [activations_on_layers, gradients_on_layers],
        [activation_names, gradient_names],
    ):
        fig, ax = plt.subplots(1, len(labels), figsize=(14, 4), sharey="row")
        for label_idx, label in enumerate(labels):
            ax[label_idx].boxplot(values[label_idx], labels=[label])
        plt.show()


plot_hooks_data(hooks_data_history)


import torch
import matplotlib.pyplot as plt

x = torch.randn((512, 100))  # Fake normalized data
plt.hist(x.mean(dim=0), bins=20)
plt.show()
print(f"X mean: {x.mean().item():.2f} X variance: {x.var().item():.2f}")

X mean: -0.00 X variance: 1.00


from torch import nn

net = nn.Sequential(
    nn.Linear(100, 50),  # weights randomly sampled from some random distribution
    nn.Sigmoid(),
    nn.Linear(50, 1),
)


weights = net[0].weight.data.numpy()
plt.hist(weights.flatten(), bins=20)
plt.show()

print(f"Weights mean: {weights.mean():.2f},  Weights variance: {weights.var():.2f}")

Weights mean: 0.00,  Weights variance: 0.00


out = net(x)
plt.hist(out.detach().numpy(), bins=20)
plt.show()

print(f"Out mean: {out.mean().item():.2f}, Out variance: {out.var().item():.2f}")

Out mean: -0.22, Out variance: 0.01


# fake cost
targets = torch.randint(15_000, 500_000, (512, 1), dtype=torch.float32)
print(f"Target example: {targets[:10].flatten()}")

Target example: tensor([493536., 444147., 105073., 128527., 212621., 499482., 427085.,  61938.,
        188614., 152376.])


criterion = nn.MSELoss()
loss = criterion(out, targets)
loss.backward()

print(f"Loss: {loss.item():.2f}")

Loss: 84912406528.00


import pandas as pd
import seaborn as sns

layer_names = ("Linear 1", "Sigmoid", "Linear 2", "Loss")
gradient_values = {}
for layer_name, p in zip(layer_names, net.parameters()):
    gradient_values[layer_name] = pd.Series(p.grad.detach().flatten().numpy())
    # print(f"{layer_name} grad: \n {p.grad}")

gradient_values = pd.DataFrame(gradient_values)
data_to_plot = gradient_values.melt(value_name="Gradient value", var_name="Layer name")

plt.figure(figsize=(10, 4))
sns.boxplot(data=data_to_plot, x="Gradient value", y="Layer name")
plt.grid()
plt.title(f"Large gradient values in the case of learning on a nonnormalized target")
plt.show()


mean = targets.float().mean()
std = targets.float().std()

transformed_targets = (targets - mean) / std
print(transformed_targets.flatten()[:10])

tensor([ 1.6824,  1.3348, -1.0515, -0.8864, -0.2946,  1.7242,  1.2147, -1.3550,
        -0.4636, -0.7186])


net.zero_grad()
out = net(x)
loss = criterion(out, transformed_targets)
loss.backward()

print(f"Loss: {loss.item():.2f}")

Loss: 1.04


layer_names = ("Linear 1", "Sigmoid", "Linear 2", "Loss")
gradient_values = {}
for layer_name, p in zip(layer_names, net.parameters()):
    gradient_values[layer_name] = pd.Series(p.grad.detach().flatten().numpy())
    # print(f"{layer_name} grad: \n {p.grad}")

gradient_values = pd.DataFrame(gradient_values)
data_to_plot = gradient_values.melt(value_name="Gradient value", var_name="Layer name")

plt.figure(figsize=(10, 4))
sns.boxplot(data=data_to_plot, x="Gradient value", y="Layer name")
plt.grid()
plt.title(f"Large gradient values in the case of learning on a nonnormalized target")
plt.show()


# Normal distribution: mu = 0, sigma = 1

x = np.arange(-4, 4.1, 0.1)
y = np.exp(-np.square(x) / 2) / np.sqrt(2 * np.pi)

plt.style.use("seaborn-v0_8-whitegrid")
plt.title("Normal distribution: mu = 0, sigma = 1", size=15)
plt.plot(x, y)
plt.show()


x = np.arange(-10, 10.1, 0.1)
y = np.tanh(x)
dy = 1 / np.cosh(x)

plt.style.use("seaborn-v0_8-whitegrid")
fig, (im1, im2) = plt.subplots(2, 1, figsize=(7, 7))
im1.set(title="tanh(x)")

# fmt: off
im1.plot(x[0:51], y[0:51], "#F9B041",
         x[150:201], y[150:201], "#F9B041",
         x[50:96], y[50:96], "#2DA9E1",
         x[105:151], y[105:151], "#2DA9E1",
         x[95:106], y[95:106], "#4AAE4D",)
im1.grid(True)
im2.set(title="tanh'(x)")
im2.plot(x[0:51], dy[0:51], "#F9B041",
         x[150:201], dy[150:201], "#F9B041",
         x[50:96], dy[50:96], "#2DA9E1",
         x[105:151], dy[105:151], "#2DA9E1",
         x[95:106], dy[95:106], "#4AAE4D",)
# fmt: on

im2.grid(True)
plt.show()


class SimpleMNIST_NN(nn.Module):

    def __init__(self, n_layers, activation=nn.Sigmoid, init_form="normal"):
        super().__init__()
        self.n_layers = n_layers
        self.activation = activation()
        layers = [nn.Linear(28 * 28, 100), self.activation]
        for _ in range(0, n_layers - 1):
            layers.append(nn.Linear(100, 100))
            layers.append(self.activation)
        layers.append(nn.Linear(100, 10))
        self.layers = nn.Sequential(*layers)
        self.init_form = init_form
        if self.init_form is not None:
            self.init()

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.layers(x)
        return x

    # xavier weight initialization
    def init(self):
        sigmoid_gain = torch.nn.init.calculate_gain("sigmoid")
        for child in self.layers.children():
            if isinstance(child, nn.Linear):
                if self.init_form == "normal":
                    torch.nn.init.xavier_normal_(child.weight, gain=sigmoid_gain)
                    if child.bias is not None:
                        torch.nn.init.zeros_(child.bias)
                elif self.init_form == "uniform":
                    torch.nn.init.xavier_uniform_(child.weight, gain=sigmoid_gain)
                    if child.bias is not None:
                        torch.nn.init.zeros_(child.bias)
                else:
                    raise NotImplementedError()


model = SimpleMNIST_NN(n_layers=3, init_form="normal").to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

# plotting weights values of first(input layer)
plt.figure(figsize=(12, 4))
plt.hist(
    list(model.layers.children())[0].weight.cpu().detach().numpy().reshape(-1), bins=100
)
plt.title("weights histogram")
plt.xlabel("values")
plt.ylabel("counts")
plt.show()

model_name = "n3_layers_sigmoid_havier"
history = train_model(model, optimizer, model_name=model_name)

  0%|          | 0/5 [00:00<?, ?it/s]


history_plotter.add(history)
history_plotter.plot(["n_layers3_sigmoid", model_name])


import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

k = 500  # sample size
n = 2
p = 0.5

sample = np.random.negative_binomial(n, p, k)
sns.histplot(data=sample, discrete=True)
plt.show()


ema = 0
alpha = 0.01
means = np.array([])

for i in range(10000):
    sample = np.random.negative_binomial(n, p, 50)
    ema = (1 - alpha) * ema + alpha * sample.mean()
    means = np.append(means, sample.mean())


import sys

print(f"{sys.getsizeof(ema)} bytes")

32 bytes


print(f"{sys.getsizeof(means)} bytes")

80112 bytes


print(f"{ema:.8f}")

2.00626164


print(f"{means.mean():.8f}")

2.00472600


from tqdm import tqdm


def train_model_sep(model, optimizer, model_name=None, num_epochs=5):
    criterion = nn.CrossEntropyLoss().to(device)

    train_history = {}
    train_history["model_name"] = model_name
    train_history["epoсhs"] = num_epochs
    train_history["loss_on_train"] = []
    train_history["loss_on_test"] = []

    for epoch in tqdm(range(num_epochs)):
        model.train()
        loss_on_train = train_epoch(model, optimizer, criterion, train_loader)
        model.eval()
        _, loss_on_test = validate(model, criterion, test_loader)
        train_history["loss_on_train"].extend(loss_on_train)
        train_history["loss_on_test"].extend(loss_on_test)
    return train_history


class SimpleMNIST_NN_Init_Batchnorm(nn.Module):
    def __init__(self, n_layers):
        super().__init__()
        self.n_layers = n_layers
        layers = [
            nn.Linear(28 * 28, 100, bias=False),
            nn.BatchNorm1d(100),
            nn.Sigmoid(),
        ]
        for _ in range(0, n_layers - 1):
            layers.append(nn.Linear(100, 100, bias=False))
            layers.append(nn.BatchNorm1d(100))
            layers.append(nn.Sigmoid())
        layers.append(nn.Linear(100, 10))
        self.layers = nn.Sequential(*layers)
        self.init()

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.layers(x)
        return x

    def init(self):
        sigmoid_gain = torch.nn.init.calculate_gain("sigmoid")
        for child in self.layers.children():
            if isinstance(child, nn.Linear):
                torch.nn.init.xavier_normal_(child.weight, gain=sigmoid_gain)
                if child.bias is not None:
                    torch.nn.init.zeros_(child.bias)


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-3)

model_name = "batchnorm2"
hooks_data_history = register_model_hooks(model)
history = train_model_sep(model, optimizer, model_name=model_name)

100%|██████████| 5/5 [01:45<00:00, 21.14s/it]


history_plotter.add(history)
history_plotter.plot(["n_layers3_sigmoid", "n3_layers_sigmoid_havier", model_name])


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
optimizer = optim.SGD(model.parameters(), lr=1e-2)

model_name = "batchnorm_increased_lr"
hooks_data_history = register_model_hooks(model)
history = train_model_sep(model, optimizer, model_name=model_name)

100%|██████████| 5/5 [01:45<00:00, 21.02s/it]


history_plotter.add(history)
history_plotter.plot(["n3_layers_sigmoid_havier", "batchnorm2", model_name])


class BadDropout(nn.Module):
    def __init__(self, p: float = 0.5):
        super().__init__()
        if p < 0 or p > 1:
            raise ValueError(
                f"Dropout probability has to be between 0 and 1, but got {p}"
            )
        self.p = p

    def forward(self, x):
        if self.training:
            keep = torch.rand(x.size()) > self.p
            if x.is_cuda:
                keep = keep.to(device)
            return x * keep
        # in test time, expectation is calculated
        return x * (1 - self.p)


class Dropout(nn.Module):
    def __init__(self, p: float = 0.2):
        super().__init__()
        if p < 0 or p > 1:
            raise ValueError(
                f"Dropout probability has to be between 0 and 1, but got {p}"
            )
        self.p = p

    def forward(self, x):
        if self.training:
            keep = torch.rand(x.size()) > self.p
            if x.is_cuda:
                keep = keep.to(device)
            return x * keep / (1 - self.p)
        return x  # in test time, expectation is calculated intrinsically - we just not divide weights


class SimpleMNIST_NN_Dropout(nn.Module):
    def __init__(self, n_layers, activation=nn.Sigmoid, init_form="normal"):
        super().__init__()
        self.n_layers = n_layers
        self.activation = activation()
        layers = [nn.Linear(28 * 28, 100), self.activation]
        for _ in range(0, n_layers - 1):
            layers.append(nn.Linear(100, 100))
            layers.append(Dropout())  # add Dropout
            layers.append(self.activation)
        layers.append(nn.Linear(100, 10))
        self.layers = nn.Sequential(*layers)
        self.init_form = init_form
        if self.init_form is not None:
            self.init()

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.layers(x)
        return x

    def init(self):
        sigmoid_gain = torch.nn.init.calculate_gain("sigmoid")
        for child in self.layers.children():
            if isinstance(child, nn.Linear):
                if self.init_form == "normal":
                    torch.nn.init.xavier_normal_(child.weight, gain=sigmoid_gain)
                    if child.bias is not None:
                        torch.nn.init.zeros_(child.bias)
                elif self.init_form == "uniform":
                    torch.nn.init.xavier_uniform_(child.weight, gain=sigmoid_gain)
                    if child.bias is not None:
                        torch.nn.init.zeros_(child.bias)
                else:
                    raise NotImplementedError()


model = SimpleMNIST_NN_Dropout(n_layers=3).to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001)

model_name = "nn3_dropout"
history = train_model_sep(model, optimizer, model_name=model_name)

100%|██████████| 5/5 [01:35<00:00, 19.10s/it]


history_plotter.add(history)
history_plotter.plot(["n_layers3_sigmoid", "n3_layers_sigmoid_havier", model_name])


N = 50  # number of data points
noise = 0.3

# generate the train data
x_train = torch.unsqueeze(torch.linspace(-1, 1, N), 1)
y_train = x_train + noise * torch.normal(torch.zeros(N, 1), torch.ones(N, 1))

# generate the test data
x_test = torch.unsqueeze(torch.linspace(-1, 1, N), 1)
y_test = x_test + noise * torch.normal(torch.zeros(N, 1), torch.ones(N, 1))

print(f"x_train shape: {x_train.shape}\nx_test shape: {x_test.shape}")

x_train shape: torch.Size([50, 1])
x_test shape: torch.Size([50, 1])


plt.scatter(
    x_train.data.numpy(), y_train.data.numpy(), c="purple", alpha=0.5, label="train"
)
plt.scatter(
    x_test.data.numpy(), y_test.data.numpy(), c="yellow", alpha=0.5, label="test"
)

x_real = np.arange(-1, 1, 0.01)
y_real = x_real
plt.plot(x_real, y_real, c="green", label="true")
plt.legend()
plt.show()


N_h = 100  # num of neurons
model = torch.nn.Sequential(
    torch.nn.Linear(1, N_h),
    torch.nn.ReLU(),
    torch.nn.Linear(N_h, N_h),
    torch.nn.ReLU(),
    torch.nn.Linear(N_h, 1),
)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


N_h = 100  # num of neurons

model_dropout = nn.Sequential(
    nn.Linear(1, N_h),
    nn.Dropout(0.5),  # 50 % probability
    nn.ReLU(),
    torch.nn.Linear(N_h, N_h),
    nn.Dropout(0.2),  # 20% probability
    nn.ReLU(),
    torch.nn.Linear(N_h, 1),
)
optimizer_dropout = torch.optim.Adam(model_dropout.parameters(), lr=0.01)


num_epochs = 1500
criterion = torch.nn.MSELoss()

for epoch in range(num_epochs):
    # train without dropout
    y_pred = model(x_train)  # look at the entire data in a single shot
    loss = criterion(y_pred, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # train with dropout
    y_pred_dropout = model_dropout(x_train)
    loss_dropout = criterion(y_pred_dropout, y_train)
    optimizer_dropout.zero_grad()
    loss_dropout.backward()
    optimizer_dropout.step()

    if epoch % 300 == 0:
        model.eval()  # not train mode
        model_dropout.eval()  #  not train mode

        # get predictions
        y_test_pred = model(x_test)
        test_loss = criterion(y_test_pred, y_test)

        y_test_pred_dropout = model_dropout(x_test)
        test_loss_dropout = criterion(y_test_pred_dropout, y_test)
        # plotting data and predictions
        plt.scatter(
            x_train.data.numpy(),
            y_train.data.numpy(),
            c="purple",
            alpha=0.5,
            label="train",
        )
        plt.scatter(
            x_test.data.numpy(),
            y_test.data.numpy(),
            c="yellow",
            alpha=0.5,
            label="test",
        )
        plt.plot(
            x_test.data.numpy(), y_test_pred.data.numpy(), "r-", lw=3, label="normal"
        )
        plt.plot(
            x_test.data.numpy(),
            y_test_pred_dropout.data.numpy(),
            "b--",
            lw=3,
            label="dropout",
        )

        plt.title(
            "Epoch %d, Loss = %0.4f, Loss with dropout = %0.4f"
            % (epoch, test_loss, test_loss_dropout)
        )

        plt.legend()

        model.train()  # train mode
        model_dropout.train()  # train mode

        plt.pause(0.05)


import torch.optim as optim

parameters = torch.randn(10, requires_grad=True)
optimizer = optim.SGD(model.parameters(), lr=0.001)


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.SGD(model.parameters(), momentum=0.9, lr=0.001)


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.SGD(model.parameters(), momentum=0.9, nesterov=True, lr=0.001)


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.Adagrad(model.parameters(), lr=0.01)


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.RMSprop(model.parameters(), lr=0.01, alpha=0.99)


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.Adam([parameters], betas=(0.9, 0.999))


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
hooks_data_history = register_model_hooks(model)

model_name = "adam"
history = train_model_sep(model, optimizer, model_name="adam")

100%|██████████| 5/5 [01:47<00:00, 21.59s/it]


history_plotter.add(history)
history_plotter.plot(["batchnorm2", "batchnorm_increased_lr", model_name])


parameters = torch.randn(10, requires_grad=True)
optimizer = optim.RMSprop(model.parameters(), alpha=0.99, weight_decay=0.001)


!pip install -q lion-pytorch


from lion_pytorch import Lion

model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
optimizer = Lion(model.parameters(), lr=1e-3, weight_decay=1e-3)

model_name = "lion"
history = train_model_sep(model, optimizer, model_name="lion", num_epochs=5)

100%|██████████| 5/5 [01:45<00:00, 21.13s/it]


history_plotter.add(history)
history_plotter.plot(["adam", model_name])


optimizer = optim.SGD(model.parameters(), lr=0.1)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, "min", factor=0.1, patience=5
)


def train_model_sep_scheduler(
    model, optimizer, scheduler, model_name=None, num_epochs=5
):
    criterion = nn.CrossEntropyLoss().to(device)

    train_history = {}
    train_history["model_name"] = model_name
    train_history["epoсhs"] = num_epochs
    train_history["loss_on_train"] = []
    train_history["loss_on_test"] = []

    for epoch in tqdm(range(num_epochs)):
        model.train()
        loss_on_train = train_epoch(model, optimizer, criterion, train_loader)
        model.eval()
        val_loss, loss_on_test = validate(model, criterion, test_loader)
        train_history["loss_on_train"].extend(loss_on_train)
        train_history["loss_on_test"].extend(loss_on_test)
        scheduler.step(val_loss)
    return train_history


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, "min", factor=0.1, patience=1
)

model_name = "reduce_lr_on_plateu"
hooks_data_history = register_model_hooks(model)
history = train_model_sep_scheduler(
    model, optimizer, scheduler, model_name=model_name, num_epochs=5
)

100%|██████████| 5/5 [01:48<00:00, 21.63s/it]


history_plotter.add(history)
history_plotter.plot([model_name])


scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)


batch_size = 32 * 16
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
criterion = nn.CrossEntropyLoss().to(device)


def get_correct_count(pred, labels):
    _, predicted = torch.max(pred.data, 1)
    return (predicted.cpu() == labels.cpu()).sum().item()


@torch.inference_mode()  # this annotation disable grad computation
def get_accuracy(model, test_loader, device="cpu"):
    correct, total = 0, 0
    for imgs, labels in test_loader:
        pred = model(imgs.to(device))
        total += labels.size(0)
        correct += get_correct_count(pred, labels)
    return correct / total


from tqdm.notebook import tqdm

lr_find_epochs = 2
lrs = [0.0002 + 0.0005 * i for i in range(10)]
losses = []
test_accuracy = []

for lr in tqdm(lrs):
    model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
    optimizer = optim.SGD(model.parameters(), lr=lr)

    for iter in range(lr_find_epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            x_train, y_train = batch
            x_train, y_train = x_train.to(device), y_train.to(device)
            y_pred = model(x_train)
            loss = criterion(y_pred, y_train)

            loss.backward()
            optimizer.step()
            loss = loss.detach().cpu().numpy()

    losses.append(loss)
    acc = get_accuracy(model, test_loader, device=device)
    test_accuracy.append(acc)

  0%|          | 0/10 [00:00<?, ?it/s]


fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 6))
fig.suptitle("LR range selection", size=15)

ax1.plot(lrs, losses)
ax1.set(xlabel="Learning rate", ylabel="Loss")
ax1.axvline(x=0.001, color="r")
ax1.axvline(x=0.004, color="r")

ax2.plot(lrs, test_accuracy)
ax2.set(xlabel="Learning rate", ylabel="Accuracy")
ax2.axvline(x=0.001, color="r")
ax2.axvline(x=0.004, color="r")

plt.show()


base_lr = 1e-3
max_lr = 4e-3


def train_epoch_sh(model, optimizer, scheduler, criterion, train_loader):
    loss_history = []
    for batch in train_loader:
        optimizer.zero_grad()
        x_train, y_train = batch  # parse data
        x_train, y_train = x_train.to(device), y_train.to(device)  # compute on gpu
        y_pred = model(x_train)  # get predictions
        loss = criterion(y_pred, y_train)  # compute loss
        loss_history.append(loss.cpu().detach().numpy())  # write loss to log
        loss.backward()
        optimizer.step()
        scheduler.step()
    return loss_history


batch_size = 32
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)


step_size_up = 4 * len(train_loader)
print(step_size_up)

7500


model = SimpleMNIST_NN_Init_Batchnorm(n_layers=3).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=base_lr)

scheduler = optim.lr_scheduler.CyclicLR(
    optimizer,
    base_lr=base_lr,
    max_lr=max_lr,
    mode="triangular",
    step_size_up=step_size_up,
)  # first case


from copy import deepcopy


def train_model_cycle_sh(model, optimizer, scheduler, model_name=None, num_epochs=5):
    criterion = nn.CrossEntropyLoss().to(device)

    train_history = {}
    train_history["model_name"] = model_name
    train_history["epoсhs"] = num_epochs

    train_history["loss_on_train"] = []
    train_history["loss_on_test"] = []

    best_loss = np.inf

    for epoch in tqdm(range(num_epochs)):
        model.train()
        loss_on_train = train_epoch_sh(
            model, optimizer, scheduler, criterion, train_loader
        )
        model.eval()
        val_loss, loss_on_test = validate(model, criterion, test_loader)
        train_history["loss_on_train"].extend(loss_on_train)
        train_history["loss_on_test"].extend(loss_on_test)
        if val_loss < best_loss:
            best_loss = val_loss
            best_model = deepcopy(model)

    return best_model, train_history


model_name = "sgd_cycle_lr"
hooks_data_history = register_model_hooks(model)
best_model, history = train_model_cycle_sh(
    model, optimizer, scheduler, model_name=model_name, num_epochs=5
)

  0%|          | 0/5 [00:00<?, ?it/s]


history_plotter.add(history)
history_plotter.plot([model_name, "batchnorm2", "adam"])


import numpy as np
import matplotlib.pyplot as plt

# number of models (M in formula)
n_models = 6
# epoch per model
epoch = 50
# number of bats in dataset
len_dataset = 8

# total iteration number (T in formula)
total_iter = n_models * epoch * len_dataset

learning_rate_0 = 0.1

t = np.array(range(total_iter))
lr_t = (learning_rate_0 / 2) * (
    np.cos(np.pi * np.mod(t, total_iter / n_models) / (total_iter / n_models)) + 1
)

fig, ax = plt.subplots(figsize=(12, 4))
fig.suptitle("Cosine annealing cycles", size=15)
ax.plot(t / len_dataset, lr_t)
ax.set(xlabel="Epoch", ylabel="Learning rate")
plt.show()


scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer, T_0=total_iter // n_models
)

Сигмоида затухает и теоретически, и практически¶

Затухание градиента¶

Нормализация входов и выходов¶

Нормализация входных данных¶

Нормализация целевых значений в задаче регрессии¶

Взрыв градиента¶

Multi-Dimensional регрессия¶

Инициализация весов¶

Инициализация Ксавье (Xavier Glorot)¶

Инициализация Каймин Хе (Kaiming He)¶

Важность инициализации весов¶

Ортогональная инициализация¶

Инициализация весов в PyTorch¶

Слои нормализации¶

Covariate shift (Ковариантный сдвиг)¶

Internal covariate shift¶

Плохой вариант борьбы с этим¶

Batch Normalization¶

Скользящее среднее¶

Защита от нулей в знаменателе¶

Сверточные слои¶

Пример работы¶

Градиент¶

Batch Normalization как регуляризация¶

Сглаживающий эффект Batch Normalization¶

Советы по использованию Batch Normalization¶

Используем Batch Normalization в PyTorch¶

Ставить Batch Normalization до или после активации?¶

До¶

После¶

Другие Normalization¶

Layer Norm¶

Instance Norm¶

Group Norm¶

Регуляризация¶

L1, L2 регуляризации¶

Dropout¶

Сверточные слои¶

Мотивация Dropout¶

Борьба с коадаптацией¶

Dropout как регуляризация¶

Dropout как ансамбль¶

Dropout помогает бороться с переобучением¶

Простая реализация Dropout¶

Пример борьбы с переобучением при помощи Dropout¶

Доверительный интервал от Dropout¶

DropConnect¶

DropBlock¶

Batch Normalization до или после Dropout¶

После¶

До¶

Ставить только что-то одно¶

Строго говоря¶

Оптимизация параметров нейросетей¶

Обзор популярных оптимизаторов¶

SGD (stochastic gradient descent)¶

Momentum¶

NAG (Nesterov momentum)¶

Adaptive Learning Rate¶

Adagrad¶

RMSprop¶

Adam¶

L2 vs Weight decay¶

Другие оптимизаторы (Lion)¶

Ландшафт функции потерь¶

Сравнение оптимизаторов¶

Режимы обучения¶

Ранняя остановка¶

Понижение шага обучения на каждой эпохе¶

Neural Network WarmUp¶

Cyclical learning schedule¶

Ускорение сходимости¶

Подбираем границы learning rate¶

Запускаем¶

Создание ансамблей моделей¶

Взаимодействие learning schedule и адаптивного изменения learning rate¶