import torch

rnn_cell = torch.nn.RNNCell(input_size=3, hidden_size=2)
dummy_sequence = torch.randn((1, 3))  # batch, input_size
h = rnn_cell(dummy_sequence)
print("Inital shape:".ljust(17), f"{dummy_sequence.shape}")
print("Resulting shape:".ljust(17), f"{h.shape}")  # hidden state

Inital shape:     torch.Size([1, 3])
Resulting shape:  torch.Size([1, 2])


from torch import nn


# Simple RNNcell without a bias and batch support
class SimplifiedRNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        # Init weight matrix, for simplicity omit bias
        self.W_hx = (
            torch.randn(input_size, hidden_size) * 0.0001
        )  # hidden_size == number of neurons
        self.W_hh = (
            torch.randn(hidden_size, hidden_size) * 0.0001
        )  # naive initialization
        self.h0 = torch.zeros((hidden_size))  # Initial hidden state

    def forward(self, x, h=None):  # Without a batch dimension
        if h is None:
            h = self.h0
        h = torch.tanh(torch.matmul(self.W_hx.T, x) + torch.matmul(self.W_hh.T, h))
        return h


simple_rnn_cell = SimplifiedRNNCell(input_size=3, hidden_size=2)
h = simple_rnn_cell(dummy_sequence[0])  # No batch
print(f"Out = h\n{h.shape} \n{h}")

Out = h
torch.Size([2]) 
tensor([-2.2494e-04,  1.6840e-05])


rnn = torch.nn.RNN(input_size=3, hidden_size=2)  # batch_first = True
dummy_batched_seq = torch.randn((2, 1, 3))  # seq_len, batch, input_size
out, h = rnn(dummy_batched_seq)

print("Inital shape:".ljust(20), f"{dummy_batched_seq.shape}")
print("Resulting shape:".ljust(20), f"{out.shape}")
print("Hidden state shape:".ljust(20), f"{h.shape}")

Inital shape:        torch.Size([2, 1, 3])
Resulting shape:     torch.Size([2, 1, 2])
Hidden state shape:  torch.Size([1, 1, 2])


import numpy as np


# Simple RNN without batching
class SimplifiedRNNLayer(nn.Module):
    def __init__(self, input_size, hidden_size):
        super().__init__()
        self.rnn_cell = SimplifiedRNNCell(input_size, hidden_size)

    # Without a batch dimension x have shape seq_len * input_size
    def forward(self, x, h=None):
        all_h = []
        for i in range(x.shape[0]):  # iterating over timestamps
            h = self.rnn_cell(torch.Tensor(x[i]), h)
            all_h.append(h)
        return np.stack(all_h), h


simple_rnn = SimplifiedRNNLayer(input_size=4, hidden_size=2)
sequence = np.array(
    [[0, 1, 2, 0], [3, 4, 5, 0]]
)  # batch with one sequence of two elements

out, h = simple_rnn(sequence)
print("Inital shape:".ljust(20), f"{sequence.shape}")
print("Resulting shape:".ljust(20), f"{out.shape}")
print("Hidden state shape:".ljust(20), f"{h.shape}")

Inital shape:        (2, 4)
Resulting shape:     (2, 2)
Hidden state shape:  torch.Size([2])


dummy_seq = torch.randn((2, 1, 3))  #  seq_len, batch, input_size

print("RNNCell")
rnn_cell = torch.nn.RNNCell(3, 2)
print("Parameter".ljust(10), "Shape")
for t, p in rnn_cell.named_parameters():
    print(t.ljust(10), p.shape)

cell_out = rnn_cell(dummy_seq[0, :, :])  # take first element from sequence
print()
print("Result shape =".ljust(20), cell_out.shape)
print("Hidden state shape =".ljust(20), cell_out.shape)  # one hidden state

print("----------------------------------------")

print("RNN")
rnn = torch.nn.RNN(3, 2)
print("Parameter".ljust(15), "Shape")
for t, p in rnn.named_parameters():
    print(t.ljust(15), p.shape)

out, h = rnn(dummy_seq)

print()
print("Result shape =".ljust(20), out.shape)  # h for all timestamps element
print("Hidden state shape =".ljust(20), cell_out.shape)  # h for last element

RNNCell
Parameter  Shape
weight_ih  torch.Size([2, 3])
weight_hh  torch.Size([2, 2])
bias_ih    torch.Size([2])
bias_hh    torch.Size([2])

Result shape =       torch.Size([1, 2])
Hidden state shape = torch.Size([1, 2])
----------------------------------------
RNN
Parameter       Shape
weight_ih_l0    torch.Size([2, 3])
weight_hh_l0    torch.Size([2, 2])
bias_ih_l0      torch.Size([2])
bias_hh_l0      torch.Size([2])

Result shape =       torch.Size([2, 1, 2])
Hidden state shape = torch.Size([1, 2])


dummy_input = torch.randn((2, 1, 3))  # seq_len, batch, input_size
rnn = torch.nn.RNN(3, 2, num_layers=3)

# Weights matrix sizes not changed!
for t, p in rnn.named_parameters():
    print(t, p.shape)

out, h = rnn(dummy_input)

print()
print("Out:\n", out.shape)  # Hidden states for all elements from top layer
print("h:\n", h.shape)  # Hidden states for last element for all layers

weight_ih_l0 torch.Size([2, 3])
weight_hh_l0 torch.Size([2, 2])
bias_ih_l0 torch.Size([2])
bias_hh_l0 torch.Size([2])
weight_ih_l1 torch.Size([2, 2])
weight_hh_l1 torch.Size([2, 2])
bias_ih_l1 torch.Size([2])
bias_hh_l1 torch.Size([2])
weight_ih_l2 torch.Size([2, 2])
weight_hh_l2 torch.Size([2, 2])
bias_ih_l2 torch.Size([2])
bias_hh_l2 torch.Size([2])

Out:
 torch.Size([2, 1, 2])
h:
 torch.Size([3, 1, 2])


dummy_input = torch.randn((2, 1, 3))  # seq_len, batch, input_size
rnn = torch.nn.RNN(3, 2, bidirectional=True)

for t, p in rnn.named_parameters():
    print(t, p.shape)

out, h = rnn(dummy_input)

print()
print("Out:\n", out.shape)  # Concatenated Hidden states from both layers
print(
    "h:\n", h.shape
)  # Hidden states last element from  both : 2*num_layers*hidden_state

weight_ih_l0 torch.Size([2, 3])
weight_hh_l0 torch.Size([2, 2])
bias_ih_l0 torch.Size([2])
bias_hh_l0 torch.Size([2])
weight_ih_l0_reverse torch.Size([2, 3])
weight_hh_l0_reverse torch.Size([2, 2])
bias_ih_l0_reverse torch.Size([2])
bias_hh_l0_reverse torch.Size([2])

Out:
 torch.Size([2, 1, 4])
h:
 torch.Size([2, 1, 2])


import pandas as pd

dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/airline-passengers.csv"
)
dataset.head()


import matplotlib.pyplot as plt

training_data = dataset.iloc[:, 1:2].values  # transform dataframe to numpy.array
# plotting
plt.figure(figsize=(12, 4))
plt.plot(training_data, label="Airline Passangers Data")
plt.title("Number of passengers per month")
plt.ylabel("#passengers")
plt.xlabel("Month")
labels_to_display = [i for i in range(training_data.shape[0]) if i % 12 == 0]
plt.xticks(labels_to_display, dataset["Month"][labels_to_display])
plt.grid()
plt.show()


# Min-Max normalization
td_min = training_data.min()
td_max = training_data.max()
print("Initial statistics:")
print("Minimum value:", repr(td_min).rjust(5))
print("Maximum value:", repr(td_max).rjust(5))

training_data = (training_data - td_min) / (td_max - td_min)
print("\nResulting statistics:")
print("Minimum value:", repr(training_data.min()).rjust(5))
print("Maximum value:", repr(training_data.max()).rjust(5))

Initial statistics:
Minimum value:   104
Maximum value:   622

Resulting statistics:
Minimum value:   0.0
Maximum value:   1.0


import numpy as np
import torch

# create data "ensemble"


def sliding_windows(data, seq_length):
    x = []
    y = []

    for i in range(len(data) - seq_length):
        _x = data[i : (i + seq_length)]  # picking several sequential observations
        _y = data[i + seq_length]  # picking the subsequent observation
        x.append(_x)
        y.append(_y)

    return torch.Tensor(np.array(x)), torch.Tensor(np.array(y))


# set length of the ensemble; accuracy of the predictions and
# speed perfomance almost always depend on it size
seq_length = 8  # compare 2 and 32
x, y = sliding_windows(training_data, seq_length)
print("Example of the obtained data:\n")
print("Data corresponding to the first x:")
print(x[0])
print("Data corresponding to the first y:")
print(y[0])

Example of the obtained data:

Data corresponding to the first x:
tensor([[0.0154],
        [0.0270],
        [0.0541],
        [0.0483],
        [0.0328],
        [0.0598],
        [0.0849],
        [0.0849]])
Data corresponding to the first y:
tensor([0.0618])


train_size = int(len(y) * 0.8)

x_train = x[:train_size]
y_train = y[:train_size]

x_test = x[train_size:]
y_test = y[train_size:]

print("Train data:")
print("x shape:", x_train.shape)
print("y shape:", y_train.shape)

print("\nTest data:")
print("x shape:", x_test.shape)
print("y shape:", y_test.shape)

Train data:
x shape: torch.Size([108, 8, 1])
y shape: torch.Size([108, 1])

Test data:
x shape: torch.Size([28, 8, 1])
y shape: torch.Size([28, 1])


import torch.nn as nn


class AirTrafficPredictor(nn.Module):
    def __init__(self, input_size, hidden_size):
        # hidden_size == number of neurons
        super().__init__()
        self.rnn = nn.RNN(
            input_size=input_size, hidden_size=hidden_size, batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)  # Predict only one value

    def forward(self, x):
        # print("x: ",x.shape) # 108 x 8 x 1 : [batch_size, seq_len, input_size]
        out, h = self.rnn(x)
        # print("out: ", out.shape) # 108 x 8 x 4 : [batch_size, seq_len, hidden_size] Useless!
        # print("h : ", h.shape) # 1 x 108 x 4 [ num_layers, batch_size, hidden_size]
        y = self.fc(h)
        # print("y",y.shape) # 1 x 108 x 1
        return y, h


def time_series_train(model, num_epochs=2000, learning_rate=0.01):
    criterion = torch.nn.MSELoss()  # mean-squared error for regression
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    for epoch in range(num_epochs):
        y_pred, h = model(x_train)  # we don't use h there, but we can!
        optimizer.zero_grad()

        # obtain the loss
        loss = criterion(y_pred[0], y_train)  # for shape compatibility
        loss.backward()

        optimizer.step()
        if epoch % 100 == 0:
            print(f"Epoch: {epoch},".ljust(15), "loss: %1.5f" % (loss.item()))


print("Simple RNN training process with MSE loss:")
input_size = 1
hidden_size = 4
rnn = AirTrafficPredictor(input_size, hidden_size)
time_series_train(rnn)

Simple RNN training process with MSE loss:
Epoch: 0,       loss: 0.49864
Epoch: 100,     loss: 0.00477
Epoch: 200,     loss: 0.00313
Epoch: 300,     loss: 0.00301
Epoch: 400,     loss: 0.00289
Epoch: 500,     loss: 0.00278
Epoch: 600,     loss: 0.00268
Epoch: 700,     loss: 0.00260
Epoch: 800,     loss: 0.00254
Epoch: 900,     loss: 0.00249
Epoch: 1000,    loss: 0.00246
Epoch: 1100,    loss: 0.00244
Epoch: 1200,    loss: 0.00242
Epoch: 1300,    loss: 0.00241
Epoch: 1400,    loss: 0.00241
Epoch: 1500,    loss: 0.00241
Epoch: 1600,    loss: 0.00240
Epoch: 1700,    loss: 0.00240
Epoch: 1800,    loss: 0.00240
Epoch: 1900,    loss: 0.00240


def time_series_plot(train_predict):
    data_predict = train_predict.data
    y_data_plot = y.data

    # Denormalize
    data_predict = data_predict[0] * (td_max - td_min) + td_min
    y_data_plot = y_data_plot * (td_max - td_min) + td_min

    # Plotting
    plt.figure(figsize=(12, 4))
    plt.axvline(x=train_size, c="r", linestyle="--")
    # shifting the curve as first y-value not correspond first value overall
    plt.plot(seq_length + np.arange(y_data_plot.shape[0]), y_data_plot)
    plt.plot(seq_length + np.arange(y_data_plot.shape[0]), data_predict)

    plt.title("Number of passengers per month")
    plt.ylabel("#passengers")
    plt.xlabel("Month")
    plt.xticks(labels_to_display, dataset["Month"][labels_to_display])

    plt.legend(["Train/Test separation", "Real", "Predicted"])
    plt.grid(axis="x")
    plt.show()


rnn.eval()
train_predict, h = rnn(x)
time_series_plot(train_predict)


import torch

lstm_cell = torch.nn.LSTMCell(input_size=3, hidden_size=4)
input = torch.randn(1, 3)  # batch, input_size
h_0 = torch.randn(1, 4)
c_0 = torch.randn(1, 4)
h, c = lstm_cell(input, (h_0, c_0))  # second arg is tuple
print("Shape of h:", h.shape)  # batch, hidden_size
print("Shape of c:", c.shape)  # batch, hidden_size

Shape of h: torch.Size([1, 4])
Shape of c: torch.Size([1, 4])


import torch.nn as nn

lstm = nn.LSTM(input_size=4, hidden_size=5)
input = torch.randn(3, 2, 4)  # seq_len, batch, input_size
out, (h, c) = lstm(input)  # h and c returned in tuple

print("Input shape:".ljust(15), input.shape)
print("Shape of h".ljust(15), h.shape)  # batch, hidden_size
print("Shape of c".ljust(15), c.shape)  # batch, hidden_size
print(
    "Output shape:".ljust(15), out.shape
)  # seq_len, batch, hidden_size : h for each element

Input shape:    torch.Size([3, 2, 4])
Shape of h      torch.Size([1, 2, 5])
Shape of c      torch.Size([1, 2, 5])
Output shape:   torch.Size([3, 2, 5])


import pandas as pd

dataset = pd.read_csv(
    "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/airline-passengers.csv"
)
dataset.head()


# Min-Max normalization
training_data = dataset.iloc[:, 1:2].values
td_min = training_data.min()
td_max = training_data.max()
print("Initial statistics:")
print("Minimum value:", repr(td_min).rjust(5))
print("Maximum value:", repr(td_max).rjust(5))

training_data = (training_data - td_min) / (td_max - td_min)
print("\nResulting statistics:")
print("Minimum value:", repr(training_data.min()).rjust(5))
print("Maximum value:", repr(training_data.max()).rjust(5))

Initial statistics:
Minimum value:   104
Maximum value:   622

Resulting statistics:
Minimum value:   0.0
Maximum value:   1.0


import numpy as np

# create data "ensemble"


def sliding_windows(data, seq_length):
    x = []
    y = []

    for i in range(len(data) - seq_length):
        _x = data[i : (i + seq_length)]  # picking several sequential observations
        _y = data[i + seq_length]  # picking the subsequent observation
        x.append(_x)
        y.append(_y)

    return torch.Tensor(np.array(x)), torch.Tensor(np.array(y))


# set length of the ensemble; accuracy of the predictions and
# speed perfomance almost always depend on it size
seq_length = 8  # compare 2 and 32
x, y = sliding_windows(training_data, seq_length)

train_size = int(len(y) * 0.8)

x_train = x[:train_size]
y_train = y[:train_size]

x_test = x[train_size:]
y_test = y[train_size:]

print("Train data:")
print("x shape:", x_train.shape)
print("y shape:", y_train.shape)

print("\nTest data:")
print("x shape:", x_test.shape)
print("y shape:", y_test.shape)

Train data:
x shape: torch.Size([108, 8, 1])
y shape: torch.Size([108, 1])

Test data:
x shape: torch.Size([28, 8, 1])
y shape: torch.Size([28, 1])


def time_series_plot(train_predict):
    data_predict = train_predict.data
    y_data_plot = y.data

    # Denormalize
    data_predict = data_predict[0] * (td_max - td_min) + td_min
    y_data_plot = y_data_plot * (td_max - td_min) + td_min

    # Plotting
    plt.figure(figsize=(12, 4))
    plt.axvline(x=train_size, c="r", linestyle="--")
    # shifting the curve as first y-value not correspond first value overall
    plt.plot(seq_length + np.arange(y_data_plot.shape[0]), y_data_plot)
    plt.plot(seq_length + np.arange(y_data_plot.shape[0]), data_predict)

    plt.title("Number of passengers per month")
    plt.ylabel("#passengers")
    plt.xlabel("Month")
    plt.xticks(labels_to_display, dataset["Month"][labels_to_display])

    plt.legend(["Train/Test separation", "Real", "Predicted"])
    plt.grid(axis="x")
    plt.show()


# Define new LSTM based model


class LSTMAirTrafficPredictor(nn.Module):
    def __init__(self, input_size, hidden_size):
        # hidden_size == number of neurons
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size, hidden_size=hidden_size, batch_first=True
        )
        self.fc = nn.Linear(hidden_size, 1)  # Predict only one value

    def forward(self, x):
        out, (h, c) = self.lstm(x)
        y = self.fc(h)
        return y


lstm = LSTMAirTrafficPredictor(input_size=1, hidden_size=4)
input = torch.randn((108, 8, 1))
out = lstm(input)

print(
    "LSTM model we use consider first input dimension as a batch dimension, output dimension logic has not changed:"
)
print("Input shape:".ljust(15), input.shape)
print("Output shape:".ljust(15), out.shape)

LSTM model we use consider first input dimension as a batch dimension, output dimension logic has not changed:
Input shape:    torch.Size([108, 8, 1])
Output shape:   torch.Size([1, 108, 1])


lstm.train()

print("LSTM training process with MSE loss:")

num_epochs = 2000
learning_rate = 0.01

criterion = torch.nn.MSELoss()  # mean-squared error for regression
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    y_pred = lstm(x_train)
    optimizer.zero_grad()
    # print(outputs.shape)
    loss = criterion(y_pred, y_train.unsqueeze(0))
    loss.backward()

    optimizer.step()
    if epoch % 100 == 0:
        print(f"Epoch: {epoch},".ljust(15), "loss: %1.5f" % (loss.item()))

LSTM training process with MSE loss:
Epoch: 0,       loss: 0.35633
Epoch: 100,     loss: 0.01312
Epoch: 200,     loss: 0.00526
Epoch: 300,     loss: 0.00424
Epoch: 400,     loss: 0.00343
Epoch: 500,     loss: 0.00278
Epoch: 600,     loss: 0.00237
Epoch: 700,     loss: 0.00228
Epoch: 800,     loss: 0.00227
Epoch: 900,     loss: 0.00225
Epoch: 1000,    loss: 0.00223
Epoch: 1100,    loss: 0.00221
Epoch: 1200,    loss: 0.00219
Epoch: 1300,    loss: 0.00216
Epoch: 1400,    loss: 0.00214
Epoch: 1500,    loss: 0.00210
Epoch: 1600,    loss: 0.00204
Epoch: 1700,    loss: 0.00181
Epoch: 1800,    loss: 0.00179
Epoch: 1900,    loss: 0.00166


import matplotlib.pyplot as plt

lstm.eval()
train_predict = lstm(x)
labels_to_display = [i for i in range(training_data.shape[0]) if i % 12 == 0]
time_series_plot(train_predict)


gru = torch.nn.GRU(input_size=4, hidden_size=3)
input = torch.randn(2, 1, 4)  # seq_len, batch, input_size
h0 = torch.randn(1, 1, 3)
output, h = gru(input, h0)

print("Input shape:".ljust(15), input.shape)
print("Shape of h:".ljust(15), h.shape)  # last h
print("Output shape:".ljust(15), output.shape)  # seq_len = 2

Input shape:    torch.Size([2, 1, 4])
Shape of h:     torch.Size([1, 1, 3])
Output shape:   torch.Size([2, 1, 3])


import pprint

text = ["hey how are you", "good i am fine", "have a nice day"]

# Join all the sentences together and extract the unique characters
# from the combined sentences
chars = set("".join(text))
# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))
# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}

print("Dictionary for mapping character to the integer:")
pprint.pprint(char2int)

Dictionary for mapping character to the integer:
{' ': 15,
 'a': 2,
 'c': 11,
 'd': 16,
 'e': 13,
 'f': 1,
 'g': 12,
 'h': 14,
 'i': 0,
 'm': 9,
 'n': 5,
 'o': 6,
 'r': 3,
 'u': 8,
 'v': 4,
 'w': 7,
 'y': 10}


lengths = [len(sent) for sent in text]
maxlen = max(lengths)
print(f"The longest string has {maxlen} characters.\n")

print(f"Initial texts:\n{text}")
# A simple loop that loops through the list of sentences and adds
# a ' ' whitespace until the length of the sentence matches
# the length of the longest sentence
for i in range(len(text)):
    while len(text[i]) < maxlen:
        text[i] += " "

print(f"Resulting texts:\n{text}")

The longest string has 15 characters.

Initial texts:
['hey how are you', 'good i am fine', 'have a nice day']
Resulting texts:
['hey how are you', 'good i am fine ', 'have a nice day']


# Creating lists that will hold our input and target sequences
input_seq = []
target_seq = []

for i in range(len(text)):
    # Remove last character for input sequence
    input_seq.append(text[i][:-1])

    # Remove first character for target sequence
    target_seq.append(text[i][1:])

    print("Input sequence:".ljust(18), f"'{input_seq[i]}'")
    print("Target sequence:".ljust(18), f"'{target_seq[i]}'")
    print()

Input sequence:    'hey how are yo'
Target sequence:   'ey how are you'

Input sequence:    'good i am fine'
Target sequence:   'ood i am fine '

Input sequence:    'have a nice da'
Target sequence:   'ave a nice day'


for i in range(len(text)):
    input_seq[i] = [char2int[character] for character in input_seq[i]]
    target_seq[i] = [char2int[character] for character in target_seq[i]]

    print("Encodded input sequence:".ljust(25), input_seq[i])
    print("Encodded target sequence:".ljust(25), target_seq[i])
    print()

Encodded input sequence:  [14, 13, 10, 15, 14, 6, 7, 15, 2, 3, 13, 15, 10, 6]
Encodded target sequence: [13, 10, 15, 14, 6, 7, 15, 2, 3, 13, 15, 10, 6, 8]

Encodded input sequence:  [12, 6, 6, 16, 15, 0, 15, 2, 9, 15, 1, 0, 5, 13]
Encodded target sequence: [6, 6, 16, 15, 0, 15, 2, 9, 15, 1, 0, 5, 13, 15]

Encodded input sequence:  [14, 2, 4, 13, 15, 2, 15, 5, 0, 11, 13, 15, 16, 2]
Encodded target sequence: [2, 4, 13, 15, 2, 15, 5, 0, 11, 13, 15, 16, 2, 10]


import numpy as np

dict_size = len(char2int)
seq_len = maxlen - 1
batch_size = len(text)


def one_hot_encode(sequence, dict_size, seq_len, batch_size):
    # Creating a multi-dimensional array of zeros with the desired output shape
    features = np.zeros((batch_size, seq_len, dict_size), dtype=np.float32)

    # Replacing the 0 at the relevant character index with a 1 to represent that character
    for i in range(batch_size):
        for u in range(seq_len):
            features[i, u, sequence[i][u]] = 1
    return features


input_seq = one_hot_encode(input_seq, dict_size, seq_len, batch_size)
print(
    "Input shape: {} --> (Batch Size, Sequence Length, One-Hot Encoding Size)".format(
        input_seq.shape
    )
)
print(input_seq[0])

Input shape: (3, 14, 17) --> (Batch Size, Sequence Length, One-Hot Encoding Size)
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# Convert data to tensor
import torch

input_seq = torch.Tensor(input_seq)
target_seq = torch.Tensor(target_seq)


import torch.nn as nn


class NextCharacterGenerator(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super().__init__()

        # RNN Layer
        self.rnn = nn.RNN(input_size, hidden_size=hidden_dim, batch_first=True)
        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        batch_size = x.size(0)
        # Initializing hidden state for first input using method defined below
        hidden_0 = torch.zeros(
            1, batch_size, self.rnn.hidden_size
        )  # 1 correspond to number of layers

        # Passing in the input and hidden state into the model and obtaining outputs
        out, hidden = self.rnn(x, hidden_0)

        # Reshaping the outputs such that it can be fit into the fully connected layer
        # Need Only if n_layers > 1
        out = out.contiguous().view(-1, self.rnn.hidden_size)
        out = self.fc(out)

        return out, hidden


# Instantiate the model with hyperparameters
model = NextCharacterGenerator(
    input_size=dict_size, output_size=dict_size, hidden_dim=12, n_layers=1
)

# Define hyperparameters
num_epochs = 100

# Define Loss, Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Training Run
for epoch in range(1, num_epochs + 1):
    optimizer.zero_grad()  # Clears existing gradients from previous epoch
    output, hidden = model(input_seq)
    loss = criterion(output, target_seq.view(-1).long())
    loss.backward()  # Does backpropagation and calculates gradients
    optimizer.step()  # Updates the weights accordingly

    if epoch % 10 == 0:
        print(f"Epoch: {epoch}/{num_epochs}".ljust(20), end=" ")
        print("Loss: {:.4f}".format(loss.item()))

Epoch: 10/100        Loss: 2.4008
Epoch: 20/100        Loss: 1.9664
Epoch: 30/100        Loss: 1.5803
Epoch: 40/100        Loss: 1.2096
Epoch: 50/100        Loss: 0.9017
Epoch: 60/100        Loss: 0.6357
Epoch: 70/100        Loss: 0.4329
Epoch: 80/100        Loss: 0.2999
Epoch: 90/100        Loss: 0.2159
Epoch: 100/100       Loss: 0.1633


def predict(model, character):
    # One-hot encoding our input to fit into the model
    character = np.array([[char2int[c] for c in character]])
    character = one_hot_encode(character, dict_size, character.shape[1], 1)
    character = torch.from_numpy(character)

    out, hidden = model(character)
    # print(out.shape)
    # print(out)
    prob = nn.functional.softmax(out[-1], dim=0).data
    # Taking the class with the highest probability score from the output
    char_ind = torch.max(prob, dim=0)[1].item()

    return int2char[char_ind], hidden


def sample(model, out_len, start="hey"):
    model.eval()  # eval mode
    start = start.lower()
    # First off, run through the starting characters
    chars = [ch for ch in start]
    size = out_len - len(chars)
    # Now pass in the previous characters and get a new one
    for _ in range(size):
        char, h = predict(model, chars)
        chars.append(char)

    return "".join(chars)


sample(model, 15, "good")

'good i am fine '


for _ in range(3):
    print(sample(model, 15, "good"))

good i am fine 
good i am fine 
good i am fine


# Let's say you have 2 sentences (lowercased, punctuations removed):
sentences = "i am new to pytorch i am having fun"

words = sentences.split(" ")

print(f"All words: {words} \n")

vocab = set(words)  # create a vocabulary
vocab_size = len(vocab)

print(f"Vocabulary (unique words): {vocab} \n")
print(f"Vocabulary size: {vocab_size} \n")

# map words to unique indices
word2idx = {word: ind for ind, word in enumerate(vocab)}

print(f"Word-to-id dictionary: {word2idx} \n")

encoded_sentences = [word2idx[word] for word in words]

print(f"Encoded sentences: {encoded_sentences}")

# let's say you want embedding dimension to be 3
emb_dim = 3

All words: ['i', 'am', 'new', 'to', 'pytorch', 'i', 'am', 'having', 'fun'] 

Vocabulary (unique words): {'fun', 'pytorch', 'new', 'to', 'am', 'having', 'i'} 

Vocabulary size: 7 

Word-to-id dictionary: {'fun': 0, 'pytorch': 1, 'new': 2, 'to': 3, 'am': 4, 'having': 5, 'i': 6} 

Encoded sentences: [6, 4, 2, 3, 1, 6, 4, 5, 0]


import torch
import torch.nn as nn


emb_layer = nn.Embedding(vocab_size, emb_dim)

word_vectors = emb_layer(torch.LongTensor(encoded_sentences))

print(f"Shape of encoded sentences: {word_vectors.shape} \n")

print(f"Shape of weigths: {emb_layer.weight.shape}")

Shape of encoded sentences: torch.Size([9, 3]) 

Shape of weigths: torch.Size([7, 3])


emb_layer.weight.requires_grad

True


emb_layer.weight.requires_grad = False


# predefined weights
weight = torch.FloatTensor([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
print(weight.shape)
embedding = nn.Embedding.from_pretrained(weight)
# get embeddings for ind 0 and 1
embedding(torch.LongTensor([0, 1]))

torch.Size([2, 3])

tensor([[0.1000, 0.2000, 0.3000],
        [0.4000, 0.5000, 0.6000]])


!wget -q https://www.dropbox.com/s/699kgut7hdb5tg9/GoogleNews-vectors-negative300.bin.gz?dl=1
!mv 'GoogleNews-vectors-negative300.bin.gz?dl=1' GoogleNews-vectors-negative300.bin.gz

# Use this way for loading from our host
# !wget https://edunet.kea.su/repo/EduNet-web_dependencies/weights/GoogleNews-vectors-negative300.bin.gz
# !mv 'GoogleNews-vectors-negative300.bin.gz' GoogleNews-vectors-negative300.bin.gz

!gunzip -q GoogleNews-vectors-negative300.bin.gz


from gensim.models import KeyedVectors

wordvector_path = "GoogleNews-vectors-negative300.bin"
word_vectors = KeyedVectors.load_word2vec_format(wordvector_path, binary=True)


weights = torch.FloatTensor(word_vectors.vectors)


weights.shape

torch.Size([3000000, 300])


embedding = nn.Embedding.from_pretrained(weight)

input = torch.LongTensor([0, 1])

embedding(input)

tensor([[0.1000, 0.2000, 0.3000],
        [0.4000, 0.5000, 0.6000]])


import torchtext

glove = torchtext.vocab.GloVe(
    name="6B", dim=50, max_vectors=10000
)  # use 10k most common words

.vector_cache/glove.6B.zip: 862MB [02:38, 5.43MB/s]                           
100%|█████████▉| 9999/10000 [00:00<00:00, 35372.33it/s]


glove_emb = nn.Embedding.from_pretrained(glove.vectors)


input = torch.LongTensor([0, 1])
glove_emb(input).shape

torch.Size([2, 50])


class RNN_with_Embedding_Layer(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(RNN_with_Embedding_Layer, self).__init__()
        self.emb = nn.Embedding.from_pretrained(glove.vectors)
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # Look up the embedding
        x = self.emb(x)
        # Set an initial hidden state
        h0 = torch.zeros(1, x.size(0), self.hidden_size)
        # Forward propagate the RNN
        out, _ = self.rnn(x, h0)
        # Pass the output of the last time step to the classifier
        out = self.fc(out[:, -1, :])
        return out


model = RNN_with_Embedding_Layer(input_size=50, hidden_size=128, num_classes=3)
print(model)

RNN_with_Embedding_Layer(
  (emb): Embedding(10000, 50)
  (rnn): RNN(50, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=3, bias=True)
)


input = "Я люблю Natural Language Processing (NLP). А ты?"
tokenized = input.split(" ")
print(tokenized)

['Я', 'люблю', 'Natural', 'Language', 'Processing', '(NLP).', 'А', 'ты?']


import re

# initializing string
input = "Я люблю Natural Language Processing (NLP). А ты?"

# using findall() to get all regex matches.
res = re.findall(r"\w+|[^\s\w]+", input)

# printing result
print(str(res))

['Я', 'люблю', 'Natural', 'Language', 'Processing', '(', 'NLP', ').', 'А', 'ты', '?']


!pip install -q transformers[torch]

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.4/7.4 MB 20.9 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 27.6 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 55.2 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 57.8 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 244.2/244.2 kB 10.1 MB/s eta 0:00:00


from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.tokenize(
    "I have a new neighbor. Every morning he annoyingly drills the wall."
)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

['i',
 'have',
 'a',
 'new',
 'neighbor',
 '.',
 'every',
 'morning',
 'he',
 'annoying',
 '##ly',
 'drills',
 'the',
 'wall',
 '.']


import numpy as np
import matplotlib.pyplot as plt

a = np.random.normal(0, 100, size=(10000))

plt.title("Normal distribution, std = 100")
plt.ylabel("Number of samples")
plt.xlabel("Sample value")

plt.hist(a)
plt.show()


from scipy.special import softmax

plt.title("Softmax on N(0, 100)")
plt.ylabel("Softmax value")
plt.xlabel("Sample index")

plt.plot(softmax(a))
plt.show()


std = np.random.normal(0, 100, size=(10000))

unit_std = std / 100

plt.title("Normal distribution, std = 100")
plt.ylabel("Number of samples")
plt.xlabel("Sample value")
plt.hist(std)
plt.show()

plt.title("Normal distribution, std = 1")
plt.ylabel("Number of samples")
plt.xlabel("Sample value")
plt.hist(unit_std)
plt.show()


plt.title("Softmax on N(0, 1)")
plt.ylabel("Softmax value")
plt.xlabel("Sample index")

plt.plot(softmax(unit_std))
plt.show()


def statistics(dimensionality, experiments=int(10e4)):
    c = []
    for i in range(experiments):
        a = torch.normal(0, 1, size=(int(dimensionality),))
        b = torch.normal(0, 1, size=(int(dimensionality),))
        c.append(torch.dot(a, b))

    c = torch.Tensor(c)
    return float(c.mean()), float(c.std())


import torch

means, stds = {}, {}
dims = torch.linspace(0, 100, 20)

for dim in dims:
    dim = float(dim)
    t_mean, t_std = statistics(dim)
    means[dim] = t_mean
    stds[dim] = t_std


x = list(means.keys())
y = list(means.values())

plt.plot(x, y)
plt.axhline(y=0, c="r", linestyle="--")
plt.legend(["Mean value", "Mean = 0"])
plt.title("Mean value of dot products")
plt.ylabel("Mean value")
plt.xlabel("Vector dimensionality")
plt.show()


x = list(stds.keys())
y = list(stds.values())
plt.scatter(x, y)
plt.title("Std values and square distance")
plt.xlabel("Vector dimensionality")

x = np.linspace(0, 100, 10000)
plt.plot(x, x**0.5, color="r")
plt.legend(["sqrt(x)", "empirical std"])
plt.show()


import math
import torch


class PositionalEncoding(torch.nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].detach()
        return x


pe = PositionalEncoding(20)
y = pe(
    torch.zeros(1, 100, 20)
)  # sequence of shape 100, every token of sequence has shape 20


import numpy as np
import matplotlib.pyplot as plt


plt.figure(figsize=(15, 5))
plt.plot(np.arange(100), y[0, :, 0:4].data.numpy())
plt.legend(["dim %d" % p for p in [1, 2, 3, 4]])
plt.show()


plt.figure(figsize=(15, 5))
plt.plot(np.arange(100), y[0, :, 4:8].data.numpy())
plt.legend(["dim %d" % p for p in [4, 5, 6, 7]])
plt.show()


!pip install -q transformers[torch]


import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from IPython.display import clear_output

transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading and initialization of model and tokenizer
model_name = "sberbank-ai/rugpt3large_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

clear_output()


text = "Вопрос: 'Сколько будет 2+2?'\nОтвет:"
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
out = model.generate(input_ids, do_sample=False, max_length=20, pad_token_id=20)

generated_text = list(map(tokenizer.decode, out))[0]

print(generated_text)

Вопрос: 'Сколько будет 2+2?'
Ответ: '2+2=4'


import locale

locale.getpreferredencoding = lambda: "UTF-8"
!pip install -q transformers[torch]


import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from IPython.display import clear_output

transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Loading and initialization of model and tokenizer
model_name_or_path = "sberbank-ai/rugpt3large_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)
model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(device)

clear_output()


text = "Нейронные сети - это очень просто и увлекательно"
tokens = tokenizer.encode(text, add_special_tokens=False)

decoded_tokens = [tokenizer.decode([token]) for token in tokens]

print("Original text:", text)
print("Tokens: ", tokens)
print("Decoded tokens: ", decoded_tokens)

Original text: Нейронные сети - это очень просто и увлекательно
Tokens:  [682, 355, 1368, 448, 5324, 376, 481, 939, 1139, 289, 13904, 1245]
Decoded tokens:  ['Н', 'ей', 'рон', 'ные', ' сети', ' -', ' это', ' очень', ' просто', ' и', ' увлека', 'тельно']


print(tokenizer.decode([167]))
print(tokenizer.decode([245]))
print(tokenizer.decode([256]))

print(tokenizer.decode([167, 245, 256]))

�
�
�
撝


text = 'Определение: "Нейронная сеть" - это'
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)


# ArgMax is defaulf behaviour
out = model.generate(input_ids, do_sample=False, max_length=30, pad_token_id=30)

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Определение: "Нейронная сеть" - это компьютерная программа, которая позволяет создавать и анализировать нейронные сети. Нейронные сети


# Generation with beam-search
out = model.generate(input_ids, do_sample=False, num_beams=5, max_length=30, pad_token_id=30)

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Определение: "Нейронная сеть" - это компьютерная сеть, состоящая из компьютеров, соединенных друг с другом. Нейронная


out = model.generate(input_ids, do_sample=True, temperature=1.3, max_length=30, pad_token_id=30)

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Определение: "Нейронная сеть" - это относительно молодая наука, получившая толчок от научных экспериментов.
Нейронная сеть может


out = model.generate(
    input_ids,
    do_sample=True,
    temperature=1.3,
    top_k=20,
    top_p=0.8,
    max_length=30,
    pad_token_id=30
)

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Определение: "Нейронная сеть" - это сеть нейронов, которые могут существовать в качестве автономных систем. В случае, когда они


!pip install -q transformers[torch]


import torch
import transformers
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from IPython.display import clear_output

transformers.logging.set_verbosity_error()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "sberbank-ai/rugpt3small_based_on_gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)

clear_output()


text = """Дым табачный воздух выел.
Комната —
глава в крученыховском аде.
Вспомни —
за этим окном
впервые
руки твои, исступленный, гладил.
Сегодня сидишь вот,
сердце в железе.
День еще —
выгонишь,
может быть, изругав.
В мутной передней долго не влезет
сломанная дрожью рука в рукав.
Выбегу,
тело в улицу брошу я.
Дикий,
обезумлюсь,
отчаяньем иссеча́сь.
Не надо этого,
дорогая,
хорошая,
дай простимся сейчас.
Все равно
любовь моя —
тяжкая гиря ведь —
висит на тебе,
куда ни бежала б.
Дай в последнем крике выреветь
горечь обиженных жалоб.
Если быка трудом уморят —
он уйдет,
разляжется в холодных водах.
Кроме любви твоей,
мне
нету моря,
а у любви твоей и плачем не вымолишь отдых.
Захочет покоя уставший слон —
царственный ляжет в опожаренном песке.
Кроме любви твоей,
мне
нету солнца,
а я и не знаю, где ты и с кем.
Если б так поэта измучила,
он
любимую на деньги б и славу выменял,
а мне
ни один не радостен звон,
кроме звона твоего любимого имени.
И в пролет не брошусь,
и не выпью яда,
и курок не смогу над виском нажать.
Надо мною,
кроме твоего взгляда,
не властно лезвие ни одного ножа.
Завтра забудешь,
что тебя короновал,
что душу цветущую любовью выжег,
и су́етных дней взметенный карнавал
растреплет страницы моих книжек…
Слов моих сухие листья ли
заставят остановиться,
жадно дыша?
Дай хоть
последней нежностью выстелить
твой уходящий шаг.."""


# Save text train data as .txt file
train_path = "train_dataset.txt"
with open(train_path, mode="w", encoding="utf-8") as f:
    f.write(text)


from transformers import TextDataset, DataCollatorForLanguageModeling
from warnings import simplefilter

simplefilter("ignore", category=FutureWarning)

# Creating Dataset
train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_path, block_size=64)

# Сreating DataLoader (crop the text into optimal length pieces)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./finetuned",  # The output directory
    overwrite_output_dir=True,  # overwrite the content of the output directory
    num_train_epochs=200,  # number of training epochs
    per_device_train_batch_size=32,  # batch size for training
    per_device_eval_batch_size=32,  # batch size for evaluation
    warmup_steps=10,  # number of warmup steps for learning rate scheduler
    gradient_accumulation_steps=16,  # to make "virtual" batch size larger
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    optimizers=(
        torch.optim.AdamW(model.parameters(), lr=1e-5),
        None,
    ),  # Optimizer and learnig rate scheduler
)


trainer.train()

{'train_runtime': 32.43, 'train_samples_per_second': 43.17, 'train_steps_per_second': 6.167, 'train_loss': 0.040189566612243655, 'epoch': 200.0}

TrainOutput(global_step=200, training_loss=0.040189566612243655, metrics={'train_runtime': 32.43, 'train_samples_per_second': 43.17, 'train_steps_per_second': 6.167, 'train_loss': 0.040189566612243655, 'epoch': 200.0})


# Probability sampling with limit example
text = "Как же сложно учить матанализ!\n"
input_ids = tokenizer.encode(text, return_tensors="pt").to(device)
model.eval()
with torch.no_grad():
    out = model.generate(
        input_ids,
        do_sample=True,
        num_beams=2,
        temperature=1.5,
        top_p=0.9,
        max_length=100,
        pad_token_id=512
    )

generated_text = list(map(tokenizer.decode, out))[0]
print()
print(generated_text)

Как же сложно учить матанализ!
Если быка трудом уморят —
он уйдет,
разляжется в холодных водах.
Кроме любви твоей,
мне
нету моря,
а у любви твоей и плачем не вымолишь отдых.
Захочет покоя уставший слон —
царственный ляжет на трон.
Захочет отдыха уставший слон —
царственный ляжет на трон.
Захочет отдыха уставший слон —
царственный

	Month	Passengers
0	1949-01	112
1	1949-02	118
2	1949-03	132
3	1949-04	129
4	1949-05	121

	Month	Passengers
0	1949-01	112
1	1949-02	118
2	1949-03	132
3	1949-04	129
4	1949-05	121

Особенности рекуррентных нейронных сетей¶

Примеры задач¶

Базовый RNN блок¶

RNNCell¶

RNN блок в PyTorch¶

Слои (Stacked RNNs)¶

Bidirectional¶

Пример прогнозирования временного ряда¶

Подготовка данных¶

Загрузка данных¶

Шкалирование данных¶

Формирование ансамблей данных¶

Разобьем на train и test¶

Создание и обучение модели¶

Обучение¶

Тестирование¶

Проблемы RNN¶

LSTM¶

LSTMCell¶

LSTM in PyTorch¶

Пример использования на задаче с временным рядом¶

Обучение¶

Выводы¶

Модификации LSTM¶

Peephole connections¶

Объединение forget и input gates¶

GRU (Gated reccurent unit)¶

Пример посимвольной генерации текста¶

Подготовка данных¶

Выравнивание данных (Padding)¶

Разбиение данных¶

Кодирование¶

One-hot encoding¶

Создание и обучение модели¶

Обучение¶

Тестирование¶

Embedding¶

Токенизация¶

Наивная токенизация¶

Subword Tokenization (Токенизация подслова)¶

Byte-Pair Encoding (BPE)¶

Sequence-to-Sequence with RNNs¶

Проблемы Sequence-to-Sequence with RNNs¶

Attention¶

Sequence-to-Sequence with RNNs and Attention mechanism¶

Модели внимания в машинном переводе¶

Модели внимания в задаче генерации подписи к изображениям¶

Проблема attention¶

Разновидности функций сходства векторов¶

Key, query, value¶

Softmax normalization¶

Multihead Attention¶

Image Captioning with RNNs and Attention¶

Transformer для машинного перевода¶

Архитектура сети Transformer¶

Общий пайплайн задачи машинного перевода¶

Архитектура трансформера-кодировщика¶

Positional encoding¶

Архитектура трансформера-декодировщика¶

Masked Self-Attention Layer¶

Небольшая историческая справка¶

Hugging Face¶

Языковое моделирование¶

Как работает GPT¶

Токенизация¶

Архитектура GPT¶

Positional Encoding¶

Transformer Decoder Block¶

Методы Генерации текста¶

Greedy Search¶

Beam Search¶

Сэмплирование с Температурой¶

Сэмплирование с Ограничением Маловероятных Токенов (Nucleus sampling)¶

Сравнение поколений GPT¶

GPT-1¶

GPT-2¶

GPT-3¶

Файнтюнинг¶

Как происходит обучение¶

Обучающие данные¶