from IPython.display import HTML
from base64 import b64encode

# Source: https://wayve.ai/wp-content/uploads/2022/06/ezgif.com-gif-maker1.mp4
!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/dev-2.0/L15/rl_control_system.mp4

mp4 = open("rl_control_system.mp4", "rb").read()
data_url = f"data:video/mp4;base64,{b64encode(mp4).decode()}"
HTML(f"<video width=1000 controls><source src={data_url} type='video/mp4'></video>")


!pip install -q gymnasium

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 953.9/953.9 kB 8.6 MB/s eta 0:00:00


import gym as openaigym
import gymnasium as gym

env = gym.make("MountainCar-v0")


from IPython.display import clear_output


# Action and observation space
action_space = env.action_space
obs_space = env.observation_space

clear_output()

print(f"The action space: {action_space}")
print(f"The observation space: {obs_space}")

The action space: Discrete(3)
The observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)


print("Upper Bound for Env Observation", env.observation_space.high)
print("Lower Bound for Env Observation", env.observation_space.low)

Upper Bound for Env Observation [0.6  0.07]
Lower Bound for Env Observation [-1.2  -0.07]


print(type(env.action_space))
print(type(env.observation_space))

<class 'gymnasium.spaces.discrete.Discrete'>
<class 'gymnasium.spaces.box.Box'>


random_action = env.action_space.sample()  # random number from 0 to 2
print(random_action)

1


# Reset the environment and see the initial observation
obs, info = env.reset()
print(f"The initial observation is {obs}")

# Take the action and get the new observation space
new_obs, reward, done, truncated, info = env.step(random_action)
print(f"The new observation is {new_obs}")

The initial observation is [-0.57499045  0.        ]
The new observation is [-5.7460654e-01  3.8391253e-04]


import random
import numpy as np
from gym import Env, spaces


np.random.seed(42)


class MultiArmedBanditEnv(Env):
    def __init__(self, n=4):
        """
        n - number of arms in the bandit
        """
        self.num_bandits = n
        self.action_space = spaces.Discrete(self.num_bandits)
        self.observation_space = spaces.Discrete(1)  # the reward of the last action
        self.bandit_success_prob = np.array(
            [0.5, 0.1, 0.9, 0.2]
        )  # success probabilities
        self.bandit_reward = np.array([2, 20, 1, 3])

    def step(self, action):
        done = True
        result_prob = np.random.random()
        if result_prob < self.bandit_success_prob[action]:
            reward = self.bandit_reward[action]
        else:
            reward = 0
        return reward, done


def argmax(x):
    return np.random.choice(np.flatnonzero(x == x.max()))


class GreedyAgent:
    def __init__(self):
        """
        Our agent initializes reward estimates as zeros.
        This estimates will be updated incrementally after each
        interaction with the environment.
        """
        self.reward_estimates = np.zeros(4)
        self.action_count = np.zeros(4)
        self.cache = 1000  # initial amount of coins agent posesses

    def get_action(self):
        # Pay 1 coin for the action
        self.cache -= 1

        # Select greedy action
        action = argmax(self.reward_estimates)

        # Add a 1 to action selected in the action count
        self.action_count[action] += 1

        return action

    def update_estimates(self, reward, action):
        # Update amount of cache by reward from our previuos action
        self.cache += reward

        # Compute the difference between the received rewards vs the reward estimates
        error = reward - self.reward_estimates[action]

        # Update the reward estimate incementally
        n = self.action_count[action]
        self.reward_estimates[action] += (1 / n) * error


import matplotlib.pyplot as plt


env = MultiArmedBanditEnv()
agent = GreedyAgent()

mean_reward_greedy = []
rewards = []

while (agent.cache > 0) and (len(mean_reward_greedy) != 10000):
    act = agent.get_action()
    reward, done = env.step(act)
    agent.update_estimates(reward, act)
    rewards.append(reward)
    mean_reward_greedy.append(sum(rewards) / (len(rewards)))

print(f"Action counts: {agent.action_count}")
print(f"Reward estimates: {agent.reward_estimates}")
plt.plot(mean_reward_greedy)
plt.show()

Action counts: [    0.     0. 10000.     0.]
Reward estimates: [0.     0.     0.9051 0.    ]


class EpsilonGreedyAgent(GreedyAgent):
    def __init__(self, epsilon):
        GreedyAgent.__init__(self)
        # Store the epsilon value
        assert epsilon >= 0 and epsilon <= 1
        self.epsilon = epsilon

    def get_action(self):
        # We need to redefine this function so that it takes an exploratory action with epsilon probability

        # Pay 1 coin for the action
        self.cache -= 1
        # One hot encoding: 0 if exploratory, 1 otherwise
        action_type = int(np.random.random() > self.epsilon)
        # Generate both types of actions for every experiment
        exploratory_action = np.random.randint(4)
        greedy_action = argmax(self.reward_estimates)
        # Use the one hot encoding to mask the actions for each experiment
        action = greedy_action * action_type + exploratory_action * (1 - action_type)

        self.action_count[action] += 1

        return action


agent = EpsilonGreedyAgent(epsilon=0.25)

mean_reward_e_greedy = []
rewards = []

while (agent.cache > 0) and (len(mean_reward_e_greedy) != 10000):
    act = agent.get_action()
    reward, done = env.step(act)
    agent.update_estimates(reward, act)
    rewards.append(reward)
    mean_reward_e_greedy.append(sum(rewards) / len(rewards))
    # Decrease epsilon with time until we reach 0.01 chanse to perform exploratory action
    if agent.epsilon > 0.01:
        agent.epsilon -= 0.0001

print(f"Action counts: {agent.action_count}")
print(f"Reward estimates: {agent.reward_estimates}")
plt.plot(mean_reward_greedy)
plt.plot(mean_reward_e_greedy)
plt.show()

Action counts: [  91. 9683.  137.   89.]
Reward estimates: [0.96703297 2.00970774 0.8540146  0.80898876]


from numpy.random import choice
from scipy.special import softmax


class SoftmaxAgent(GreedyAgent):
    def __init__(self):
        GreedyAgent.__init__(self)

    def get_action(self):
        # We need to redefine this function so that it takes an action with probability pi(a,s)

        # Pay 1 coin for the action
        self.cache -= 1

        # get \pi(a_i|s):
        pi = softmax(self.reward_estimates)
        # sample action
        action = choice(np.arange(0, 4, 1), 1, p=pi)[0]

        self.action_count[action] += 1

        return action


agent = SoftmaxAgent()

mean_reward_softmax_agent = []
rewards = []

while (agent.cache > 0) and (len(mean_reward_softmax_agent) != 10000):
    act = agent.get_action()
    reward, done = env.step(act)
    agent.update_estimates(reward, act)
    rewards.append(reward)
    mean_reward_softmax_agent.append(sum(rewards) / len(rewards))

clear_output()

print(f"Action counts: {agent.action_count}")
print(f"Reward estimates: {agent.reward_estimates}")
plt.plot(mean_reward_greedy)
plt.plot(mean_reward_e_greedy)
plt.plot(mean_reward_softmax_agent)
plt.show()

Action counts: [1766. 5262. 1660. 1312.]
Reward estimates: [0.99207248 2.05245154 0.89578313 0.59222561]


import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)


import gym
from IPython.display import clear_output

env = gym.make("MountainCarContinuous-v0")
env.seed(42)
clear_output()

print("observation space:", env.observation_space)
print("action space:", env.action_space)
print("  - low:", env.action_space.low)
print("  - high:", env.action_space.high)

observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)
action space: Box(-1.0, 1.0, (1,), float32)
  - low: [-1.]
  - high: [1.]


import math
import torch.nn as nn
import torch.nn.functional as F


class Agent(nn.Module):
    def __init__(self, env, h_size=16):
        super(Agent, self).__init__()
        self.env = env
        # state, hidden layer, action sizes
        self.s_size = env.observation_space.shape[0]
        self.h_size = h_size
        self.a_size = env.action_space.shape[0]
        # define layers (we used 2 layers)
        self.fc1 = nn.Linear(self.s_size, self.h_size)
        self.fc2 = nn.Linear(self.h_size, self.a_size)

    def set_weights(self, weights):
        s_size = self.s_size
        h_size = self.h_size
        a_size = self.a_size
        # separate the weights for each layer
        fc1_end = (s_size * h_size) + h_size
        fc1_W = torch.from_numpy(weights[: s_size * h_size].reshape(s_size, h_size))
        fc1_b = torch.from_numpy(weights[s_size * h_size : fc1_end])
        fc2_W = torch.from_numpy(
            weights[fc1_end : fc1_end + (h_size * a_size)].reshape(h_size, a_size)
        )
        fc2_b = torch.from_numpy(weights[fc1_end + (h_size * a_size) :])
        # set the weights for each layer
        self.fc1.weight.data.copy_(fc1_W.view_as(self.fc1.weight.data))
        self.fc1.bias.data.copy_(fc1_b.view_as(self.fc1.bias.data))
        self.fc2.weight.data.copy_(fc2_W.view_as(self.fc2.weight.data))
        self.fc2.bias.data.copy_(fc2_b.view_as(self.fc2.bias.data))

    def get_weights_dim(self):
        return (self.s_size + 1) * self.h_size + (self.h_size + 1) * self.a_size

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        return x.cpu().data

    def act(self, state):
        state = torch.from_numpy(state).float().to(device)
        with torch.no_grad():
            action = self.forward(state)
        return action

    def evaluate(self, weights, gamma=1.0, max_t=5000):
        self.set_weights(weights)
        episode_return = 0.0
        state = self.env.reset()
        for t in range(max_t):
            state = torch.from_numpy(state).float().to(device)
            action = self.forward(state)
            state, reward, done, _ = self.env.step(action)
            episode_return += reward * math.pow(gamma, t)
            if done:
                break
        return episode_return


clear_output()


import numpy as np
from collections import deque
from tqdm.notebook import tqdm


np.random.seed(42)


def cem(
    agent,
    n_iterations=5,
    max_t=1000,
    gamma=1.0,
    print_every=10,
    pop_size=50,
    elite_frac=0.2,
    sigma=0.5,
):
    """PyTorch implementation of the cross-entropy method.

    Params
    ======
        Agent (object): agent instance
        n_iterations (int): maximum number of training iterations
        max_t (int): maximum number of timesteps per episode
        gamma (float): discount rate
        print_every (int): how often to print average score (over last 100 episodes)
        pop_size (int): size of population at each iteration
        elite_frac (float): percentage of top performers to use in update
        sigma (float): standard deviation of additive noise
    """
    n_elite = int(pop_size * elite_frac)

    scores_deque = deque(maxlen=100)
    scores = []
    # Initialize the weight with random noise
    best_weight = sigma * np.random.randn(agent.get_weights_dim())

    for i_iteration in tqdm(range(1, n_iterations + 1)):
        # Define the cadidates and get the reward of each candidate
        weights_pop = [
            best_weight + (sigma * np.random.randn(agent.get_weights_dim()))
            for i in range(pop_size)
        ]
        rewards = np.array(
            [agent.evaluate(weights, gamma, max_t) for weights in weights_pop]
        )

        # Select best candidates from collected rewards
        elite_idxs = rewards.argsort()[-n_elite:]
        elite_weights = [weights_pop[i] for i in elite_idxs]
        best_weight = np.array(elite_weights).mean(axis=0)

        reward = agent.evaluate(best_weight, gamma=1.0)
        scores_deque.append(reward)
        scores.append(reward)

        torch.save(agent.state_dict(), "checkpoint.pth")

        if i_iteration % print_every == 0:
            print(
                "Episode {}\tAverage Score: {:.2f}".format(
                    i_iteration, np.mean(scores_deque)
                )
            )

        if np.mean(scores_deque) >= 90.0:
            print(
                "\nEnvironment solved in {:d} iterations!\tAverage Score: {:.2f}".format(
                    i_iteration - 100, np.mean(scores_deque)
                )
            )
            break
    return scores


np.bool8 = np.bool_
agent = Agent(env).to(device)
scores = cem(agent)

  0%|          | 0/5 [00:00<?, ?it/s]


import matplotlib.pyplot as plt

# Plot the scores
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores) + 1), scores)
plt.ylabel("Score")
plt.xlabel("Episode #")
plt.show()


!mkdir -p video


import glob
import base64, io
from IPython import display
from IPython.display import HTML
from gym.wrappers.monitoring import video_recorder


def show_video(env_name):
    mp4list = glob.glob("video/*.mp4")
    if len(mp4list) > 0:
        mp4 = "video/{}.mp4".format(env_name)
        video = io.open(mp4, "r+b").read()
        encoded = base64.b64encode(video)
        display.display(
            HTML(
                data="""<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>""".format(
                    encoded.decode("ascii")
                )
            )
        )
    else:
        print("Could not find video")


def show_video_of_model(agent, env_name):
    env = gym.make(env_name)
    vid = video_recorder.VideoRecorder(env, path="video/{}.mp4".format(env_name))
    agent.load_state_dict(torch.load("checkpoint.pth"))
    state = env.reset()
    done = False
    while not done:
        vid.capture_frame()

        action = agent.act(state)
        next_state, reward, done, _ = env.step(action)
        state = next_state
        if done:
            break
    env.close()


agent = Agent(env).to(device)
show_video_of_model(agent, "MountainCarContinuous-v0")
clear_output()


clear_output()
print("Необученная модель:")
show_video("MountainCarContinuous-v0")

Необученная модель:


!wget -qN https://edunet.kea.su/repo/EduNet-content/dev-2.0/L15/weights/checkpoint.pth


agent = Agent(env).to(device)
show_video_of_model(agent, "MountainCarContinuous-v0")
clear_output()


clear_output()
print("Обученная модель:")
show_video("MountainCarContinuous-v0")

Обученная модель:


from IPython.display import clear_output


# After this setup we can import mdp package
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week02_value_based/mdp.py
!touch .setup_complete

clear_output()


from mdp import MDP

transition_probs = {
    "s0": {"a0": {"s0": 0.5, "s2": 0.5}, "a1": {"s2": 1}},
    "s1": {"a0": {"s0": 0.7, "s1": 0.1, "s2": 0.2}, "a1": {"s1": 0.95, "s2": 0.05}},
    "s2": {"a0": {"s0": 0.4, "s2": 0.6}, "a1": {"s0": 0.3, "s1": 0.3, "s2": 0.4}},
}
rewards = {"s1": {"a0": {"s0": +5}}, "s2": {"a1": {"s0": -1}}}

mdp = MDP(transition_probs, rewards, initial_state="s0")

# We can now use MDP just as any other gym environment:
print("initial state =", mdp.reset())
next_state, reward, done, info = mdp.step("a1")
print("next_state = %s, reward = %s, done = %s" % (next_state, reward, done))

initial state = s0
next_state = s2, reward = 0.0, done = False


# MDP methods

print("mdp.get_all_states =", mdp.get_all_states())
print("mdp.get_possible_actions('s1') = ", mdp.get_possible_actions("s1"))
print("mdp.get_next_states('s1', 'a0') = ", mdp.get_next_states("s1", "a0"))

# state, action, next_state
print("mdp.get_reward('s1', 'a0', 's0') = ", mdp.get_reward("s1", "a0", "s0"))

# get_transition_prob(self, state, action, next_state)
print(
    "mdp.get_transition_prob('s1', 'a0', 's0') = ",
    mdp.get_transition_prob("s1", "a0", "s0"),
)

mdp.get_all_states = ('s0', 's1', 's2')
mdp.get_possible_actions('s1') =  ('a0', 'a1')
mdp.get_next_states('s1', 'a0') =  {'s0': 0.7, 's1': 0.1, 's2': 0.2}
mdp.get_reward('s1', 'a0', 's0') =  5
mdp.get_transition_prob('s1', 'a0', 's0') =  0.7


from IPython.display import display_jpeg
import mdp as MDP
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

display_jpeg(MDP.plot_graph(mdp))


def get_action_value(mdp, state_values, state, action, gamma):
    """
    Computes Q(s,a) as in formula above

    mdp : MDP object
    state_values : dictionayry of { state_i : V_i }
    state: string id of current state
    gamma: float discount coeff

    """

    next_states = mdp.get_next_states(state, action)

    Q = 0.0

    for next_state in next_states.keys():
        # alternatively p = mdp.get_transition_prob(state, action, next_state)
        p = next_states[next_state]
        Q += p * (
            mdp.get_reward(state, action, next_state) + gamma * state_values[next_state]
        )
    return Q


def get_new_state_value(mdp, state_values, state, gamma):
    """Computes next V(s) as in formula above. Please do not change state_values in process."""
    if mdp.is_terminal(state):
        return 0  # Game over

    q_max = float("-inf")
    actions = mdp.get_possible_actions(state)
    for a in actions:
        q = get_action_value(mdp, state_values, state, a, gamma)
        q_max = max(q_max, q)
    return q_max


# parameters
gamma = 0.9  # discount for MDP
num_iter = 100  # maximum iterations, excluding initialization
# stop VI if new values are this close to old values (or closer)
min_difference = 0.001

# initialize V(s)
state_values = {s: 0 for s in mdp.get_all_states()}

display_jpeg(MDP.plot_graph_with_state_values(mdp, state_values))


for i in range(num_iter):
    # Compute new state values using the functions you defined above.
    # It must be a dict {state : float V_new(state)}

    new_state_values = {}
    for s in state_values.keys():
        new_state_values[s] = get_new_state_value(mdp, state_values, s, gamma)

    # Compute difference
    diff = max(abs(new_state_values[s] - state_values[s]) for s in mdp.get_all_states())
    print("iter %4i   |   diff: %6.5f   |   " % (i, diff), end="")
    print("   ".join("V(%s) = %.3f" % (s, v) for s, v in state_values.items()))
    state_values = new_state_values

    if diff < min_difference:
        print("Terminated")
        break

iter    0   |   diff: 3.50000   |   V(s0) = 0.000   V(s1) = 0.000   V(s2) = 0.000
iter    1   |   diff: 0.64500   |   V(s0) = 0.000   V(s1) = 3.500   V(s2) = 0.000
iter    2   |   diff: 0.58050   |   V(s0) = 0.000   V(s1) = 3.815   V(s2) = 0.645
iter    3   |   diff: 0.43582   |   V(s0) = 0.581   V(s1) = 3.959   V(s2) = 0.962
iter    4   |   diff: 0.30634   |   V(s0) = 0.866   V(s1) = 4.395   V(s2) = 1.272
iter    5   |   diff: 0.27571   |   V(s0) = 1.145   V(s1) = 4.670   V(s2) = 1.579
iter    6   |   diff: 0.24347   |   V(s0) = 1.421   V(s1) = 4.926   V(s2) = 1.838
iter    7   |   diff: 0.21419   |   V(s0) = 1.655   V(s1) = 5.169   V(s2) = 2.075
iter    8   |   diff: 0.19277   |   V(s0) = 1.868   V(s1) = 5.381   V(s2) = 2.290
iter    9   |   diff: 0.17327   |   V(s0) = 2.061   V(s1) = 5.573   V(s2) = 2.481
iter   10   |   diff: 0.15569   |   V(s0) = 2.233   V(s1) = 5.746   V(s2) = 2.654
iter   11   |   diff: 0.14012   |   V(s0) = 2.389   V(s1) = 5.902   V(s2) = 2.810
iter   12   |   diff: 0.12610   |   V(s0) = 2.529   V(s1) = 6.042   V(s2) = 2.950
iter   13   |   diff: 0.11348   |   V(s0) = 2.655   V(s1) = 6.168   V(s2) = 3.076
iter   14   |   diff: 0.10213   |   V(s0) = 2.769   V(s1) = 6.282   V(s2) = 3.190
iter   15   |   diff: 0.09192   |   V(s0) = 2.871   V(s1) = 6.384   V(s2) = 3.292
iter   16   |   diff: 0.08272   |   V(s0) = 2.963   V(s1) = 6.476   V(s2) = 3.384
iter   17   |   diff: 0.07445   |   V(s0) = 3.045   V(s1) = 6.558   V(s2) = 3.467
iter   18   |   diff: 0.06701   |   V(s0) = 3.120   V(s1) = 6.633   V(s2) = 3.541
iter   19   |   diff: 0.06031   |   V(s0) = 3.187   V(s1) = 6.700   V(s2) = 3.608
iter   20   |   diff: 0.05428   |   V(s0) = 3.247   V(s1) = 6.760   V(s2) = 3.668
iter   21   |   diff: 0.04885   |   V(s0) = 3.301   V(s1) = 6.814   V(s2) = 3.723
iter   22   |   diff: 0.04396   |   V(s0) = 3.350   V(s1) = 6.863   V(s2) = 3.771
iter   23   |   diff: 0.03957   |   V(s0) = 3.394   V(s1) = 6.907   V(s2) = 3.815
iter   24   |   diff: 0.03561   |   V(s0) = 3.434   V(s1) = 6.947   V(s2) = 3.855
iter   25   |   diff: 0.03205   |   V(s0) = 3.469   V(s1) = 6.982   V(s2) = 3.891
iter   26   |   diff: 0.02884   |   V(s0) = 3.502   V(s1) = 7.014   V(s2) = 3.923
iter   27   |   diff: 0.02596   |   V(s0) = 3.530   V(s1) = 7.043   V(s2) = 3.951
iter   28   |   diff: 0.02336   |   V(s0) = 3.556   V(s1) = 7.069   V(s2) = 3.977
iter   29   |   diff: 0.02103   |   V(s0) = 3.580   V(s1) = 7.093   V(s2) = 4.001
iter   30   |   diff: 0.01892   |   V(s0) = 3.601   V(s1) = 7.114   V(s2) = 4.022
iter   31   |   diff: 0.01703   |   V(s0) = 3.620   V(s1) = 7.133   V(s2) = 4.041
iter   32   |   diff: 0.01533   |   V(s0) = 3.637   V(s1) = 7.150   V(s2) = 4.058
iter   33   |   diff: 0.01380   |   V(s0) = 3.652   V(s1) = 7.165   V(s2) = 4.073
iter   34   |   diff: 0.01242   |   V(s0) = 3.666   V(s1) = 7.179   V(s2) = 4.087
iter   35   |   diff: 0.01117   |   V(s0) = 3.678   V(s1) = 7.191   V(s2) = 4.099
iter   36   |   diff: 0.01006   |   V(s0) = 3.689   V(s1) = 7.202   V(s2) = 4.110
iter   37   |   diff: 0.00905   |   V(s0) = 3.699   V(s1) = 7.212   V(s2) = 4.121
iter   38   |   diff: 0.00815   |   V(s0) = 3.708   V(s1) = 7.221   V(s2) = 4.130
iter   39   |   diff: 0.00733   |   V(s0) = 3.717   V(s1) = 7.230   V(s2) = 4.138
iter   40   |   diff: 0.00660   |   V(s0) = 3.724   V(s1) = 7.237   V(s2) = 4.145
iter   41   |   diff: 0.00594   |   V(s0) = 3.731   V(s1) = 7.244   V(s2) = 4.152
iter   42   |   diff: 0.00534   |   V(s0) = 3.736   V(s1) = 7.249   V(s2) = 4.158
iter   43   |   diff: 0.00481   |   V(s0) = 3.742   V(s1) = 7.255   V(s2) = 4.163
iter   44   |   diff: 0.00433   |   V(s0) = 3.747   V(s1) = 7.260   V(s2) = 4.168
iter   45   |   diff: 0.00390   |   V(s0) = 3.751   V(s1) = 7.264   V(s2) = 4.172
iter   46   |   diff: 0.00351   |   V(s0) = 3.755   V(s1) = 7.268   V(s2) = 4.176
iter   47   |   diff: 0.00316   |   V(s0) = 3.758   V(s1) = 7.271   V(s2) = 4.179
iter   48   |   diff: 0.00284   |   V(s0) = 3.762   V(s1) = 7.275   V(s2) = 4.183
iter   49   |   diff: 0.00256   |   V(s0) = 3.764   V(s1) = 7.277   V(s2) = 4.185
iter   50   |   diff: 0.00230   |   V(s0) = 3.767   V(s1) = 7.280   V(s2) = 4.188
iter   51   |   diff: 0.00207   |   V(s0) = 3.769   V(s1) = 7.282   V(s2) = 4.190
iter   52   |   diff: 0.00186   |   V(s0) = 3.771   V(s1) = 7.284   V(s2) = 4.192
iter   53   |   diff: 0.00168   |   V(s0) = 3.773   V(s1) = 7.286   V(s2) = 4.194
iter   54   |   diff: 0.00151   |   V(s0) = 3.775   V(s1) = 7.288   V(s2) = 4.196
iter   55   |   diff: 0.00136   |   V(s0) = 3.776   V(s1) = 7.289   V(s2) = 4.197
iter   56   |   diff: 0.00122   |   V(s0) = 3.778   V(s1) = 7.291   V(s2) = 4.199
iter   57   |   diff: 0.00110   |   V(s0) = 3.779   V(s1) = 7.292   V(s2) = 4.200
iter   58   |   diff: 0.00099   |   V(s0) = 3.780   V(s1) = 7.293   V(s2) = 4.201
Terminated


display_jpeg(MDP.plot_graph_with_state_values(mdp, state_values))


def get_optimal_action(mdp, state_values, state, gamma=0.9):
    """Finds optimal action using formula above."""
    if mdp.is_terminal(state):
        return None

    best_action = None
    q_max = float("-inf")
    actions = mdp.get_possible_actions(state)
    for a in actions:
        q = get_action_value(mdp, state_values, state, a, gamma)
        if q > q_max:
            best_action = a
            q_max = q

    return best_action


display_jpeg(
    MDP.plot_graph_optimal_strategy_and_state_values(
        mdp, state_values, get_action_value
    )
)


import numpy as np

# Measure agent's average reward

s = mdp.reset()
rewards = []
for _ in range(10000):
    s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
    rewards.append(r)

print("average reward: ", np.mean(rewards))

assert 0.40 < np.mean(rewards) < 0.55

average reward:  0.4532


from IPython.display import clear_output
import os

################################################
# For CartPole
################################################

!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash

!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/atari_wrappers.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/utils.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/replay_buffer.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/framebuffer.py

!pip install swig
!pip install gym[box2d]

!touch .setup_complete


# This code creates a virtual display to draw game images on.
# It will have no effect if your machine has a monitor.
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ["DISPLAY"] = ":1"

clear_output()


import gym

ENV_NAME = "CartPole-v1"


def make_env(seed=None):
    # some envs are wrapped with a time limit wrapper by default
    env = gym.make(ENV_NAME, render_mode="rgb_array", new_step_api=True).unwrapped
    if seed is not None:
        env.seed(seed)
    return env


import matplotlib.pyplot as plt
import numpy as np
from warnings import simplefilter

simplefilter("ignore", category=DeprecationWarning)

env = make_env()
env.reset()
env_img = np.squeeze(env.render())
clear_output()
plt.imshow(env_img)
state_shape, n_actions = env.observation_space.shape, env.action_space.n


import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# those who have a GPU but feel unfair to use it can uncomment:
# device = torch.device('cpu')

print(state_shape)

(4,)


from torch import nn


class DQNAgent(nn.Module):
    def __init__(self, state_shape, n_actions, epsilon=0):
        super().__init__()
        self.epsilon = epsilon
        self.n_actions = n_actions
        self.state_shape = state_shape
        # Define your network body here. Please make sure agent is fully contained here
        assert len(state_shape) == 1
        state_dim = state_shape[0]

        # Define NN
        ##############################################
        hidden_size = 150
        self._nn = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.ReLU(),
        )
        ##############################################

    def forward(self, state_t):
        """
        takes agent's observation (tensor), returns qvalues (tensor)
        :param state_t: a batch states, shape = [batch_size, *state_dim=4]
        """
        # Use your network to compute qvalues for given state

        ##############################################
        qvalues = self._nn(state_t)
        ##############################################

        assert qvalues.requires_grad, "qvalues must be a torch tensor with grad"
        assert (
            len(qvalues.shape) == 2
            and qvalues.shape[0] == state_t.shape[0]
            and qvalues.shape[1] == n_actions
        )

        return qvalues

    def get_qvalues(self, states):
        """
        like forward, but works on numpy arrays, not tensors
        """
        model_device = next(self.parameters()).device
        states = torch.tensor(states, device=model_device, dtype=torch.float32)
        qvalues = self.forward(states)
        return qvalues.data.cpu().numpy()

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy."""
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape

        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)

        should_explore = np.random.choice([0, 1], batch_size, p=[1 - epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)


agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)


def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):
    """Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward."""
    rewards = []
    for _ in range(n_games):
        s = env.reset()
        reward = 0
        for _ in range(t_max):
            qvalues = agent.get_qvalues([s])
            action = (
                qvalues.argmax(axis=-1)[0]
                if greedy
                else agent.sample_actions(qvalues)[0]
            )
            s, r, done, _, _ = env.step(action)
            reward += r
            if done:
                break

        rewards.append(reward)
    return np.mean(rewards)


from replay_buffer import ReplayBuffer

exp_replay = ReplayBuffer(2000)


target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)
# This is how you can load weights from agent into target network
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>


def compute_td_loss(
    states,
    actions,
    rewards,
    next_states,
    is_done,
    agent,
    target_network,
    gamma=0.99,
    check_shapes=False,
    device=device,
):
    """Compute td loss using torch operations only. Use the formulae above."""
    states = torch.tensor(
        states, device=device, dtype=torch.float32
    )  # shape: [batch_size, *state_shape]
    actions = torch.tensor(
        actions, device=device, dtype=torch.int64
    )  # shape: [batch_size]
    rewards = torch.tensor(
        rewards, device=device, dtype=torch.float32
    )  # shape: [batch_size]
    # shape: [batch_size, *state_shape]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float)
    is_done = torch.tensor(
        is_done.astype("float32"),
        device=device,
        dtype=torch.float32,
    )  # shape: [batch_size]
    is_not_done = 1 - is_done

    # get q-values for all actions in current states
    predicted_qvalues = agent(states)  # shape: [batch_size, n_actions]

    # compute q-values for all actions in next states
    # with torch.no_grad():
    predicted_next_qvalues = target_network(
        next_states
    )  # shape: [batch_size, n_actions]

    # select q-values for chosen actions
    predicted_qvalues_for_actions = predicted_qvalues[
        range(len(actions)), actions
    ]  # shape: [batch_size]

    # compute V*(next_states) using predicted next q-values
    ##############################################
    next_state_values = predicted_next_qvalues.max(axis=-1)[0]
    ##############################################

    assert (
        next_state_values.dim() == 1 and next_state_values.shape[0] == states.shape[0]
    ), "must predict one value per state"

    # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
    # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
    # you can multiply next state values by is_not_done to achieve this.
    ###############################################
    target_qvalues_for_actions = rewards + gamma * next_state_values * is_not_done
    ##############################################

    # mean squared error loss to minimize
    loss = torch.mean(
        (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2
    )

    if check_shapes:
        assert (
            predicted_next_qvalues.data.dim() == 2
        ), "make sure you predicted q-values for all actions in next state"
        assert (
            next_state_values.data.dim() == 1
        ), "make sure you computed V(s') as maximum over just the actions axis and not all axes"
        assert (
            target_qvalues_for_actions.data.dim() == 1
        ), "there's something wrong with target q-values, they must be a vector"

    return loss


import random

seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

env = make_env()
env.reset(seed=seed)
state_dim = env.observation_space.shape
n_actions = env.action_space.n
state = env.reset()

agent = DQNAgent(state_dim, n_actions, epsilon=1).to(device)
target_network = DQNAgent(state_dim, n_actions, epsilon=1).to(device)
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>


def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):
    """
    Play the game for exactly n_steps, record every (s,a,r,s', done) to replay buffer.
    Whenever game ends, add record with done=True and reset the game.
    It is guaranteed that env has done=False when passed to this function.

    PLEASE DO NOT RESET ENV UNLESS IT IS "DONE"

    :returns: return sum of rewards over time and the state in which the env stays

    hint: use agent.sample.actions
    """
    s = initial_state
    sum_rewards = 0

    # Play the game for n_steps as per instructions above
    for _ in range(n_steps):
        qvalues = agent.get_qvalues([s])

        action = agent.sample_actions(qvalues)[0]
        # action = action.argmax(axis=-1)[0]
        state, reward, done, _, _ = env.step(action)
        sum_rewards += reward

        exp_replay.add(s, action, reward, state, done)

        if done:
            state = env.reset()

        s = state

    return sum_rewards, s


import utils
from warnings import simplefilter

simplefilter("ignore", category=UserWarning)

REPLAY_BUFFER_SIZE = 10**4

exp_replay = ReplayBuffer(REPLAY_BUFFER_SIZE)
for i in range(100):
    if not utils.is_enough_ram(min_available_gb=0.1):
        print(
            """
            Less than 100 Mb RAM available.
            Make sure the buffer size in not too huge.
            Also check, maybe other processes consume RAM heavily.
            """
        )
        break
    play_and_record(state, agent, env, exp_replay, n_steps=10**2)
    if len(exp_replay) == REPLAY_BUFFER_SIZE:
        break
print(len(exp_replay))

10000


import time

timesteps_per_epoch = 1
batch_size = 32
total_steps = 4 * 10**4
decay_steps = 1 * 10**4

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)

init_epsilon = 1
final_epsilon = 0.1

loss_freq = 20
refresh_target_network_freq = 100
eval_freq = 1000

max_grad_norm = 5000

mean_rw_history = []
td_loss_history = []
grad_norm_history = []
initial_state_v_history = []
step = 0


def wait_for_keyboard_interrupt():
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        pass


from tqdm import trange

state = env.reset()
with trange(step, total_steps + 1) as progress_bar:
    for step in progress_bar:
        if not utils.is_enough_ram():
            print("less that 100 Mb RAM available, freezing")
            print("make sure everything is ok and use KeyboardInterrupt to continue")
            wait_for_keyboard_interrupt()

        agent.epsilon = utils.linear_decay(
            init_epsilon, final_epsilon, step, decay_steps
        )

        # play
        _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)

        # train
        # sample batch_size of data from experience replay
        s, a, r, next_s, is_done = exp_replay.sample(batch_size)
        # loss = compute TD loss
        loss = compute_td_loss(s, a, r, next_s, is_done, agent, target_network)

        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()

        if step % loss_freq == 0:
            td_loss_history.append(loss.data.cpu().item())
            grad_norm_history.append(grad_norm.data.cpu().item())

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
            target_network.load_state_dict(agent.state_dict())

        if step % eval_freq == 0:
            mean_rw_history.append(
                evaluate(make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)
            )
            initial_state_q_values = agent.get_qvalues([make_env(seed=step).reset()])
            initial_state_v_history.append(np.max(initial_state_q_values))

            clear_output(True)
            print("buffer size = %i, epsilon = %.5f" % (len(exp_replay), agent.epsilon))

            plt.figure(figsize=[16, 9])

            plt.subplot(2, 2, 1)
            plt.title("Mean reward per episode")
            plt.plot(mean_rw_history)
            plt.grid()

            assert not np.isnan(td_loss_history[-1])
            plt.subplot(2, 2, 2)
            plt.title("TD loss history (smoothened)")
            plt.plot(utils.smoothen(td_loss_history))
            plt.grid()

            plt.subplot(2, 2, 3)
            plt.title("Initial state V")
            plt.plot(initial_state_v_history)
            plt.grid()

            plt.subplot(2, 2, 4)
            plt.title("Grad norm history (smoothened)")
            plt.plot(utils.smoothen(grad_norm_history))
            plt.grid()

            plt.show()

buffer size = 10000, epsilon = 0.10000

100%|██████████| 40001/40001 [05:35<00:00, 119.07it/s]


final_score = evaluate(make_env(), agent, n_games=30, greedy=True, t_max=1000)
print("final score:", final_score)
if final_score > 300:
    print("Well done")
else:
    print("not good enough for DQN")

final score: 98.9
not good enough for DQN

Reinforcement learning (RL)¶

Обучение с подкреплением¶

Примеры задач, решаемых с использованием RL¶

Примеры работ с использованием обучения с подкреплением у выпускников курса¶

Устоявшаяся терминология¶

Stateless environment in RL¶

Задача о многоруких бандитах¶

Gymnasium framework¶

Пространства действий и наблюдений¶

Взаимодействие со средой¶

Создание своей среды¶

Поиск оптимальной стратегии решения¶

Жадная стратегия¶

$\epsilon$-жадная стратегия¶

softmax-стратегия¶

Cross-entropy method (CEM)¶

Пример оптимизации недифференцируемого функционала¶

Пример CEM в Gym¶

Марковский процесс принятия решений (Markov decision process, MDP)¶

Markov property¶

Определение Markov Property¶

Markov process¶

Определение¶

Марковский процесс принятия решений¶

Формальное описание MDP¶

Пример¶

Функция награды (reward)¶

Суммарная награда (Return)¶

$\gamma$-коэффициент дисконтирования¶

Проблема поиска оптимальной политики¶

Политика (policy)¶

State-value function $v_{\pi}(s)$ (V-функция)¶

Action-value function $q_{\pi}(s, a)$ (Q-функция)¶

Уравнение Беллмана¶

Поиск оптимальной политики Беллмана для MDP (решение "MDP")¶

Методы динамического программирования¶

Policy iteration¶

Value Iteration¶

Temporal difference (TD)-обучение (TD-learning)¶

Q-Learning¶

Deep Q-Learning¶

Loss¶

Approximate Q-learning¶

Алгоритм обучения¶

Experience replay¶

Target network¶

Пример c CartPole DQN¶

Архитектура сети¶

Experience Replay Buffer and Target Networks¶

TD-Loss¶

Реализация алгоритма¶