!pip install -q gymnasium

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 953.8/953.8 kB 7.4 MB/s eta 0:00:00


import gymnasium as gym

env = gym.make("MountainCar-v0")


from warnings import simplefilter

simplefilter("ignore", category=DeprecationWarning)


# Action and observation space
action_space = env.action_space
obs_space = env.observation_space

print(f"The action space: {action_space}")
print(f"The observation space: {obs_space}")

The action space: Discrete(3)
The observation space: Box([-1.2  -0.07], [0.6  0.07], (2,), float32)


print("Upper Bound for Env Observation", env.observation_space.high)
print("Lower Bound for Env Observation", env.observation_space.low)

Upper Bound for Env Observation [0.6  0.07]
Lower Bound for Env Observation [-1.2  -0.07]


print(type(env.action_space))
print(type(env.observation_space))

<class 'gymnasium.spaces.discrete.Discrete'>
<class 'gymnasium.spaces.box.Box'>


random_action = env.action_space.sample()  # random number from 0 to 2
print(random_action)

2


# reset the environment and see the initial observation
obs, info = env.reset()
print(f"The initial observation is {obs}")

# Take the action and get the new observation space
new_obs, reward, done, truncated, info = env.step(random_action)
print(f"The new observation is {new_obs}")

The initial observation is [-0.49436864  0.        ]
The new observation is [-0.49358758  0.00078105]


import random
from gym import Env, spaces
import numpy as np

np.random.seed(42)


class MultiArmedBanditEnv(Env):
    def __init__(self, n=4):
        """
        n - number of arms in the bandit
        """
        self.num_bandits = n
        self.action_space = spaces.Discrete(self.num_bandits)
        self.observation_space = spaces.Discrete(1)  # FIXME: just the reward of the last action
        self.bandit_success_prob = np.array(
            [0.5, 0.1, 0.9, 0.2]
        )  # success probabilities
        self.bandit_reward = np.array([2, 20, 1, 3])

    def step(self, action):
        done = True
        result_prob = np.random.random()
        if result_prob < self.bandit_success_prob[action]:
            reward = self.bandit_reward[action]
        else:
            reward = 0
        return reward, done


def argmax(x):
    return np.random.choice(np.flatnonzero(x == x.max()))


class GreedyAgent:
    def __init__(self):
        """
        Our agent initializes reward estimates as zeros.
        This estimates will be updated incrementally after each
        interaction with the environment.
        """
        self.reward_estimates = np.zeros(4)
        self.action_count = np.zeros(4)
        self.cache = 1000  # initial amount of coins agent posesses

    def get_action(self):
        # Pay 1 coin for the action
        self.cache -= 1

        # Select greedy action
        action = argmax(self.reward_estimates)

        # Add a 1 to action selected in the action count
        self.action_count[action] += 1

        return action

    def update_estimates(self, reward, action):
        # Update amount of cache by reward from our previuos action
        self.cache += reward

        # Compute the difference between the received rewards vs the reward estimates
        error = reward - self.reward_estimates[action]

        # Update the reward estimate incementally
        n = self.action_count[action]
        self.reward_estimates[action] += (1 / n) * error


import matplotlib.pyplot as plt

env = MultiArmedBanditEnv()
agent = GreedyAgent()

mean_reward_greedy = []
rewards = []

while (agent.cache > 0) and (len(mean_reward_greedy) != 10000):
    act = agent.get_action()
    reward, done = env.step(act)
    agent.update_estimates(reward, act)
    rewards.append(reward)
    mean_reward_greedy.append(sum(rewards) / (len(rewards)))

print(f"Action counts: {agent.action_count}")
print(f"Reward estimates: {agent.reward_estimates}")
plt.plot(mean_reward_greedy)
plt.show()

Action counts: [    0.     0. 10000.     0.]
Reward estimates: [0.     0.     0.9051 0.    ]


class EpsilonGreedyAgent(GreedyAgent):
    def __init__(self, epsilon):
        GreedyAgent.__init__(self)
        # Store the epsilon value
        assert epsilon >= 0 and epsilon <= 1
        self.epsilon = epsilon

    def get_action(self):
        # We need to redefine this function so that it takes an exploratory action with epsilon probability

        # Pay 1 coin for the action
        self.cache -= 1
        # One hot encoding: 0 if exploratory, 1 otherwise
        action_type = int(np.random.random() > self.epsilon)
        # Generate both types of actions for every experiment
        exploratory_action = np.random.randint(4)
        greedy_action = argmax(self.reward_estimates)
        # Use the one hot encoding to mask the actions for each experiment
        action = greedy_action * action_type + exploratory_action * (1 - action_type)

        self.action_count[action] += 1

        return action


agent = EpsilonGreedyAgent(epsilon=0.25)

mean_reward_e_greedy = []
rewards = []

while (agent.cache > 0) and (len(mean_reward_e_greedy) != 10000):
    act = agent.get_action()
    reward, done = env.step(act)
    agent.update_estimates(reward, act)
    rewards.append(reward)
    mean_reward_e_greedy.append(sum(rewards) / len(rewards))
    # Decrease epsilon with time until we reach 0.01 chanse to perform exploratory action
    if agent.epsilon > 0.01:
        agent.epsilon -= 0.0001

print(f"Action counts: {agent.action_count}")
print(f"Reward estimates: {agent.reward_estimates}")
plt.plot(mean_reward_greedy)
plt.plot(mean_reward_e_greedy)
plt.show()

Action counts: [  91. 9683.  137.   89.]
Reward estimates: [0.96703297 2.00970774 0.8540146  0.80898876]


from IPython.display import clear_output


# After this setup we can import mdp package
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week02_value_based/mdp.py
!touch .setup_complete

clear_output()


from mdp import MDP

transition_probs = {
    "s0": {"a0": {"s0": 0.5, "s2": 0.5}, "a1": {"s2": 1}},
    "s1": {"a0": {"s0": 0.7, "s1": 0.1, "s2": 0.2}, "a1": {"s1": 0.95, "s2": 0.05}},
    "s2": {"a0": {"s0": 0.4, "s2": 0.6}, "a1": {"s0": 0.3, "s1": 0.3, "s2": 0.4}},
}
rewards = {"s1": {"a0": {"s0": +5}}, "s2": {"a1": {"s0": -1}}}

mdp = MDP(transition_probs, rewards, initial_state="s0")

# We can now use MDP just as any other gym environment:
print("initial state =", mdp.reset())
next_state, reward, done, info = mdp.step("a1")
print("next_state = %s, reward = %s, done = %s" % (next_state, reward, done))

initial state = s0
next_state = s2, reward = 0.0, done = False


# MDP methods

print("mdp.get_all_states =", mdp.get_all_states())
print("mdp.get_possible_actions('s1') = ", mdp.get_possible_actions("s1"))
print("mdp.get_next_states('s1', 'a0') = ", mdp.get_next_states("s1", "a0"))

# state, action, next_state
print("mdp.get_reward('s1', 'a0', 's0') = ", mdp.get_reward("s1", "a0", "s0"))

# get_transition_prob(self, state, action, next_state)
print(
    "mdp.get_transition_prob('s1', 'a0', 's0') = ",
    mdp.get_transition_prob("s1", "a0", "s0"),
)

mdp.get_all_states = ('s0', 's1', 's2')
mdp.get_possible_actions('s1') =  ('a0', 'a1')
mdp.get_next_states('s1', 'a0') =  {'s0': 0.7, 's1': 0.1, 's2': 0.2}
mdp.get_reward('s1', 'a0', 's0') =  5
mdp.get_transition_prob('s1', 'a0', 's0') =  0.7


from IPython.display import display_jpeg
import mdp as MDP
import warnings

warnings.filterwarnings("ignore", category=DeprecationWarning)

display_jpeg(MDP.plot_graph(mdp))


def get_action_value(mdp, state_values, state, action, gamma):
    """
    Computes Q(s,a) as in formula above

    mdp : MDP object
    state_values : dictionayry of { state_i : V_i }
    state: string id of current state
    gamma: float discount coeff

    """

    next_states = mdp.get_next_states(state, action)

    Q = 0.0

    for next_state in next_states.keys():
        p = next_states[next_state]  # FIXME: alternatively p = mdp.get_transition_prob(state, action, next_state)
        Q += p * (
            mdp.get_reward(state, action, next_state) + gamma * state_values[next_state]
        )
    return Q


def get_new_state_value(mdp, state_values, state, gamma):
    """Computes next V(s) as in formula above. Please do not change state_values in process."""
    if mdp.is_terminal(state):
        return 0  # Game over

    q_max = float("-inf")
    actions = mdp.get_possible_actions(state)
    for a in actions:
        q = get_action_value(mdp, state_values, state, a, gamma)
        q_max = max(q_max, q)
    return q_max


# parameters
gamma = 0.9  # discount for MDP
num_iter = 100  # maximum iterations, excluding initialization
# stop VI if new values are this close to old values (or closer)
min_difference = 0.001

# initialize V(s)
state_values = {s: 0 for s in mdp.get_all_states()}

display_jpeg(MDP.plot_graph_with_state_values(mdp, state_values))


for i in range(num_iter):
    # Compute new state values using the functions you defined above.
    # It must be a dict {state : float V_new(state)}

    new_state_values = {}
    for s in state_values.keys():
        new_state_values[s] = get_new_state_value(mdp, state_values, s, gamma)

    # Compute difference
    diff = max(abs(new_state_values[s] - state_values[s]) for s in mdp.get_all_states())
    print("iter %4i   |   diff: %6.5f   |   " % (i, diff), end="")
    print("   ".join("V(%s) = %.3f" % (s, v) for s, v in state_values.items()))
    state_values = new_state_values

    if diff < min_difference:
        print("Terminated")
        break

iter    0   |   diff: 3.50000   |   V(s0) = 0.000   V(s1) = 0.000   V(s2) = 0.000
iter    1   |   diff: 0.64500   |   V(s0) = 0.000   V(s1) = 3.500   V(s2) = 0.000
iter    2   |   diff: 0.58050   |   V(s0) = 0.000   V(s1) = 3.815   V(s2) = 0.645
iter    3   |   diff: 0.43582   |   V(s0) = 0.581   V(s1) = 3.959   V(s2) = 0.962
iter    4   |   diff: 0.30634   |   V(s0) = 0.866   V(s1) = 4.395   V(s2) = 1.272
iter    5   |   diff: 0.27571   |   V(s0) = 1.145   V(s1) = 4.670   V(s2) = 1.579
iter    6   |   diff: 0.24347   |   V(s0) = 1.421   V(s1) = 4.926   V(s2) = 1.838
iter    7   |   diff: 0.21419   |   V(s0) = 1.655   V(s1) = 5.169   V(s2) = 2.075
iter    8   |   diff: 0.19277   |   V(s0) = 1.868   V(s1) = 5.381   V(s2) = 2.290
iter    9   |   diff: 0.17327   |   V(s0) = 2.061   V(s1) = 5.573   V(s2) = 2.481
iter   10   |   diff: 0.15569   |   V(s0) = 2.233   V(s1) = 5.746   V(s2) = 2.654
iter   11   |   diff: 0.14012   |   V(s0) = 2.389   V(s1) = 5.902   V(s2) = 2.810
iter   12   |   diff: 0.12610   |   V(s0) = 2.529   V(s1) = 6.042   V(s2) = 2.950
iter   13   |   diff: 0.11348   |   V(s0) = 2.655   V(s1) = 6.168   V(s2) = 3.076
iter   14   |   diff: 0.10213   |   V(s0) = 2.769   V(s1) = 6.282   V(s2) = 3.190
iter   15   |   diff: 0.09192   |   V(s0) = 2.871   V(s1) = 6.384   V(s2) = 3.292
iter   16   |   diff: 0.08272   |   V(s0) = 2.963   V(s1) = 6.476   V(s2) = 3.384
iter   17   |   diff: 0.07445   |   V(s0) = 3.045   V(s1) = 6.558   V(s2) = 3.467
iter   18   |   diff: 0.06701   |   V(s0) = 3.120   V(s1) = 6.633   V(s2) = 3.541
iter   19   |   diff: 0.06031   |   V(s0) = 3.187   V(s1) = 6.700   V(s2) = 3.608
iter   20   |   diff: 0.05428   |   V(s0) = 3.247   V(s1) = 6.760   V(s2) = 3.668
iter   21   |   diff: 0.04885   |   V(s0) = 3.301   V(s1) = 6.814   V(s2) = 3.723
iter   22   |   diff: 0.04396   |   V(s0) = 3.350   V(s1) = 6.863   V(s2) = 3.771
iter   23   |   diff: 0.03957   |   V(s0) = 3.394   V(s1) = 6.907   V(s2) = 3.815
iter   24   |   diff: 0.03561   |   V(s0) = 3.434   V(s1) = 6.947   V(s2) = 3.855
iter   25   |   diff: 0.03205   |   V(s0) = 3.469   V(s1) = 6.982   V(s2) = 3.891
iter   26   |   diff: 0.02884   |   V(s0) = 3.502   V(s1) = 7.014   V(s2) = 3.923
iter   27   |   diff: 0.02596   |   V(s0) = 3.530   V(s1) = 7.043   V(s2) = 3.951
iter   28   |   diff: 0.02336   |   V(s0) = 3.556   V(s1) = 7.069   V(s2) = 3.977
iter   29   |   diff: 0.02103   |   V(s0) = 3.580   V(s1) = 7.093   V(s2) = 4.001
iter   30   |   diff: 0.01892   |   V(s0) = 3.601   V(s1) = 7.114   V(s2) = 4.022
iter   31   |   diff: 0.01703   |   V(s0) = 3.620   V(s1) = 7.133   V(s2) = 4.041
iter   32   |   diff: 0.01533   |   V(s0) = 3.637   V(s1) = 7.150   V(s2) = 4.058
iter   33   |   diff: 0.01380   |   V(s0) = 3.652   V(s1) = 7.165   V(s2) = 4.073
iter   34   |   diff: 0.01242   |   V(s0) = 3.666   V(s1) = 7.179   V(s2) = 4.087
iter   35   |   diff: 0.01117   |   V(s0) = 3.678   V(s1) = 7.191   V(s2) = 4.099
iter   36   |   diff: 0.01006   |   V(s0) = 3.689   V(s1) = 7.202   V(s2) = 4.110
iter   37   |   diff: 0.00905   |   V(s0) = 3.699   V(s1) = 7.212   V(s2) = 4.121
iter   38   |   diff: 0.00815   |   V(s0) = 3.708   V(s1) = 7.221   V(s2) = 4.130
iter   39   |   diff: 0.00733   |   V(s0) = 3.717   V(s1) = 7.230   V(s2) = 4.138
iter   40   |   diff: 0.00660   |   V(s0) = 3.724   V(s1) = 7.237   V(s2) = 4.145
iter   41   |   diff: 0.00594   |   V(s0) = 3.731   V(s1) = 7.244   V(s2) = 4.152
iter   42   |   diff: 0.00534   |   V(s0) = 3.736   V(s1) = 7.249   V(s2) = 4.158
iter   43   |   diff: 0.00481   |   V(s0) = 3.742   V(s1) = 7.255   V(s2) = 4.163
iter   44   |   diff: 0.00433   |   V(s0) = 3.747   V(s1) = 7.260   V(s2) = 4.168
iter   45   |   diff: 0.00390   |   V(s0) = 3.751   V(s1) = 7.264   V(s2) = 4.172
iter   46   |   diff: 0.00351   |   V(s0) = 3.755   V(s1) = 7.268   V(s2) = 4.176
iter   47   |   diff: 0.00316   |   V(s0) = 3.758   V(s1) = 7.271   V(s2) = 4.179
iter   48   |   diff: 0.00284   |   V(s0) = 3.762   V(s1) = 7.275   V(s2) = 4.183
iter   49   |   diff: 0.00256   |   V(s0) = 3.764   V(s1) = 7.277   V(s2) = 4.185
iter   50   |   diff: 0.00230   |   V(s0) = 3.767   V(s1) = 7.280   V(s2) = 4.188
iter   51   |   diff: 0.00207   |   V(s0) = 3.769   V(s1) = 7.282   V(s2) = 4.190
iter   52   |   diff: 0.00186   |   V(s0) = 3.771   V(s1) = 7.284   V(s2) = 4.192
iter   53   |   diff: 0.00168   |   V(s0) = 3.773   V(s1) = 7.286   V(s2) = 4.194
iter   54   |   diff: 0.00151   |   V(s0) = 3.775   V(s1) = 7.288   V(s2) = 4.196
iter   55   |   diff: 0.00136   |   V(s0) = 3.776   V(s1) = 7.289   V(s2) = 4.197
iter   56   |   diff: 0.00122   |   V(s0) = 3.778   V(s1) = 7.291   V(s2) = 4.199
iter   57   |   diff: 0.00110   |   V(s0) = 3.779   V(s1) = 7.292   V(s2) = 4.200
iter   58   |   diff: 0.00099   |   V(s0) = 3.780   V(s1) = 7.293   V(s2) = 4.201
Terminated


display_jpeg(MDP.plot_graph_with_state_values(mdp, state_values))


def get_optimal_action(mdp, state_values, state, gamma=0.9):
    """Finds optimal action using formula above."""
    if mdp.is_terminal(state):
        return None

    best_action = None
    q_max = float("-inf")
    actions = mdp.get_possible_actions(state)
    for a in actions:
        q = get_action_value(mdp, state_values, state, a, gamma)
        if q > q_max:
            best_action = a
            q_max = q

    return best_action


display_jpeg(MDP.plot_graph_optimal_strategy_and_state_values(mdp, state_values, get_action_value))


import numpy as np

# Measure agent's average reward

s = mdp.reset()
rewards = []
for _ in range(10000):
    s, r, done, _ = mdp.step(get_optimal_action(mdp, state_values, s, gamma))
    rewards.append(r)

print("average reward: ", np.mean(rewards))

assert 0.40 < np.mean(rewards) < 0.55

average reward:  0.4663


from IPython.display import clear_output
import os

################################################
# For CartPole
################################################

!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/setup_colab.sh -O- | bash

!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/atari_wrappers.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/utils.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/replay_buffer.py
!wget -q https://raw.githubusercontent.com/yandexdataschool/Practical_RL/master/week04_approx_rl/framebuffer.py

!pip install swig
!pip install gym[box2d]

!touch .setup_complete


# This code creates a virtual display to draw game images on.
# It will have no effect if your machine has a monitor.
if type(os.environ.get("DISPLAY")) is not str or len(os.environ.get("DISPLAY")) == 0:
    !bash ../xvfb start
    os.environ["DISPLAY"] = ":1"

clear_output()


import gym

ENV_NAME = "CartPole-v1"


def make_env(seed=None):
    # some envs are wrapped with a time limit wrapper by default
    env = gym.make(ENV_NAME, render_mode="rgb_array", new_step_api=True).unwrapped
    if seed is not None:
        env.seed(seed)
    return env


import matplotlib.pyplot as plt
import numpy as np

env = make_env()
env.reset()
env_img = np.squeeze(env.render())
plt.imshow(env_img)
state_shape, n_actions = env.observation_space.shape, env.action_space.n


import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# those who have a GPU but feel unfair to use it can uncomment:
# device = torch.device('cpu')

print(state_shape)

(4,)


from torch import nn


class DQNAgent(nn.Module):
    def __init__(self, state_shape, n_actions, epsilon=0):
        super().__init__()
        self.epsilon = epsilon
        self.n_actions = n_actions
        self.state_shape = state_shape
        # Define your network body here. Please make sure agent is fully contained here
        assert len(state_shape) == 1
        state_dim = state_shape[0]

        # Define NN
        ##############################################
        hidden_size = 150
        self._nn = nn.Sequential(
            nn.Linear(state_dim, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, n_actions),
            nn.ReLU(),
        )
        ##############################################

    def forward(self, state_t):
        """
        takes agent's observation (tensor), returns qvalues (tensor)
        :param state_t: a batch states, shape = [batch_size, *state_dim=4]
        """
        # Use your network to compute qvalues for given state

        ##############################################
        qvalues = self._nn(state_t)
        ##############################################

        assert qvalues.requires_grad, "qvalues must be a torch tensor with grad"
        assert (
            len(qvalues.shape) == 2
            and qvalues.shape[0] == state_t.shape[0]
            and qvalues.shape[1] == n_actions
        )

        return qvalues

    def get_qvalues(self, states):
        """
        like forward, but works on numpy arrays, not tensors
        """
        model_device = next(self.parameters()).device
        states = torch.tensor(states, device=model_device, dtype=torch.float32)
        qvalues = self.forward(states)
        return qvalues.data.cpu().numpy()

    def sample_actions(self, qvalues):
        """pick actions given qvalues. Uses epsilon-greedy exploration strategy."""
        epsilon = self.epsilon
        batch_size, n_actions = qvalues.shape

        random_actions = np.random.choice(n_actions, size=batch_size)
        best_actions = qvalues.argmax(axis=-1)

        should_explore = np.random.choice([0, 1], batch_size, p=[1 - epsilon, epsilon])
        return np.where(should_explore, random_actions, best_actions)


agent = DQNAgent(state_shape, n_actions, epsilon=0.5).to(device)


def evaluate(env, agent, n_games=1, greedy=False, t_max=10000):
    """Plays n_games full games. If greedy, picks actions as argmax(qvalues). Returns mean reward."""
    rewards = []
    for _ in range(n_games):
        s = env.reset()
        reward = 0
        for _ in range(t_max):
            qvalues = agent.get_qvalues([s])
            action = (
                qvalues.argmax(axis=-1)[0]
                if greedy
                else agent.sample_actions(qvalues)[0]
            )
            s, r, done, _, _ = env.step(action)
            reward += r
            if done:
                break

        rewards.append(reward)
    return np.mean(rewards)


from replay_buffer import ReplayBuffer

exp_replay = ReplayBuffer(2000)


target_network = DQNAgent(agent.state_shape, agent.n_actions, epsilon=0.5).to(device)
# This is how you can load weights from agent into target network
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>


def compute_td_loss(
    states,
    actions,
    rewards,
    next_states,
    is_done,
    agent,
    target_network,
    gamma=0.99,
    check_shapes=False,
    device=device,
):
    """Compute td loss using torch operations only. Use the formulae above."""
    states = torch.tensor(
        states, device=device, dtype=torch.float32
    )  # shape: [batch_size, *state_shape]
    actions = torch.tensor(
        actions, device=device, dtype=torch.int64
    )  # shape: [batch_size]
    rewards = torch.tensor(
        rewards, device=device, dtype=torch.float32
    )  # shape: [batch_size]
    # shape: [batch_size, *state_shape]
    next_states = torch.tensor(next_states, device=device, dtype=torch.float)
    is_done = torch.tensor(
        is_done.astype("float32"),
        device=device,
        dtype=torch.float32,
    )  # shape: [batch_size]
    is_not_done = 1 - is_done

    # get q-values for all actions in current states
    predicted_qvalues = agent(states)  # shape: [batch_size, n_actions]

    # compute q-values for all actions in next states
    # with torch.no_grad():
    predicted_next_qvalues = target_network(
        next_states
    )  # shape: [batch_size, n_actions]

    # select q-values for chosen actions
    predicted_qvalues_for_actions = predicted_qvalues[
        range(len(actions)), actions
    ]  # shape: [batch_size]

    # compute V*(next_states) using predicted next q-values
    ##############################################
    next_state_values = predicted_next_qvalues.max(axis=-1)[0]
    ##############################################

    assert (
        next_state_values.dim() == 1 and next_state_values.shape[0] == states.shape[0]
    ), "must predict one value per state"

    # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
    # at the last state use the simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
    # you can multiply next state values by is_not_done to achieve this.
    ###############################################
    target_qvalues_for_actions = rewards + gamma * next_state_values * is_not_done
    ##############################################

    # mean squared error loss to minimize
    loss = torch.mean(
        (predicted_qvalues_for_actions - target_qvalues_for_actions.detach()) ** 2
    )

    if check_shapes:
        assert (
            predicted_next_qvalues.data.dim() == 2
        ), "make sure you predicted q-values for all actions in next state"
        assert (
            next_state_values.data.dim() == 1
        ), "make sure you computed V(s') as maximum over just the actions axis and not all axes"
        assert (
            target_qvalues_for_actions.data.dim() == 1
        ), "there's something wrong with target q-values, they must be a vector"

    return loss


import random

# your favourite random seed
seed = 42

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

env = make_env()
env.reset(seed=seed)
state_dim = env.observation_space.shape
n_actions = env.action_space.n
state = env.reset()

agent = DQNAgent(state_dim, n_actions, epsilon=1).to(device)
target_network = DQNAgent(state_dim, n_actions, epsilon=1).to(device)
target_network.load_state_dict(agent.state_dict())

<All keys matched successfully>


def play_and_record(initial_state, agent, env, exp_replay, n_steps=1):
    """
    Play the game for exactly n_steps, record every (s,a,r,s', done) to replay buffer.
    Whenever game ends, add record with done=True and reset the game.
    It is guaranteed that env has done=False when passed to this function.

    PLEASE DO NOT RESET ENV UNLESS IT IS "DONE"

    :returns: return sum of rewards over time and the state in which the env stays

    hint: use agent.sample.actions
    """
    s = initial_state
    sum_rewards = 0

    # Play the game for n_steps as per instructions above
    for _ in range(n_steps):
        qvalues = agent.get_qvalues([s])

        action = agent.sample_actions(qvalues)[0]
        # action = action.argmax(axis=-1)[0]
        state, reward, done, _, _ = env.step(action)
        sum_rewards += reward

        exp_replay.add(s, action, reward, state, done)

        if done:
            state = env.reset()

        s = state

    return sum_rewards, s


import utils
from warnings import simplefilter

simplefilter("ignore", category=UserWarning)

REPLAY_BUFFER_SIZE = 10**4

exp_replay = ReplayBuffer(REPLAY_BUFFER_SIZE)
for i in range(100):
    if not utils.is_enough_ram(min_available_gb=0.1):
        print(
            """
            Less than 100 Mb RAM available.
            Make sure the buffer size in not too huge.
            Also check, maybe other processes consume RAM heavily.
            """
        )
        break
    play_and_record(state, agent, env, exp_replay, n_steps=10**2)
    if len(exp_replay) == REPLAY_BUFFER_SIZE:
        break
print(len(exp_replay))

10000


import time

timesteps_per_epoch = 1
batch_size = 32
total_steps = 4 * 10**4
decay_steps = 1 * 10**4

optimizer = torch.optim.Adam(agent.parameters(), lr=1e-4)

init_epsilon = 1
final_epsilon = 0.1

loss_freq = 20
refresh_target_network_freq = 100
eval_freq = 1000

max_grad_norm = 5000

mean_rw_history = []
td_loss_history = []
grad_norm_history = []
initial_state_v_history = []
step = 0


def wait_for_keyboard_interrupt():
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        pass


from tqdm import trange

state = env.reset()
with trange(step, total_steps + 1) as progress_bar:
    for step in progress_bar:
        if not utils.is_enough_ram():
            print("less that 100 Mb RAM available, freezing")
            print("make sure everything is ok and use KeyboardInterrupt to continue")
            wait_for_keyboard_interrupt()

        agent.epsilon = utils.linear_decay(
            init_epsilon, final_epsilon, step, decay_steps
        )

        # play
        _, state = play_and_record(state, agent, env, exp_replay, timesteps_per_epoch)

        # train
        # sample batch_size of data from experience replay
        s, a, r, next_s, is_done = exp_replay.sample(batch_size)
        # loss = compute TD loss
        loss = compute_td_loss(s, a, r, next_s, is_done, agent, target_network)

        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(agent.parameters(), max_grad_norm)
        optimizer.step()
        optimizer.zero_grad()

        if step % loss_freq == 0:
            td_loss_history.append(loss.data.cpu().item())
            grad_norm_history.append(grad_norm.data.cpu().item())

        if step % refresh_target_network_freq == 0:
            # Load agent weights into target_network
            target_network.load_state_dict(agent.state_dict())

        if step % eval_freq == 0:
            mean_rw_history.append(
                evaluate(make_env(seed=step), agent, n_games=3, greedy=True, t_max=1000)
            )
            initial_state_q_values = agent.get_qvalues([make_env(seed=step).reset()])
            initial_state_v_history.append(np.max(initial_state_q_values))

            clear_output(True)
            print("buffer size = %i, epsilon = %.5f" % (len(exp_replay), agent.epsilon))

            plt.figure(figsize=[16, 9])

            plt.subplot(2, 2, 1)
            plt.title("Mean reward per episode")
            plt.plot(mean_rw_history)
            plt.grid()

            assert not np.isnan(td_loss_history[-1])
            plt.subplot(2, 2, 2)
            plt.title("TD loss history (smoothened)")
            plt.plot(utils.smoothen(td_loss_history))
            plt.grid()

            plt.subplot(2, 2, 3)
            plt.title("Initial state V")
            plt.plot(initial_state_v_history)
            plt.grid()

            plt.subplot(2, 2, 4)
            plt.title("Grad norm history (smoothened)")
            plt.plot(utils.smoothen(grad_norm_history))
            plt.grid()

            plt.show()

buffer size = 10000, epsilon = 0.10000

100%|██████████| 40001/40001 [07:22<00:00, 90.39it/s]


final_score = evaluate(make_env(), agent, n_games=30, greedy=True, t_max=1000)
print("final score:", final_score)
if final_score > 300:
    print("Well done")
else:
    print("not good enough for DQN")

final score: 99.06666666666666
not good enough for DQN

Как работает обучение с подкреплением¶

Место RL в машинном обучении¶

Обучение с учителем¶

Обучение без учителя¶

Обучение с подкреплением¶

Терминология: агент, функция награды, состояние среды¶

Классические примеры задач RL¶

Особенности и сложности RL¶

Низкая скорость обучения (sample efficiency)¶

Сложное проектирование функции награды¶

Неустойчивость обучения¶

Библиотеки¶

Gym¶

Пространства действий и наблюдений¶

Взаимодействие со средой¶

Создание своей среды¶

Жадная стратегия¶

$\varepsilon$-жадная стратегия¶

Markov property¶

Определение Markov Property¶

Markov process¶

Определение¶

Матрица состояний¶

Transition matrix¶

Награда (Reward)¶

Суммарная награда (Return)¶

Дисконтирование (discounting)¶

Markov reward process (MRP)¶

Марковский процесс принятия решений¶

Формальное описание MDP¶

Пример¶

Нахождение лучшей последовательности переходов¶

Определение Политики¶

Value function¶

Определение Value Function¶

Уравнение Беллмана¶

Определение Optimal Value Function¶

Bellman equation¶

Нахождение оптимальной политики Беллмана¶

Политика не обязана быть оптимальной¶

Policy iteration¶

Value Iteration¶

Траектории MDP¶

Q-Learning¶

Deep Q-Learning¶

Loss¶

Approximate Q-learning¶

Алгоритм обучения¶

Experience replay¶

Target network¶

Пример c CartPole DQN¶

Архитектура сети¶

Experience Replay Buffer and Target Networks¶

TD-Loss¶

Main loop¶

Проблемы Q-обучения¶

Дальнейшие идеи¶

Другие улучшения DQN¶

Double Q-learning (NIPS 2010)¶

Альтернативные подходы¶