Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
| Download
Project: multiagent
Path: Multiagent.ipynb
Views: 67Visibility: Unlisted (only visible to those who know the link)
Image: ubuntu2204Kernel: Python 3 (system-wide)
Import Module
In [0]:
pip install pettingzoo
In [0]:
pip install pygame
In [0]:
from pettingzoo.mpe import simple_reference_v3 from IPython.display import clear_output, display import matplotlib.pyplot as plt import time import random import numpy as np import cv2 from collections import deque import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F
In [0]:
def sto(env, agents, model, hyperparameter, filepath, data): torch.save({ 'env': env, 'model': model, 'agents': agents, 'hp': hyperparameter, 'data': data }, filepath) print('successfully !') def load(filepath): tmp = torch.load(filepath) return tmp
DQN
In [0]:
class DQN(nn.Module): def __init__(self, input_shape, num_actions): super(DQN, self).__init__() # input_shape = (input_shape[2], input_shape[0], input_shape[1]) # self.conv_layers = nn.Sequential( # nn.Conv2d(input_shape[0], 16, kernel_size=8, stride=4), # nn.ReLU(), # nn.Conv2d(16, 32, kernel_size=4, stride=2), # nn.ReLU() # ) # conv_out_size = self._get_conv_out(input_shape) self.fc_layers = nn.Sequential( nn.Linear(input_shape, 128), nn.ReLU(), nn.Linear(128, 64), nn.ReLU(), nn.Linear(64, num_actions) ) def _get_conv_out(self, shape): o = self.conv_layers(torch.zeros(1, *shape)) return int(np.prod(o.size())) def forward(self, x): #conv_out = self.conv_layers(x).reshape(x.size()[0], -1) #return self.fc_layers(conv_out) return self.fc_layers(x)
In [0]:
class DQNAgent: def __init__(self, input_shape, num_actions, lr=0.0005, gamma=0.99): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model = DQN(input_shape, num_actions).to(self.device) self.target_model = DQN(input_shape, num_actions).to(self.device) self.target_model.load_state_dict(self.model.state_dict()) self.target_model.eval() self.optimizer = optim.Adam(self.model.parameters(), lr=lr) self.loss_fn = nn.MSELoss() self.gamma = gamma def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) def act(self, obs, epsilon): if np.random.rand() < epsilon: return np.random.randint(0, self.model.fc_layers[-1].out_features) obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device) q_values = self.model(obs_tensor) return q_values.argmax().item() def train_step(self, batch): obs_batch = torch.tensor(np.array([transition['obs'] for transition in batch]), dtype=torch.float32).to(self.device) actions_batch = torch.tensor([transition['action'] for transition in batch], dtype=torch.int64).to(self.device) rewards_batch = torch.tensor([transition['reward'] for transition in batch], dtype=torch.float32).to(self.device) next_obs_batch = torch.tensor(np.array([transition['next_obs'] for transition in batch]), dtype=torch.float32).to(self.device) dones_batch = torch.tensor([transition['dones'] for transition in batch], dtype=torch.float32).to(self.device) q_values = self.model(obs_batch).gather(1, actions_batch.unsqueeze(1)).squeeze(1) next_q_values = self.target_model(next_obs_batch).max(1)[0] expected_q_values = rewards_batch + self.gamma * next_q_values * (1 - dones_batch) loss = self.loss_fn(q_values, expected_q_values.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step()
In [0]:
def run_DQN(env, device, tmp = None): num_agents = env.num_agents name_agent = env.agents replay_buffer = None data = [] agents = {agent: DQNAgent(np.prod(env.observation_spaces[agent].shape), env.action_spaces[agent].n) for agent in env.agents} if tmp != None: assert env.metadata['name'] == tmp['env'].metadata['name'] assert tmp['model'] == 'IQL' for agent in env.agents: agents[agent].model.load_state_dict(tmp['agents'][agent].model.state_dict()) agents[agent].target_model.load_state_dict(tmp['agents'][agent].target_model.state_dict()) hyperparameter = tmp['hp'] epsilon_start = hyperparameter['epsilon_start'] epsilon_end = hyperparameter['epsilon_end'] epsilon_decay = hyperparameter['epsilon_decay'] num_episodes = hyperparameter['num_episodes'] batch_size = hyperparameter['batch_size'] update_target_frequency = hyperparameter['update_target_frequency'] replay_buffer_size = hyperparameter['replay_buffer_size'] data = tmp['data'] else: epsilon_start = 1.0 epsilon_end = 0.01 epsilon_decay = 0.995 num_episodes = 100000 batch_size = 32 update_target_frequency = 1000 replay_buffer_size = 20000 replay_buffer = deque(maxlen=replay_buffer_size) reward_buffer = deque(maxlen=100) current_episode = len(data) step = -1 hyperparameter = { 'epsilon_start': epsilon_start, 'epsilon_end': epsilon_end, 'epsilon_decay': epsilon_decay, 'num_episodes': num_episodes, 'batch_size': batch_size, 'update_target_frequency': update_target_frequency, 'replay_buffer_size': replay_buffer_size, } for episode in range(current_episode, num_episodes): obs, info = env.reset() episode_reward = np.zeros(num_agents) while env.agents: step += 1 epsilon = max(epsilon_end, epsilon_start * epsilon_decay ** step) actions = {} for agent in name_agent: actions[agent] = agents[agent].act(np.array(obs[agent]).flatten(), epsilon) next_obs, rewards, terminations, truncations, infos = env.step(actions) transitions = {} for i in range(num_agents): curAgent = name_agent[i] transition = { 'obs': np.array(obs[curAgent]).flatten(), 'action': actions[curAgent], 'reward': rewards[curAgent], 'next_obs': np.array(next_obs[curAgent]).flatten(), 'dones': terminations[curAgent] or truncations[curAgent] } if curAgent not in transitions: transitions[curAgent] = {} transitions[curAgent] = transition episode_reward[i] += rewards[curAgent] replay_buffer.append(transitions) if len(replay_buffer) >= batch_size: for agent in name_agent: batch = [replay_buffer[i][agent] for i in np.random.choice(len(replay_buffer), batch_size, replace=False)] agents[agent].train_step(batch) obs = next_obs reward_buffer.append(np.mean(episode_reward)) if (episode + 1) % update_target_frequency == 0: for agent in name_agent: agents[agent].update_target_model() data.append([episode + 1, np.mean(episode_reward)]) if (episode + 1) % 100 == 0: print(f"Episode {episode + 1} - Mean Reward: {np.mean(reward_buffer)}") sto(env, 'IQL', agents, hyperparameter, 'DQN.pt', data) return agents, data
VDN
In [0]:
class SharedNetwork(nn.Module): def __init__(self, input_dim, num_actions): super(SharedNetwork, self).__init__() self.fc1 = nn.Linear(input_dim, 128) # Thay đổi kích thước đầu vào self.relu = nn.ReLU() self.lstm = nn.LSTM(128, 128, batch_first = True) self.fc_value = nn.Linear(128, 1) self.fc_advantage = nn.Linear(128, num_actions) def forward(self, x, hc): x = self.relu(self.fc1(x)) x, newhc = self.lstm(x.unsqueeze(1), hc) value = self.fc_value(x) advantage = self.fc_advantage(x) q_value = value + (advantage - advantage.mean(dim=-1, keepdim=True)) return q_value, newhc
In [0]:
class VDN: def __init__(self, input_shape, agents, num_actions, lr=0.001, gamma=0.99): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.agents = agents self.shared_network = SharedNetwork(input_shape + len(self.agents), num_actions).to(self.device) self.target_network = SharedNetwork(input_shape + len(self.agents), num_actions).to(self.device) self.target_network.load_state_dict(self.shared_network.state_dict()) self.target_network.eval() self.optimizer = optim.Adam(self.shared_network.parameters(), lr=lr) self.loss_fn = nn.MSELoss() self.gamma = gamma self.hc = {} self.role = {} role_info = torch.eye(len(agents)).to(self.device) cnt = 0 for agent in agents: self.hc[agent] = self.init_hidden_states(1) self.role[agent] = role_info[cnt] cnt += 1 def combine(self, obs, agent): if torch.is_tensor(obs): obs_array = obs.cpu().numpy().flatten() else: obs_array = np.array(obs).flatten() role_tensor = self.role[agent].cpu() obs_tensor_cpu = torch.tensor(obs_array, device='cpu') return torch.cat((obs_tensor_cpu, role_tensor)).to(self.device, dtype=torch.float32) def update_target_network(self): self.target_network.load_state_dict(self.shared_network.state_dict()) def act(self, obs, agent, epsilon): if np.random.rand() < epsilon: return np.random.randint(0, self.shared_network.fc_advantage.out_features) obs_tensor = torch.tensor(obs, dtype=torch.float32).unsqueeze(0).to(self.device) q_values, self.hc[agent] = self.shared_network(obs_tensor, self.hc[agent]) return q_values.argmax().item() def reset(self): for agent in self.agents: self.hc[agent] = self.init_hidden_states(1) def vdn_step(self, batch, agents, batch_size): total_q_values = torch.zeros(batch_size, device=self.device) expected_q_values = torch.zeros(batch_size, device=self.device) for agent in agents: obs_batch = torch.stack([transition['obs'] for transition in batch[agent]]).to(self.device) actions_batch = torch.tensor([transition['action'] for transition in batch[agent]], dtype=torch.int64).to(self.device) rewards_batch = torch.tensor([transition['reward'] for transition in batch[agent]], dtype=torch.float32).to(self.device) next_obs_batch = torch.stack([transition['next_obs'] for transition in batch[agent]]).to(self.device) dones_batch = torch.tensor([transition['dones'] for transition in batch[agent]], dtype=torch.float32).to(self.device) self.hc[agent] = self.init_hidden_states(batch_size) cur_q_values, _ = self.shared_network(obs_batch, self.hc[agent]) cur_q_values = cur_q_values.squeeze(1).gather(1, actions_batch.unsqueeze(1)).squeeze(1).reshape(-1) total_q_values += cur_q_values self.hc[agent] = self.init_hidden_states(batch_size) cur_next_q_values, _ = self.target_network(next_obs_batch, self.hc[agent]) cur_next_q_values = cur_next_q_values.max(dim=2)[0].squeeze(0).reshape(-1) expected_q_values += rewards_batch + self.gamma * cur_next_q_values * (1 - dones_batch) loss = self.loss_fn(total_q_values, expected_q_values.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def init_hidden_states(self, batch_size): return (torch.zeros([1, batch_size, 128]).to(self.device), torch.zeros([1, batch_size, 128]).to(self.device))
In [0]:
def run_VDN(env, device, tmp = None): num_agents = env.num_agents name_agent = env.agents replay_buffer = None data = [] num_actions = max(space.n for space in env.action_spaces.values()) input_shape = len(np.array(env.state()).flatten()) vdnAgents = VDN(input_shape, name_agent, num_actions) if tmp != None: assert env.metadata['name'] == tmp['env'].metadata['name'] assert tmp['agents'] == 'VDN' vdnAgents.shared_network.load_state_dict(tmp['model'].shared_network.state_dict()) vdnAgents.target_network.load_state_dict(tmp['model'].target_network.state_dict()) hyperparameter = tmp['hp'] epsilon_start = hyperparameter['epsilon_start'] epsilon_end = hyperparameter['epsilon_end'] epsilon_decay = hyperparameter['epsilon_decay'] num_episodes = hyperparameter['num_episodes'] update_target_frequency = hyperparameter['update_target_frequency'] replay_buffer_size = hyperparameter['replay_buffer_size'] data = tmp['data'] else: epsilon_start = 1.0 epsilon_end = 0.01 epsilon_decay = 0.995 num_episodes = 100000 update_target_frequency = 300 replay_buffer_size = 20000 replay_buffer = deque(maxlen=replay_buffer_size) current_episode = len(data) batch_size = 32 reward_buffer = deque(maxlen=100) step = -1 hyperparameter = { 'epsilon_start': epsilon_start, 'epsilon_end': epsilon_end, 'epsilon_decay': epsilon_decay, 'num_episodes': num_episodes, 'update_target_frequency': update_target_frequency, 'replay_buffer_size': replay_buffer_size, } for episode in range(current_episode, num_episodes): obs, info = env.reset() episode_reward = np.zeros(num_agents) vdnAgents.reset() while env.agents: actions = {} step += 1 epsilon = max(epsilon_end, epsilon_start * epsilon_decay ** step) for agent in name_agent: actions[agent] = vdnAgents.act(vdnAgents.combine(obs[agent], agent), agent, epsilon) next_obs, rewards, terminations, truncations, infos = env.step(actions) transitions = {} for i in range(num_agents): curAgent = name_agent[i] transition = { 'obs': vdnAgents.combine(obs[curAgent], curAgent), 'action': actions[curAgent], 'reward': rewards[curAgent], 'next_obs': vdnAgents.combine(next_obs[curAgent], curAgent), 'dones': terminations[curAgent] or truncations[curAgent] } if curAgent not in transitions: transitions[curAgent] = {} transitions[curAgent] = transition episode_reward[i] += rewards[curAgent] replay_buffer.append(transitions) obs = next_obs if len(replay_buffer) > batch_size: batch = [replay_buffer[i] for i in np.random.choice(len(replay_buffer), batch_size, replace=False)] vdnAgents.vdn_step(batch, name_agent, batch_size) reward_buffer.append(np.mean(episode_reward)) if (episode + 1) % update_target_frequency == 0: vdnAgents.update_target_network() data.append([episode + 1, np.mean(episode_reward)]) if (episode + 1) % 100 == 0: print(f"Episode {episode + 1} - Mean Reward: {np.mean(episode_reward)}") sto(env, 'VDN', vdnAgents, hyperparameter, 'VDN.pt', data) return vdnAgents, data
QMIX
In [0]:
class QNetwork(nn.Module): def __init__(self, input_dim, num_actions, device): super(QNetwork, self).__init__() self.device = device self.num_actions = num_actions self.fc1 = nn.Linear(input_dim, 128) self.lstm = nn.GRU(128, 64, batch_first = True) self.fc2 = nn.Linear(64, num_actions) def forward(self, x, hx, epsilon, predict): output_dim = x.shape[0] x, hx = self.lstm(self.fc1(x).unsqueeze(1), hx) value = self.fc2(x) index_output = [] for i in range(output_dim): if np.random.rand() < epsilon and predict == False: index_output.append([np.random.randint(0, self.num_actions)]) else: index_output.append([value[i].argmax(dim=-1).item()]) index_output_tensor = torch.tensor(index_output, device=self.device).unsqueeze(1) selected_values = value.gather(-1, index_output_tensor) return selected_values.view(output_dim, 1), hx, index_output
In [0]:
class Mixer(nn.Module): def __init__(self, input_dim, state_dim, n_agents, mixing_embed_dim, device): super(Mixer, self).__init__() self.device = device self.embed_dim = mixing_embed_dim self.state_dim = state_dim self.n_agents = n_agents self.hyper_w_1 = nn.Linear(self.state_dim, self.embed_dim * self.n_agents) self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim) self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim) self.V = nn.Sequential( nn.Linear(self.state_dim, self.embed_dim), nn.ReLU(), nn.Linear(self.embed_dim, 1), ) def forward(self, agent_qs, state): batch_size = agent_qs.size(0) agent_qs = agent_qs.view(-1, 1, self.n_agents) w1 = torch.abs(self.hyper_w_1(state)) w1 = w1.reshape(-1, self.n_agents, self.embed_dim) b1 = self.hyper_b_1(state).view(-1, 1, self.embed_dim) x = F.elu(torch.bmm(agent_qs, w1) + b1) w_final = torch.abs(self.hyper_w_final(state)).view(-1, self.embed_dim, 1) q_tot = torch.bmm(x, w_final) + self.V(state).view(-1, 1, 1) q_tot = q_tot.view(batch_size, -1) return q_tot
In [0]:
class QMIXNetwork(nn.Module): def __init__(self, input_shape, state_dim, n_agents, num_action, device): super(QMIXNetwork, self).__init__() self.device = device self.agentLayer = QNetwork(input_shape, num_action, device) self.mixer = Mixer(input_shape, state_dim, n_agents, 32, device) def forward(self, obs, hidden, epsilon, state, agents, predict): agent_qs = [] new_hidden_states = [] for agent in agents: agent_q, new_hidden_state, _ = self.agentLayer(obs[agent], hidden[agent], epsilon, predict) agent_qs.append(agent_q) new_hidden_states.append(new_hidden_state) agent_qs = torch.cat(agent_qs, dim = 1) q_tot = self.mixer(agent_qs, state) return q_tot def get_action(self, obs, hidden, epsilon, predict): data = self.agentLayer(obs, hidden, epsilon, predict) return data[2][0][0], data[1]
In [0]:
class QMIX: def __init__(self, input_shape, state_dim, agents, num_actions, lr=0.0005, gamma=0.99): self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.agents = agents self.train_network = QMIXNetwork(input_shape + len(self.agents), state_dim, len(agents), num_actions, device).to(self.device) self.target_network = QMIXNetwork(input_shape + len(self.agents), state_dim, len(agents), num_actions, device).to(self.device) self.target_network.load_state_dict(self.train_network.state_dict()) self.target_network.eval() self.optimizer = optim.Adam(self.train_network.parameters(), lr=lr) self.loss_fn = nn.MSELoss() self.gamma = gamma self.hx = {} self.role = {} role_info = torch.eye(len(agents)).to(self.device) cnt = 0 for agent in agents: self.hx[agent] = self.init_hidden_states(1) self.role[agent] = role_info[cnt] cnt += 1 def combine(self, obs, agent): if torch.is_tensor(obs): obs_array = obs.cpu().numpy().flatten() else: obs_array = np.array(obs).flatten() role_tensor = self.role[agent].cpu() obs_tensor_cpu = torch.tensor(obs_array, device='cpu') return torch.cat((obs_tensor_cpu, role_tensor)).to(self.device, dtype=torch.float32) def update_target_network(self): self.target_network.load_state_dict(self.train_network.state_dict()) def reset(self): for agent in self.agents: self.hx[agent] = self.init_hidden_states(1) def qmix_step(self, batch, agents, batch_size, epsilon): obs_batch = {} next_obs_batch = {} for agent in agents: obs_batch[agent] = torch.stack([transition[agent]['obs'] for transition in batch[:]]).to(self.device) next_obs_batch[agent] = torch.stack([transition[agent]['next_obs'] for transition in batch[:]]).to(self.device) state_batch = torch.stack([transitions['state'] for transitions in batch]).to(self.device) next_state_batch = torch.stack([transitions['next_state'] for transitions in batch]).to(self.device) rewards_batch = torch.tensor([transition['rewards'] for transition in batch], dtype=torch.float32).to(self.device) dones_batch = torch.tensor([transition['dones'] for transition in batch], dtype=torch.float32).to(self.device) for agent in self.agents: self.hx[agent] = self.init_hidden_states(batch_size) batch_q_tot = self.train_network(obs_batch, self.hx, epsilon, state_batch, self.agents, True) for agent in self.agents: self.hx[agent] = self.init_hidden_states(batch_size) target_q_tot = self.target_network(next_obs_batch, self.hx, epsilon, next_state_batch, self.agents, True) target_q_tot = rewards_batch + self.gamma * target_q_tot * (1 - dones_batch) batch_q_tot = batch_q_tot.expand_as(target_q_tot) loss = self.loss_fn(batch_q_tot, target_q_tot.detach()) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def init_hidden_states(self, batch_size): return torch.zeros([1, batch_size, 64]).to(self.device)
In [0]:
def run_QMIX(env, device, tmp = None): num_agents = env.num_agents name_agent = env.agents replay_buffer = None data = [] num_actions = max(space.n for space in env.action_spaces.values()) input_shape = np.prod(np.array(env.observation_space(name_agent[0]).shape)) + 1 state_dim = len(np.array(env.state()).flatten()) qmixAgents = QMIX(input_shape, state_dim, name_agent, num_actions) if tmp != None: assert env.metadata['name'] == tmp['env'].metadata['name'] assert tmp['agents'] == 'QMIX' qmixAgents.shared_network.load_state_dict(tmp['model'].shared_network.state_dict()) qmixAgents.target_network.load_state_dict(tmp['model'].target_network.state_dict()) hyperparameter = tmp['hp'] epsilon_start = hyperparameter['epsilon_start'] epsilon_end = hyperparameter['epsilon_end'] epsilon_decay = hyperparameter['epsilon_decay'] num_episodes = hyperparameter['num_episodes'] update_target_frequency = hyperparameter['update_target_frequency'] replay_buffer_size = hyperparameter['replay_buffer_size'] data = tmp['data'] else: epsilon_start = 1.0 epsilon_end = 0.01 epsilon_decay = 0.995 num_episodes = 100000 update_target_frequency = 300 replay_buffer_size = 20000 replay_buffer = deque(maxlen=replay_buffer_size) batch_size = 32 current_episode = len(data) reward_buffer = deque(maxlen=100) step = -1 hyperparameter = { 'epsilon_start': epsilon_start, 'epsilon_end': epsilon_end, 'epsilon_decay': epsilon_decay, 'num_episodes': num_episodes, 'update_target_frequency': update_target_frequency, 'replay_buffer_size': replay_buffer_size, } for episode in range(current_episode, num_episodes): obs, info = env.reset() episode_reward = np.zeros(num_agents) prev_actions = {} for agent in name_agent: prev_actions[agent] = 0 while env.agents: step += 1 epsilon = max(epsilon_end, epsilon_start * epsilon_decay ** step) actions = {} newObs = {} qmixAgents.reset() for agent in name_agent: newObs[agent] = np.concatenate((np.array(obs[agent]).flatten(), [prev_actions[agent]])) actions[agent], qmixAgents.hx[agent] = qmixAgents.train_network.get_action(qmixAgents.combine(newObs[agent], agent).unsqueeze(0), qmixAgents.hx[agent], epsilon, False) cur_state = env.state() next_obs, rewards, terminations, truncations, infos = env.step(actions) next_state = env.state() transitions = {'state': torch.flatten(torch.tensor(cur_state, dtype=torch.float32)), 'next_state': torch.flatten(torch.tensor(next_state, dtype=torch.float32)), 'rewards': [], 'dones': []} for agent in name_agent: transitions['rewards'].append(rewards[agent]) transitions['dones'].append(terminations[agent] or truncations[agent]) for i in range(num_agents): curAgent = name_agent[i] new_next_obs = np.concatenate((np.array(next_obs[curAgent]).flatten(), [actions[curAgent]])) transition = { 'obs': qmixAgents.combine(newObs[agent], curAgent), 'next_obs': qmixAgents.combine(new_next_obs, curAgent), } if curAgent not in transitions: transitions[curAgent] = {} transitions[curAgent] = transition episode_reward[i] += rewards[curAgent] replay_buffer.append(transitions) obs = next_obs prev_action = actions if len(replay_buffer) > batch_size: batch = [replay_buffer[i] for i in np.random.choice(len(replay_buffer), batch_size, replace=False)] qmixAgents.qmix_step(batch, name_agent, batch_size, epsilon) reward_buffer.append(np.mean(episode_reward)) if (episode + 1) % update_target_frequency == 0: qmixAgents.update_target_network() data.append([episode + 1, np.mean(episode_reward)]) if (episode + 1) % 100 == 0: print(f"Episode {episode + 1} - Mean Reward: {np.mean(reward_buffer)}") sto(env, 'QMIX', qmixAgents, hyperparameter, 'QMIX.pt', data) return qmixAgents, data
Env
In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu' print(f"Using device: {device}")
In [0]:
env = simple_reference_v3.parallel_env(render_mode="human") observations, infos = env.reset() #tmp = load('luu.pt') agents, data = run_QMIX(env, device=device)
In [0]:
episodes = [item[0] for item in data] mean_rewards = [item[1] for item in data]
In [0]:
plt.plot(episodes, mean_rewards, label='Mean Reward') plt.xlabel('Episode') plt.ylabel('Mean Reward') plt.title('Mean Reward per Episode') plt.legend() plt.grid(True) plt.show()