Path: blob/master/finrl/agents/portfolio_optimization/algorithms.py
732 views
from __future__ import annotations12import copy34import numpy as np5import torch6from torch.optim import AdamW7from torch.utils.data import DataLoader8from tqdm import tqdm910from .architectures import EIIE11from .utils import apply_portfolio_noise12from .utils import PVM13from .utils import ReplayBuffer14from .utils import RLDataset151617class PolicyGradient:18"""Class implementing policy gradient algorithm to train portfolio19optimization agents.2021Note:22During testing, the agent is optimized through online learning.23The parameters of the policy is updated repeatedly after a constant24period of time. To disable it, set learning rate to 0.2526Attributes:27train_env: Environment used to train the agent28train_policy: Policy used in training.29test_env: Environment used to test the agent.30test_policy: Policy after test online learning.31"""3233def __init__(34self,35env,36policy=EIIE,37policy_kwargs=None,38validation_env=None,39batch_size=100,40lr=1e-3,41action_noise=0,42optimizer=AdamW,43device="cpu",44):45"""Initializes Policy Gradient for portfolio optimization.4647Args:48env: Training Environment.49policy: Policy architecture to be used.50policy_kwargs: Arguments to be used in the policy network.51validation_env: Validation environment.52batch_size: Batch size to train neural network.53lr: policy Neural network learning rate.54action_noise: Noise parameter (between 0 and 1) to be applied55during training.56optimizer: Optimizer of neural network.57device: Device where neural network is run.58"""59self.policy = policy60self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs61self.validation_env = validation_env62self.batch_size = batch_size63self.lr = lr64self.action_noise = action_noise65self.optimizer = optimizer66self.device = device67self._setup_train(env, self.policy, self.batch_size, self.lr, self.optimizer)6869def _setup_train(self, env, policy, batch_size, lr, optimizer):70"""Initializes algorithm before training.7172Args:73env: environment.74policy: Policy architecture to be used.75batch_size: Batch size to train neural network.76lr: Policy neural network learning rate.77optimizer: Optimizer of neural network.78"""79# environment80self.train_env = env8182# neural networks83self.train_policy = policy(**self.policy_kwargs).to(self.device)84self.train_optimizer = optimizer(self.train_policy.parameters(), lr=lr)8586# replay buffer and portfolio vector memory87self.train_batch_size = batch_size88self.train_buffer = ReplayBuffer(capacity=batch_size)89self.train_pvm = PVM(self.train_env.episode_length, env.portfolio_size)9091# dataset and dataloader92dataset = RLDataset(self.train_buffer)93self.train_dataloader = DataLoader(94dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True95)9697def train(self, episodes=100):98"""Training sequence.99100Args:101episodes: Number of episodes to simulate.102"""103for i in tqdm(range(1, episodes + 1)):104obs = self.train_env.reset() # observation105self.train_pvm.reset() # reset portfolio vector memory106done = False107108while not done:109# define last_action and action and update portfolio vector memory110last_action = self.train_pvm.retrieve()111obs_batch = np.expand_dims(obs, axis=0)112last_action_batch = np.expand_dims(last_action, axis=0)113action = apply_portfolio_noise(114self.train_policy(obs_batch, last_action_batch), self.action_noise115)116self.train_pvm.add(action)117118# run simulation step119next_obs, reward, done, info = self.train_env.step(action)120121# add experience to replay buffer122exp = (obs, last_action, info["price_variation"], info["trf_mu"])123self.train_buffer.append(exp)124125# update policy networks126if len(self.train_buffer) == self.train_batch_size:127self._gradient_ascent()128129obs = next_obs130131# gradient ascent with episode remaining buffer data132self._gradient_ascent()133134# validation step135if self.validation_env:136self.test(self.validation_env)137138def _setup_test(self, env, policy, batch_size, lr, optimizer):139"""Initializes algorithm before testing.140141Args:142env: Environment.143policy: Policy architecture to be used.144batch_size: batch size to train neural network.145lr: policy neural network learning rate.146optimizer: Optimizer of neural network.147"""148# environment149self.test_env = env150151# process None arguments152policy = self.train_policy if policy is None else policy153lr = self.lr if lr is None else lr154optimizer = self.optimizer if optimizer is None else optimizer155156# neural networks157# define policy158self.test_policy = copy.deepcopy(policy).to(self.device)159self.test_optimizer = optimizer(self.test_policy.parameters(), lr=lr)160161# replay buffer and portfolio vector memory162self.test_buffer = ReplayBuffer(capacity=batch_size)163self.test_pvm = PVM(self.test_env.episode_length, env.portfolio_size)164165# dataset and dataloader166dataset = RLDataset(self.test_buffer)167self.test_dataloader = DataLoader(168dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True169)170171def test(172self, env, policy=None, online_training_period=10, lr=None, optimizer=None173):174"""Tests the policy with online learning.175176Args:177env: Environment to be used in testing.178policy: Policy architecture to be used. If None, it will use the training179architecture.180online_training_period: Period in which an online training will occur. To181disable online learning, use a very big value.182batch_size: Batch size to train neural network. If None, it will use the183training batch size.184lr: Policy neural network learning rate. If None, it will use the training185learning rate186optimizer: Optimizer of neural network. If None, it will use the training187optimizer188189Note:190To disable online learning, set learning rate to 0 or a very big online191training period.192"""193self._setup_test(env, policy, online_training_period, lr, optimizer)194195obs = self.test_env.reset() # observation196self.test_pvm.reset() # reset portfolio vector memory197done = False198steps = 0199200while not done:201steps += 1202# define last_action and action and update portfolio vector memory203last_action = self.test_pvm.retrieve()204obs_batch = np.expand_dims(obs, axis=0)205last_action_batch = np.expand_dims(last_action, axis=0)206action = self.test_policy(obs_batch, last_action_batch)207self.test_pvm.add(action)208209# run simulation step210next_obs, reward, done, info = self.test_env.step(action)211212# add experience to replay buffer213exp = (obs, last_action, info["price_variation"], info["trf_mu"])214self.test_buffer.append(exp)215216# update policy networks217if steps % online_training_period == 0:218self._gradient_ascent(test=True)219220obs = next_obs221222def _gradient_ascent(self, test=False):223"""Performs the gradient ascent step in the policy gradient algorithm.224225Args:226test: If true, it uses the test dataloader and policy.227"""228# get batch data from dataloader229obs, last_actions, price_variations, trf_mu = (230next(iter(self.test_dataloader))231if test232else next(iter(self.train_dataloader))233)234obs = obs.to(self.device)235last_actions = last_actions.to(self.device)236price_variations = price_variations.to(self.device)237trf_mu = trf_mu.unsqueeze(1).to(self.device)238239# define policy loss (negative for gradient ascent)240mu = (241self.test_policy.mu(obs, last_actions)242if test243else self.train_policy.mu(obs, last_actions)244)245policy_loss = -torch.mean(246torch.log(torch.sum(mu * price_variations * trf_mu, dim=1))247)248249# update policy network250if test:251self.test_policy.zero_grad()252policy_loss.backward()253self.test_optimizer.step()254else:255self.train_policy.zero_grad()256policy_loss.backward()257self.train_optimizer.step()258259260