Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
AI4Finance-Foundation
GitHub Repository: AI4Finance-Foundation/FinRL
Path: blob/master/finrl/agents/portfolio_optimization/algorithms.py
732 views
1
from __future__ import annotations
2
3
import copy
4
5
import numpy as np
6
import torch
7
from torch.optim import AdamW
8
from torch.utils.data import DataLoader
9
from tqdm import tqdm
10
11
from .architectures import EIIE
12
from .utils import apply_portfolio_noise
13
from .utils import PVM
14
from .utils import ReplayBuffer
15
from .utils import RLDataset
16
17
18
class PolicyGradient:
19
"""Class implementing policy gradient algorithm to train portfolio
20
optimization agents.
21
22
Note:
23
During testing, the agent is optimized through online learning.
24
The parameters of the policy is updated repeatedly after a constant
25
period of time. To disable it, set learning rate to 0.
26
27
Attributes:
28
train_env: Environment used to train the agent
29
train_policy: Policy used in training.
30
test_env: Environment used to test the agent.
31
test_policy: Policy after test online learning.
32
"""
33
34
def __init__(
35
self,
36
env,
37
policy=EIIE,
38
policy_kwargs=None,
39
validation_env=None,
40
batch_size=100,
41
lr=1e-3,
42
action_noise=0,
43
optimizer=AdamW,
44
device="cpu",
45
):
46
"""Initializes Policy Gradient for portfolio optimization.
47
48
Args:
49
env: Training Environment.
50
policy: Policy architecture to be used.
51
policy_kwargs: Arguments to be used in the policy network.
52
validation_env: Validation environment.
53
batch_size: Batch size to train neural network.
54
lr: policy Neural network learning rate.
55
action_noise: Noise parameter (between 0 and 1) to be applied
56
during training.
57
optimizer: Optimizer of neural network.
58
device: Device where neural network is run.
59
"""
60
self.policy = policy
61
self.policy_kwargs = {} if policy_kwargs is None else policy_kwargs
62
self.validation_env = validation_env
63
self.batch_size = batch_size
64
self.lr = lr
65
self.action_noise = action_noise
66
self.optimizer = optimizer
67
self.device = device
68
self._setup_train(env, self.policy, self.batch_size, self.lr, self.optimizer)
69
70
def _setup_train(self, env, policy, batch_size, lr, optimizer):
71
"""Initializes algorithm before training.
72
73
Args:
74
env: environment.
75
policy: Policy architecture to be used.
76
batch_size: Batch size to train neural network.
77
lr: Policy neural network learning rate.
78
optimizer: Optimizer of neural network.
79
"""
80
# environment
81
self.train_env = env
82
83
# neural networks
84
self.train_policy = policy(**self.policy_kwargs).to(self.device)
85
self.train_optimizer = optimizer(self.train_policy.parameters(), lr=lr)
86
87
# replay buffer and portfolio vector memory
88
self.train_batch_size = batch_size
89
self.train_buffer = ReplayBuffer(capacity=batch_size)
90
self.train_pvm = PVM(self.train_env.episode_length, env.portfolio_size)
91
92
# dataset and dataloader
93
dataset = RLDataset(self.train_buffer)
94
self.train_dataloader = DataLoader(
95
dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True
96
)
97
98
def train(self, episodes=100):
99
"""Training sequence.
100
101
Args:
102
episodes: Number of episodes to simulate.
103
"""
104
for i in tqdm(range(1, episodes + 1)):
105
obs = self.train_env.reset() # observation
106
self.train_pvm.reset() # reset portfolio vector memory
107
done = False
108
109
while not done:
110
# define last_action and action and update portfolio vector memory
111
last_action = self.train_pvm.retrieve()
112
obs_batch = np.expand_dims(obs, axis=0)
113
last_action_batch = np.expand_dims(last_action, axis=0)
114
action = apply_portfolio_noise(
115
self.train_policy(obs_batch, last_action_batch), self.action_noise
116
)
117
self.train_pvm.add(action)
118
119
# run simulation step
120
next_obs, reward, done, info = self.train_env.step(action)
121
122
# add experience to replay buffer
123
exp = (obs, last_action, info["price_variation"], info["trf_mu"])
124
self.train_buffer.append(exp)
125
126
# update policy networks
127
if len(self.train_buffer) == self.train_batch_size:
128
self._gradient_ascent()
129
130
obs = next_obs
131
132
# gradient ascent with episode remaining buffer data
133
self._gradient_ascent()
134
135
# validation step
136
if self.validation_env:
137
self.test(self.validation_env)
138
139
def _setup_test(self, env, policy, batch_size, lr, optimizer):
140
"""Initializes algorithm before testing.
141
142
Args:
143
env: Environment.
144
policy: Policy architecture to be used.
145
batch_size: batch size to train neural network.
146
lr: policy neural network learning rate.
147
optimizer: Optimizer of neural network.
148
"""
149
# environment
150
self.test_env = env
151
152
# process None arguments
153
policy = self.train_policy if policy is None else policy
154
lr = self.lr if lr is None else lr
155
optimizer = self.optimizer if optimizer is None else optimizer
156
157
# neural networks
158
# define policy
159
self.test_policy = copy.deepcopy(policy).to(self.device)
160
self.test_optimizer = optimizer(self.test_policy.parameters(), lr=lr)
161
162
# replay buffer and portfolio vector memory
163
self.test_buffer = ReplayBuffer(capacity=batch_size)
164
self.test_pvm = PVM(self.test_env.episode_length, env.portfolio_size)
165
166
# dataset and dataloader
167
dataset = RLDataset(self.test_buffer)
168
self.test_dataloader = DataLoader(
169
dataset=dataset, batch_size=batch_size, shuffle=False, pin_memory=True
170
)
171
172
def test(
173
self, env, policy=None, online_training_period=10, lr=None, optimizer=None
174
):
175
"""Tests the policy with online learning.
176
177
Args:
178
env: Environment to be used in testing.
179
policy: Policy architecture to be used. If None, it will use the training
180
architecture.
181
online_training_period: Period in which an online training will occur. To
182
disable online learning, use a very big value.
183
batch_size: Batch size to train neural network. If None, it will use the
184
training batch size.
185
lr: Policy neural network learning rate. If None, it will use the training
186
learning rate
187
optimizer: Optimizer of neural network. If None, it will use the training
188
optimizer
189
190
Note:
191
To disable online learning, set learning rate to 0 or a very big online
192
training period.
193
"""
194
self._setup_test(env, policy, online_training_period, lr, optimizer)
195
196
obs = self.test_env.reset() # observation
197
self.test_pvm.reset() # reset portfolio vector memory
198
done = False
199
steps = 0
200
201
while not done:
202
steps += 1
203
# define last_action and action and update portfolio vector memory
204
last_action = self.test_pvm.retrieve()
205
obs_batch = np.expand_dims(obs, axis=0)
206
last_action_batch = np.expand_dims(last_action, axis=0)
207
action = self.test_policy(obs_batch, last_action_batch)
208
self.test_pvm.add(action)
209
210
# run simulation step
211
next_obs, reward, done, info = self.test_env.step(action)
212
213
# add experience to replay buffer
214
exp = (obs, last_action, info["price_variation"], info["trf_mu"])
215
self.test_buffer.append(exp)
216
217
# update policy networks
218
if steps % online_training_period == 0:
219
self._gradient_ascent(test=True)
220
221
obs = next_obs
222
223
def _gradient_ascent(self, test=False):
224
"""Performs the gradient ascent step in the policy gradient algorithm.
225
226
Args:
227
test: If true, it uses the test dataloader and policy.
228
"""
229
# get batch data from dataloader
230
obs, last_actions, price_variations, trf_mu = (
231
next(iter(self.test_dataloader))
232
if test
233
else next(iter(self.train_dataloader))
234
)
235
obs = obs.to(self.device)
236
last_actions = last_actions.to(self.device)
237
price_variations = price_variations.to(self.device)
238
trf_mu = trf_mu.unsqueeze(1).to(self.device)
239
240
# define policy loss (negative for gradient ascent)
241
mu = (
242
self.test_policy.mu(obs, last_actions)
243
if test
244
else self.train_policy.mu(obs, last_actions)
245
)
246
policy_loss = -torch.mean(
247
torch.log(torch.sum(mu * price_variations * trf_mu, dim=1))
248
)
249
250
# update policy network
251
if test:
252
self.test_policy.zero_grad()
253
policy_loss.backward()
254
self.test_optimizer.step()
255
else:
256
self.train_policy.zero_grad()
257
policy_loss.backward()
258
self.train_optimizer.step()
259
260