CoCalc -- common.py

GitHub Repository: AI4Finance-Foundation/FinRL
Path: blob/master/finrl/meta/paper_trading/common.py
⁷³² views
1
# Disclaimer: Nothing herein is financial advice, and NOT a recommendation to trade real money. Many platforms exist for simulated trading (paper trading) which can be used for building and developing the methods discussed. Please use common sense and always first consult a professional before trading or investing.
2
# -----------------------------------------------------------------------------------------------------------------------------------------
3
# Import related modules
4
from __future__ import annotations
5

6
import os
7
import time
8

9
import gym
10
import torch.nn as nn
11
from torch import Tensor
12
from torch.distributions.normal import Normal
13

14

15
# -----------------------------------------------------------------------------------------------------------------------------------------
16
# PPO
17

18

19
class ActorPPO(nn.Module):
20
    def __init__(self, dims: [int], state_dim: int, action_dim: int):
21
        super().__init__()
22
        self.net = build_mlp(dims=[state_dim, *dims, action_dim])
23
        self.action_std_log = nn.Parameter(
24
            torch.zeros((1, action_dim)), requires_grad=True
25
        )  # trainable parameter
26

27
    def forward(self, state: Tensor) -> Tensor:
28
        return self.net(state).tanh()  # action.tanh()
29

30
    def get_action(self, state: Tensor) -> (Tensor, Tensor):  # for exploration
31
        action_avg = self.net(state)
32
        action_std = self.action_std_log.exp()
33

34
        dist = Normal(action_avg, action_std)
35
        action = dist.sample()
36
        logprob = dist.log_prob(action).sum(1)
37
        return action, logprob
38

39
    def get_logprob_entropy(self, state: Tensor, action: Tensor) -> (Tensor, Tensor):
40
        action_avg = self.net(state)
41
        action_std = self.action_std_log.exp()
42

43
        dist = Normal(action_avg, action_std)
44
        logprob = dist.log_prob(action).sum(1)
45
        entropy = dist.entropy().sum(1)
46
        return logprob, entropy
47

48
    @staticmethod
49
    def convert_action_for_env(action: Tensor) -> Tensor:
50
        return action.tanh()
51

52

53
class CriticPPO(nn.Module):
54
    def __init__(self, dims: [int], state_dim: int, _action_dim: int):
55
        super().__init__()
56
        self.net = build_mlp(dims=[state_dim, *dims, 1])
57

58
    def forward(self, state: Tensor) -> Tensor:
59
        return self.net(state)  # advantage value
60

61

62
def build_mlp(dims: [int]) -> nn.Sequential:  # MLP (MultiLayer Perceptron)
63
    net_list = []
64
    for i in range(len(dims) - 1):
65
        net_list.extend([nn.Linear(dims[i], dims[i + 1]), nn.ReLU()])
66
    del net_list[-1]  # remove the activation of output layer
67
    return nn.Sequential(*net_list)
68

69

70
class Config:
71
    def __init__(self, agent_class=None, env_class=None, env_args=None):
72
        self.env_class = env_class  # env = env_class(**env_args)
73
        self.env_args = env_args  # env = env_class(**env_args)
74

75
        if env_args is None:  # dummy env_args
76
            env_args = {
77
                "env_name": None,
78
                "state_dim": None,
79
                "action_dim": None,
80
                "if_discrete": None,
81
            }
82
        self.env_name = env_args[
83
            "env_name"
84
        ]  # the name of environment. Be used to set 'cwd'.
85
        self.state_dim = env_args[
86
            "state_dim"
87
        ]  # vector dimension (feature number) of state
88
        self.action_dim = env_args[
89
            "action_dim"
90
        ]  # vector dimension (feature number) of action
91
        self.if_discrete = env_args[
92
            "if_discrete"
93
        ]  # discrete or continuous action space
94

95
        self.agent_class = agent_class  # agent = agent_class(...)
96

97
        """Arguments for reward shaping"""
98
        self.gamma = 0.99  # discount factor of future rewards
99
        self.reward_scale = 1.0  # an approximate target reward usually be closed to 256
100

101
        """Arguments for training"""
102
        self.gpu_id = int(0)  # `int` means the ID of single GPU, -1 means CPU
103
        self.net_dims = (
104
            64,
105
            32,
106
        )  # the middle layer dimension of MLP (MultiLayer Perceptron)
107
        self.learning_rate = 6e-5  # 2 ** -14 ~= 6e-5
108
        self.soft_update_tau = 5e-3  # 2 ** -8 ~= 5e-3
109
        self.batch_size = int(128)  # num of transitions sampled from replay buffer.
110
        self.horizon_len = int(
111
            2000
112
        )  # collect horizon_len step while exploring, then update network
113
        self.buffer_size = (
114
            None  # ReplayBuffer size. Empty the ReplayBuffer for on-policy.
115
        )
116
        self.repeat_times = 8.0  # repeatedly update network using ReplayBuffer to keep critic's loss small
117

118
        """Arguments for evaluate"""
119
        self.cwd = None  # current working directory to save model. None means set automatically
120
        self.break_step = +np.inf  # break training if 'total_step > break_step'
121
        self.eval_times = int(32)  # number of times that get episodic cumulative return
122
        self.eval_per_step = int(2e4)  # evaluate the agent per training steps
123

124
    def init_before_training(self):
125
        if self.cwd is None:  # set cwd (current working directory) for saving model
126
            self.cwd = f"./{self.env_name}_{self.agent_class.__name__[5:]}"
127
        os.makedirs(self.cwd, exist_ok=True)
128

129

130
def get_gym_env_args(env, if_print: bool) -> dict:
131
    if {"unwrapped", "observation_space", "action_space", "spec"}.issubset(
132
        dir(env)
133
    ):  # isinstance(env, gym.Env):
134
        env_name = env.unwrapped.spec.id
135
        state_shape = env.observation_space.shape
136
        state_dim = (
137
            state_shape[0] if len(state_shape) == 1 else state_shape
138
        )  # sometimes state_dim is a list
139

140
        if_discrete = isinstance(env.action_space, gym.spaces.Discrete)
141
        if if_discrete:  # make sure it is discrete action space
142
            action_dim = env.action_space.n
143
        elif isinstance(
144
            env.action_space, gym.spaces.Box
145
        ):  # make sure it is continuous action space
146
            action_dim = env.action_space.shape[0]
147

148
    env_args = {
149
        "env_name": env_name,
150
        "state_dim": state_dim,
151
        "action_dim": action_dim,
152
        "if_discrete": if_discrete,
153
    }
154
    print(f"env_args = {repr(env_args)}") if if_print else None
155
    return env_args
156

157

158
def kwargs_filter(function, kwargs: dict) -> dict:
159
    import inspect
160

161
    sign = inspect.signature(function).parameters.values()
162
    sign = {val.name for val in sign}
163
    common_args = sign.intersection(kwargs.keys())
164
    return {key: kwargs[key] for key in common_args}  # filtered kwargs
165

166

167
def build_env(env_class=None, env_args=None):
168
    if env_class.__module__ == "gym.envs.registration":  # special rule
169
        env = env_class(id=env_args["env_name"])
170
    else:
171
        env = env_class(**kwargs_filter(env_class.__init__, env_args.copy()))
172
    for attr_str in ("env_name", "state_dim", "action_dim", "if_discrete"):
173
        setattr(env, attr_str, env_args[attr_str])
174
    return env
175

176

177
class AgentBase:
178
    def __init__(
179
        self,
180
        net_dims: [int],
181
        state_dim: int,
182
        action_dim: int,
183
        gpu_id: int = 0,
184
        args: Config = Config(),
185
    ):
186
        self.state_dim = state_dim
187
        self.action_dim = action_dim
188

189
        self.gamma = args.gamma
190
        self.batch_size = args.batch_size
191
        self.repeat_times = args.repeat_times
192
        self.reward_scale = args.reward_scale
193
        self.soft_update_tau = args.soft_update_tau
194

195
        self.states = None  # assert self.states == (1, state_dim)
196
        self.device = torch.device(
197
            f"cuda:{gpu_id}" if (torch.cuda.is_available() and (gpu_id >= 0)) else "cpu"
198
        )
199

200
        act_class = getattr(self, "act_class", None)
201
        cri_class = getattr(self, "cri_class", None)
202
        self.act = self.act_target = act_class(net_dims, state_dim, action_dim).to(
203
            self.device
204
        )
205
        self.cri = self.cri_target = (
206
            cri_class(net_dims, state_dim, action_dim).to(self.device)
207
            if cri_class
208
            else self.act
209
        )
210

211
        self.act_optimizer = torch.optim.Adam(self.act.parameters(), args.learning_rate)
212
        self.cri_optimizer = (
213
            torch.optim.Adam(self.cri.parameters(), args.learning_rate)
214
            if cri_class
215
            else self.act_optimizer
216
        )
217

218
        self.criterion = torch.nn.SmoothL1Loss()
219

220
    @staticmethod
221
    def optimizer_update(optimizer, objective: Tensor):
222
        optimizer.zero_grad()
223
        objective.backward()
224
        optimizer.step()
225

226
    @staticmethod
227
    def soft_update(
228
        target_net: torch.nn.Module, current_net: torch.nn.Module, tau: float
229
    ):
230
        for tar, cur in zip(target_net.parameters(), current_net.parameters()):
231
            tar.data.copy_(cur.data * tau + tar.data * (1.0 - tau))
232

233

234
class AgentPPO(AgentBase):
235
    def __init__(
236
        self,
237
        net_dims: [int],
238
        state_dim: int,
239
        action_dim: int,
240
        gpu_id: int = 0,
241
        args: Config = Config(),
242
    ):
243
        self.if_off_policy = False
244
        self.act_class = getattr(self, "act_class", ActorPPO)
245
        self.cri_class = getattr(self, "cri_class", CriticPPO)
246
        AgentBase.__init__(self, net_dims, state_dim, action_dim, gpu_id, args)
247

248
        self.ratio_clip = getattr(
249
            args, "ratio_clip", 0.25
250
        )  # `ratio.clamp(1 - clip, 1 + clip)`
251
        self.lambda_gae_adv = getattr(
252
            args, "lambda_gae_adv", 0.95
253
        )  # could be 0.80~0.99
254
        self.lambda_entropy = getattr(
255
            args, "lambda_entropy", 0.01
256
        )  # could be 0.00~0.10
257
        self.lambda_entropy = torch.tensor(
258
            self.lambda_entropy, dtype=torch.float32, device=self.device
259
        )
260

261
    def explore_env(self, env, horizon_len: int) -> [Tensor]:
262
        states = torch.zeros((horizon_len, self.state_dim), dtype=torch.float32).to(
263
            self.device
264
        )
265
        actions = torch.zeros((horizon_len, self.action_dim), dtype=torch.float32).to(
266
            self.device
267
        )
268
        logprobs = torch.zeros(horizon_len, dtype=torch.float32).to(self.device)
269
        rewards = torch.zeros(horizon_len, dtype=torch.float32).to(self.device)
270
        dones = torch.zeros(horizon_len, dtype=torch.bool).to(self.device)
271

272
        ary_state = self.states[0]
273

274
        get_action = self.act.get_action
275
        convert = self.act.convert_action_for_env
276
        for i in range(horizon_len):
277
            state = torch.as_tensor(ary_state, dtype=torch.float32, device=self.device)
278
            action, logprob = (t.squeeze(0) for t in get_action(state.unsqueeze(0))[:2])
279

280
            ary_action = convert(action).detach().cpu().numpy()
281
            ary_state, reward, done, _ = env.step(ary_action)
282
            if done:
283
                obs = env.reset()
284
                if isinstance(obs, tuple):
285
                    obs = obs[0]
286
                ary_state = obs
287

288
            states[i] = state
289
            actions[i] = action
290
            logprobs[i] = logprob
291
            rewards[i] = reward
292
            dones[i] = done
293

294
        self.states[0] = ary_state
295
        rewards = (rewards * self.reward_scale).unsqueeze(1)
296
        undones = (1 - dones.type(torch.float32)).unsqueeze(1)
297
        return states, actions, logprobs, rewards, undones
298

299
    def update_net(self, buffer) -> [float]:
300
        with torch.no_grad():
301
            states, actions, logprobs, rewards, undones = buffer
302
            buffer_size = states.shape[0]
303

304
            """get advantages reward_sums"""
305
            bs = 2**10  # set a smaller 'batch_size' when out of GPU memory.
306
            values = [self.cri(states[i : i + bs]) for i in range(0, buffer_size, bs)]
307
            values = torch.cat(values, dim=0).squeeze(
308
                1
309
            )  # values.shape == (buffer_size, )
310

311
            advantages = self.get_advantages(
312
                rewards, undones, values
313
            )  # advantages.shape == (buffer_size, )
314
            reward_sums = advantages + values  # reward_sums.shape == (buffer_size, )
315
            del rewards, undones, values
316

317
            advantages = (advantages - advantages.mean()) / (
318
                advantages.std(dim=0) + 1e-5
319
            )
320
        assert logprobs.shape == advantages.shape == reward_sums.shape == (buffer_size,)
321

322
        """update network"""
323
        obj_critics = 0.0
324
        obj_actors = 0.0
325

326
        update_times = int(buffer_size * self.repeat_times / self.batch_size)
327
        assert update_times >= 1
328
        for _ in range(update_times):
329
            indices = torch.randint(
330
                buffer_size, size=(self.batch_size,), requires_grad=False
331
            )
332
            state = states[indices]
333
            action = actions[indices]
334
            logprob = logprobs[indices]
335
            advantage = advantages[indices]
336
            reward_sum = reward_sums[indices]
337

338
            value = self.cri(state).squeeze(
339
                1
340
            )  # critic network predicts the reward_sum (Q value) of state
341
            obj_critic = self.criterion(value, reward_sum)
342
            self.optimizer_update(self.cri_optimizer, obj_critic)
343

344
            new_logprob, obj_entropy = self.act.get_logprob_entropy(state, action)
345
            ratio = (new_logprob - logprob.detach()).exp()
346
            surrogate1 = advantage * ratio
347
            surrogate2 = advantage * ratio.clamp(
348
                1 - self.ratio_clip, 1 + self.ratio_clip
349
            )
350
            obj_surrogate = torch.min(surrogate1, surrogate2).mean()
351

352
            obj_actor = obj_surrogate + obj_entropy.mean() * self.lambda_entropy
353
            self.optimizer_update(self.act_optimizer, -obj_actor)
354

355
            obj_critics += obj_critic.item()
356
            obj_actors += obj_actor.item()
357
        a_std_log = getattr(self.act, "a_std_log", torch.zeros(1)).mean()
358
        return obj_critics / update_times, obj_actors / update_times, a_std_log.item()
359

360
    def get_advantages(
361
        self, rewards: Tensor, undones: Tensor, values: Tensor
362
    ) -> Tensor:
363
        advantages = torch.empty_like(values)  # advantage value
364

365
        masks = undones * self.gamma
366
        horizon_len = rewards.shape[0]
367

368
        next_state = torch.tensor(self.states, dtype=torch.float32).to(self.device)
369
        next_value = self.cri(next_state).detach()[0, 0]
370

371
        advantage = 0  # last_gae_lambda
372
        for t in range(horizon_len - 1, -1, -1):
373
            delta = rewards[t] + masks[t] * next_value - values[t]
374
            advantages[t] = advantage = (
375
                delta + masks[t] * self.lambda_gae_adv * advantage
376
            )
377
            next_value = values[t]
378
        return advantages
379

380

381
class PendulumEnv(gym.Wrapper):  # a demo of custom gym env
382
    def __init__(self):
383
        gym.logger.set_level(40)  # Block warning
384
        gym_env_name = "Pendulum-v0" if gym.__version__ < "0.18.0" else "Pendulum-v1"
385
        super().__init__(env=gym.make(gym_env_name))
386

387
        """the necessary env information when you design a custom env"""
388
        self.env_name = gym_env_name  # the name of this env.
389
        self.state_dim = self.observation_space.shape[0]  # feature number of state
390
        self.action_dim = self.action_space.shape[0]  # feature number of action
391
        self.if_discrete = False  # discrete action or continuous action
392

393
    def reset(
394
        self,
395
        *,
396
        seed=None,
397
        options=None,
398
    ) -> np.ndarray:  # reset the agent in env
399
        obs = self.env.reset()
400
        if isinstance(obs, tuple):
401
            obs = obs[0]
402
        return obs
403

404
    def step(
405
        self, action: np.ndarray
406
    ) -> (np.ndarray, float, bool, dict):  # agent interacts in env
407
        # We suggest that adjust action space to (-1, +1) when designing a custom env.
408
        state, reward, done, info_dict = self.env.step(action * 2)
409
        return state.reshape(self.state_dim), float(reward), done, info_dict
410

411

412
def train_agent(args: Config):
413
    args.init_before_training()
414

415
    env = build_env(args.env_class, args.env_args)
416
    agent = args.agent_class(
417
        args.net_dims, args.state_dim, args.action_dim, gpu_id=args.gpu_id, args=args
418
    )
419
    obs = env.reset()
420
    if isinstance(obs, tuple):
421
        obs = obs[0]
422
    agent.states = obs[np.newaxis, :]
423

424
    evaluator = Evaluator(
425
        eval_env=build_env(args.env_class, args.env_args),
426
        eval_per_step=args.eval_per_step,
427
        eval_times=args.eval_times,
428
        cwd=args.cwd,
429
    )
430
    torch.set_grad_enabled(False)
431
    while True:  # start training
432
        buffer_items = agent.explore_env(env, args.horizon_len)
433

434
        torch.set_grad_enabled(True)
435
        logging_tuple = agent.update_net(buffer_items)
436
        torch.set_grad_enabled(False)
437

438
        evaluator.evaluate_and_save(agent.act, args.horizon_len, logging_tuple)
439
        if (evaluator.total_step > args.break_step) or os.path.exists(
440
            f"{args.cwd}/stop"
441
        ):
442
            torch.save(agent.act.state_dict(), args.cwd + "/actor.pth")
443
            break  # stop training when reach `break_step` or `mkdir cwd/stop`
444

445

446
def render_agent(
447
    env_class,
448
    env_args: dict,
449
    net_dims: [int],
450
    agent_class,
451
    actor_path: str,
452
    render_times: int = 8,
453
):
454
    env = build_env(env_class, env_args)
455

456
    state_dim = env_args["state_dim"]
457
    action_dim = env_args["action_dim"]
458
    agent = agent_class(net_dims, state_dim, action_dim, gpu_id=-1)
459
    actor = agent.act
460

461
    print(f"| render and load actor from: {actor_path}")
462
    actor.load_state_dict(
463
        torch.load(actor_path, map_location=lambda storage, loc: storage)
464
    )
465
    for i in range(render_times):
466
        cumulative_reward, episode_step = get_rewards_and_steps(
467
            env, actor, if_render=True
468
        )
469
        print(
470
            f"|{i:4}  cumulative_reward {cumulative_reward:9.3f}  episode_step {episode_step:5.0f}"
471
        )
472

473

474
class Evaluator:
475
    def __init__(
476
        self, eval_env, eval_per_step: int = 1e4, eval_times: int = 8, cwd: str = "."
477
    ):
478
        self.cwd = cwd
479
        self.env_eval = eval_env
480
        self.eval_step = 0
481
        self.total_step = 0
482
        self.start_time = time.time()
483
        self.eval_times = (
484
            eval_times  # number of times that get episodic cumulative return
485
        )
486
        self.eval_per_step = eval_per_step  # evaluate the agent per training steps
487

488
        self.recorder = []
489
        print(
490
            f"\n| `step`: Number of samples, or total training steps, or running times of `env.step()`."
491
            f"\n| `time`: Time spent from the start of training to this moment."
492
            f"\n| `avgR`: Average value of cumulative rewards, which is the sum of rewards in an episode."
493
            f"\n| `stdR`: Standard dev of cumulative rewards, which is the sum of rewards in an episode."
494
            f"\n| `avgS`: Average of steps in an episode."
495
            f"\n| `objC`: Objective of Critic network. Or call it loss function of critic network."
496
            f"\n| `objA`: Objective of Actor network. It is the average Q value of the critic network."
497
            f"\n| {'step':>8}  {'time':>8}  | {'avgR':>8}  {'stdR':>6}  {'avgS':>6}  | {'objC':>8}  {'objA':>8}"
498
        )
499

500
    def evaluate_and_save(self, actor, horizon_len: int, logging_tuple: tuple):
501
        self.total_step += horizon_len
502
        if self.eval_step + self.eval_per_step > self.total_step:
503
            return
504
        self.eval_step = self.total_step
505

506
        rewards_steps_ary = [
507
            get_rewards_and_steps(self.env_eval, actor) for _ in range(self.eval_times)
508
        ]
509
        rewards_steps_ary = np.array(rewards_steps_ary, dtype=np.float32)
510
        avg_r = rewards_steps_ary[:, 0].mean()  # average of cumulative rewards
511
        std_r = rewards_steps_ary[:, 0].std()  # std of cumulative rewards
512
        avg_s = rewards_steps_ary[:, 1].mean()  # average of steps in an episode
513

514
        used_time = time.time() - self.start_time
515
        self.recorder.append((self.total_step, used_time, avg_r))
516

517
        print(
518
            f"| {self.total_step:8.2e}  {used_time:8.0f}  "
519
            f"| {avg_r:8.2f}  {std_r:6.2f}  {avg_s:6.0f}  "
520
            f"| {logging_tuple[0]:8.2f}  {logging_tuple[1]:8.2f}"
521
        )
522

523

524
def get_rewards_and_steps(
525
    env, actor, if_render: bool = False
526
) -> (float, int):  # cumulative_rewards and episode_steps
527
    device = next(actor.parameters()).device  # net.parameters() is a Python generator.
528
    state = env.reset()
529
    if isinstance(state, tuple):
530
        state = state[0]
531
    episode_steps = 0
532
    cumulative_returns = 0.0  # sum of rewards in an episode
533
    for episode_steps in range(12345):
534
        tensor_state = torch.as_tensor(
535
            state, dtype=torch.float32, device=device
536
        ).unsqueeze(0)
537
        tensor_action = actor(tensor_state)
538
        action = (
539
            tensor_action.detach().cpu().numpy()[0]
540
        )  # not need detach(), because using torch.no_grad() outside
541
        state, reward, done, _ = env.step(action)
542
        cumulative_returns += reward
543

544
        if if_render:
545
            env.render()
546
        if done:
547
            break
548
    return cumulative_returns, episode_steps + 1
549

550

551
# -----------------------------------------------------------------------------------------------------------------------------------------
552
# DRL Agent Class
553

554
import torch
555

556
# from elegantrl.agents import AgentA2C
557

558
MODELS = {"ppo": AgentPPO}
559
OFF_POLICY_MODELS = ["ddpg", "td3", "sac"]
560
ON_POLICY_MODELS = ["ppo"]
561
# MODEL_KWARGS = {x: config.__dict__[f"{x.upper()}_PARAMS"] for x in MODELS.keys()}
562
#
563
# NOISE = {
564
#     "normal": NormalActionNoise,
565
#     "ornstein_uhlenbeck": OrnsteinUhlenbeckActionNoise,
566
# }
567

568

569
class DRLAgent:
570
    """Implementations of DRL algorithms
571
    Attributes
572
    ----------
573
        env: gym environment class
574
            user-defined class
575
    Methods
576
    -------
577
        get_model()
578
            setup DRL algorithms
579
        train_model()
580
            train DRL algorithms in a train dataset
581
            and output the trained model
582
        DRL_prediction()
583
            make a prediction in a test dataset and get results
584
    """
585

586
    def __init__(self, env, price_array, tech_array, turbulence_array):
587
        self.env = env
588
        self.price_array = price_array
589
        self.tech_array = tech_array
590
        self.turbulence_array = turbulence_array
591

592
    def get_model(self, model_name, model_kwargs):
593
        env_config = {
594
            "price_array": self.price_array,
595
            "tech_array": self.tech_array,
596
            "turbulence_array": self.turbulence_array,
597
            "if_train": True,
598
        }
599
        environment = self.env(config=env_config)
600
        env_args = {
601
            "config": env_config,
602
            "env_name": environment.env_name,
603
            "state_dim": environment.state_dim,
604
            "action_dim": environment.action_dim,
605
            "if_discrete": False,
606
        }
607
        agent = MODELS[model_name]
608
        if model_name not in MODELS:
609
            raise NotImplementedError("NotImplementedError")
610
        model = Config(agent_class=agent, env_class=self.env, env_args=env_args)
611
        model.if_off_policy = model_name in OFF_POLICY_MODELS
612
        if model_kwargs is not None:
613
            try:
614
                model.learning_rate = model_kwargs["learning_rate"]
615
                model.batch_size = model_kwargs["batch_size"]
616
                model.gamma = model_kwargs["gamma"]
617
                model.seed = model_kwargs["seed"]
618
                model.net_dims = model_kwargs["net_dimension"]
619
                model.target_step = model_kwargs["target_step"]
620
                model.eval_gap = model_kwargs["eval_gap"]
621
                model.eval_times = model_kwargs["eval_times"]
622
            except BaseException:
623
                raise ValueError(
624
                    "Fail to read arguments, please check 'model_kwargs' input."
625
                )
626
        return model
627

628
    def train_model(self, model, cwd, total_timesteps=5000):
629
        model.cwd = cwd
630
        model.break_step = total_timesteps
631
        train_agent(model)
632

633
    @staticmethod
634
    def DRL_prediction(model_name, cwd, net_dimension, environment):
635
        if model_name not in MODELS:
636
            raise NotImplementedError("NotImplementedError")
637
        agent_class = MODELS[model_name]
638
        environment.env_num = 1
639
        agent = agent_class(
640
            net_dimension, environment.state_dim, environment.action_dim
641
        )
642
        actor = agent.act
643
        # load agent
644
        try:
645
            cwd = cwd + "/actor.pth"
646
            print(f"| load actor from: {cwd}")
647
            actor.load_state_dict(
648
                torch.load(cwd, map_location=lambda storage, loc: storage)
649
            )
650
            act = actor
651
            device = agent.device
652
        except BaseException:
653
            raise ValueError("Fail to load agent!")
654

655
        # test on the testing env
656
        _torch = torch
657
        state = environment.reset()
658
        episode_returns = []  # the cumulative_return / initial_account
659
        episode_total_assets = [environment.initial_total_asset]
660
        with _torch.no_grad():
661
            for i in range(environment.max_step):
662
                s_tensor = _torch.as_tensor((state,), device=device)
663
                a_tensor = act(s_tensor)  # action_tanh = act.forward()
664
                action = (
665
                    a_tensor.detach().cpu().numpy()[0]
666
                )  # not need detach(), because with torch.no_grad() outside
667
                state, reward, done, _ = environment.step(action)
668

669
                total_asset = (
670
                    environment.amount
671
                    + (
672
                        environment.price_ary[environment.day] * environment.stocks
673
                    ).sum()
674
                )
675
                episode_total_assets.append(total_asset)
676
                episode_return = total_asset / environment.initial_total_asset
677
                episode_returns.append(episode_return)
678
                if done:
679
                    break
680
        print("Test Finished!")
681
        # return episode total_assets on testing data
682
        print("episode_return", episode_return)
683
        return episode_total_assets
684

685

686
# -----------------------------------------------------------------------------------------------------------------------------------------
687
# Train & Test Functions
688

689
from finrl.meta.data_processor import DataProcessor
690

691
# construct environment
692

693

694
def train(
695
    start_date,
696
    end_date,
697
    ticker_list,
698
    data_source,
699
    time_interval,
700
    technical_indicator_list,
701
    drl_lib,
702
    env,
703
    model_name,
704
    if_vix=True,
705
    **kwargs,
706
):
707
    # download data
708
    dp = DataProcessor(data_source, **kwargs)
709
    data = dp.download_data(ticker_list, start_date, end_date, time_interval)
710
    data = dp.clean_data(data)
711
    data = dp.add_technical_indicator(data, technical_indicator_list)
712
    if if_vix:
713
        data = dp.add_vix(data)
714
    else:
715
        data = dp.add_turbulence(data)
716
    price_array, tech_array, turbulence_array = dp.df_to_array(data, if_vix)
717
    env_config = {
718
        "price_array": price_array,
719
        "tech_array": tech_array,
720
        "turbulence_array": turbulence_array,
721
        "if_train": True,
722
    }
723
    env_instance = env(config=env_config)
724

725
    # read parameters
726
    cwd = kwargs.get("cwd", "./" + str(model_name))
727

728
    if drl_lib == "elegantrl":
729
        DRLAgent_erl = DRLAgent
730
        break_step = kwargs.get("break_step", 1e6)
731
        erl_params = kwargs.get("erl_params")
732
        agent = DRLAgent_erl(
733
            env=env,
734
            price_array=price_array,
735
            tech_array=tech_array,
736
            turbulence_array=turbulence_array,
737
        )
738
        model = agent.get_model(model_name, model_kwargs=erl_params)
739
        trained_model = agent.train_model(
740
            model=model, cwd=cwd, total_timesteps=break_step
741
        )
742

743

744
# -----------------------------------------------------------------------------------------------------------------------------------------
745

746

747
def test(
748
    start_date,
749
    end_date,
750
    ticker_list,
751
    data_source,
752
    time_interval,
753
    technical_indicator_list,
754
    drl_lib,
755
    env,
756
    model_name,
757
    if_vix=True,
758
    **kwargs,
759
):
760
    # import data processor
761
    from finrl.meta.data_processor import DataProcessor
762

763
    # fetch data
764
    dp = DataProcessor(data_source, **kwargs)
765
    data = dp.download_data(ticker_list, start_date, end_date, time_interval)
766
    data = dp.clean_data(data)
767
    data = dp.add_technical_indicator(data, technical_indicator_list)
768

769
    if if_vix:
770
        data = dp.add_vix(data)
771
    else:
772
        data = dp.add_turbulence(data)
773
    price_array, tech_array, turbulence_array = dp.df_to_array(data, if_vix)
774

775
    env_config = {
776
        "price_array": price_array,
777
        "tech_array": tech_array,
778
        "turbulence_array": turbulence_array,
779
        "if_train": False,
780
    }
781
    env_instance = env(config=env_config)
782

783
    # load elegantrl needs state dim, action dim and net dim
784
    net_dimension = kwargs.get("net_dimension", 2**7)
785
    cwd = kwargs.get("cwd", "./" + str(model_name))
786
    print("price_array: ", len(price_array))
787

788
    if drl_lib == "elegantrl":
789
        DRLAgent_erl = DRLAgent
790
        episode_total_assets = DRLAgent_erl.DRL_prediction(
791
            model_name=model_name,
792
            cwd=cwd,
793
            net_dimension=net_dimension,
794
            environment=env_instance,
795
        )
796
        return episode_total_assets
797

798

799
# -----------------------------------------------------------------------------------------------------------------------------------------
800

801
import alpaca_trade_api as tradeapi
802
import pandas_market_calendars as tc
803
import numpy as np
804
import pandas as pd
805
import yfinance as yf
806

807

808
def get_trading_days(start, end):
809
    nyse = tc.get_calendar("NYSE")
810
    df = nyse.date_range_htf("1D", pd.Timestamp(start), pd.Timestamp(end))
811
    # df = nyse.sessions_in_range(
812
    #     pd.Timestamp(start, tz=pytz.UTC), pd.Timestamp(end, tz=pytz.UTC)
813
    # )
814
    trading_days = []
815
    for day in df:
816
        trading_days.append(str(day)[:10])
817
    return trading_days
818

819

820
def alpaca_history(key, secret, url, start, end):
821
    api = tradeapi.REST(key, secret, url, "v2")
822
    trading_days = get_trading_days(start, end)
823
    df = pd.DataFrame()
824
    for day in trading_days:
825
        df = df.append(
826
            api.get_portfolio_history(date_start=day, timeframe="5Min").df.iloc[:78]
827
        )
828
    equities = df.equity.values
829
    cumu_returns = equities / equities[0]
830
    cumu_returns = cumu_returns[~np.isnan(cumu_returns)]
831

832
    return df, cumu_returns
833

834

835
def DIA_history(start):
836
    data_df = yf.download(["^DJI"], start=start, interval="5m")
837
    data_df = data_df.iloc[:]
838
    baseline_returns = data_df["Adj Close"].values / data_df["Adj Close"].values[0]
839
    return data_df, baseline_returns
840

841

842
# -----------------------------------------------------------------------------------------------------------------------------------------
843

844
Product

Resources

Company