CoCalc -- hyperparams

GitHub Repository: AI4Finance-Foundation/FinRL
Path: blob/master/finrl/agents/stablebaselines3/hyperparams_opt.py
⁷³² views
1
from __future__ import annotations
2

3
from typing import Any
4
from typing import Dict
5

6
import numpy as np
7
import optuna
8
from stable_baselines3.common.noise import NormalActionNoise
9
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
10
from torch import nn as nn
11
from utils import linear_schedule
12

13

14
def sample_ppo_params(trial: optuna.Trial) -> dict[str, Any]:
15
    """
16
    Sampler for PPO hyperparams.
17

18
    :param trial:
19
    :return:
20
    """
21
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
22
    n_steps = trial.suggest_categorical(
23
        "n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
24
    )
25
    gamma = trial.suggest_categorical(
26
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
27
    )
28
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
29
    lr_schedule = "constant"
30
    # Uncomment to enable learning rate schedule
31
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
32
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
33
    clip_range = trial.suggest_categorical("clip_range", [0.1, 0.2, 0.3, 0.4])
34
    n_epochs = trial.suggest_categorical("n_epochs", [1, 5, 10, 20])
35
    gae_lambda = trial.suggest_categorical(
36
        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]
37
    )
38
    max_grad_norm = trial.suggest_categorical(
39
        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]
40
    )
41
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
42
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
43
    # Uncomment for gSDE (continuous actions)
44
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
45
    # Uncomment for gSDE (continuous action)
46
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
47
    # Orthogonal initialization
48
    ortho_init = False
49
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
50
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
51
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
52

53
    # TODO: account when using multiple envs
54
    if batch_size > n_steps:
55
        batch_size = n_steps
56

57
    if lr_schedule == "linear":
58
        learning_rate = linear_schedule(learning_rate)
59

60
    # Independent networks usually work best
61
    # when not working with images
62
    net_arch = {
63
        "small": [dict(pi=[64, 64], vf=[64, 64])],
64
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
65
    }[net_arch]
66

67
    activation_fn = {
68
        "tanh": nn.Tanh,
69
        "relu": nn.ReLU,
70
        "elu": nn.ELU,
71
        "leaky_relu": nn.LeakyReLU,
72
    }[activation_fn]
73

74
    return {
75
        "n_steps": n_steps,
76
        "batch_size": batch_size,
77
        "gamma": gamma,
78
        "learning_rate": learning_rate,
79
        "ent_coef": ent_coef,
80
        "clip_range": clip_range,
81
        "n_epochs": n_epochs,
82
        "gae_lambda": gae_lambda,
83
        "max_grad_norm": max_grad_norm,
84
        "vf_coef": vf_coef,
85
        # "sde_sample_freq": sde_sample_freq,
86
        "policy_kwargs": dict(
87
            # log_std_init=log_std_init,
88
            net_arch=net_arch,
89
            activation_fn=activation_fn,
90
            ortho_init=ortho_init,
91
        ),
92
    }
93

94

95
def sample_trpo_params(trial: optuna.Trial) -> dict[str, Any]:
96
    """
97
    Sampler for TRPO hyperparams.
98

99
    :param trial:
100
    :return:
101
    """
102
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32, 64, 128, 256, 512])
103
    n_steps = trial.suggest_categorical(
104
        "n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
105
    )
106
    gamma = trial.suggest_categorical(
107
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
108
    )
109
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
110
    lr_schedule = "constant"
111
    # Uncomment to enable learning rate schedule
112
    # lr_schedule = trial.suggest_categorical('lr_schedule', ['linear', 'constant'])
113
    # line_search_shrinking_factor = trial.suggest_categorical("line_search_shrinking_factor", [0.6, 0.7, 0.8, 0.9])
114
    n_critic_updates = trial.suggest_categorical(
115
        "n_critic_updates", [5, 10, 20, 25, 30]
116
    )
117
    cg_max_steps = trial.suggest_categorical("cg_max_steps", [5, 10, 20, 25, 30])
118
    # cg_damping = trial.suggest_categorical("cg_damping", [0.5, 0.2, 0.1, 0.05, 0.01])
119
    target_kl = trial.suggest_categorical(
120
        "target_kl", [0.1, 0.05, 0.03, 0.02, 0.01, 0.005, 0.001]
121
    )
122
    gae_lambda = trial.suggest_categorical(
123
        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]
124
    )
125
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
126
    # Uncomment for gSDE (continuous actions)
127
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
128
    # Uncomment for gSDE (continuous action)
129
    # sde_sample_freq = trial.suggest_categorical("sde_sample_freq", [-1, 8, 16, 32, 64, 128, 256])
130
    # Orthogonal initialization
131
    ortho_init = False
132
    # ortho_init = trial.suggest_categorical('ortho_init', [False, True])
133
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
134
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
135

136
    # TODO: account when using multiple envs
137
    if batch_size > n_steps:
138
        batch_size = n_steps
139

140
    if lr_schedule == "linear":
141
        learning_rate = linear_schedule(learning_rate)
142

143
    # Independent networks usually work best
144
    # when not working with images
145
    net_arch = {
146
        "small": [dict(pi=[64, 64], vf=[64, 64])],
147
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
148
    }[net_arch]
149

150
    activation_fn = {
151
        "tanh": nn.Tanh,
152
        "relu": nn.ReLU,
153
        "elu": nn.ELU,
154
        "leaky_relu": nn.LeakyReLU,
155
    }[activation_fn]
156

157
    return {
158
        "n_steps": n_steps,
159
        "batch_size": batch_size,
160
        "gamma": gamma,
161
        # "cg_damping": cg_damping,
162
        "cg_max_steps": cg_max_steps,
163
        # "line_search_shrinking_factor": line_search_shrinking_factor,
164
        "n_critic_updates": n_critic_updates,
165
        "target_kl": target_kl,
166
        "learning_rate": learning_rate,
167
        "gae_lambda": gae_lambda,
168
        # "sde_sample_freq": sde_sample_freq,
169
        "policy_kwargs": dict(
170
            # log_std_init=log_std_init,
171
            net_arch=net_arch,
172
            activation_fn=activation_fn,
173
            ortho_init=ortho_init,
174
        ),
175
    }
176

177

178
def sample_a2c_params(trial: optuna.Trial) -> dict[str, Any]:
179
    """
180
    Sampler for A2C hyperparams.
181

182
    :param trial:
183
    :return:
184
    """
185
    gamma = trial.suggest_categorical(
186
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
187
    )
188
    normalize_advantage = trial.suggest_categorical(
189
        "normalize_advantage", [False, True]
190
    )
191
    max_grad_norm = trial.suggest_categorical(
192
        "max_grad_norm", [0.3, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]
193
    )
194
    # Toggle PyTorch RMS Prop (different from TF one, cf doc)
195
    use_rms_prop = trial.suggest_categorical("use_rms_prop", [False, True])
196
    gae_lambda = trial.suggest_categorical(
197
        "gae_lambda", [0.8, 0.9, 0.92, 0.95, 0.98, 0.99, 1.0]
198
    )
199
    n_steps = trial.suggest_categorical(
200
        "n_steps", [8, 16, 32, 64, 128, 256, 512, 1024, 2048]
201
    )
202
    lr_schedule = trial.suggest_categorical("lr_schedule", ["linear", "constant"])
203
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
204
    ent_coef = trial.suggest_loguniform("ent_coef", 0.00000001, 0.1)
205
    vf_coef = trial.suggest_uniform("vf_coef", 0, 1)
206
    # Uncomment for gSDE (continuous actions)
207
    # log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
208
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
209
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium"])
210
    # sde_net_arch = trial.suggest_categorical("sde_net_arch", [None, "tiny", "small"])
211
    # full_std = trial.suggest_categorical("full_std", [False, True])
212
    # activation_fn = trial.suggest_categorical('activation_fn', ['tanh', 'relu', 'elu', 'leaky_relu'])
213
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])
214

215
    if lr_schedule == "linear":
216
        learning_rate = linear_schedule(learning_rate)
217

218
    net_arch = {
219
        "small": [dict(pi=[64, 64], vf=[64, 64])],
220
        "medium": [dict(pi=[256, 256], vf=[256, 256])],
221
    }[net_arch]
222

223
    # sde_net_arch = {
224
    #     None: None,
225
    #     "tiny": [64],
226
    #     "small": [64, 64],
227
    # }[sde_net_arch]
228

229
    activation_fn = {
230
        "tanh": nn.Tanh,
231
        "relu": nn.ReLU,
232
        "elu": nn.ELU,
233
        "leaky_relu": nn.LeakyReLU,
234
    }[activation_fn]
235

236
    return {
237
        "n_steps": n_steps,
238
        "gamma": gamma,
239
        "gae_lambda": gae_lambda,
240
        "learning_rate": learning_rate,
241
        "ent_coef": ent_coef,
242
        "normalize_advantage": normalize_advantage,
243
        "max_grad_norm": max_grad_norm,
244
        "use_rms_prop": use_rms_prop,
245
        "vf_coef": vf_coef,
246
        "policy_kwargs": dict(
247
            # log_std_init=log_std_init,
248
            net_arch=net_arch,
249
            # full_std=full_std,
250
            activation_fn=activation_fn,
251
            # sde_net_arch=sde_net_arch,
252
            ortho_init=ortho_init,
253
        ),
254
    }
255

256

257
def sample_sac_params(trial: optuna.Trial) -> dict[str, Any]:
258
    """
259
    Sampler for SAC hyperparams.
260

261
    :param trial:
262
    :return:
263
    """
264
    gamma = trial.suggest_categorical(
265
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
266
    )
267
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
268
    batch_size = trial.suggest_categorical(
269
        "batch_size", [16, 32, 64, 128, 256, 512, 1024, 2048]
270
    )
271
    buffer_size = trial.suggest_categorical(
272
        "buffer_size", [int(1e4), int(1e5), int(1e6)]
273
    )
274
    learning_starts = trial.suggest_categorical(
275
        "learning_starts", [0, 1000, 10000, 20000]
276
    )
277
    # train_freq = trial.suggest_categorical('train_freq', [1, 10, 100, 300])
278
    train_freq = trial.suggest_categorical(
279
        "train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]
280
    )
281
    # Polyak coeff
282
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
283
    # gradient_steps takes too much time
284
    # gradient_steps = trial.suggest_categorical('gradient_steps', [1, 100, 300])
285
    gradient_steps = train_freq
286
    # ent_coef = trial.suggest_categorical('ent_coef', ['auto', 0.5, 0.1, 0.05, 0.01, 0.0001])
287
    ent_coef = "auto"
288
    # You can comment that out when not using gSDE
289
    log_std_init = trial.suggest_uniform("log_std_init", -4, 1)
290
    # NOTE: Add "verybig" to net_arch when tuning HER
291
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
292
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
293

294
    net_arch = {
295
        "small": [64, 64],
296
        "medium": [256, 256],
297
        "big": [400, 300],
298
        # Uncomment for tuning HER
299
        # "large": [256, 256, 256],
300
        # "verybig": [512, 512, 512],
301
    }[net_arch]
302

303
    target_entropy = "auto"
304
    # if ent_coef == 'auto':
305
    #     # target_entropy = trial.suggest_categorical('target_entropy', ['auto', 5, 1, 0, -1, -5, -10, -20, -50])
306
    #     target_entropy = trial.suggest_uniform('target_entropy', -10, 10)
307

308
    hyperparams = {
309
        "gamma": gamma,
310
        "learning_rate": learning_rate,
311
        "batch_size": batch_size,
312
        "buffer_size": buffer_size,
313
        "learning_starts": learning_starts,
314
        "train_freq": train_freq,
315
        "gradient_steps": gradient_steps,
316
        "ent_coef": ent_coef,
317
        "tau": tau,
318
        "target_entropy": target_entropy,
319
        "policy_kwargs": dict(log_std_init=log_std_init, net_arch=net_arch),
320
    }
321

322
    if trial.using_her_replay_buffer:
323
        hyperparams = sample_her_params(trial, hyperparams)
324

325
    return hyperparams
326

327

328
def sample_td3_params(trial: optuna.Trial) -> dict[str, Any]:
329
    """
330
    Sampler for TD3 hyperparams.
331

332
    :param trial:
333
    :return:
334
    """
335
    gamma = trial.suggest_categorical(
336
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
337
    )
338
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
339
    batch_size = trial.suggest_categorical(
340
        "batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]
341
    )
342
    buffer_size = trial.suggest_categorical(
343
        "buffer_size", [int(1e4), int(1e5), int(1e6)]
344
    )
345
    # Polyak coeff
346
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
347

348
    train_freq = trial.suggest_categorical(
349
        "train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]
350
    )
351
    gradient_steps = train_freq
352

353
    noise_type = trial.suggest_categorical(
354
        "noise_type", ["ornstein-uhlenbeck", "normal", None]
355
    )
356
    noise_std = trial.suggest_uniform("noise_std", 0, 1)
357

358
    # NOTE: Add "verybig" to net_arch when tuning HER
359
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
360
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
361

362
    net_arch = {
363
        "small": [64, 64],
364
        "medium": [256, 256],
365
        "big": [400, 300],
366
        # Uncomment for tuning HER
367
        # "verybig": [256, 256, 256],
368
    }[net_arch]
369

370
    hyperparams = {
371
        "gamma": gamma,
372
        "learning_rate": learning_rate,
373
        "batch_size": batch_size,
374
        "buffer_size": buffer_size,
375
        "train_freq": train_freq,
376
        "gradient_steps": gradient_steps,
377
        "policy_kwargs": dict(net_arch=net_arch),
378
        "tau": tau,
379
    }
380

381
    if noise_type == "normal":
382
        hyperparams["action_noise"] = NormalActionNoise(
383
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
384
        )
385
    elif noise_type == "ornstein-uhlenbeck":
386
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
387
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
388
        )
389

390
    if trial.using_her_replay_buffer:
391
        hyperparams = sample_her_params(trial, hyperparams)
392

393
    return hyperparams
394

395

396
def sample_ddpg_params(trial: optuna.Trial) -> dict[str, Any]:
397
    """
398
    Sampler for DDPG hyperparams.
399

400
    :param trial:
401
    :return:
402
    """
403
    gamma = trial.suggest_categorical(
404
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
405
    )
406
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
407
    batch_size = trial.suggest_categorical(
408
        "batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048]
409
    )
410
    buffer_size = trial.suggest_categorical(
411
        "buffer_size", [int(1e4), int(1e5), int(1e6)]
412
    )
413
    # Polyak coeff
414
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])
415

416
    train_freq = trial.suggest_categorical(
417
        "train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512]
418
    )
419
    gradient_steps = train_freq
420

421
    noise_type = trial.suggest_categorical(
422
        "noise_type", ["ornstein-uhlenbeck", "normal", None]
423
    )
424
    noise_std = trial.suggest_uniform("noise_std", 0, 1)
425

426
    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
427
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
428
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])
429

430
    net_arch = {"small": [64, 64], "medium": [256, 256], "big": [400, 300]}[net_arch]
431

432
    hyperparams = {
433
        "gamma": gamma,
434
        "tau": tau,
435
        "learning_rate": learning_rate,
436
        "batch_size": batch_size,
437
        "buffer_size": buffer_size,
438
        "train_freq": train_freq,
439
        "gradient_steps": gradient_steps,
440
        "policy_kwargs": dict(net_arch=net_arch),
441
    }
442

443
    if noise_type == "normal":
444
        hyperparams["action_noise"] = NormalActionNoise(
445
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
446
        )
447
    elif noise_type == "ornstein-uhlenbeck":
448
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
449
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
450
        )
451

452
    if trial.using_her_replay_buffer:
453
        hyperparams = sample_her_params(trial, hyperparams)
454

455
    return hyperparams
456

457

458
def sample_dqn_params(trial: optuna.Trial) -> dict[str, Any]:
459
    """
460
    Sampler for DQN hyperparams.
461

462
    :param trial:
463
    :return:
464
    """
465
    gamma = trial.suggest_categorical(
466
        "gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999]
467
    )
468
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
469
    batch_size = trial.suggest_categorical(
470
        "batch_size", [16, 32, 64, 100, 128, 256, 512]
471
    )
472
    buffer_size = trial.suggest_categorical(
473
        "buffer_size", [int(1e4), int(5e4), int(1e5), int(1e6)]
474
    )
475
    exploration_final_eps = trial.suggest_uniform("exploration_final_eps", 0, 0.2)
476
    exploration_fraction = trial.suggest_uniform("exploration_fraction", 0, 0.5)
477
    target_update_interval = trial.suggest_categorical(
478
        "target_update_interval", [1, 1000, 5000, 10000, 15000, 20000]
479
    )
480
    learning_starts = trial.suggest_categorical(
481
        "learning_starts", [0, 1000, 5000, 10000, 20000]
482
    )
483

484
    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 128, 256, 1000])
485
    subsample_steps = trial.suggest_categorical("subsample_steps", [1, 2, 4, 8])
486
    gradient_steps = max(train_freq // subsample_steps, 1)
487

488
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small", "medium"])
489

490
    net_arch = {"tiny": [64], "small": [64, 64], "medium": [256, 256]}[net_arch]
491

492
    hyperparams = {
493
        "gamma": gamma,
494
        "learning_rate": learning_rate,
495
        "batch_size": batch_size,
496
        "buffer_size": buffer_size,
497
        "train_freq": train_freq,
498
        "gradient_steps": gradient_steps,
499
        "exploration_fraction": exploration_fraction,
500
        "exploration_final_eps": exploration_final_eps,
501
        "target_update_interval": target_update_interval,
502
        "learning_starts": learning_starts,
503
        "policy_kwargs": dict(net_arch=net_arch),
504
    }
505

506
    if trial.using_her_replay_buffer:
507
        hyperparams = sample_her_params(trial, hyperparams)
508

509
    return hyperparams
510

511

512
def sample_her_params(
513
    trial: optuna.Trial, hyperparams: dict[str, Any]
514
) -> dict[str, Any]:
515
    """
516
    Sampler for HerReplayBuffer hyperparams.
517

518
    :param trial:
519
    :parma hyperparams:
520
    :return:
521
    """
522
    her_kwargs = trial.her_kwargs.copy()
523
    her_kwargs["n_sampled_goal"] = trial.suggest_int("n_sampled_goal", 1, 5)
524
    her_kwargs["goal_selection_strategy"] = trial.suggest_categorical(
525
        "goal_selection_strategy", ["final", "episode", "future"]
526
    )
527
    her_kwargs["online_sampling"] = trial.suggest_categorical(
528
        "online_sampling", [True, False]
529
    )
530
    hyperparams["replay_buffer_kwargs"] = her_kwargs
531
    return hyperparams
532

533

534
def sample_tqc_params(trial: optuna.Trial) -> dict[str, Any]:
535
    """
536
    Sampler for TQC hyperparams.
537

538
    :param trial:
539
    :return:
540
    """
541
    # TQC is SAC + Distributional RL
542
    hyperparams = sample_sac_params(trial)
543

544
    n_quantiles = trial.suggest_int("n_quantiles", 5, 50)
545
    top_quantiles_to_drop_per_net = trial.suggest_int(
546
        "top_quantiles_to_drop_per_net", 0, n_quantiles - 1
547
    )
548

549
    hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles})
550
    hyperparams["top_quantiles_to_drop_per_net"] = top_quantiles_to_drop_per_net
551

552
    return hyperparams
553

554

555
def sample_qrdqn_params(trial: optuna.Trial) -> dict[str, Any]:
556
    """
557
    Sampler for QR-DQN hyperparams.
558

559
    :param trial:
560
    :return:
561
    """
562
    # TQC is DQN + Distributional RL
563
    hyperparams = sample_dqn_params(trial)
564

565
    n_quantiles = trial.suggest_int("n_quantiles", 5, 200)
566
    hyperparams["policy_kwargs"].update({"n_quantiles": n_quantiles})
567

568
    return hyperparams
569

570

571
def sample_ars_params(trial: optuna.Trial) -> dict[str, Any]:
572
    """
573
    Sampler for ARS hyperparams.
574
    :param trial:
575
    :return:
576
    """
577
    # n_eval_episodes = trial.suggest_categorical("n_eval_episodes", [1, 2])
578
    n_delta = trial.suggest_categorical("n_delta", [4, 8, 6, 32, 64])
579
    # learning_rate = trial.suggest_categorical("learning_rate", [0.01, 0.02, 0.025, 0.03])
580
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
581
    delta_std = trial.suggest_categorical(
582
        "delta_std", [0.01, 0.02, 0.025, 0.03, 0.05, 0.1, 0.2, 0.3]
583
    )
584
    top_frac_size = trial.suggest_categorical(
585
        "top_frac_size", [0.1, 0.2, 0.3, 0.5, 0.8, 0.9, 1.0]
586
    )
587
    zero_policy = trial.suggest_categorical("zero_policy", [True, False])
588
    n_top = max(int(top_frac_size * n_delta), 1)
589

590
    # net_arch = trial.suggest_categorical("net_arch", ["linear", "tiny", "small"])
591

592
    # Note: remove bias to be as the original linear policy
593
    # and do not squash output
594
    # Comment out when doing hyperparams search with linear policy only
595
    # net_arch = {
596
    #     "linear": [],
597
    #     "tiny": [16],
598
    #     "small": [32],
599
    # }[net_arch]
600

601
    # TODO: optimize the alive_bonus_offset too
602

603
    return {
604
        # "n_eval_episodes": n_eval_episodes,
605
        "n_delta": n_delta,
606
        "learning_rate": learning_rate,
607
        "delta_std": delta_std,
608
        "n_top": n_top,
609
        "zero_policy": zero_policy,
610
        # "policy_kwargs": dict(net_arch=net_arch),
611
    }
612

613

614
HYPERPARAMS_SAMPLER = {
615
    "a2c": sample_a2c_params,
616
    "ars": sample_ars_params,
617
    "ddpg": sample_ddpg_params,
618
    "dqn": sample_dqn_params,
619
    "qrdqn": sample_qrdqn_params,
620
    "sac": sample_sac_params,
621
    "tqc": sample_tqc_params,
622
    "ppo": sample_ppo_params,
623
    "td3": sample_td3_params,
624
    "trpo": sample_trpo_params,
625
}
626

627
Product

Resources

Company