Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
shivamshrirao
GitHub Repository: shivamshrirao/diffusers
Path: blob/main/examples/rl/run_diffuser_locomotion.py
1448 views
1
import d4rl # noqa
2
import gym
3
import tqdm
4
from diffusers.experimental import ValueGuidedRLPipeline
5
6
7
config = dict(
8
n_samples=64,
9
horizon=32,
10
num_inference_steps=20,
11
n_guide_steps=2, # can set to 0 for faster sampling, does not use value network
12
scale_grad_by_std=True,
13
scale=0.1,
14
eta=0.0,
15
t_grad_cutoff=2,
16
device="cpu",
17
)
18
19
20
if __name__ == "__main__":
21
env_name = "hopper-medium-v2"
22
env = gym.make(env_name)
23
24
pipeline = ValueGuidedRLPipeline.from_pretrained(
25
"bglick13/hopper-medium-v2-value-function-hor32",
26
env=env,
27
)
28
29
env.seed(0)
30
obs = env.reset()
31
total_reward = 0
32
total_score = 0
33
T = 1000
34
rollout = [obs.copy()]
35
try:
36
for t in tqdm.tqdm(range(T)):
37
# call the policy
38
denorm_actions = pipeline(obs, planning_horizon=32)
39
40
# execute action in environment
41
next_observation, reward, terminal, _ = env.step(denorm_actions)
42
score = env.get_normalized_score(total_reward)
43
44
# update return
45
total_reward += reward
46
total_score += score
47
print(
48
f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"
49
f" {total_score}"
50
)
51
52
# save observations for rendering
53
rollout.append(next_observation.copy())
54
55
obs = next_observation
56
except KeyboardInterrupt:
57
pass
58
59
print(f"Total reward: {total_reward}")
60
61