Path: blob/main/examples/rl/run_diffuser_locomotion.py
1448 views
import d4rl # noqa1import gym2import tqdm3from diffusers.experimental import ValueGuidedRLPipeline456config = dict(7n_samples=64,8horizon=32,9num_inference_steps=20,10n_guide_steps=2, # can set to 0 for faster sampling, does not use value network11scale_grad_by_std=True,12scale=0.1,13eta=0.0,14t_grad_cutoff=2,15device="cpu",16)171819if __name__ == "__main__":20env_name = "hopper-medium-v2"21env = gym.make(env_name)2223pipeline = ValueGuidedRLPipeline.from_pretrained(24"bglick13/hopper-medium-v2-value-function-hor32",25env=env,26)2728env.seed(0)29obs = env.reset()30total_reward = 031total_score = 032T = 100033rollout = [obs.copy()]34try:35for t in tqdm.tqdm(range(T)):36# call the policy37denorm_actions = pipeline(obs, planning_horizon=32)3839# execute action in environment40next_observation, reward, terminal, _ = env.step(denorm_actions)41score = env.get_normalized_score(total_reward)4243# update return44total_reward += reward45total_score += score46print(47f"Step: {t}, Reward: {reward}, Total Reward: {total_reward}, Score: {score}, Total Score:"48f" {total_score}"49)5051# save observations for rendering52rollout.append(next_observation.copy())5354obs = next_observation55except KeyboardInterrupt:56pass5758print(f"Total reward: {total_reward}")596061