From 1c79a1c4d4e40477dbcafd6d987d0840f75bbd2c Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Wed, 8 Jan 2025 12:53:16 +0800 Subject: [PATCH 1/6] Update conda_env.yml --- conda_env.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/conda_env.yml b/conda_env.yml index 01a5da2..797fd10 100644 --- a/conda_env.yml +++ b/conda_env.yml @@ -82,9 +82,9 @@ dependencies: - tensorflow-estimator==2.9.0 - tensorflow-io-gcs-filesystem==0.26.0 - termcolor==1.1.0 - - torch==1.8.2+cu111 - - torchaudio==0.8.2 - - torchvision==0.9.2+cu111 + - torch==2.1.0 + - torchaudio==2.1.0 + - torchvision==0.16.0 - tqdm==4.64.0 - typing-extensions==4.2.0 - urllib3==1.26.9 From f0714288ad5f34cfcd6f51463f20eec8d7b0633c Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Wed, 8 Jan 2025 12:53:23 +0800 Subject: [PATCH 2/6] Update env.py --- env.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/env.py b/env.py index cb8a895..1b2839c 100644 --- a/env.py +++ b/env.py @@ -30,6 +30,7 @@ 'humanoid-walk', 'fish-swim', 'acrobot-swingup', + 'quadruped-run' ] CONTROL_SUITE_ACTION_REPEATS = { 'cartpole': 8, @@ -41,6 +42,7 @@ 'humanoid': 2, 'fish': 2, 'acrobot': 4, + 'quadruped': 2 } From 37591dce59eb62df6d87ca736e534c462a24f8ff Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Tue, 11 Feb 2025 13:54:53 +0800 Subject: [PATCH 3/6] return computation & plan horizon fix --- main.py | 4 ++-- utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/main.py b/main.py index 2e5a63b..f83e9b7 100644 --- a/main.py +++ b/main.py @@ -479,7 +479,7 @@ def update_belief_and_act( imged_reward = bottle(reward_model, (imged_beliefs, imged_prior_states)) value_pred = bottle(value_model, (imged_beliefs, imged_prior_states)) returns = lambda_return( - imged_reward, value_pred, bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam + imged_reward[:-1], value_pred[:-1], bootstrap=value_pred[-1], discount=args.discount, lambda_=args.disclam ) actor_loss = -torch.mean(returns) # Update model parameters @@ -494,7 +494,7 @@ def update_belief_and_act( value_prior_states = imged_prior_states.detach() target_return = returns.detach() value_dist = Normal( - bottle(value_model, (value_beliefs, value_prior_states)), 1 + bottle(value_model, (value_beliefs, value_prior_states))[:-1], 1 ) # detach the input tensor from the transition network. value_loss = -value_dist.log_prob(target_return).mean(dim=(0, 1)) # Update model parameters diff --git a/utils.py b/utils.py index df7b3bf..cb4ae76 100644 --- a/utils.py +++ b/utils.py @@ -74,7 +74,7 @@ def imagine_ahead(prev_state, prev_belief, policy, transition_model, planning_ho prev_state = flatten(prev_state) # Create lists for hidden states (cannot use single tensor as buffer because autograd won't work with inplace writes) - T = planning_horizon + T = planning_horizon + 1 beliefs, prior_states, prior_means, prior_std_devs = ( [torch.empty(0)] * T, [torch.empty(0)] * T, From 478f84c80d1bd43cc9bc3b918bbfd6867283de63 Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Tue, 11 Feb 2025 13:57:12 +0800 Subject: [PATCH 4/6] default learning rate modify --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index f83e9b7..99c3bf3 100644 --- a/main.py +++ b/main.py @@ -91,7 +91,7 @@ parser.add_argument('--global-kl-beta', type=float, default=0, metavar='βg', help='Global KL weight (0 to disable)') parser.add_argument('--free-nats', type=float, default=3, metavar='F', help='Free nats') parser.add_argument('--bit-depth', type=int, default=5, metavar='B', help='Image bit depth (quantisation)') -parser.add_argument('--model_learning-rate', type=float, default=1e-3, metavar='α', help='Learning rate') +parser.add_argument('--model_learning-rate', type=float, default=6e-4, metavar='α', help='Learning rate') parser.add_argument('--actor_learning-rate', type=float, default=8e-5, metavar='α', help='Learning rate') parser.add_argument('--value_learning-rate', type=float, default=8e-5, metavar='α', help='Learning rate') parser.add_argument( From 9d67b6376f74d6cac2f716bb3fe2367ed8233cd5 Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Tue, 11 Feb 2025 13:58:24 +0800 Subject: [PATCH 5/6] MemoryError fix --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 99c3bf3..34c1520 100644 --- a/main.py +++ b/main.py @@ -681,7 +681,7 @@ def update_belief_and_act( ) if args.checkpoint_experience: torch.save( - D, os.path.join(results_dir, 'experience.pth') + D, os.path.join(results_dir, 'experience.pth'), pickle_protocol=5 ) # Warning: will fail with MemoryError with large memory sizes From 75bff11131125ffa28d93aed244ce23c2e068bb1 Mon Sep 17 00:00:00 2001 From: sumwailiu <798465811@qq.com> Date: Tue, 11 Feb 2025 14:09:57 +0800 Subject: [PATCH 6/6] step counting consistency --- main.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main.py b/main.py index 34c1520..124051b 100644 --- a/main.py +++ b/main.py @@ -175,7 +175,7 @@ D.append(observation, action, reward, done) observation = next_observation t += 1 - metrics['steps'].append(t * args.action_repeat + (0 if len(metrics['steps']) == 0 else metrics['steps'][-1])) + metrics['steps'].append(int(t * args.action_repeat) + (0 if len(metrics['steps']) == 0 else metrics['steps'][-1])) metrics['episodes'].append(s) print("experience replay buffer is ready") @@ -336,8 +336,8 @@ def update_belief_and_act( losses = [] model_modules = transition_model.modules + encoder.modules + observation_model.modules + reward_model.modules - print("training loop") - for s in tqdm(range(args.collect_interval)): + print("\n training loop") + for s in tqdm(range(1, args.collect_interval + 1)): # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags) observations, actions, rewards, nonterminals = D.sample( args.batch_size, args.chunk_size @@ -535,7 +535,7 @@ def update_belief_and_act( torch.zeros(1, args.state_size, device=args.device), torch.zeros(1, env.action_size, device=args.device), ) - pbar = tqdm(range(args.max_episode_length // args.action_repeat)) + pbar = tqdm(range(1, args.max_episode_length // args.action_repeat + 1)) for t in pbar: # print("step",t) belief, posterior_state, action, next_observation, reward, done = update_belief_and_act( @@ -560,7 +560,7 @@ def update_belief_and_act( break # Update and plot train reward metrics - metrics['steps'].append(t + metrics['steps'][-1]) + metrics['steps'].append(int(t * args.action_repeat) + metrics['steps'][-1]) metrics['episodes'].append(episode) metrics['train_rewards'].append(total_reward) lineplot( @@ -651,7 +651,7 @@ def update_belief_and_act( test_envs.close() writer.add_scalar("train_reward", metrics['train_rewards'][-1], metrics['steps'][-1]) - writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1], metrics['steps'][-1] * args.action_repeat) + writer.add_scalar("train/episode_reward", metrics['train_rewards'][-1], metrics['steps'][-1]) writer.add_scalar("observation_loss", metrics['observation_loss'][0][-1], metrics['steps'][-1]) writer.add_scalar("reward_loss", metrics['reward_loss'][0][-1], metrics['steps'][-1]) writer.add_scalar("kl_loss", metrics['kl_loss'][0][-1], metrics['steps'][-1])