当前位置：首页 > news >正文

强化学习_06_pytorch-PPO2实践(ALE/Breakout-v5)

news 2025/9/12 20:12:18

一、环境适当调整

数据收集：RecordEpisodeStatistics
进行起始跳过n帧：baseSkipFrame
一条生命结束记录为done:EpisodicLifeEnv
得分处理成0或1:ClipRewardEnv
叠帧: FrameStack
- 图像环境的基本操作，方便CNN捕捉智能体的行动
向量空间reset处理修复
- gym.vector.SyncVectorEnv: 原始代码中的reset是随机的
- 继承重写的spSyncVectorEnv方法，支持每个向量的环境的seed一致，利于同一seed下环境的训练


class spSyncVectorEnv(gym.vector.SyncVectorEnv):"""step_await _terminateds reset"""def __init__(self,env_fns: Iterable[Callable[[], Env]],observation_space: Space = None,action_space: Space = None,copy: bool = True,random_reset: bool = False,seed: int = None):super().__init__(env_fns, observation_space, action_space, copy)self.random_reset = random_resetself.seed = seeddef step_wait(self) -> Tuple[Any, NDArray[Any], NDArray[Any], NDArray[Any], dict]:"""Steps through each of the environments returning the batched results.Returns:The batched environment step results"""observations, infos = [], {}for i, (env, action) in enumerate(zip(self.envs, self._actions)):(observation,self._rewards[i],self._terminateds[i],self._truncateds[i],info,) = env.step(action)if self._terminateds[i]:old_observation, old_info = observation, infoif self.random_reset:observation, info = env.reset(seed=np.random.randint(0, 999999))else:observation, info = env.reset() if self.seed is None else env.reset(seed=self.seed) info["final_observation"] = old_observationinfo["final_info"] = old_infoobservations.append(observation)infos = self._add_info(infos, info, i)self.observations = concatenate(self.single_observation_space, observations, self.observations)return (deepcopy(self.observations) if self.copy else self.observations,np.copy(self._rewards),np.copy(self._terminateds),np.copy(self._truncateds),infos,)

二、pytorch实践

2.1 智能体构建与训练

详细可见 Github: test_ppo_atari.Breakout_v5_ppo2_test

调整向量环境的reset 之后，

支持actor, criticor用同一个cnn层提取特征(PPOSharedCNN)
对eps进行了调小->eps=0.165，希望更新的策略范围更小一些；
关闭学习率衰减
进行不同ent_coef的尝试: 稍微大一点，增加agent的探索；
- ent_coef=0.015 & batch_size=256+128batch 陡降-回升慢
- ent_coef=0.025 & batch_size=256 陡降回升-最终reward=311
- √ ent_coef=0.05 & batch_size=256 -最终PPO2__AtariEnv instance__20241029__2217 reward=416
- ent_coef=0.05 & batch_size=256+128
- ent_coef=0.1 & batch_size=256 提升过于平缓

env_name = 'ALE/Breakout-v5' 
env_name_str = env_name.replace('/', '-')
gym_env_desc(env_name)
print("gym.__version__ = ", gym.__version__ )
path_ = os.path.dirname(__file__)
num_envs = 12
episod_life = True
clip_reward = True
resize_inner_area = True # True
env_pool_flag = False # True
seed = 202404
envs = spSyncVectorEnv([make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True, max_no_reward_count=120, resize_inner_area=resize_inner_area) for _ in range(num_envs)],random_reset=False,seed=202404
)
dist_type = 'norm'
cfg = Config(envs, save_path=os.path.join(path_, "test_models" ,f'PPO2_{env_name_str}-2'),  seed=202404,num_envs=num_envs,episod_life=episod_life,clip_reward=clip_reward,resize_inner_area=resize_inner_area,env_pool_flag=env_pool_flag,# 网络参数 Atria-CNN + MLPactor_hidden_layers_dim=[512, 256], critic_hidden_layers_dim=[512, 128], # agent参数actor_lr=4.5e-4,   gamma=0.99,# 训练参数num_episode=3600,  off_buffer_size=128,  max_episode_steps=128, PPO_kwargs={'cnn_flag': True,'clean_rl_cnn': True,'share_cnn_flag': True,'continue_action_flag': False,'lmbda': 0.95,'eps':  0.165,  # 0.165'k_epochs': 4,  #  update_epochs'sgd_batch_size': 512,  'minibatch_size': 256, 'act_type': 'relu','dist_type': dist_type,'critic_coef': 1.0, # 1.0'ent_coef': 0.05, 'max_grad_norm': 0.5,  'clip_vloss': True,'mini_adv_norm': True,'anneal_lr': False,'num_episode': 3600,}
)
minibatch_size = cfg.PPO_kwargs['minibatch_size']
max_grad_norm = cfg.PPO_kwargs['max_grad_norm']
cfg.trail_desc = f"actor_lr={cfg.actor_lr},minibatch_size={minibatch_size},max_grad_norm={max_grad_norm},hidden_layers={cfg.actor_hidden_layers_dim}",
agent = PPO2(state_dim=cfg.state_dim,actor_hidden_layers_dim=cfg.actor_hidden_layers_dim,critic_hidden_layers_dim=cfg.critic_hidden_layers_dim,action_dim=cfg.action_dim,actor_lr=cfg.actor_lr,critic_lr=cfg.critic_lr,gamma=cfg.gamma,PPO_kwargs=cfg.PPO_kwargs,device=cfg.device,reward_func=None
)
agent.train()
ppo2_train(envs, agent, cfg, wandb_flag=True, wandb_project_name=f"PPO2-{env_name_str}-NEW",train_without_seed=False, test_ep_freq=cfg.off_buffer_size * 10, online_collect_nums=cfg.off_buffer_size,test_episode_count=10, add_max_step_reward_flag=False,play_func='ppo2_play',ply_env=ply_env
)

2.2 训练出的智能体观测

最后将训练的最好的网络拿出来进行观察


env = make_atari_env(env_name, skip=4, episod_life=episod_life, clip_reward=clip_reward, ppo_train=True, max_no_reward_count=120, resize_inner_area=resize_inner_area, render_mode='human')()
ppo2_play(env, agent, cfg, episode_count=2, play_without_seed=False, render=True, ppo_train=True)