import gym import roboschool import os from gym.wrappers.monitoring.video_recorder import VideoRecorder from stable_baselines.ppo1 import PPO1 from stable_baselines.common import set_global_seeds from stable_baselines.bench import Monitor from stable_baselines.common import tf_util from stable_baselines.common.policies import MlpPolicy from mpi4py import MPI class RewScale(gym.RewardWrapper): def __init__(self, env, scale): gym.RewardWrapper.__init__(self, env) self.scale = scale def reward(self, _reward): return _reward * self.scale class SagemakerStableBaselinesLauncher(): """ Sagemaker's Stable Baselines Launcher. """ def __init__(self, env, output_path, model, num_timesteps): self._env = env self._output_path = output_path self._model = model self._num_timesteps = num_timesteps def _train(self): """Train the RL model """ self._model.learn(total_timesteps=self._num_timesteps) def _predict(self, model, video_path): """Run predictions on trained RL model. """ vr = VideoRecorder(env=self._env, path="{}/rl_out.mp4".format(video_path, str(MPI.COMM_WORLD.Get_rank())), enabled=True) obs = self._env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = self._env.step(action) if dones: obs = self._env.reset() self._env.render(mode='rgb_array') vr.capture_frame() vr.close() self._env.close() def run(self): self._train() if MPI.COMM_WORLD.Get_rank() == 0: self._predict(self._model, self._output_path) class SagemakerStableBaselinesPPO1Launcher(SagemakerStableBaselinesLauncher): """ Sagemaker's Stable Baselines PPO1 Launcher. """ def __init__(self, env, output_path, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, schedule, verbose, num_timesteps): print( "Initializing PPO with output_path: {} and Hyper Params [timesteps_per_actorbatch: {},clip_param: {}, " "entcoeff: {}, optim_epochs: {}, optim_stepsize: {}, optim_batchsize: {}, gamma: {}, lam: {}, " "schedule: {}, verbose: {}, num_timesteps: {}]".format(output_path, timesteps_per_actorbatch, clip_param, entcoeff, optim_epochs, optim_stepsize, optim_batchsize, gamma, lam, schedule, verbose, num_timesteps)) super().__init__(env, output_path, PPO1(policy=MlpPolicy, env=env, gamma=gamma, timesteps_per_actorbatch=timesteps_per_actorbatch, clip_param=clip_param, entcoeff=entcoeff, optim_epochs=optim_epochs, optim_stepsize=optim_stepsize, optim_batchsize=optim_batchsize, lam=lam, schedule=schedule, verbose=verbose), num_timesteps) def create_env(env_id, output_path, seed=0): rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(output_path, str(rank)), allow_early_resets=True) env.seed(seed) return env