# Pre-requisite

In [None]:
%cd /Notebooks/without-ude/
!git clone https://github.com/DLR-RM/stable-baselines3.git
%cd /Notebooks/without-ude/stable-baselines3/
!git checkout 58a9806
!cp /Notebooks/without-ude/stable-baselines3.patch /Notebooks/without-ude/stable-baselines3/
%cd /Notebooks/without-ude/stable-baselines3/
!git apply --reject --whitespace=fix ./stable-baselines3.patch
%cd /Notebooks/without-ude/

In [None]:
import numpy as np
import grpc
import os
import time
import gym
from gym.spaces.space import Space
import pandas as pd
import matplotlib.pyplot as plt

from ude import UDEToGymWrapper
from ude.environment.ude_environment import UDEEnvironment
from typing import Union, Tuple, Dict, List, Any, Optional


%matplotlib inline

# Customize the following settings

1. ENV_NAME -> UDE paper experiments are the following gym environment Hopper-v2, LunarLanderContinuous-v2, Pendulum-v1
3. ALGO -> UDE paper experiments are the following PPO, DDPG, SAC algorithm

In [None]:
ENV_NAME = "Hopper-v2" # This experiment is run for Hopper-v2, LunarLanderContinuous-v2, Pendulum-v1
ALGO = "PPO" # Supported are PPO, DDPG, SAC

In [None]:
BASE_PATH = "/Notebooks/without-ude"

In [None]:
%cd {BASE_PATH}/stable-baselines3

from stable_baselines3 import PPO
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3.common.evaluation import evaluate_policy

In [None]:
def write_metrics(path, data):
 with open(path, "a+") as fp:
 fp.write(data)

# Intialize environment

In [None]:
model_path = "{}/output/models/{}-MlpPolicy-without-ude-baseline-{}".format(BASE_PATH, ALGO, ENV_NAME)
experiment_results_path = "{}/output/experiment_results/{}-MlpPolicy-without-ude-baseline-{}".format(BASE_PATH, ALGO, ENV_NAME)

In [None]:
%mkdir -p {model_path}
%mkdir -p {experiment_results_path}

In [None]:
env = gym.make(ENV_NAME)
seed_list = [0, 1, 6, 7, 9]
total_timesteps = 1000000
evals_between_training_step = 1000

# Train with different seeds

In [None]:
for seed in seed_list:
 model_seed_path = "{}/seed-{}".format(model_path, seed)
 experiment_result_seed_path = "{}/seed-{}.txt".format(experiment_results_path, seed)
 step_experiment_result_seed_path = "{}/step-seed-{}.txt".format(experiment_results_path, seed)
 if ALGO == "PPO":
 model = PPO(policy="MlpPolicy", env=env, verbose=0, seed=seed,
 metric_path = step_experiment_result_seed_path)
 elif ALGO == "SAC":
 model = SAC(policy="MlpPolicy", env=env, verbose=0, seed=seed,
 metric_path = step_experiment_result_seed_path)
 elif ALGO == "DDPG":
 model = DDPG(policy="MlpPolicy", env=env, verbose=0, seed=seed,
 metric_path = step_experiment_result_seed_path)
 else:
 raise Exception("Supported ALGO values are PPO, SAC, DDPG")
 for i in range(total_timesteps//evals_between_training_step):
 model.increment_iteration_number()
 
 start_training_time = time.time()
 model.learn(total_timesteps=evals_between_training_step)
 total_training_time = time.time() - start_training_time
 if i % 10 == 0:
 model.save(model_seed_path)
 env.reset()
 start_eval_time = time.time()
 mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)
 total_eval_time = time.time() - start_eval_time
 data = "{}|{}|{}|{}|{}|{}|{}|{}\n".format(seed, i,
 start_training_time, total_training_time,
 start_eval_time, total_eval_time,
 mean_reward, std_reward) 
 write_metrics(experiment_result_seed_path, data)
 model.save(model_seed_path)
 del model

# Plot graphs

In [None]:
seeds_mean_reward_list = []
seeds_timesteps_list = []
for seed in seed_list:
 experiment_result_seed_path = "{}/seed-{}.txt".format(experiment_results_path, seed)
 df = pd.read_csv(experiment_result_seed_path, sep="|",
 names=["seed", "rollout",
 "start_training_time", "total_training_time",
 "start_eval_time", "total_eval_time",
 "mean_reward", "std_reward"])
 df = pd.read_csv(experiment_result_seed_path, sep="|",
 names=["seed", "rollouts",
 "start_training_time", "total_training_time",
 "start_eval_time", "total_eval_time",
 "mean_reward", "std_reward"])
 df['timesteps'] = df['rollouts'] * evals_between_training_step
 df['cumulative_training_time'] = df['total_training_time'].cumsum()
 df['cumulative_evaluation_time'] = df['total_eval_time'].cumsum()
 seeds_mean_reward_list.append(df["mean_reward"].to_numpy())
 seeds_timesteps_list.append(df["timesteps"].to_numpy())
 
 # Plotting graphs
 fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
 fig.suptitle("Seed = {}".format(seed))
 
 ax1.set_title("mean_reward vs timesteps")
 ax1.plot(df['timesteps'], df['mean_reward'])
 ax1.set(xlabel='timesteps', ylabel='mean_reward')
 
 ax2.set_title("mean_reward vs cumulative_training_time")
 ax2.plot(df['cumulative_training_time'], df['mean_reward'])
 ax2.set(xlabel='cumulative_training_time (seconds)', ylabel='mean_reward')

 ax3.set_title("mean_reward vs cumulative_evaluation_time")
 ax3.plot(df['cumulative_evaluation_time'], df['mean_reward'])
 ax3.set(xlabel='cumulative_evaluation_time (seconds)', ylabel='mean_reward')
 fig.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(20, 6))
fig.suptitle("Mean reward across all seeds")

avg_reward_all_seeds = np.array(seeds_mean_reward_list).mean(axis=0)
ax.set_title("mean_reward vs timesteps")
ax.plot(seeds_timesteps_list[0], avg_reward_all_seeds)
ax.set(xlabel='timesteps', ylabel='mean_reward')
fig.show()