{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "131eb6a0-9307-4b7d-a824-be58fe26b983",
   "metadata": {},
   "source": [
    "# Pre-requisite"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfe957f9-815a-4a8d-a272-feb9d023b779",
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd /Notebooks/without-ude/\n",
    "!git clone https://github.com/DLR-RM/stable-baselines3.git\n",
    "%cd /Notebooks/without-ude/stable-baselines3/\n",
    "!git checkout 58a9806\n",
    "!cp /Notebooks/without-ude/stable-baselines3.patch /Notebooks/without-ude/stable-baselines3/\n",
    "%cd /Notebooks/without-ude/stable-baselines3/\n",
    "!git apply --reject --whitespace=fix ./stable-baselines3.patch\n",
    "%cd /Notebooks/without-ude/"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3b4bc99b-03be-48fe-b8ea-e75a70af4a3c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import grpc\n",
    "import os\n",
    "import time\n",
    "import gym\n",
    "from gym.spaces.space import Space\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from ude import UDEToGymWrapper\n",
    "from ude.environment.ude_environment import UDEEnvironment\n",
    "from typing import Union, Tuple, Dict, List, Any, Optional\n",
    "\n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7cc9247e-7257-47e9-80a2-5f47d75b76d4",
   "metadata": {},
   "source": [
    "# Customize the following settings\n",
    "\n",
    "1. ENV_NAME -> UDE paper experiments are the following gym environment Hopper-v2, LunarLanderContinuous-v2, Pendulum-v1\n",
    "3. ALGO -> UDE paper experiments are the following PPO, DDPG, SAC algorithm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3cc810f2-8e7c-4475-9c81-00340014a901",
   "metadata": {},
   "outputs": [],
   "source": [
    "ENV_NAME = \"Hopper-v2\"  # This experiment is run for Hopper-v2, LunarLanderContinuous-v2, Pendulum-v1\n",
    "ALGO = \"PPO\"  # Supported are PPO, DDPG, SAC"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "65ea6bd8-cd0e-4df9-9ab3-22bb1e337c28",
   "metadata": {},
   "outputs": [],
   "source": [
    "BASE_PATH = \"/Notebooks/without-ude\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d95e1db5-1455-41a1-a0fd-d670483fcbc0",
   "metadata": {},
   "outputs": [],
   "source": [
    "%cd {BASE_PATH}/stable-baselines3\n",
    "\n",
    "from stable_baselines3 import PPO\n",
    "from stable_baselines3 import SAC\n",
    "from stable_baselines3 import DDPG\n",
    "from stable_baselines3.common.evaluation import evaluate_policy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0e4bbcd3-b9ea-4a3a-8ca8-b6af090abe70",
   "metadata": {},
   "outputs": [],
   "source": [
    "def write_metrics(path, data):\n",
    "    with open(path, \"a+\") as fp:\n",
    "        fp.write(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5bc3cfc4-672e-459f-90f7-9c918e8154bf",
   "metadata": {},
   "source": [
    "# Intialize environment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32765ab7-1600-4ce3-87d5-0e58fbac2aa7",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_path = \"{}/output/models/{}-MlpPolicy-without-ude-baseline-{}\".format(BASE_PATH, ALGO, ENV_NAME)\n",
    "experiment_results_path = \"{}/output/experiment_results/{}-MlpPolicy-without-ude-baseline-{}\".format(BASE_PATH, ALGO, ENV_NAME)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c83b2fd3-8624-47d9-a419-d46b92c501d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "%mkdir -p {model_path}\n",
    "%mkdir -p {experiment_results_path}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c634f0ec-1cfe-4fba-a3d9-d68bd46524a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "env = gym.make(ENV_NAME)\n",
    "seed_list = [0, 1, 6, 7, 9]\n",
    "total_timesteps = 1000000\n",
    "evals_between_training_step = 1000"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3cdaedc3-1c58-48a9-9156-e7452deea32c",
   "metadata": {},
   "source": [
    "# Train with different seeds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "96d3cfe1-fc7a-4ba3-b39f-0ec7eab3f64c",
   "metadata": {},
   "outputs": [],
   "source": [
    "for seed in seed_list:\n",
    "    model_seed_path = \"{}/seed-{}\".format(model_path, seed)\n",
    "    experiment_result_seed_path = \"{}/seed-{}.txt\".format(experiment_results_path, seed)\n",
    "    step_experiment_result_seed_path = \"{}/step-seed-{}.txt\".format(experiment_results_path, seed)\n",
    "    if ALGO == \"PPO\":\n",
    "        model = PPO(policy=\"MlpPolicy\", env=env, verbose=0, seed=seed,\n",
    "                    metric_path = step_experiment_result_seed_path)\n",
    "    elif ALGO == \"SAC\":\n",
    "        model = SAC(policy=\"MlpPolicy\", env=env, verbose=0, seed=seed,\n",
    "                    metric_path = step_experiment_result_seed_path)\n",
    "    elif ALGO == \"DDPG\":\n",
    "        model = DDPG(policy=\"MlpPolicy\", env=env, verbose=0, seed=seed,\n",
    "                     metric_path = step_experiment_result_seed_path)\n",
    "    else:\n",
    "        raise Exception(\"Supported ALGO values are PPO, SAC, DDPG\")\n",
    "    for i in range(total_timesteps//evals_between_training_step):\n",
    "        model.increment_iteration_number()\n",
    "        \n",
    "        start_training_time = time.time()\n",
    "        model.learn(total_timesteps=evals_between_training_step)\n",
    "        total_training_time = time.time() - start_training_time\n",
    "        if i % 10 == 0:\n",
    "            model.save(model_seed_path)\n",
    "        env.reset()\n",
    "        start_eval_time = time.time()\n",
    "        mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5)\n",
    "        total_eval_time = time.time() - start_eval_time\n",
    "        data = \"{}|{}|{}|{}|{}|{}|{}|{}\\n\".format(seed, i,\n",
    "                                                 start_training_time, total_training_time,\n",
    "                                                 start_eval_time, total_eval_time,\n",
    "                                                 mean_reward, std_reward)        \n",
    "        write_metrics(experiment_result_seed_path, data)\n",
    "    model.save(model_seed_path)\n",
    "    del model"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "992b1de7-49a2-4245-a0a5-ff8447e5cf63",
   "metadata": {},
   "source": [
    "# Plot graphs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "114b4796-4f1d-4222-b538-fa68da7cd1a8",
   "metadata": {},
   "outputs": [],
   "source": [
    "seeds_mean_reward_list = []\n",
    "seeds_timesteps_list = []\n",
    "for seed in seed_list:\n",
    "    experiment_result_seed_path = \"{}/seed-{}.txt\".format(experiment_results_path, seed)\n",
    "    df = pd.read_csv(experiment_result_seed_path, sep=\"|\",\n",
    "                     names=[\"seed\", \"rollout\",\n",
    "                            \"start_training_time\", \"total_training_time\",\n",
    "                            \"start_eval_time\", \"total_eval_time\",\n",
    "                            \"mean_reward\", \"std_reward\"])\n",
    "    df = pd.read_csv(experiment_result_seed_path, sep=\"|\",\n",
    "                 names=[\"seed\", \"rollouts\",\n",
    "                        \"start_training_time\", \"total_training_time\",\n",
    "                        \"start_eval_time\", \"total_eval_time\",\n",
    "                        \"mean_reward\", \"std_reward\"])\n",
    "    df['timesteps'] = df['rollouts'] * evals_between_training_step\n",
    "    df['cumulative_training_time'] = df['total_training_time'].cumsum()\n",
    "    df['cumulative_evaluation_time'] = df['total_eval_time'].cumsum()\n",
    "    seeds_mean_reward_list.append(df[\"mean_reward\"].to_numpy())\n",
    "    seeds_timesteps_list.append(df[\"timesteps\"].to_numpy())\n",
    "    \n",
    "    # Plotting graphs\n",
    "    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))\n",
    "    fig.suptitle(\"Seed = {}\".format(seed))\n",
    "    \n",
    "    ax1.set_title(\"mean_reward vs timesteps\")\n",
    "    ax1.plot(df['timesteps'], df['mean_reward'])\n",
    "    ax1.set(xlabel='timesteps', ylabel='mean_reward')\n",
    "    \n",
    "    ax2.set_title(\"mean_reward vs cumulative_training_time\")\n",
    "    ax2.plot(df['cumulative_training_time'], df['mean_reward'])\n",
    "    ax2.set(xlabel='cumulative_training_time (seconds)', ylabel='mean_reward')\n",
    "\n",
    "    ax3.set_title(\"mean_reward vs cumulative_evaluation_time\")\n",
    "    ax3.plot(df['cumulative_evaluation_time'], df['mean_reward'])\n",
    "    ax3.set(xlabel='cumulative_evaluation_time (seconds)', ylabel='mean_reward')\n",
    "    fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e0d668b8-7d74-4f09-9342-68adbd7ee094",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1, 1, figsize=(20, 6))\n",
    "fig.suptitle(\"Mean reward across all seeds\")\n",
    "\n",
    "avg_reward_all_seeds = np.array(seeds_mean_reward_list).mean(axis=0)\n",
    "ax.set_title(\"mean_reward vs timesteps\")\n",
    "ax.plot(seeds_timesteps_list[0], avg_reward_all_seeds)\n",
    "ax.set(xlabel='timesteps', ylabel='mean_reward')\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "41e5bc54-c8c4-4146-9a97-17447fb4f354",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}