import json from markov.architecture.constants import Input from markov.architecture.embedder_factory import create_input_embedder, create_middle_embedder from markov.constants import (ExplorationTypes, HyperParameterKeys, LossTypes) from markov.environments.deepracer_racetrack_env import DeepRacerRacetrackEnvParameters from markov.environments.constants import ( NUMBER_OF_LIDAR_SECTORS, SECTOR_LIDAR_CLIPPING_DIST ) from markov.log_handler.exception_handler import log_and_exit from markov.log_handler.constants import (SIMAPP_EVENT_ERROR_CODE_500, SIMAPP_SIMULATION_WORKER_EXCEPTION) from markov.multi_agent_coach.multi_agent_graph_manager import MultiAgentGraphManager from markov.multi_agent_coach.agents.sac_agent import SoftActorCriticAgentParameters from markov.multi_agent_coach.spaces import ScalableBoxActionSpace from markov.memories.deepracer_memory import DeepRacerMemoryParameters from markov.boto.s3.constants import TrainingAlgorithm from rl_coach.spaces import DiscreteActionSpace from rl_coach.memories.non_episodic.experience_replay import ExperienceReplayParameters from rl_coach.exploration_policies.additive_noise import AdditiveNoiseParameters from rl_coach.agents.clipped_ppo_agent import ClippedPPOAgentParameters from rl_coach.base_parameters import VisualizationParameters, PresetValidationParameters, \ DistributedCoachSynchronizationType, RunType from rl_coach.core_types import TrainingSteps, EnvironmentEpisodes, EnvironmentSteps from markov.exploration_policies.deepracer_categorical import DeepRacerCategoricalParameters from rl_coach.exploration_policies.e_greedy import EGreedyParameters from rl_coach.filters.filter import InputFilter from rl_coach.filters.observation.observation_rgb_to_y_filter import ObservationRGBToYFilter from rl_coach.filters.observation.observation_stacking_filter import ObservationStackingFilter from rl_coach.filters.observation.observation_to_uint8_filter import ObservationToUInt8Filter from rl_coach.filters.observation.observation_clipping_filter import ObservationClippingFilter from markov.filters.observation.observation_sector_discretize_filter import ObservationSectorDiscretizeFilter from rl_coach.graph_managers.graph_manager import ScheduleParameters from rl_coach.schedules import LinearSchedule class DeepRacerClippedPPOAgentParams(ClippedPPOAgentParameters): def __init__(self): super().__init__() # Agent to pass to the enviroment class, the objects in this list should # adhere to the AgentInterface self.env_agent = None class DeepRacerSACAgentParams(SoftActorCriticAgentParameters): def __init__(self): super().__init__() # Agent to pass to the enviroment class, the objects in this list should # adhere to the AgentInterface self.env_agent = None def get_sac_params(agent_params, agent, params, run_type=str(RunType.ROLLOUT_WORKER)): for net_key in ["policy", "v", "q"]: agent_params.network_wrappers[net_key].learning_rate = params[HyperParameterKeys.LEARNING_RATE.value] agent_params.network_wrappers[net_key].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) # DH: use empty middleware_embedder for q net if net_key != "q": agent_params.network_wrappers[net_key].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) for net_key in ["policy", "q", "v"]: agent_params.network_wrappers[net_key].batch_size = params[HyperParameterKeys.BATCH_SIZE.value] agent_params.network_wrappers[net_key].optimizer_epsilon = 1e-5 agent_params.network_wrappers[net_key].adam_optimizer_beta2 = 0.999 if params[HyperParameterKeys.LOSS_TYPE.value] == LossTypes.HUBER.value: agent_params.network_wrappers[net_key].replace_mse_with_huber_loss = True agent_params.network_wrappers['policy'].heads_parameters[0].sac_alpha = params[HyperParameterKeys.SAC_ALPHA.value] # Rescale action values in the policy head agent_params.network_wrappers['policy'].heads_parameters[0].rescale_action_values = True agent_params.algorithm.discount = params[HyperParameterKeys.DISCOUNT_FACTOR.value] # DH: should set num_steps_between_copying_online_weights_to_target as EnvironmentSteps instead of EnvironmentEpisodes. # see agent.py should_copy_online_weight... agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentSteps(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC # tau=1 agent_params.algorithm.rate_for_copying_weights_to_target = 1 agent_params.algorithm.use_deterministic_for_evaluation = True # DH: ----to address the training worker fetch issue-------------------------- if run_type == str(RunType.TRAINER): agent_params.memory = ExperienceReplayParameters() elif run_type == str(RunType.ROLLOUT_WORKER): agent_params.memory = DeepRacerMemoryParameters() # EpisodicExperienceReplayParameters() return agent_params def get_clipped_ppo_params(agent_params, agent, params): """This function is algorithm specific settings required for Clipped PPO algorithm Args: agent_params (DeepRacerClippedPPOAgentParams): the agent parameters that will be used to create the RL agent agent (Agent): The agent object that was created either as part of create_rollout_agent or create_training_agent params (dict): dictionary of hyperparameters Returns: DeepRacerClippedPPOAgentParams: updated agent params object with hyperparameters and other required details """ agent_params.network_wrappers['main'].learning_rate = params[HyperParameterKeys.LEARNING_RATE.value] agent_params.network_wrappers['main'].input_embedders_parameters = \ create_input_embedder(agent.network_settings['input_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].middleware_parameters = \ create_middle_embedder(agent.network_settings['middleware_embedders'], agent.network_settings['embedder_type'], agent.network_settings['activation_function']) agent_params.network_wrappers['main'].batch_size = params[HyperParameterKeys.BATCH_SIZE.value] agent_params.network_wrappers['main'].optimizer_epsilon = 1e-5 agent_params.network_wrappers['main'].adam_optimizer_beta2 = 0.999 if params[HyperParameterKeys.LOSS_TYPE.value] == LossTypes.HUBER.value: agent_params.network_wrappers['main'].replace_mse_with_huber_loss = True agent_params.algorithm.clip_likelihood_ratio_using_epsilon = 0.2 agent_params.algorithm.beta_entropy = params[HyperParameterKeys.BETA_ENTROPY.value] agent_params.algorithm.gae_lambda = 0.95 agent_params.algorithm.discount = params[HyperParameterKeys.DISCOUNT_FACTOR.value] agent_params.algorithm.optimization_epochs = params[HyperParameterKeys.NUM_EPOCHS.value] agent_params.algorithm.estimate_state_value_using_gae = True agent_params.algorithm.num_steps_between_copying_online_weights_to_target = \ EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.num_consecutive_playing_steps = \ EnvironmentEpisodes(params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value]) agent_params.algorithm.distributed_coach_synchronization_type = \ DistributedCoachSynchronizationType.SYNC if params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip() == ExplorationTypes.CATEGORICAL.value: agent_params.exploration = {DiscreteActionSpace: DeepRacerCategoricalParameters(use_stochastic_evaluation_policy=False), ScalableBoxActionSpace: AdditiveNoiseParameters()} elif params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip() == ExplorationTypes.E_GREEDY.value: agent_params.exploration = {DiscreteActionSpace: EGreedyParameters(), ScalableBoxActionSpace: AdditiveNoiseParameters()} agent_params.exploration[DiscreteActionSpace].epsilon_schedule = \ LinearSchedule(1.0, params[HyperParameterKeys.E_GREEDY_VALUE.value], params[HyperParameterKeys.EPSILON_STEPS.value]) else: log_and_exit("Unknown exploration_type found in hyper parameters. \ exploration_type: {}".format(params[HyperParameterKeys.EXPLORATION_TYPE.value].lower().strip()), SIMAPP_SIMULATION_WORKER_EXCEPTION, SIMAPP_EVENT_ERROR_CODE_500) agent_params.memory = DeepRacerMemoryParameters() return agent_params # % TODO - refactor this module to be more modular based on the training algorithm and avoid if-else def get_updated_hyper_parameters(hp_dict, training_algorithm): """ Update the default hyperparameters Args: hp_dict (dict): Hyperparameters passed when training job is created training_algorithm (str): Training algorithm value from TrainingAlgorithm enum Returns: params (dict): updated hyperparameters """ #################### # All Default Parameters # #################### params = {} params[HyperParameterKeys.BATCH_SIZE.value] = int(hp_dict.get(HyperParameterKeys.BATCH_SIZE.value, 64)) params[HyperParameterKeys.STACK_SIZE.value] = int(hp_dict.get(HyperParameterKeys.STACK_SIZE.value, 1)) params[HyperParameterKeys.LEARNING_RATE.value] = float(hp_dict.get(HyperParameterKeys.LEARNING_RATE.value, 0.0003)) params[HyperParameterKeys.EXPLORATION_TYPE.value] = (hp_dict.get(HyperParameterKeys.EXPLORATION_TYPE.value, ExplorationTypes.CATEGORICAL.value)).lower() params[HyperParameterKeys.E_GREEDY_VALUE.value] = float(hp_dict.get(HyperParameterKeys.E_GREEDY_VALUE.value, .05)) params[HyperParameterKeys.EPSILON_STEPS.value] = int(hp_dict.get(HyperParameterKeys.EPSILON_STEPS.value, 10000)) params[HyperParameterKeys.DISCOUNT_FACTOR.value] = \ float(hp_dict.get(HyperParameterKeys.DISCOUNT_FACTOR.value, .999)) params[HyperParameterKeys.LOSS_TYPE.value] = hp_dict.get(HyperParameterKeys.LOSS_TYPE.value, LossTypes.MEAN_SQUARED_ERROR.value).lower() params[HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value] = \ int(hp_dict.get(HyperParameterKeys.NUM_EPISODES_BETWEEN_TRAINING.value, 20)) params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value] = \ int(hp_dict.get(HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value, 100000)) params[HyperParameterKeys.TERMINATION_CONDITION_AVG_SCORE.value] = \ float(hp_dict.get(HyperParameterKeys.TERMINATION_CONDITION_AVG_SCORE.value, 100000)) #################### # Clipped PPO algo #################### if TrainingAlgorithm.CLIPPED_PPO.value == training_algorithm: params[HyperParameterKeys.BETA_ENTROPY.value] = float(hp_dict.get(HyperParameterKeys.BETA_ENTROPY.value, .01)) params[HyperParameterKeys.NUM_EPOCHS.value] = int(hp_dict.get(HyperParameterKeys.NUM_EPOCHS.value, 10)) #################### # SAC algo #################### elif TrainingAlgorithm.SAC.value == training_algorithm: params[HyperParameterKeys.SAC_ALPHA.value] = float(hp_dict.get(HyperParameterKeys.SAC_ALPHA.value, 0.2)) return params def get_graph_manager(hp_dict, agent_list, run_phase_subject, enable_domain_randomization=False, done_condition=any, run_type=str(RunType.ROLLOUT_WORKER), pause_physics=None, unpause_physics=None): #################### # Hyperparameters # #################### # Note: The following three line hard-coded to pick the first agent's trainig algorithm # and dump the hyper parameters for the particular training algorithm into json # for training jobs (so that the console display the training hyperparameters correctly) # since right now, we only support training one model at a time. # TODO: clean these lines up when we support multi-agent training. training_algorithm = agent_list[0].ctrl.model_metadata.training_algorithm if agent_list else None params = get_updated_hyper_parameters(hp_dict, training_algorithm) params_json = json.dumps(params, indent=2, sort_keys=True) print("Using the following hyper-parameters", params_json, sep='\n') #################### # Graph Scheduling # #################### schedule_params = ScheduleParameters() schedule_params.improve_steps = TrainingSteps(params[HyperParameterKeys.TERMINATION_CONDITION_MAX_EPISODES.value]) schedule_params.steps_between_evaluation_periods = EnvironmentEpisodes(40) schedule_params.evaluation_steps = EnvironmentEpisodes(5) schedule_params.heatup_steps = EnvironmentSteps(0) ######### # Agent # ######### trainable_agents_list = list() non_trainable_agents_list = list() for agent in agent_list: if agent.network_settings: training_algorithm = agent.ctrl.model_metadata.training_algorithm params = get_updated_hyper_parameters(hp_dict, training_algorithm) if TrainingAlgorithm.SAC.value == training_algorithm: agent_params = get_sac_params(DeepRacerSACAgentParams(), agent, params, run_type) else: agent_params = get_clipped_ppo_params(DeepRacerClippedPPOAgentParams(), agent, params) agent_params.env_agent = agent input_filter = InputFilter(is_a_reference_filter=True) for observation in agent.network_settings['input_embedders'].keys(): if observation == Input.LEFT_CAMERA.value or observation == Input.CAMERA.value or \ observation == Input.OBSERVATION.value: input_filter.add_observation_filter(observation, 'to_grayscale', ObservationRGBToYFilter()) input_filter.add_observation_filter(observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) input_filter.add_observation_filter(observation, 'stacking', ObservationStackingFilter(1)) if observation == Input.STEREO.value: input_filter.add_observation_filter(observation, 'to_uint8', ObservationToUInt8Filter(0, 255)) if observation == Input.LIDAR.value: input_filter.add_observation_filter(observation, 'clipping', ObservationClippingFilter(0.15, 1.0)) if observation == Input.SECTOR_LIDAR.value: sector_binary_filter = ObservationSectorDiscretizeFilter(num_sectors=NUMBER_OF_LIDAR_SECTORS, num_values_per_sector=1, clipping_dist=SECTOR_LIDAR_CLIPPING_DIST) input_filter.add_observation_filter(observation, 'binary', sector_binary_filter) if observation == Input.DISCRETIZED_SECTOR_LIDAR.value: num_sectors = agent.ctrl.model_metadata.lidar_num_sectors num_values_per_sector = agent.ctrl.model_metadata.lidar_num_values_per_sector clipping_dist = agent.ctrl.model_metadata.lidar_clipping_dist sector_discretize_filter = ObservationSectorDiscretizeFilter(num_sectors=num_sectors, num_values_per_sector=num_values_per_sector, clipping_dist=clipping_dist) input_filter.add_observation_filter(observation, 'discrete', sector_discretize_filter) agent_params.input_filter = input_filter() trainable_agents_list.append(agent_params) else: non_trainable_agents_list.append(agent) ############### # Environment # ############### env_params = DeepRacerRacetrackEnvParameters() env_params.agents_params = trainable_agents_list env_params.non_trainable_agents = non_trainable_agents_list env_params.level = 'DeepRacerRacetrackEnv-v0' env_params.run_phase_subject = run_phase_subject env_params.enable_domain_randomization = enable_domain_randomization env_params.done_condition = done_condition env_params.pause_physics = pause_physics env_params.unpause_physics = unpause_physics vis_params = VisualizationParameters() vis_params.dump_mp4 = False ######## # Test # ######## preset_validation_params = PresetValidationParameters() preset_validation_params.test = True preset_validation_params.min_reward_threshold = 400 preset_validation_params.max_episodes_to_achieve_reward = 10000 graph_manager = MultiAgentGraphManager(agents_params=trainable_agents_list, env_params=env_params, schedule_params=schedule_params, vis_params=vis_params, preset_validation_params=preset_validation_params, done_condition=done_condition) return graph_manager, params_json