# This file plots the last tuning job's cloudwatch metrics. ## Summary Columns COLUMNS_TO_PLOT = ['td_mse', 'validation_td_mse', 'mean_q', 'training_iteration'] LAST_VALUE_TABLE_COLUMNS = ['Trial #','td_mse','mean_q','training_iteration'] ## Debug Columns # COLUMNS_TO_PLOT = ['mean_q', 'validation_mean_q', 'td_mse', 'validation_td_mse', 'critic_loss', 'validation_critic_loss', 'cql_loss', 'validation_cql_loss', 'actor_loss','validation_actor_loss','training_iteration','iterations_since_restore', 'timesteps_total', ] # LAST_VALUE_TABLE_COLUMNS = ['Trial #','td_mse','cql_loss','actor_loss','mean_q','training_iteration'] NUM_COLUMNS = 2 import os import boto3 from datetime import datetime import math import pandas as pd import matplotlib.pyplot as plt import plotly.io as pio from plotly.offline import init_notebook_mode, iplot from plotly.subplots import make_subplots import plotly.graph_objects as go import plotly.express as px pio.templates.default = "simple_white" pd.options.plotting.backend = "plotly" sm_client = boto3.client('sagemaker') cw_client = boto3.client('cloudwatch') TUNING_JOB_NAME = sm_client.list_hyper_parameter_tuning_jobs(SortBy='CreationTime',MaxResults=1).get('HyperParameterTuningJobSummaries')[0].get('HyperParameterTuningJobName') print(f'Plotting tuning job {TUNING_JOB_NAME}') num_rows = math.ceil((1+len(COLUMNS_TO_PLOT))/float(NUM_COLUMNS)) training_job_names = sm_client.list_training_jobs_for_hyper_parameter_tuning_job(HyperParameterTuningJobName=TUNING_JOB_NAME, MaxResults = 100) print(f'{len(training_job_names["TrainingJobSummaries"])} training jobs') metric_series = {} for job in training_job_names['TrainingJobSummaries'][:4]: print(job['TrainingJobName']) available_metrics = cw_client.list_metrics( Namespace="/aws/sagemaker/TrainingJobs", # MetricName='string', Dimensions=[ { 'Name': 'TrainingJobName', 'Value': job['TrainingJobName'] }, ], ) if len(available_metrics['Metrics']) == 0: print(f'No metrics for training job {job}') continue # print(f'Available Metrics: {available_metrics}') MetricDataQueries=[ { 'Id': metric['MetricName'].lower(), 'MetricStat': { 'Metric': metric, 'Stat': 'Minimum', #Average 'Period': 300 } } for metric in available_metrics['Metrics']] # print(f'MetricDataQueries: {MetricDataQueries}') cw_metric_data = cw_client.get_metric_data( MetricDataQueries=MetricDataQueries, StartTime=job['TrainingStartTime'], EndTime=job.get('TrainingEndTime',datetime.now()), ) dict_of_series = {metric_data['Label']: pd.Series(metric_data['Values'][::-1], name = job['TrainingJobName']) for metric_data in cw_metric_data['MetricDataResults']} for key, value in dict_of_series.items(): metric_series[key] = metric_series.get(key,[]) + [value] specs = [[{"type": "table", "rowspan": num_rows}]+[{"type": "scatter"}]*(NUM_COLUMNS)]+[ [None] + [{"type": "scatter"}]*NUM_COLUMNS]*(num_rows-1) fig = make_subplots( rows=num_rows, cols=NUM_COLUMNS+1, subplot_titles = ['Latest Values'] + COLUMNS_TO_PLOT, specs=specs ) fig['layout'].update(height=1000, width=1500, title_text=f"Tuning Job: {TUNING_JOB_NAME}") plot_locs = list(range(1,1+len(COLUMNS_TO_PLOT))) plot_locs = [ (i+1,j+2) for i in range(num_rows) for j in range(NUM_COLUMNS)] # colors = px.colors.qualitative.Vivid colors = px.colors.qualitative.Alphabet latest_value_table = pd.DataFrame() # Plot the trends for plot_loc, column_name in zip(plot_locs,COLUMNS_TO_PLOT): if column_name not in metric_series: print(f'{column_name} not in metric series') continue df = pd.concat(metric_series[column_name], axis = 1) latest_value_table[column_name] = pd.concat([series.iloc[-1:].reset_index(drop = True) for series in metric_series[column_name]], axis = 1).transpose() for trial in df.columns: i = trial.split('-')[-2] fig.add_trace( go.Scatter( x=df.index, y=df[trial], mode='lines', # 'lines' or 'markers' name= f'{i}. {trial}', legendgroup=trial, line = {'color': colors[int(i)%len(colors)]}, showlegend = plot_loc == plot_locs[1] ), row=plot_loc[0], col=plot_loc[1]) # Use a log scale if all values are above 0. if df[trial].min() > 0: fig.update_yaxes( type="log", row=plot_loc[0], col=plot_loc[1]) latest_value_table['Trial #'] = latest_value_table.index.to_series().apply(lambda x: x.split('-')[-2]) latest_value_table = latest_value_table.round(2).reset_index(drop = True)[LAST_VALUE_TABLE_COLUMNS] latest_value_table.rename(columns={'training_iteration':'iters'}, inplace = True) latest_value_table.sort_values('mean_q', ascending = False, inplace = True) # Plot a summary table fig.add_trace( go.Table( header={'values': [col.replace("_"," ") for col in latest_value_table.columns]}, cells=dict( values=[latest_value_table[k].tolist() for k in latest_value_table.columns], ) ), row=1, col=1) fig.write_html(os.path.join(os.getcwd(),f'{TUNING_JOB_NAME}_hyperparameter_tuning_plot.html'), full_html=False, include_plotlyjs='cdn')