# -*- coding: utf-8 -*-
"""
Runs data processing scripts to turn raw experimental design plan csv from (../raw) into a
fully formulated experimental design with hyperparameters/customization to pass to a training job. 
"""
import click
import logging
import pandas as pd
import os
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
from csv import writer
import numpy as np


@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw experimental design csv from (../raw) into
        full formulated experimental design to pass to a training job in (saved in ../interim).
        
        INSTRUCTIONS:
        * Get started designing your experiment by following the steps explained in the get_results jupyter notebook. 
        * You can get started by exploring modifications to the default design in the Jupyter Notebook to suit your needs
        * Then, modify the code below to match your desired experimental design, for faster iteration 
        * To generate the neccessary files to start testing in Sagemaker (rather than running many Notebook cells) simply type "make experiment" in the terminal. 
        * To execute the experiments after you have run this, see the get_results Jupyter Notebook     
    
    Parameters:
    ----------
    input_filepath: string
        defined in MAKEFILE, location of input files
    output_filepath: string
        defined in MAKEFILE, location of output files

    Returns:
    -------
    None: generates files
        * experimental_design.csv (contains all your planned experiments + parameters)
        * individual csv files corresponding to each run (read into the training jobs individually as you run them)
    """
    logger = logging.getLogger(__name__)
    logger.info('making experimental design for training jobs from raw design')

    # load raw experimental design from user input
    exp_design = pd.read_csv(f'{input_filepath}/experimental_design.csv')

    # For the case of 1 GPU - num steps selected via guess and check to get whole numbered epochs
    num_steps = 84375 
    batch_size = 32*1 # for 1 GPU
    num_samples = exp_design['dataset_size'].values*0.9 # 90% of each size is used for training
    num_steps_per_epoch = num_samples/batch_size

    # calculate the number of epochs to be passed as hyperparameters for each experiment
    num_epochs = num_steps/num_steps_per_epoch
    # print("\nTraining dataset sample sizes:", num_samples)
    # print("\nRequired epochs for different sample sizes on 1 GPU:", num_epochs)
    s3_bucket = os.getenv("BUCKET_NAME")
    dataset_name = os.getenv("HF_DATASET")
    model_name = os.getenv("HF_MODEL")
    tunable_params = os.getenv("TUNED_PARAMS")

    # insert additional columns containing info that the training jobs will require
    exp_design.insert(loc = 0, column = 'dataset_name', value = dataset_name)
    exp_design.insert(loc = 1, column = 'automodel_name', value = model_name)
    exp_design.insert(loc = 2, column = 'num_parameters_tuned', value = tunable_params) # constant for this model
    exp_design.insert(loc = 3, column = 's3_bucket', value = s3_bucket) # constant for this model
    exp_design.insert(loc = 4, column = 'per_device_train_batch_size', value = batch_size) # calculated above
    exp_design.insert(loc = 5, column = 'learning_rate', value = 5e-5)
    exp_design.insert(loc = 6, column = 'epochs', value = num_epochs) # calculated above


    # map the num_nodes column to specific factor levels for experimentation
    instance_mapper = {1:'ml.p3.2xlarge', 2:'ml.p3.16xlarge', 4:'ml.p3.16xlarge'}
    gpu_mapper = {1:1, 2:16, 4:32}
    parallel_enabled_mapper = {1:False, 2:True, 4:True}
    EBS_volume_mapper = {'ml.p3.2xlarge':1024, 'ml.p3.8xlarge':1024, 'ml.p3.16xlarge':30} # leave default for 16xlarges, add more storage for small instances
    price_mapper = {"ml.p3.2xlarge": 3.825, "ml.p3.8xlarge":14.688, "ml.p3.16xlarge":28.152} # hourly instance pricing from SageMaker website

    # keep adding extra desired info using the above mappers
    exp_design.insert(loc = 7, column = 'instance_type', value = exp_design['num_nodes'].map(instance_mapper))
    exp_design.insert(loc = 8, column = 'num_gpus', value = exp_design['num_nodes'].map(gpu_mapper))
    exp_design.insert(loc = 9, column = 'global_batch_size', value = exp_design['num_gpus']*exp_design['per_device_train_batch_size']) 
    exp_design.insert(loc = 10, column = 'num_steps', value = np.rint((exp_design['epochs'] * exp_design['dataset_size']*0.9)/exp_design['global_batch_size'])) # note - in the future num steps could be held constant across exps instead
    exp_design.insert(loc = 11, column = 'hourly_price', value = exp_design['instance_type'].map(price_mapper))
    exp_design.insert(loc = 12, column = 'volume_size', value = exp_design['instance_type'].map(EBS_volume_mapper)) 
    exp_design.insert(loc = 13, column = 'parallel_enabled', value = exp_design['num_nodes'].map(parallel_enabled_mapper)) 

    # save completed design to file
    exp_design.to_csv(f'{output_filepath}/experimental_design.csv', index_label = 'run_id')

    # OPTIONAL - add custom runs outside the initial design - comment out code section enclosed between lines -- if not desired --------------------
    # --------------------------------------------------------------------------------------------------------------------

    # if any additional data points desired, add additional rows to csv with custom params
    cp1 = [15,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,5.0,'ml.p3.8xlarge',4,128,21094,14.688,1024,False,1,600000]
    cp2 = [16,dataset_name,model_name,tunable_params,s3_bucket,16,5e-05,5.0,'ml.p3.16xlarge',8,128,10547,28.152,1024,True,1,600000] # pd batch size adjusted for cuda error
    cp3 = [17,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,30.0,'ml.p3.16xlarge',8,256,10547,28.152,1024,True,1,100000]
    cp4 = [18,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,3.0,'ml.p3.16xlarge',8,256,10547,28.152,1024,True,1,1000000]
    cp5 = [19,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,30.0,'ml.p3.8xlarge',4,128,21094,14.688,1024,False,1,100000]
    cp6 = [20,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,3.0,'ml.p3.8xlarge',4,128,21094,14.688,1024,False,1,1000000]
    cp7 = [21,dataset_name,model_name,tunable_params,s3_bucket,32,2.83e-4,30.0,'ml.p3.16xlarge',32,1024,2637,28.152,30,True,4,100000] 
    cp8 = [22,dataset_name,model_name,tunable_params,s3_bucket,32,2.83e-4,3.0,'ml.p3.16xlarge',32,1024,2637,28.152,30,True,4,1000000] 
    cp9 = [23,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,30.0,'ml.p3.16xlarge',8,256,10547,28.152,1024,False,1,100000]
    cp10 = [24,dataset_name,model_name,tunable_params,s3_bucket,32,5e-05,3.0,'ml.p3.16xlarge',8,256,10547,28.152,1024,False,1,1000000]

    # Open our existing CSV file in append mode
    # Create a file object for this file
    with open(f'{output_filepath}/experimental_design.csv', 'a') as f_object:
    
        # Pass this file object to csv.writer()
        # and get a writer object
        writer_object = writer(f_object)
    
        # Pass the list as an argument into
        # the writerow()
        writer_object.writerow(cp1)
        writer_object.writerow(cp2)
        writer_object.writerow(cp3)
        writer_object.writerow(cp4)
        writer_object.writerow(cp5)
        writer_object.writerow(cp6)

        # increased learn rate to compare with lr controlled version
        writer_object.writerow(cp7)
        writer_object.writerow(cp8)
        writer_object.writerow(cp9)
        writer_object.writerow(cp10)
  
        #Close the file object
        f_object.close()

    # read in updated file 
    exp_design = pd.read_csv(f'{output_filepath}/experimental_design.csv')
    # ------------------------------------------------------------------------
    # ------------------------------------------------------------------------

    # export desired experiments into individual csvs
    for ix, val in exp_design.iterrows():
        exp_design.loc[exp_design.index == ix].to_csv(f'{output_filepath}/run{ix}_experimental_design.csv', index_label = 'run_id')
    
    # to execute experiments - follow instructions in the get_results notebook

if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # not used in this stub but often useful for finding various files
    project_dir = Path(__file__).resolve().parents[2]

    # find .env automagically by walking up directories until it's found, then
    # load up the .env entries as environment variables
    load_dotenv(find_dotenv())

    main()