# -*- coding: utf-8 -*- """ Same code that is referenced in make_s3_upload.py (see docstring there) only callable from Jupyter Notebook rather than the Makefile. """ # import python libraries import pandas as pd # import aws/sagemaker resources import boto3 import sagemaker import sagemaker.huggingface import os # from sagemaker import get_execution_role # uncomment if in SageMaker Notebooks # import huggingface resources import datasets from datasets import load_dataset from datasets.filesystems import S3FileSystem from transformers import AutoTokenizer # Read experimental design exp_design = pd.read_csv('../data/interim/experimental_design.csv') # Configure AWS Resources # S3 bucket and connection s3 = S3FileSystem() bucket = exp_design['s3_bucket'].unique()[0] # should only have one bucket per experiment configured # dataset used from HuggingFace Website dataset_name = exp_design['dataset_name'].unique()[0] # should only have one dataset per experiment configured # Define IAM role iam_client = boto3.client('iam') role_name = os.getenv("AWS_ROLE") role = iam_client.get_role(RoleName=role_name)['Role']['Arn'] sess = sagemaker.Session(default_bucket=f"{bucket}") metrics_session = sagemaker.session.Session() # use to get metrics after training # if in SageMaker Notebooks, comment out role_name and role above and replace with below: # role = get_execution_role() # assign region for the session region = boto3.Session().region_name print("Configuration settings:") print(f"sagemaker role arn: {role}") print(f"sagemaker bucket: {sess.default_bucket()}") print(f"sagemaker session region: {sess.boto_region_name}") # Configure File I/O for Experiment # s3 key prefix for the data input_prefix = f'datasets/{dataset_name}' output_prefix = f"output/{dataset_name}" # output path for train job results output_path = f's3://{sess.default_bucket()}/{output_prefix}' # capture order of operations for experiment size_run_order = exp_design['dataset_size'] node_run_order = exp_design['num_nodes'] epochs_run_order = exp_design['epochs'] batch_run_order = exp_design['per_device_train_batch_size'] num_runs = len(node_run_order) print("Settings for this experiment:") print(display(exp_design)) # set dataset sizes of interest dataset_sizes = size_run_order.unique() # includes training and val obs to be loaded print("Dataset sizes:", dataset_sizes) # initialize dataset lookups train_dataset_lookup = dict() val_dataset_lookup = dict() # set tokenizer used in preprocessing tokenizer_name = exp_design['automodel_name'].unique()[0] # should only have one automodel_name configured per experiment checkpoint = tokenizer_name # ensures checkpoint and tokenizer are the same, required for an AutoModel # download tokenizer print("Downloading tokenizer.") tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) def tokenize(batch): return tokenizer(batch['content'], padding='max_length', truncation=True) # split and tokenize datasets, going to assign s3 paths to dictionary for lookup later print("Splitting and tokenizing data.") for num_obs in dataset_sizes: train_split = datasets.load_dataset(dataset_name, split=datasets.ReadInstruction( 'train', from_=0, to=num_obs, unit='abs')) # split loaded training data into 90% training and 10% val sets train_val_dataset = train_split.train_test_split(test_size=0.1, seed = 324) # tokenize splits train_dataset = train_val_dataset['train'].map(tokenize, batched=True) val_dataset = train_val_dataset['test'].map(tokenize, batched=True) # set dataset format for pytorch train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) print("Uploading data to S3 for size: {}".format(num_obs)) # save train_dataset to s3 training_input_path = f's3://{sess.default_bucket()}/{input_prefix}/{num_obs}/train' train_dataset.save_to_disk(training_input_path,fs=s3) # save val_dataset to s3 validation_input_path = f's3://{sess.default_bucket()}/{input_prefix}/{num_obs}/val' val_dataset.save_to_disk(validation_input_path,fs=s3) # assign s3 paths to tokenized splits to lookup dict for each dataset size train_dataset_lookup[num_obs] = training_input_path val_dataset_lookup[num_obs] = validation_input_path print("Training/Val dataset paths for dataset size {}:".format(num_obs)) print(f"{training_input_path}") print(f"{validation_input_path}") # load and tokenize test dataset, constant for all test_dataset = load_dataset(dataset_name, split = 'test') # test split constant for all test_dataset = test_dataset.map(tokenize, batched=True) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) # set format for pytorch # save test_dataset to s3 test_input_path = f's3://{sess.default_bucket()}/{input_prefix}/test' print("Test dataset path:") print(f"{test_input_path}") test_dataset.save_to_disk(test_input_path,fs=s3)