# Hugging Face and Sagemaker: fine-tuning a seq2seq model


# Introduction

In this script, we use the [Hugging Face transformers](https://huggingface.co/docs/transformers/index) library to fine-tune a seq2seq model, Pegasus, for medical text summarization tasks. This script was developed and tested in SageMaker Studio.


## Model

[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus) is a sequence to sequence (encoder-decoder) model that was introduced by Google AI in 2020. It is specifically designed for abstractive text summarization tasks and has shown impressive results in various benchmark datasets.

## Setup 

[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus) is a transformer-based model that was introduced by Google AI in 2020. It is specifically designed for abstractive text summarization tasks and has shown impressive results in various benchmark datasets.

## Dependencies

Install the required dependencies

In [None]:
%pip install transformers --quiet
%pip install accelerate --quiet
%pip install datasets --quiet
%pip install -U sagemaker --quiet
%pip install s3fs --quiet

### Variables and hyperparameters

In [7]:
from datetime import datetime

# vars
model_checkpoint = 'google/pegasus-xsum'
bucket_name = 'YOUR_BUCKET_NAME'
train_data_path = 'data/train.csv'
test_data_path = 'data/test.csv'
artifact_path = 'training_artifacts/%s/' % datetime.today().strftime('%Y-%m-%d') 

# tokenizer
max_target_length = 32
max_input_length = 512
ds_col_full = "note"
ds_col_summ = "summary"

# training
batch_size = 1
num_train_epochs = 5
learning_rate = 1e-3
optimizer_name = 'Adam' # must be a supported algorithm from https://pytorch.org/docs/stable/optim.html

In [8]:
import boto3
s3 = boto3.client('s3')

### Getting the data

In [None]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files={
    "train": f's3://{bucket_name}/{train_data_path}', 
    "validation": f's3://{bucket_name}/{test_data_path}'
})

### Tokenizer
Prepares data for the model by mapping text into numerical inputs called tokens

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples[ds_col_full],
        max_length=max_input_length,
        truncation=True,
        padding='max_length'
    )
    labels = tokenizer(
        examples[ds_col_summ], max_length=max_target_length, truncation=True, padding='max_length'
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

tokenized_datasets.set_format("torch")

tokenized_datasets = tokenized_datasets.remove_columns(
    dataset["train"].column_names
)

In [12]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

### Data Collator
Pads data during batching

In [13]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

### Optimizer
The optimizer maintains training state and update parameters based on training loss

In [14]:
### Hardcode the optimizer, replaced by following code block

#from torch.optim import Adam

#optimizer = Adam(model.parameters(), lr=learning_rate)

In [15]:
# Dynamically select optimizer based on input var

from importlib import import_module

module = import_module('torch.optim')
opt_fnc = getattr(module, optimizer_name)

optimizer = opt_fnc(model.parameters(), lr=learning_rate)

### Accelerator
The accelerator enables distributed training

In [16]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer = accelerator.prepare(
    model, optimizer
)

### Learning rate scheduler
Manages adjustments to the learning rate

In [19]:
from transformers import get_scheduler

num_update_steps_per_epoch = len(tokenized_datasets["train"]) / batch_size
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

## Training

In [None]:
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="output/",
    save_total_limit=1,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="epoch",
    logging_dir="output/",
    load_best_model_at_end=True,
    disable_tqdm=True,
    logging_first_step=True,
    logging_steps=1,
    save_strategy="epoch",
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    optimizers=(optimizer, lr_scheduler)
)
    
trainer.train()

In [17]:
unwrapped_model = accelerator.unwrap_model(trainer.model)
unwrapped_model.save_pretrained('model_dir', save_function=accelerator.save)

In [None]:
tokenizer.save_pretrained('model_dir')

In [21]:
with open("model_dir/special_tokens_map.json", "rb") as f:
    s3.upload_fileobj(f, bucket_name, artifact_path + "special_tokens_map.json")
with open("model_dir/tokenizer_config.json", "rb") as f:
    s3.upload_fileobj(f, bucket_name, artifact_path + "tokenizer_config.json")
with open("model_dir/tokenizer.json", "rb") as f:
    s3.upload_fileobj(f, bucket_name, artifact_path + "tokenizer.json")

### Zip and save the model to S3

In [None]:
!cd model_dir/ && tar -czvf model.tar.gz *
!mv model_dir/model.tar.gz ./

In [23]:
with open("model.tar.gz", "rb") as f:
    s3.upload_fileobj(f, bucket_name, artifact_path + "model/model.tar.gz")