In [None]:
### Kernel and SageMaker Setup
Please use the ml.t3.medium instance for this notebook. The Kernel is 'Data Science - Python3'.

# Text Summarization of Consumer Health Questions
## Part 2 Fine tuning Flan-t5 via SageMaker SDK
In the previous notebook we fine-tuned the MeQSum dataset on a local notebook instance. In this notebook we will learn how to use the SageMaker SDK to spin up training instances for fine-tuning the Flan-T5-base model on a medical summary task. 
### MeQSum Dataset
"On the Summarization of Cealth Questions". Asma Ben Abacha and Dina Demner-Fushman. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, ACL 2019. 
#### Citation Information
@Inproceedings{MeQSum,
author = {Asma {Ben Abacha} and Dina Demner-Fushman},
title = {On the Summarization of Consumer Health Questions},
booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, ACL 2019, Florence, Italy, July 28th - August 2},
year = {2019},
abstract = {Question understanding is one of the main challenges in question answering. In real world applications, users often submit natural language questions that are longer than needed and include peripheral information that increases the complexity of the question, leading to substantially more false positives in answer retrieval. In this paper, we study neural abstractive models for medical question summarization. We introduce the MeQSum corpus of 1,000 summarized consumer health questions. We explore data augmentation methods and evaluate state-of-the-art neural abstractive models on this new task. In particular, we show that semantic augmentation from question datasets improves the overall performance, and that pointer-generator networks outperform sequence-to-sequence attentional models on this task, with a ROUGE-1 score of 44.16%. We also present a detailed error analysis and discuss directions for improvement that are specific to question summarization. }}




In [None]:
!pip -q install transformers==4.28.0 datasets==2.12.0 sagemaker==2.156.0 --upgrade

## 1. Data Preparation

In [None]:
import pandas as pd
# dataset from https://github.com/abachaa/MeQSum

df = pd.read_excel('MeQSum_ACL2019_BenAbacha_Demner-Fushman.xlsx')
df = df.drop('File', axis=1)
df = df.rename(columns={'CHQ':'Text'})
df = df.dropna()
df['Text']= df['Text'].apply(lambda x: x.lower())
df['Summary'] = df['Summary'].apply(lambda x: x.lower())
df['Id'] = range(0, len(df.index))
df = df[['Id', 'Text', 'Summary']]
# df = df.sample(frac=1).reset_index(drop=True) # to shuffle
df

In [None]:
# Import libraries required for modelling & create a SageMaker Session

import datasets
from datasets import Dataset
from datasets import load_metric
from datasets import concatenate_datasets
from datasets.filesystems import S3FileSystem

import transformers
from transformers import AutoTokenizer

import sagemaker
from sagemaker.huggingface import HuggingFace

sess = sagemaker.Session()

In [None]:
model_checkpoint = 'google/flan-t5-base' # 'google/flan-t5-small' for quick training.
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Similar to any ML model, we split the data into a train and test set

train = df[:700]
val = df[700:900]
test = df[900:]
print('train: {}, val: {}, test: {}'.format(train.shape, val.shape, test.shape))

In [None]:
# Metadata and dataset objects

train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)
test_dataset = Dataset.from_pandas(test)

In [None]:
# Deterimine the max input length and max target length based on the number of rows in the dataset

tokenized_inputs = concatenate_datasets([train_dataset, val_dataset, test_dataset]).map(lambda x: tokenizer(x["Text"], truncation=True), batched=True, remove_columns=["Text", "Summary"])
max_input_length = max([len(x) for x in tokenized_inputs["input_ids"]])
print(f"Max input length: {max_input_length}")

tokenized_targets = concatenate_datasets([train_dataset, val_dataset, test_dataset]).map(lambda x: tokenizer(x["Summary"], truncation=True), batched=True, remove_columns=["Text", "Summary"])
max_target_length = max([len(x) for x in tokenized_targets["input_ids"]])
print(f"Max target length: {max_target_length}")

#### Create a function to tokenise inputs to the model & ensure vectors are the same length

In [None]:
def preprocess_function(sample,padding="max_length"):
 inputs = ["summarize: " + item for item in sample["Text"]]
 model_inputs = tokenizer(inputs, max_length=max_input_length, padding=padding, truncation=True)

 labels = tokenizer(text_target=sample["Summary"], max_length=max_target_length, padding=padding, truncation=True)

 if padding == "max_length":
 labels["input_ids"] = [
 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
 ]

 model_inputs["labels"] = labels["input_ids"]
 return model_inputs

In [None]:
# Apply the tokenisation function

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

print(f"Keys of tokenized dataset: {tokenized_train.features}")

#### We will upload the data to S3, which integrates natively with Amazon SageMaker

In [None]:
# Uploading dataset to S3
s3 = S3FileSystem()

bucket = sess.default_bucket()
s3_prefix = "huggingface/meqsum-flan-t5-summarization"

base_job_name="huggingface-meqsum-flan-t5-summarization"
checkpoint_in_bucket="checkpoints"

# The S3 URI to store the checkpoints
checkpoint_s3_bucket="s3://{}/{}/{}".format(bucket, base_job_name, checkpoint_in_bucket)

# The local path where the model will save its checkpoints in the training container
checkpoint_local_path="/opt/ml/checkpoints"

dataset_input_path = "s3://{}/{}".format(bucket, s3_prefix)
train_input_path = "{}/train".format(dataset_input_path)
valid_input_path = "{}/validation".format(dataset_input_path)

print(dataset_input_path)
print(train_input_path)
print(valid_input_path)
print(checkpoint_s3_bucket)

tokenized_train.save_to_disk(train_input_path, fs=s3)
tokenized_val.save_to_disk(valid_input_path, fs=s3)

## 2. Training the model using SageMaker Training

#### Define the objects & parameters for model training


In [None]:
# Define hyperparameters

hyperparameters = {
 "epochs": 10,
 "learning-rate": 2e-5,
 "train-batch-size": 4,
 "eval-batch-size": 4,
 "model-name": model_checkpoint,
 'output_dir': checkpoint_local_path
}

In [None]:
# Define the loss metrics that will be used to measure model performance

metric_definitions=[
 {'Name': 'loss', 'Regex': "'loss': ([0-9]+(.|e\-)[0-9]+),?"},
 {'Name': 'learning_rate', 'Regex': "'learning_rate': ([0-9]+(.|e\-)[0-9]+),?"},
 {'Name': 'eval_loss', 'Regex': "'eval_loss': ([0-9]+(.|e\-)[0-9]+),?"},
 {'Name': 'epoch', 'Regex': "'epoch': ([0-9]+(.|e\-)[0-9]+),?"}
]

In [None]:
# Here we call the Hugging Face container and add parameters to it such as dataset locations, and the training instance size

huggingface_estimator = HuggingFace(
 role=sagemaker.get_execution_role(),
 entry_point="train.py",
 dependencies=["requirements.txt"],
 hyperparameters=hyperparameters,
 base_job_name=base_job_name,
 checkpoint_s3_uri=checkpoint_s3_bucket,
 checkpoint_local_path=checkpoint_local_path,
 transformers_version="4.26.0",
 pytorch_version="1.13.1",
 py_version="py39",
 instance_type="ml.p3.2xlarge",
 instance_count=1,
 metric_definitions=metric_definitions
 # distribution={"smdistributed": {"dataparallel": {"enabled": True}}}, # For distributed training.
)

In [None]:
huggingface_estimator.fit({"train": train_input_path, "valid": valid_input_path})

In [None]:
huggingface_estimator.model_data

## 3. Perform inferencing on the test dataset with a SageMaker endpoint

In [None]:
# Deploy the model endpoint

huggingface_predictor = huggingface_estimator.deploy(
 initial_instance_count=1, instance_type="ml.p3.2xlarge"
)

In [None]:
# Obtain the model predictions

predictions = []
for test_data in test_dataset: 
 prediction = huggingface_predictor.predict({"inputs": f"summarize: {test_data['Text']}"})
 predictions.append(prediction[0]['generated_text'])

#### The `Predicted Summary` column is the model's output

In [None]:
# Model summarisation performance can be inspected by evaluation metrics and spot checks

test['Predicted Summary'] = predictions
pd.set_option('display.max_colwidth', 1024)
test

## 4. Clean up of resources so they aren't left runnning

In [None]:
# Remember to delete your endpoint after use as you will be charged for the instances it uses

huggingface_predictor.delete_model()
huggingface_predictor.delete_endpoint()