# Train reward model with human feedback

The reward model is trained on a human-labeled dataset with the preferred `star_rating` for a given review.  The model flattens the human-labeled data from Ground Truth into (review, star_rating, ranking) tuples and provides a reward score for the RL-based model fine-tuning.

![Pipeline](img/generative_ai_pipeline_rlhf_plus.png)

![RLHF](img/rlhf_summarization.png)

![Convert human ranking data into reward dataset](img/convert_groundtruth_ranking_data_to_reward_model_dataset_summarization.png)

In [2]:
import psutil

notebook_memory = psutil.virtual_memory()
print(notebook_memory)

if notebook_memory.total < 32 * 1000 * 1000 * 1000:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True

svmem(total=32877658112, available=8898641920, percent=72.9, used=23516119040, free=3900121088, active=24321519616, inactive=3359760384, buffers=0, cached=5461417984, shared=1404928, slab=504270848)


In [3]:
# %pip install --disable-pip-version-check -q \
#     transformers==4.26.1 \
#     datasets==2.9.0 \
#     accelerate==0.17.0 \
#     bitsandbytes==0.37.0 \
#     promptsource==0.2.3 \
#     trl==0.4.1 \
#     evaluate==0.4.0

In [4]:
import boto3
import sagemaker
import pandas as pd

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

In [5]:
import io
import json
import uuid
import time
import boto3
import botocore

# Amazon Python SDK clients
sagemaker = boto3.client("sagemaker", region)
a2i = boto3.client("sagemaker-a2i-runtime")
s3 = boto3.client("s3", region)

In [22]:
import os
import glob
import numpy as np
import argparse
import pprint
from collections import defaultdict

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
import torch.utils.data.distributed
from torch.utils.data import Dataset, DataLoader

from transformers import AutoConfig, AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

In [23]:
%store -r human_feedback_dataset

In [24]:
try:
    human_feedback_dataset
except NameError:
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the notebooks in the previous section before you continue.")
    print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [25]:
print(human_feedback_dataset)

Dataset({
    features: ['prompt', 'response', 'reward'],
    num_rows: 2
})


# Train a reward model with human preference and alignment data
This is typically a language model initialized from the supervised-fine-tuned (SFT) model (trained in a previous notebook), but with an additional binary-classification layer placed on top.  This reward model is used to train the reinforcement-learning model in the next step.  The reinforcement-learning model is what is deployed into production to serve applications.

In [26]:
# %store -r supervised_fine_tuned_model_path

In [27]:
# try:
#     supervised_fine_tuned_model_path
# except NameError:
#     print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
#     print("[ERROR] Please run the notebooks in the previous section before you continue.")
#     print("+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")

In [28]:
# TODO:  switch back to the stored `supervised_fine_tuned_model_path` variable
supervised_fine_tuned_model_path = 'google/flan-t5-base'

In [29]:
print(supervised_fine_tuned_model_path)

google/flan-t5-base


In [30]:
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Union

import evaluate
import numpy as np
import torch.nn as nn
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    HfArgumentParser,
    PreTrainedTokenizerBase,
    Trainer,
    TrainingArguments,
)
from transformers.utils import PaddingStrategy

In [31]:
tokenizer = AutoTokenizer.from_pretrained(supervised_fine_tuned_model_path)

In [32]:
# Turn the dataset into pairs of prompt + responses, where text_j is the preferred prompt + response and text_k is the other.
def turn_into_text_classification_format(examples):
    new_examples = {"text_j": [], "text_k": []}
    for prompt, response, reward in zip(examples["prompt"], examples["response"], examples["reward"]):
        # TODO:  Add a check to make sure there is only a single 0 and a single 1
        if len(response) != 2 or len(reward) != 2 or reward[0] not in (0, 1) or reward[1] not in (0, 1):
            raise ValueError(
                f"There should be two responses with a reward that is either 0 or 1. Received {len(response)} responses and {len(reward)} rewards."
            )
            
        reward_response_index = reward.index(1)

        new_examples["text_j"].append(
            #str(response[highest_ranked_response_index]) + " " + tokenizer.bos_token + " " + prompt
            prompt + " " + str(response[reward_response_index])
        )
        new_examples["text_k"].append(
            #str(response[0 if highest_ranked_response_index == 1 else 1]) + " " + tokenizer.bos_token + " " + prompt
            prompt + " " + str(response[0 if reward_response_index == 1 else 1])
        )

    return new_examples

# Tokenize the dataset.
def preprocess_function(examples):
    tokenized_j = tokenizer(examples["text_j"], truncation=True)
    tokenized_k = tokenizer(examples["text_k"], truncation=True)
    return {
        "input_ids_j": tokenized_j["input_ids"],
        "attention_mask_j": tokenized_j["attention_mask"],
        "input_ids_k": tokenized_k["input_ids"],
        "attention_mask_k": tokenized_k["attention_mask"],
    }

In [33]:
num_proc = 8  # Can adjust to be higher if you have more processors
original_columns = human_feedback_dataset.column_names
print(original_columns)

human_feedback_binary_classification_dataset = human_feedback_dataset.map(turn_into_text_classification_format, batched=True, num_proc=num_proc, remove_columns=original_columns)

human_feedback_tokenized_dataset = human_feedback_binary_classification_dataset.map(preprocess_function, 
                                                                                    batched=True, 
                                                                                    num_proc=num_proc, 
                                                                                    remove_columns=["text_j", "text_k"])

print(human_feedback_tokenized_dataset)

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


['prompt', 'response', 'reward']
    

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


    

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['input_ids_j', 'attention_mask_j', 'input_ids_k', 'attention_mask_k'],
    num_rows: 2
})


In [34]:
# Define the metric that we'll use for validation.
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, _ = eval_pred
    # Here, predictions is rewards_j and rewards_k.
    # We want to see how much of the time rewards_j > rewards_k.
    predictions = np.argmax(predictions, axis=0)
    labels = np.zeros(predictions.shape)
    return accuracy.compute(predictions=predictions, references=labels)

In [35]:
# We need to define a special data collator that batches the data in our j vs k format.
@dataclass
class RewardDataCollatorWithPadding:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    return_tensors: str = "pt"

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        features_j = []
        features_k = []
        for feature in features:
            features_j.append({"input_ids": feature["input_ids_j"], "attention_mask": feature["attention_mask_j"]})
            features_k.append({"input_ids": feature["input_ids_k"], "attention_mask": feature["attention_mask_k"]})
        batch_j = self.tokenizer.pad(
            features_j,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch_k = self.tokenizer.pad(
            features_k,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=self.return_tensors,
        )
        batch = {
            "input_ids_j": batch_j["input_ids"],
            "attention_mask_j": batch_j["attention_mask"],
            "input_ids_k": batch_k["input_ids"],
            "attention_mask_k": batch_k["attention_mask"],
            "return_loss": True,
        }
        return batch

In [42]:
# We are using RoBERTa because it's the basis of a good, lightweight reward classifier model.  
# For more info, see Practical Data Science on AWS from DeepLearning.ai
ranking_reward_custom_model_name = 'roberta-base'
ranking_reward_custom_model = AutoModelForSequenceClassification.from_pretrained(ranking_reward_custom_model_name, num_labels=1)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.

In [45]:
class RewardTrainer(Trainer):
    # Define how to compute the reward loss.
    def compute_loss(self, model, inputs, return_outputs=False):
        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
        if return_outputs:
            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
        return loss

# Define and parse arguments.
local_rank = 0
resume_from_checkpoint = False
deepspeed = None
per_device_train_batch_size = 16
per_device_eval_batch_size = 16
gradient_accumulation_steps = 4
learning_rate = 2e-5
weight_decay = 0.001
bf16 = False
num_train_epochs = 1

ranking_reward_model_custom_checkpoint = './ranking_reward_model_custom/'

# Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
training_args = TrainingArguments(
    output_dir=ranking_reward_model_custom_checkpoint,
    learning_rate=learning_rate,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    num_train_epochs=num_train_epochs,
    weight_decay=weight_decay,
#    evaluation_strategy="epoch",
    save_strategy="epoch",
    gradient_accumulation_steps=gradient_accumulation_steps,
#    deepspeed=deepspeed,
#    local_rank=local_rank,
    remove_unused_columns=False,
    label_names=[],
)
    
# Train the model, woohoo.
trainer = RewardTrainer(
    model=ranking_reward_custom_model,
    args=training_args,
    train_dataset=human_feedback_tokenized_dataset, #["train"],
#    eval_dataset=tokenized_ds["validation"],
    compute_metrics=compute_metrics,
    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
)

trainer.train(resume_from_checkpoint)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss


TrainOutput(global_step=1, training_loss=0.18226858973503113, metrics={'train_runtime': 9.6516, 'train_samples_per_second': 0.207, 'train_steps_per_second': 0.104, 'total_flos': 0.0, 'train_loss': 0.18226858973503113, 'epoch': 1.0})

In [46]:
trainer.save_model(ranking_reward_model_custom_checkpoint)
tokenizer.save_pretrained(ranking_reward_model_custom_checkpoint)

('./ranking_reward_model_custom/tokenizer_config.json',
 './ranking_reward_model_custom/special_tokens_map.json',
 './ranking_reward_model_custom/tokenizer.json')

In [47]:
%store ranking_reward_model_custom_checkpoint

Stored 'ranking_reward_model_custom_checkpoint' (str)


In [None]:
# from transformers import TextClassificationPipeline
# from transformers import pipeline

# tokenizer = AutoTokenizer.from_pretrained(rl_ranking_reward_custom_dataset_model_checkpoint)
# ranking_reward_model_custom_model = AutoModelForSequenceClassification.from_pretrained(ranking_reward_model_custom_checkpoint, num_labels=1)

In [None]:
# conversation = '<TODO>'
# summary = '<TODO>'

# # run the conversation + summary pair though the reward classifier
# #prompt_and_response_pair_classification = rl_ranking_reward_custom_dataset_model ...


In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>