In [None]:
%pip install torch==1.13.1 torchdata

In [1]:
%pip install --disable-pip-version-check -q \
    transformers==4.27.2 \
    datasets==2.9.0 \
    accelerate==0.17.0 \
    evaluate==0.4.0 \
    trl==0.4.1 \
    rouge_score==0.1.2 \
    loralib==0.1.1

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
#!pip install git+https://github.com/huggingface/peft.git

In [3]:
!pip install git+https://github.com/lvwerra/trl.git

Collecting git+https://github.com/lvwerra/trl.git
  Cloning https://github.com/lvwerra/trl.git to /tmp/pip-req-build-cpoujzci
  Running command git clone --filter=blob:none --quiet https://github.com/lvwerra/trl.git /tmp/pip-req-build-cpoujzci
  Resolved https://github.com/lvwerra/trl.git to commit ce37eadcfa22f2a3c25422411a586b8f593e3e6e
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: trl
  Building wheel for trl (setup.py) ... [?25ldone
[?25h  Created wheel for trl: filename=trl-0.4.2.dev0-py3-none-any.whl size=54216 sha256=01482a1b7edbf841545e9be34a23a25646b053744c4d2bf27bb772a41e059d2e
  Stored in directory: /tmp/pip-ephem-wheel-cache-w6prql5u/wheels/ca/6e/f4/b183ecbed483efdcd2041a8021ce7bcb9f7b09c74bff5bb00a
Successfully built trl
Installing collected packages: trl
  Attempting uninstall: trl
    Found existing installation: trl 0.4.1
    Uninstalling trl-0.4.1:
      Successfully uninstalled trl-0.4.1
Successfully installed trl-0.4.2.dev0


In [4]:
%store -r ranking_reward_model_custom_checkpoint

In [5]:
print(ranking_reward_model_custom_checkpoint)

./ranking_reward_model_custom/


In [6]:
# %store -r supervised_fine_tuned_model_path

In [7]:
# print(supervised_fine_tuned_model_path)

# Load dataset

In [8]:
from transformers import  AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
dataset = load_dataset('./data-summarization/')
dataset

Using custom data configuration data-summarization-46d1b2508a766ab7
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/data-summarization-46d1b2508a766ab7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'topic'],
        num_rows: 14460
    })
})

## Explore an Example Prompt

In [9]:
idx = 0
diag = dataset['train'][idx]['dialogue']
baseline_human_summary = dataset['train'][idx]['summary']

prompt = f'Summarize the following conversation.\n\n{diag}\n\nSummary:'
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

print(f'Prompt:\n--------------------------\n{prompt}\n--------------------------')
print(f'Baseline human summary : {baseline_human_summary}')

Prompt:
--------------------------
Summarize the following conversation.

#Person1#: If we employ you, what starting salary would you expect?
#Person2#: I'd like to start at 3000 yuan a month.
#Person1#: I think your background and experience are worth the compensation.
#Person2#: Does it include bonuses?
#Person1#: No, there are annual bonuses, one week paid vacation a year, and health insurance.
#Person2#: Very good.

Summary:
--------------------------
Baseline human summary : #Person1# agrees #Person2#'s starting monthly salary would be 3000 yuan and tells #Person2# about other benefits.


## Tokenize the Dataset

In [28]:
def tokenize_function(example):
    prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '

    dialog = example["dialogue"]
    query = prompt + dialog + end_prompt
    
    example['query'] = query
    example['input_ids'] = tokenizer(query, padding="max_length", truncation=True, return_tensors="pt").input_ids
    summary = example["summary"]
    example['labels'] = tokenizer(summary, padding="max_length", truncation=True, return_tensors="pt").input_ids
    return example

tokenized_dataset = dataset.map(tokenize_function) #, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['id', 'topic', 'dialogue', 'summary',])

# tokenized_dataset = tokenized_dataset.filter(lambda x: len(x["input_ids"]) < 512, batched=False)
# tokenized_dataset.set_format(type="torch")

tokenized_dataset['train'][0]['query']

  0%|          | 0/14460 [00:00<?, ?ex/s]

"Summarize the following conversation.\n\n#Person1#: If we employ you, what starting salary would you expect?\n#Person2#: I'd like to start at 3000 yuan a month.\n#Person1#: I think your background and experience are worth the compensation.\n#Person2#: Does it include bonuses?\n#Person1#: No, there are annual bonuses, one week paid vacation a year, and health insurance.\n#Person2#: Very good.\n\nSummary: "

In [29]:
from dataclasses import dataclass, field
from typing import Optional

import torch
from accelerate import Accelerator
from datasets import load_dataset
#from peft import LoraConfig
from tqdm import tqdm
from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline

from trl import AutoModelForSeq2SeqLMWithValueHead, PPOConfig, PPOTrainer, set_seed
from trl.core import LengthSampler

tqdm.pandas()

fine_tuned_with_ranking_rewards = './fine_tuned_with_ranking_rewards'

@dataclass
class ScriptArguments:
    model_name: Optional[str] = field(default="google/flan-t5-base", metadata={"help": "the model name"})
    tokenizer_name: Optional[str] = field(default="google/flan-t5-base", metadata={"help": "the tokenizer name"})
    # model_name: Optional[str] = field(default=supervised_fine_tuned_model_path, metadata={"help": "the model name"})
    # tokenizer_name: Optional[str] = field(default=supervised_fine_tuned_model_path, metadata={"help": "the tokenizer name"})
    reward_model_name: Optional[str] = field(default=ranking_reward_model_custom_checkpoint, metadata={"help": "the reward model name"})
    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
    learning_rate: Optional[float] = field(default=1.4e-5, metadata={"help": "the learning rate"})
    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"})
    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
    batch_size: Optional[int] = field(default=8, metadata={"help": "the batch size"})
    ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
    gradient_accumulation_steps: Optional[int] = field(
        default=8, metadata={"help": "the number of gradient accumulation steps"}
    )
    adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
    early_stopping: Optional[bool] = field(default=True, metadata={"help": "whether to early stop"})
    target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
    reward_baseline: Optional[float] = field(
        default=0.0,
        metadata={"help": "a baseline value that is subtracted from the reward"},
    )
    batched_gen: Optional[bool] = field(default=True, metadata={"help": "whether to use the batched text gen"})
    save_freq: Optional[int] = field(default=100, metadata={"help": "n steps to save the model"})
    output_dir: Optional[str] = field(default=fine_tuned_with_ranking_rewards, metadata={"help": "n steps to save the model"})
    seed: Optional[int] = field(default=42, metadata={"help": "the seed"})


parser = HfArgumentParser(ScriptArguments)
script_args: ScriptArguments = parser.parse_args_into_dataclasses(return_remaining_strings=True)[0]
reward_model_name = script_args.reward_model_name


#dataset_name = "lvwerra/stack-exchange-paired"

config = PPOConfig(
    model_name=script_args.model_name,
    learning_rate=script_args.learning_rate,
    log_with=script_args.log_with,
    batch_size=script_args.batch_size,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    optimize_cuda_cache=True,
    early_stopping=script_args.early_stopping,
    target_kl=script_args.target_kl,
    ppo_epochs=script_args.ppo_epochs,
    seed=script_args.seed,
)


# Dataset is here:
#   https://huggingface.co/datasets/lvwerra/stack-exchange-paired/tree/main/data/rl

# train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
# train_dataset = train_dataset.select(range(100000))

# train_dataset = load_dataset("knkarthick/dialogsum", split="test")
# train_dataset = train_dataset.select(range(10))
# # We then define the arguments to pass to the sentiment analysis pipeline.
# # We set `return_all_scores` to True to get the sentiment score for each token.
sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16, "truncation": True}

tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)


# def build_human_feedback_dataset(
#     tokenizer, # dataset_name="lvwerra/stack-exchange-paired", input_min_text_length=2, input_max_text_length=8
# ):
#     # load with datasets
#     ds = load_dataset("knkarthick/dialogsum", split="test")
#     original_columns = ds.column_names
#     num_proc = 24

#     def preprocess_function(examples):
#         new_examples = {
#             "query": [],
#             "input_ids": [],
#         }
#         # for question in examples["question"]:
#         #     query = "Question: " + question + "\n\nAnswer: "
#         #     tokenized_question = tokenizer(query, truncation=True)
#         #     new_examples["query"].append(query)
#         #     new_examples["input_ids"].append(tokenized_question["input_ids"])
#         for example in examples:
#             prompt = 'Summarize the following conversation.\n\n'
#             end_prompt = '\n\nSummary: '
#             query = [prompt + i + end_prompt for i in example["dialogue"]]
#             example['input_ids'] = tokenizer(query, padding="max_length", truncation=True, return_tensors="pt").input_ids
#             example['labels'] = tokenizer(example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids
#             new_examples["query"].append(query)
#             new_examples["input_ids"].append(example["input_ids"])
            
#         return new_examples

#     ds = train_dataset.map(
#         preprocess_function,
#         batched=True,
#         num_proc=num_proc,
#         remove_columns=original_columns,
#     )
#     ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)

#     ds.set_format(type="torch")
#     return ds


# # We retrieve the dataloader by calling the `build_dataset` function.
#dataset = build_human_feedback_dataset(tokenizer)

def collator(data):
    return dict((key, [d[key] for d in data]) for key in data[0])

sample_tokenized_dataset = tokenized_dataset.filter(lambda example, indice: indice % 100 == 0, with_indices=True)


  0%|          | 0/15 [00:00<?, ?ba/s]

In [30]:
# set seed before initializing value head for deterministic eval
set_seed(config.seed)

# Now let's build the model, the reference model, and the tokenizer.
current_device = Accelerator().local_process_index

# lora_config = LoraConfig(
#     r=16,
#     lora_alpha=32,
#     lora_dropout=0.05,
#     bias="none",
#     task_type="SEQ_2_SEQ_LM",
# )
model = AutoModelForSeq2SeqLMWithValueHead.from_pretrained(config.model_name #, device_map={"": current_device},
#    peft_config=lora_config,
#    layer_norm_names=[],
)

sample_tokenized_dataset = tokenized_dataset.filter(lambda example, indice: indice % 100 == 0, with_indices=True)

# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
ppo_trainer = PPOTrainer(
    config,
    model,
    ref_model=None,
    tokenizer=tokenizer,
    dataset=sample_tokenized_dataset['train'],
    data_collator=collator,
)

device = ppo_trainer.accelerator.device
if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug
sentiment_pipe = pipeline(
    "sentiment-analysis", # reward classifier is a type of sentiment-analysis (perhaps make this more clear by using text-classification instead?)
    model=reward_model_name,
#    device_map={"": current_device},
    tokenizer=tokenizer,
)

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/data-summarization-46d1b2508a766ab7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-dfa1a8a7aa6c13bb.arrow


In [52]:
# We then define the arguments to pass to the `generate` function. These arguments
# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
# the `generate` function of the trained model.
generation_kwargs = {
    # "min_length": -1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.pad_token_id,
    "eos_token_id": 100_000,
}
output_min_length = 32
output_max_length = script_args.output_max_length
output_length_sampler = LengthSampler(output_min_length, output_max_length)

for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    prompt_tensors = batch["input_ids"]

    batch["response"] = []
    response_tensors = []
    prompt_tensors_as_tensor = []
    for prompt_tensor in prompt_tensors:
        #print(prompt_tensor[0])
        prompt_tensor_as_tensor = torch.as_tensor(prompt_tensor[0], device="cuda")
        prompt_tensors_as_tensor.append(prompt_tensor_as_tensor)
        response_tensor = ppo_trainer.generate(            
            prompt_tensor_as_tensor,
            return_prompt=False,
            length_sampler=output_length_sampler,
            **generation_kwargs,
        )
        response_tensors.append(response_tensor[0])
        #print(response_tensor)
        #batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
        decoded_response = tokenizer.decode(response_tensor[0], skip_special_tokens=True)
        batch["response"].append(decoded_response)

    # Compute sentiment score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    #print(texts)
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    #print(pipe_outputs)
    rewards = []
    for output in pipe_outputs:
        #print(output)
        reward = torch.tensor(output[0]["score"] - script_args.reward_baseline)
        rewards.append(reward)

    # Run PPO step    
    stats = ppo_trainer.step(prompt_tensors_as_tensor, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
        ppo_trainer.tokenizer.save_pretrained(fine_tuned_with_ranking_rewards)
        ppo_trainer.accelerator.unwrap_model(ppo_trainer.model).save_pretrained(fine_tuned_with_ranking_rewards)

  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
  f"KL divergence is starting to become negative: {mean_kl.item():.2f} - this might be a precursor for failed training."
18it [08:54, 29.69s/it]
