![Pipeline](./img/generative_ai_pipeline_rlhf_plus.png)

In [5]:
import psutil

notebook_memory = psutil.virtual_memory()
print(notebook_memory)

if notebook_memory.total < 32 * 1000 * 1000 * 1000:
    print('*******************************************')    
    print('YOU ARE NOT USING THE CORRECT INSTANCE TYPE')
    print('PLEASE CHANGE INSTANCE TYPE TO  m5.2xlarge ')
    print('*******************************************')
else:
    correct_instance_type=True

svmem(total=66814242816, available=64650002432, percent=3.2, used=1483964416, free=52365144064, active=2943295488, inactive=9972576256, buffers=2768896, cached=12962365440, shared=913408, slab=942276608)


# Quantitative Results with ROUGE Metric

The [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) helps quantify the validity of summarizations produced by models. It compares summarizations to a "baseline" summary which is usually created by a human. While not perfect, it does give an indication to the overall increase in summarization effectiveness that we have accomplished by fine-tuning.

In [6]:
%pip install --disable-pip-version-check -q \
    transformers==4.27.2 \
    datasets==2.9.0 \
    accelerate==0.17.0 \
    promptsource==0.2.3 \
    evaluate==0.4.0 \
    trl==0.4.1 \
    rouge_score==0.1.2 \
    loralib==0.1.1

[0mNote: you may need to restart the kernel to use updated packages.


# Create prompts for few-shot, one-shot, zero-shot inference on sample data

In [7]:
dataset_templates_name = 'amazon_us_reviews/Wireless_v1_00'
prompt_template_name = 'Generate review headline based on review body'

In [8]:
import pandas as pd
import csv
file = './data-tsv/amazon_reviews_us_Digital_Video_Games_v1_00.tsv.gz'

# Read the file
df = pd.read_csv(file, delimiter="\t", quoting=csv.QUOTE_NONE, compression="gzip")

df.isna().values.any()
df = df.dropna()
df = df.reset_index(drop=True)    

print("Shape of dataframe {}".format(df.shape))

# Convert Pandas dataframes into Datasets
import datasets
from datasets import Dataset

# Create Dataset objects (Arrow PyTables) from Pandas dataframes
dataset = Dataset.from_pandas(df)
df.head()

Shape of dataframe (145427, 15)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,21269168,RSH1OZ87OYK92,B013PURRZW,603406193,Madden NFL 16 - Xbox One Digital Code,Digital_Video_Games,2,2,3,N,N,A slight improvement from last year.,I keep buying madden every year hoping they ge...,2015-08-31
1,US,133437,R1WFOQ3N9BO65I,B00F4CEHNK,341969535,Xbox Live Gift Card,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome,2015-08-31
2,US,45765011,R3YOOS71KM5M9,B00DNHLFQA,951665344,Command & Conquer The Ultimate Collection [Ins...,Digital_Video_Games,5,0,0,N,Y,Hail to the great Yuri!,If you are prepping for the end of the world t...,2015-08-31
3,US,113118,R3R14UATT3OUFU,B004RMK5QG,395682204,Playstation Plus Subscription,Digital_Video_Games,5,0,0,N,Y,Five Stars,Perfect,2015-08-31
4,US,22151364,RV2W9SGDNQA2C,B00G9BNLQE,640460561,Saints Row IV - Enter The Dominatrix [Online G...,Digital_Video_Games,5,0,0,N,Y,Five Stars,Awesome!,2015-08-31


In [9]:
from promptsource.templates import DatasetTemplates
prompt_templates = DatasetTemplates(dataset_templates_name) 

print('*** Available prompts:')

for template in prompt_templates.templates.values():
    print(template.get_name())

*** Available prompts:
Generate review headline based on review body
Generate review based on rating and category
Given the review headline return a categorical rating
Generate review headline based on rating
Given the review body return a categorical rating


In [10]:
from pprint import pprint

prompt = prompt_templates[prompt_template_name]
print('** Selected prompt name: {}'.format(prompt_template_name))

** Selected prompt name: Generate review headline based on review body


In [11]:
print('** Available prompt answers: {}'.format(prompt.answer_choices))

** Available prompt answers: None


In [12]:
print('** Selected prompt template:')
pprint(prompt.__dict__)

** Selected prompt template:
{'answer_choices': None,
 'id': '5feaa0d7-e4e0-46cc-8517-e00bfa7fd00e',
 'jinja': 'Give a short sentence describing the following product review:\n'
          '{{review_body}} \n'
          '|||\n'
          '{{review_headline}}',
 'metadata': <promptsource.templates.Template.Metadata object at 0x7fa3469490d0>,
 'name': 'Generate review headline based on review body',
 'reference': 'Generate review headline based on review body'}


# Qualitative 

In [13]:
prompts_and_labels = dataset \
    .filter(lambda row: len(row['review_headline']) > 200) \
    .select(range(1000)).map(lambda row : {'prompt': prompt.apply(row)[0], 'label': prompt.apply(row)[1]})

  0%|          | 0/1000 [00:00<?, ?ex/s]

In [14]:
prompts = prompts_and_labels['prompt']
human_baseline_summaries = prompts_and_labels['review_headline']

In [15]:
# for prompt_label in dataset:
#     prompt = prompt_label['prompt']
#     inputs = tokenizer(prompt, return_tensors='pt')

#     response = tokenizer.decode(model.generate(inputs["input_ids"], 
#                        max_new_tokens=200,
#                        do_sample=True, 
#                        top_k=50, 
#                        top_p=0.9
#                       )[0],
#                      skip_special_tokens=True)

#     print('PROMPT: {}'.format(prompt))
#     print('RESPONSE: {}'.format(response))
#     print('EXPECTED RESPONSE: {}'.format(prompt_label['label']))
#     print('----')

In [16]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

pretrained_model_checkpoint='t5-base'
pretrained_model_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_checkpoint, use_fast=True)
pretrained_model = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model_checkpoint)

instruct_fine_tuned_model_checkpoint='google/flan-t5-base'
instruct_fine_tuned_model_tokenizer = AutoTokenizer.from_pretrained(instruct_fine_tuned_model_checkpoint, use_fast=True)
instruct_fine_tuned_model = AutoModelForSeq2SeqLM.from_pretrained(instruct_fine_tuned_model_checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


# Quantitative Results with ROUGE Metric

The [ROUGE metric](https://en.wikipedia.org/wiki/ROUGE_(metric)) helps quantify the validity of summarizations produced by models. It compares summarizations to a "baseline" summary which is usually created by a human. While not perfect, it does give an indication to the overall increase in summarization effectiveness that we have accomplished by fine-tuning.# ROUGE evaluation of summaries

In [17]:
import evaluate

rouge = evaluate.load('rouge')

In [19]:
from transformers import GenerationConfig

pretrained_model_summaries = []
instruct_fine_tuned_model_summaries = []

for idx, prompt in enumerate(prompts):
    input_ids = pretrained_model_tokenizer(prompt, return_tensors="pt").input_ids

    pretrained_model_outputs = pretrained_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    pretrained_model_text_output = pretrained_model_tokenizer.decode(pretrained_model_outputs[0], skip_special_tokens=True)
    pretrained_model_summaries.append(pretrained_model_text_output)

    instruct_fine_tuned_model_outputs = instruct_fine_tuned_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    instruct_fine_tuned_model_text_output = instruct_fine_tuned_model_tokenizer.decode(instruct_fine_tuned_model_outputs[0], skip_special_tokens=True)
    instruct_fine_tuned_model_summaries.append(instruct_fine_tuned_model_text_output)

Token indices sequence length is longer than the specified maximum sequence length for this model (607 > 512). Running this sequence through the model will result in indexing errors


In [20]:
pretrained_model_results = rouge.compute(
    predictions=pretrained_model_summaries,
    references=human_baseline_summaries[0:len(pretrained_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
pretrained_model_results

{'rouge1': 0.007368872409147647,
 'rouge2': 0.00381036656171144,
 'rougeL': 0.006487574402761279,
 'rougeLsum': 0.006745900840542784}

In [21]:
instruct_fine_tuned_model_results = rouge.compute(
    predictions=instruct_fine_tuned_model_summaries,
    references=human_baseline_summaries[0:len(instruct_fine_tuned_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)
instruct_fine_tuned_model_results

{'rouge1': 0.09712394312163732,
 'rouge2': 0.053035563434343486,
 'rougeL': 0.09372375672342996,
 'rougeLsum': 0.09386530138173066}

In [22]:
%store pretrained_model_checkpoint

Stored 'pretrained_model_checkpoint' (str)


In [23]:
%store instruct_fine_tuned_model_checkpoint

Stored 'instruct_fine_tuned_model_checkpoint' (str)


In [24]:
%store dataset_templates_name

Stored 'dataset_templates_name' (str)


In [25]:
%store prompt_template_name

Stored 'prompt_template_name' (str)


# Release Resources

In [None]:
%%html

<p><b>Shutting down your kernel for this notebook to release resources.</b></p>
<button class="sm-command-button" data-commandlinker-command="kernelmenu:shutdown" style="display:none;">Shutdown Kernel</button>
        
<script>
try {
    els = document.getElementsByClassName("sm-command-button");
    els[0].click();
}
catch(err) {
    // NoOp
}    
</script>