# OpenCALM SageMaker Finetuning by JAQKET dataset

[OpenCALM](https://huggingface.co/spaces/kyo-takano/OpenCALM-7B) を SageMaker の Training Job を使用し Fine Tuning し、学習済みモデルを推論エンドポイントに Hosting する Notebook です。[JAQKET](https://www.nlp.ecei.tohoku.ac.jp/projects/jaqket/) データセットから、 Fine Tuning 用に学習データ、 推論用に評価データセットを使用しています。

以下の環境で Training Job / Hosting を行い動作確認を行ってます。

* `ml.g5.2xlarge(NVIDIA A10G Tensor Core GPU 搭載 VRAM 24GB, RAM 32GB, vCPU 8)` : `PyTorch 1.13 Python 3.9 GPU Optimized`
 
[各インスタンスの料金についてはこちら](https://aws.amazon.com/jp/sagemaker/pricing/)をご確認ください。

In [None]:
!pip install -U "sagemaker>=2.143.0"

In [None]:
!pip install tqdm

In [None]:
import sagemaker, boto3, json
from sagemaker import get_execution_role
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.huggingface import HuggingFace

role = get_execution_role()
region = boto3.Session().region_name
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.__version__

## Upload Data

Fine Tuning 用の日本語データをフォルダに配置してアップロードする。

### Prepare JAQKET dataset

In [None]:
!wget -P data https://jaqket.s3.ap-northeast-1.amazonaws.com/data/aio_02/aio_02_train.jsonl

In [None]:
!head -n 2 data/aio_02_train.jsonl

In [None]:
# Convet .jsonl to .json
import pandas as pd

df = pd.read_json("data/aio_02_train.jsonl", orient="records", lines=True)
df = df.rename(columns={"question": "instruction", "answers": "output"})
df = df[["instruction", "output"]]
df["output"] = df["output"].apply(lambda x: f"{x[0]}」")
df["input"] = ""
print(df.shape)
df.to_json(
    "data/aio_02_train_formatted.jsonl", orient="records", force_ascii=False, lines=True
)

In [None]:
df.head(2)

In [None]:
input_train = sess.upload_data(
    path="./data/aio_02_train_formatted.jsonl", key_prefix="OpenCALM"
)
input_train

## Fine-tuning

In [None]:
model_name = "cyberagent/open-calm-7b"
model_name_base = model_name.split("/")[-1]

In [None]:
hyperparameters = {
    "base_model": model_name,
    # 'load_in_8bit': True,
    # 'load_in_4bit': True,
    "pad_token_id": 1,
    "data_path": "/opt/ml/input/data/train/aio_02_train_formatted.jsonl",
    "num_epochs": 2,  # default 3
    "cutoff_len": 256,
    "group_by_length": False,
    "output_dir": "/opt/ml/model",
    # 'resume_from_checkpoint': '/opt/ml/checkpoints',
    "lora_target_modules": "[query_key_value]",
    "lora_r": 16,
    "batch_size": 32,
    "micro_batch_size": 4,
    # 'val_set_size': 200,
    "prompt_template_name": "simple_qa_ja",
}

In [None]:
huggingface_estimator = HuggingFace(
    base_job_name=model_name_base,
    role=role,
    entry_point="finetune.py",
    source_dir="./scripts/code",
    instance_type="ml.g5.2xlarge",
    instance_count=1,
    volume_size=200,
    transformers_version="4.26",
    pytorch_version="1.13",
    py_version="py39",
    use_spot_instances=True,
    max_wait=86400,
    hyperparameters=hyperparameters,
    metric_definitions=[
        {"Name": "eval_loss", "Regex": "'eval_loss': (\d\.\d+)"},
        {"Name": "train_loss", "Regex": "'loss': (\d\.\d+)"},
    ],
    # checkpoint_s3_uri=f"s3://{bucket}/{base_job_name}/checkpoint/",
)
huggingface_estimator.fit({"train": input_train})

## Download and Extract Model

In [None]:
import boto3
import sagemaker


def get_latest_training_job_artifact(base_job_name):
    sagemaker_client = boto3.client("sagemaker")
    response = sagemaker_client.list_training_jobs(
        NameContains=base_job_name, SortBy="CreationTime", SortOrder="Descending"
    )
    training_job_arn = response["TrainingJobSummaries"][0]["TrainingJobArn"]
    training_job_description = sagemaker_client.describe_training_job(
        TrainingJobName=training_job_arn.split("/")[-1]
    )
    return training_job_description["ModelArtifacts"]["S3ModelArtifacts"]


try:
    model_data = huggingface_estimator.model_data
except:
    # Retrieve artifact url when kernel is restarted
    model_data = get_latest_training_job_artifact(model_name_base)

print(model_data)

In [None]:
!aws s3 cp {model_data} {model_name_base}.tar.gz

In [None]:
!rm -rf scripts/{model_name_base} && mkdir scripts/{model_name_base}
!tar -xvf {model_name_base}.tar.gz -C scripts/{model_name_base} --no-same-owner --wildcards adapter_*
!ls -l scripts/{model_name_base}

## Package and Upload Model

In [None]:
%cd scripts
!tar -czvf ../{model_name_base}.tar.gz *
%cd -

In [None]:
model_path = sess.upload_data(
    f"{model_name_base}.tar.gz", bucket=bucket, key_prefix=f"OpenCALM"
)
model_path

## Deploy Model

In [None]:
from sagemaker.async_inference import AsyncInferenceConfig
from sagemaker.serializers import JSONSerializer


huggingface_model = PyTorchModel(
    model_data=model_path,
    framework_version="1.13",
    py_version="py39",
    role=role,
    name=model_name_base,
    env={
        "model_params": json.dumps(
            {
                "base_model": model_name,
                "lora_weights": model_name_base,  # path relative to model package
                "peft": True,
                "load_8bit": False,
                "prompt_template": "simple_qa_ja",
            }
        ),
        "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600",
    },
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g5.2xlarge",
    endpoint_name=model_name_base,
    serializer=JSONSerializer(),
    # async_inference_config=AsyncInferenceConfig()
)

## Run Inference

In [None]:
# With SageMaker SDK

from sagemaker.predictor import Predictor
from sagemaker.predictor_async import AsyncPredictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor_client = Predictor(
    endpoint_name=model_name_base,
    sagemaker_session=sess,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer(),
)
# predictor_client = AsyncPredictor(
#     predictor=predictor_client,
#     name=endpoint_name
# )
data = {
    "instruction": "映画『ウエスト・サイド物語』に登場する2つの少年グループといえば、シャーク団と何団?",
    "max_new_tokens": 64,
    "temperature": 0.1,
    "do_sample": True,
    "pad_token_id": 1,
    "bos_token_id": 0,
    "eos_token_is": 0,
    # "repetition_penalty": 1.05,
    # "top_p": 0.75,
    # "top_k": 40,
    # "no_repeat_ngram_size": 2,
    "stop_ids": [1, 0],
}
response = predictor_client.predict(data=data)
print(response)

### Inference for dev data

In [None]:
!wget -P data https://jaqket.s3.ap-northeast-1.amazonaws.com/data/aio_02/aio_02_dev_v1.0.jsonl

In [None]:
import re


def inference(instruction):
    data = {
        "instruction": instruction,
        "input": "",
        "max_new_tokens": 64,
        "temperature": 0.1,
        "do_sample": False,
        "num_beams": 5,
        "pad_token_id": 1,
        "bos_token_id": 0,
        "eos_token_is": 0,
        # "repetition_penalty": 1.05,
        "stop_ids": [1, 0],
    }
    response = predictor_client.predict(data=data)
    answer = ""
    try:
        answer = re.findall("「(.*?)」", f"「{response}")[-1]
    except IndexError:
        answer = response
    return answer

In [None]:
import pandas as pd
from tqdm import tqdm


df = pd.read_json("data/aio_02_dev_v1.0.jsonl", orient="records", lines=True)

llm_answers = []
matches = []
for idx, row in tqdm(df.iterrows()):
    llm_answer = inference(row["question"])
    llm_answers += [llm_answer]
    matches += [llm_answer in row["answers"]]


df["llm_answers"] = pd.Series(llm_answers)
df["match"] = pd.Series(matches)

In [None]:
print(df.match.sum(), "/", len(df))

In [None]:
df.to_csv(f"data/{model_name_base}.csv", index=False)

## Benchmark Speed

In [None]:
%timeit response = predictor_client.predict(data=data)

## Delete Endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint()