# Alpaca-LoRA SageMaker Finetuning

This is a sample code to finetune and deploy [Alpaca-LoRA](https://github.com/tloen/alpaca-lora) on SageMaker.

In [None]:
!pip install "sagemaker>=2.143.0" -U

In [None]:
import sagemaker, boto3, json
from sagemaker import get_execution_role
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.huggingface import HuggingFace

role = get_execution_role()
region = boto3.Session().region_name
sess = sagemaker.Session()
bucket = sess.default_bucket()

sagemaker.__version__

## Upload Data

Fine Tuning 用の日本語データをフォルダに配置してアップロードします。

ここでは例として Alpaca データセットを日本語に翻訳したものを利用します。（LICENSE：Apache-2.0）

In [None]:
!curl https://raw.githubusercontent.com/maekawataiki/alpaca_ja/main/alpaca_cleaned_ja.json --create-dirs -o data/alpaca_data_ja.json

In [None]:
!head data/alpaca_data_ja.json

In [None]:
input_train = sess.upload_data(
    path="./data/alpaca_data_ja.json",
    key_prefix="Alpaca"
)
input_train

## Fine-tuning

Fine-tuning took approximately 4 hours for 1 epoch on p3.2xlarge.

In [None]:
hyperparameters = {
    'base_model': 'huggyllama/llama-7b',
    'load_in_4bit': True,
    'pad_token_id': 0,
    'data_path': '/opt/ml/input/data/train/alpaca_data_ja.json',
    'num_epochs': 1,  # default 3
    'cutoff_len': 512,
    'group_by_length': False,
    'output_dir': '/opt/ml/model',
    'lora_target_modules': '[q_proj,v_proj]',
    'lora_r': 8,
    'micro_batch_size': 8,
    'prompt_template_name': 'alpaca'
}

In [None]:
huggingface_estimator = HuggingFace(
    base_job_name="Alpaca",
    role=role,
    entry_point='finetune.py',
    source_dir='./scripts/code',
    instance_type='ml.g5.2xlarge',
    instance_count=1,
    volume_size=100,
    transformers_version='4.26',
    pytorch_version='1.13',
    py_version='py39',
    use_spot_instances=True,
    max_wait=86400,
    hyperparameters=hyperparameters,
    metric_definitions=[{'Name': 'eval_loss', 'Regex': "'eval_loss': (\d\.\d+)"},
                        {'Name': 'train_loss', 'Regex': "'loss': (\d\.\d+)"}],
)
huggingface_estimator.fit({'train': input_train})

## Download and Extract Model

In [None]:
import boto3
import sagemaker

def get_latest_training_job_artifact(base_job_name):
    sagemaker_client = boto3.client('sagemaker')
    response = sagemaker_client.list_training_jobs(NameContains=base_job_name, SortBy='CreationTime', SortOrder='Descending')
    training_job_arn = response['TrainingJobSummaries'][0]['TrainingJobArn']
    training_job_description = sagemaker_client.describe_training_job(TrainingJobName=training_job_arn.split('/')[-1])
    return training_job_description['ModelArtifacts']['S3ModelArtifacts']

try:
    model_data = huggingface_estimator.model_data
except:
    # Retrieve artifact url when kernel is restarted
    model_data = get_latest_training_job_artifact('Alpaca')
    
!aws s3 cp {model_data} alpaca_ja.tar.gz

In [None]:
# !rm -rf scripts/model && mkdir scripts/model
!tar -xvf alpaca_ja.tar.gz -C scripts/model --no-same-owner --wildcards adapter*
!ls -l scripts/model

## Package and Upload Model

In [None]:
%cd scripts
!tar -czvf ../package.tar.gz *
%cd -

In [None]:
model_path = sess.upload_data('package.tar.gz', bucket=bucket, key_prefix=f"Alpaca")
model_path

## Deploy Model

In [None]:
from sagemaker.async_inference import AsyncInferenceConfig
from sagemaker.serializers import JSONSerializer

endpoint_name = "Alpaca-LoRA-ja"

huggingface_model = PyTorchModel(
    model_data=model_path,
    framework_version="1.13",
    py_version='py39',
    role=role,
    name=endpoint_name,
    env={
        "model_params": json.dumps({
            "base_model": "huggyllama/llama-7b",
            "lora_weights": "model",  # path relative to model package
            "peft": True,
            "load_8bit": True,
            "prompt_template": "alpaca",
        }),
        "SAGEMAKER_MODEL_SERVER_TIMEOUT": "3600"
    }
)

# deploy model to SageMaker Inference
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type='ml.g5.2xlarge',
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    # async_inference_config=AsyncInferenceConfig()
)

## Run Inference

In [None]:
# With SageMaker SDK

from sagemaker.predictor import Predictor
from sagemaker.predictor_async import AsyncPredictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor_client = Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)
# predictor_client = AsyncPredictor(
#     predictor=predictor_client,
#     name=endpoint_name
# )
data = {
    "instruction": "AWS とは？",
    "input": "",
    "max_new_tokens": 128,
    "temperature": 0.3,
}
response = predictor_client.predict(
    data=data
)
print(response)

In [None]:
# With Boto3

import boto3
import json

endpoint_name = "Alpaca-LoRA-ja"
sagemaker_client = boto3.client('sagemaker-runtime')

data = {
    "instruction": "AWS とは？",
    "input": """""",
    "max_new_tokens": 128,
    "temperature": 0.3,
}

response = sagemaker_client.invoke_endpoint(
    EndpointName=endpoint_name,
    ContentType='application/json',
    Accept='application/json',
    Body=json.dumps(data)
)

result = json.loads(response['Body'].read())
print(result)

## Benchmark Speed

18.1 s ± 26.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%timeit response = predictor_client.predict(data=data)

## Delete Endpoint

In [None]:
predictor.delete_model()
predictor.delete_endpoint()