### 1. 安装HuggingFace 并下载模型到本地

In [None]:
!pip install huggingface-hub -Uqq

In [None]:
from huggingface_hub import snapshot_download
from pathlib import Path

local_model_path = Path("./Buffer_instruct_baichuan_001_model")
local_model_path.mkdir(exist_ok=True)
model_name = "csdc-atl/buffer-instruct-baichuan-001"
commit_hash = "a0a840f0fb9bc0ce43ca5cb03cd6383412883774"

In [None]:
snapshot_download(repo_id=model_name, revision=commit_hash, cache_dir=local_model_path)

### 2. 把模型拷贝到S3为后续部署做准备

In [None]:
import sagemaker
from sagemaker import image_uris
import boto3
import os
import time
import json

role = sagemaker.get_execution_role() # execution role for the endpoint
sess = sagemaker.session.Session() # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket() # bucket to house artifacts

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

In [None]:
s3_model_prefix = "LLM-RAG/workshop/Buffer_instruct_baichuan_001_model" # folder where model checkpoint will go
model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]
s3_code_prefix = "LLM-RAG/workshop/Buffer_instruct_deploy_code"
print(f"s3_code_prefix: {s3_code_prefix}")
print(f"model_snapshot_path: {model_snapshot_path}")

In [None]:
!aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}

### 3. 模型部署准备(entrypoint脚本,容器镜像,服务配置)

In [None]:
inference_image_uri = (
 f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.21.0-deepspeed0.8.3-cu117"
)

#中国区需要替换为下面的image_uri
# inference_image_uri = (
# f"727897471807.dkr.ecr.{region}.amazonaws.com.cn/djl-inference:0.21.0-deepspeed0.8.3-cu117"
# )

print(f"Image going to be used is ---- > {inference_image_uri}")

In [None]:
!mkdir -p Buffer_instruct_deploy_code

In [None]:
%%writefile Buffer_instruct_deploy_code/model.py
from djl_python import Input, Output
import torch
import logging
import math
import os
from transformers import AutoTokenizer, AutoModelForCausalLM


def load_model(properties):
 tensor_parallel = properties["tensor_parallel_degree"]
 model_location = properties['model_dir']
 if "model_id" in properties:
 model_location = properties['model_id']
 logging.info(f"Loading model in {model_location}")
 
 tokenizer = AutoTokenizer.from_pretrained(model_location, trust_remote_code=True)
 model = AutoModelForCausalLM.from_pretrained(model_location, trust_remote_code=True)
 model = model.eval().half().cuda()
 
 return model, tokenizer


model = None
tokenizer = None
generator = None


def handle(inputs: Input):
 global model, tokenizer
 if not model:
 model, tokenizer = load_model(inputs.get_properties())

 if inputs.is_empty():
 return None
 data = inputs.get_as_json()
 
 prompt = data["inputs"]
 params = data["parameters"]
 history = data["history"]

 response, history = model.chat(tokenizer, prompt, history=history, **params)
 
 result = {"outputs": response, "history" : history}
 return Output().add_as_json(result)

In [None]:
print(f"option.s3url ==> s3://{bucket}/{s3_model_prefix}/")

#### Note: option.s3url 需要按照自己的账号进行修改, 可以拷贝上一个cell的输出

#### 注意: 必须把transformers升级到4.28.1以上,否则会出现有些module找不到的问题(```from transformers.models.llama.configuration_llama import LlamaConfig```)

如果是中国区建议添加国内的pip镜像,如下代码所示
```
%%writefile Buffer_instruct_deploy_code/requirements.txt
-i https://pypi.tuna.tsinghua.edu.cn/simple
transformers==4.28.1
```

In [None]:
%%writefile Buffer_instruct_deploy_code/requirements.txt
transformers==4.28.1

In [None]:
%%writefile Buffer_instruct_deploy_code/serving.properties
engine=Python
option.tensor_parallel_degree=1
option.s3url = s3://sagemaker-us-west-2-106839800180/LLM-RAG/workshop/Buffer_instruct_baichuan_001_model/

In [None]:
!rm model.tar.gz
!tar czvf model.tar.gz Buffer_instruct_deploy_code

In [None]:
s3_code_artifact = sess.upload_data("model.tar.gz", bucket, s3_code_prefix)
print(f"S3 Code or Model tar ball uploaded to --- > {s3_code_artifact}")

### 4. 创建模型 & 创建endpoint

In [None]:
from sagemaker.utils import name_from_base
import boto3

model_name = name_from_base(f"buffer-instruct-baichuan-001") # Append a timestamp to the provided string
print(model_name)
print(f"Image going to be used is ---- > {inference_image_uri}")

create_model_response = sm_client.create_model(
 ModelName=model_name,
 ExecutionRoleArn=role,
 PrimaryContainer={
 "Image": inference_image_uri,
 "ModelDataUrl": s3_code_artifact
 },
 
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

In [None]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
 EndpointConfigName=endpoint_config_name,
 ProductionVariants=[
 {
 "VariantName": "variant1",
 "ModelName": model_name,
 "InstanceType": "ml.g5.4xlarge",
 "InitialInstanceCount": 1,
 # "VolumeSizeInGB" : 400,
 # "ModelDataDownloadTimeoutInSeconds": 2400,
 "ContainerStartupHealthCheckTimeoutInSeconds": 15*60,
 },
 ],
)
endpoint_config_response

In [None]:
create_endpoint_response = sm_client.create_endpoint(
 EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

#### 持续检测模型部署进度

In [None]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
 time.sleep(60)
 resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
 status = resp["EndpointStatus"]
 print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

### 5. 模型测试

In [None]:
%%time
import json
import boto3

smr_client = boto3.client("sagemaker-runtime")

parameters = {

}

In [None]:
question1 = "请问AWS Clean Rooms是多方都会收费吗?"
context1 = "Q: 是否发起者和数据贡献者都会被收费?\nA: 是单方收费,只有查询的接收方会收费"
prompts_template = """{system_role_prompt},以下context内的文本内容为背景知识:\n\n{context}\n\n请根据背景知识, 回答这个问题:{question}"""
prompt = prompts_template.format(system_role_prompt="你是云服务AWS的智能客服机器人AWSBot", context=context1, question=question1)

response_model = smr_client.invoke_endpoint(
 EndpointName=endpoint_name,
 Body=json.dumps(
 {
 "inputs": prompt,
 "parameters": parameters,
 'context': '',
 'existing_answer': '',
 'history':[]
 }
 ),
 ContentType="application/json",
 )

response_model['Body'].read().decode('utf8')

In [None]:
!aws sagemaker delete-endpoint --endpoint-name buffer-instruct-baichuan-001-2023-07-20-05-11-49-585-endpoint

In [None]:
!aws sagemaker delete-endpoint-config --endpoint-config-name buffer-instruct-baichuan-001-2023-07-20-05-11-49-585-config

In [None]:
!aws sagemaker delete-model --model-name buffer-instruct-baichuan-001-2023-07-20-05-11-49-585