{
"cells": [
{
"cell_type": "markdown",
"id": "cb60aea8-54a9-4cf8-9cff-5f9b2043036b",
"metadata": {},
"source": [
"# Preprocessing \n",
"* Container: codna_pytorch_py39"
]
},
{
"cell_type": "markdown",
"id": "6dc511d5-62aa-404b-9fbd-0f1a59446926",
"metadata": {},
"source": [
"## AutoReload"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "51b06f10-985d-4077-b729-e484c814f6b0",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"%load_ext autoreload\n",
"%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "0ba446e0-448f-4e07-9d5a-225c16bde8cb",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import boto3"
]
},
{
"cell_type": "markdown",
"id": "82fcf66a-7690-4ee7-bc5c-ecab6d254e84",
"metadata": {},
"source": [
"## 1. Processing-job for preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "31244e49-1ef6-44b0-aa9a-8c279e3e9093",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"import os\n",
"import wget\n",
"import sagemaker\n",
"from sagemaker.pytorch.estimator import PyTorch\n",
"from sagemaker.workflow.execution_variables import ExecutionVariables\n",
"from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor"
]
},
{
"cell_type": "markdown",
"id": "8d163960-9894-465c-ae15-78e94a590630",
"metadata": {},
"source": [
"## 2. parameter store 설정"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "831caf05-28f8-4dab-b96c-9b8574e064cf",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from utils.ssm import parameter_store\n",
"strRegionName=boto3.Session().region_name\n",
"pm = parameter_store(strRegionName)\n",
"prefix = pm.get_params(key=\"PREFIX\")"
]
},
{
"cell_type": "markdown",
"id": "fef44376-e6ac-43c2-b9e4-e18adb65a2ec",
"metadata": {},
"source": [
"* params for processing job"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "4e5789a5-57b6-4a7e-81bc-188e23c28b06",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"instance-type: local\n",
"image-uri: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training\n",
"role: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436\n",
"bucket: sm-nemo-ramp\n",
"dataset-path: /home/ec2-user/SageMaker/nemo-on-sagemaker/1.building-component/data\n",
"sagemaker_session: \n",
"git_config: {'repo': 'https://git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code', 'branch': 'main', 'username': 'dongjin-at-419974056037', 'password': 'wtLv/fP4ESjBDnyW5xgqFPGR0dMTIyK5/8gK6IS1Zsg='}\n"
]
}
],
"source": [
"local_mode = True\n",
"\n",
"if local_mode: \n",
" instance_type = 'local'\n",
" \n",
" import os\n",
" from sagemaker.local import LocalSession\n",
" \n",
" sagemaker_session = LocalSession()\n",
" data_path = os.path.join(os.getcwd(), \"data\")\n",
" \n",
"else:\n",
" instance_type = \"ml.m5.xlarge\" ## \"ml.g4dn.xlarge\"\n",
" sagemaker_session = sagemaker.Session()\n",
" data_path = pm.get_params(key=prefix + '-S3-DATA-PATH')\n",
" \n",
"git_config = {\n",
" 'repo': f'https://{pm.get_params(key=\"-\".join([prefix, \"CODE_REPO\"]))}',\n",
" 'branch': 'main',\n",
" 'username': pm.get_params(key=\"-\".join([prefix, \"CODECOMMIT-USERNAME\"]), enc=True),\n",
" 'password': pm.get_params(key=\"-\".join([prefix, \"CODECOMMIT-PWD\"]), enc=True)\n",
"} \n",
" \n",
"print (f\"instance-type: {instance_type}\")\n",
"print (f\"image-uri: {pm.get_params(key=''.join([prefix, '-IMAGE-URI']))}\")\n",
"print (f\"role: {pm.get_params(key=prefix + '-SAGEMAKER-ROLE-ARN')}\")\n",
"print (f\"bucket: {pm.get_params(key=prefix + '-BUCKET')}\")\n",
"print (f\"dataset-path: {data_path}\")\n",
"print (f\"sagemaker_session: {sagemaker_session}\")\n",
"print (f\"git_config: {git_config}\")"
]
},
{
"cell_type": "markdown",
"id": "5b1d1a28-03f1-4f00-ad79-5786b15b0d46",
"metadata": {},
"source": [
"* Define processing job"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "f75ec691-0a7c-450f-9c6f-dfa75f44d22c",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"dataset_processor = FrameworkProcessor(\n",
" estimator_cls=PyTorch,\n",
" framework_version=None,\n",
" image_uri=pm.get_params(key=''.join([prefix, \"-IMAGE-URI\"])),\n",
" instance_type=instance_type,\n",
" instance_count=1,\n",
" role=pm.get_params(key=prefix + \"-SAGEMAKER-ROLE-ARN\"),\n",
" base_job_name=\"preprocessing\", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)\n",
" sagemaker_session=sagemaker_session\n",
")\n",
"\n",
"proc_prefix = \"/opt/ml/processing\"\n",
"\n",
"output_path = os.path.join(\n",
" \"s3://{}\".format(pm.get_params(key=prefix + \"-BUCKET\")),\n",
" prefix,\n",
" \"preprocessing\",\n",
" \"data\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "c1406d98-45eb-4adf-87ca-730d5b74b853",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'s3://sm-nemo-ramp/nemo-asr/preprocessing/data'"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"output_path"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "8cfa4f86-9bb8-47cb-8ca5-e318c0470836",
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
},
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Cloning into '/tmp/tmpn_1lg81x'...\n",
"remote: Counting objects: 20, done. \n",
"Already on 'main'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Your branch is up to date with 'origin/main'.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:sagemaker:Creating processing-job with name preprocessing-2023-03-22-10-16-10-483\n",
"INFO:sagemaker.local.local_session:Starting processing job\n",
"INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole\n",
"INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.\n",
"INFO:sagemaker.local.image:docker compose file: \n",
"networks:\n",
" sagemaker-local:\n",
" name: sagemaker-local\n",
"services:\n",
" algo-1-w6h0y:\n",
" container_name: jv6cmky3ha-algo-1-w6h0y\n",
" entrypoint:\n",
" - /bin/bash\n",
" - /opt/ml/processing/input/entrypoint/runproc.sh\n",
" - --proc_prefix\n",
" - /opt/ml/processing\n",
" - --train_mount_dir\n",
" - /opt/ml/input/data/training/\n",
" - --test_mount_dir\n",
" - /opt/ml/input/data/testing/\n",
" environment: []\n",
" image: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training\n",
" networks:\n",
" sagemaker-local:\n",
" aliases:\n",
" - algo-1-w6h0y\n",
" stdin_open: true\n",
" tty: true\n",
" volumes:\n",
" - /tmp/tmpczdb0bse/algo-1-w6h0y/config:/opt/ml/config\n",
" - /tmp/tmpczdb0bse/algo-1-w6h0y/output:/opt/ml/output\n",
" - /tmp/tmp1xqg7ou8:/opt/ml/processing/input\n",
" - /tmp/tmpd4p6wk4_:/opt/ml/processing/input/code/\n",
" - /tmp/tmpgp5p1b0n:/opt/ml/processing/input/entrypoint\n",
" - /tmp/tmp13wj1kvh/output/output-data:/opt/ml/processing/output\n",
" - /tmp/tmpczdb0bse/shared:/opt/ml/shared\n",
"version: '2.3'\n",
"\n",
"INFO:sagemaker.local.image:docker command: docker-compose -f /tmp/tmpczdb0bse/docker-compose.yaml up --build --abort-on-container-exit\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating jv6cmky3ha-algo-1-w6h0y ... \n",
"Creating jv6cmky3ha-algo-1-w6h0y ... done\n",
"Attaching to jv6cmky3ha-algo-1-w6h0y\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Received arguments Namespace(proc_prefix='/opt/ml/processing', train_mount_dir='/opt/ml/input/data/training/', test_mount_dir='/opt/ml/input/data/testing/')\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Converting .sph to .wav...\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Finished conversion.\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ******\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ******\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Training manifest created.\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Test manifest created.\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ***Done***\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m data_dir ['entrypoint', 'code', 'an4']\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m self.output_dir ['an4']\n",
"\u001b[36mjv6cmky3ha-algo-1-w6h0y exited with code 0\n",
"\u001b[0mAborting on container exit...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:sagemaker.local.utils:Failed to delete: /tmp/tmp13wj1kvh/output/output-data Please remove it manually.\n",
"WARNING:sagemaker.local.image:Failed to delete: /tmp/tmp13wj1kvh Please remove it manually.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"===== Job Complete =====\n"
]
}
],
"source": [
"dataset_processor.run(\n",
" #job_name=\"preprocessing\", ## 이걸 넣어야 캐시가 작동함, 안그러면 프로세서의 base_job_name 이름뒤에 날짜 시간이 붙어서 캐시 동작 안함\n",
" code='preprocessing.py', #소스 디렉토리 안에서 파일 path\n",
" source_dir= \"./code\", #현재 파일에서 소스 디렉토리 상대경로 # add processing.py and requirements.txt here\n",
" git_config=git_config,\n",
" inputs=[\n",
" ProcessingInput(\n",
" input_name=\"input-data\",\n",
" source=data_path,\n",
" destination=os.path.join(proc_prefix, \"input\")\n",
" ),\n",
" ],\n",
" outputs=[ \n",
" ProcessingOutput(\n",
" output_name=\"output-data\",\n",
" source=os.path.join(proc_prefix, \"output\"),\n",
" destination=output_path\n",
" ),\n",
" ],\n",
" arguments=[\"--proc_prefix\", proc_prefix, \\\n",
" \"--train_mount_dir\", \"/opt/ml/input/data/training/\", \\\n",
" \"--test_mount_dir\", \"/opt/ml/input/data/testing/\"],\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2b39af55-7e48-4730-aa54-01350c162084",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'s3://sm-nemo-ramp/nemo-asr/preprocessing/data'"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"!aws s3 sync $output_path ./data/preprocessing --quiet\n",
"output_path"
]
},
{
"cell_type": "markdown",
"id": "d86fd8d5-b842-47a1-aa76-fa3043557afc",
"metadata": {},
"source": [
"## 3. parameter store에 Processing output 추가"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "e3328b4c-e16d-4042-aa81-d2ac611b8efe",
"metadata": {
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"'Store suceess'"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pm.put_params(key=\"-\".join([prefix, \"PREP-DATA-PATH\"]), value=output_path, overwrite=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd3268c2-50c1-4462-bb3c-f149a17886fc",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_pytorch_p39",
"language": "python",
"name": "conda_pytorch_p39"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}