{ "cells": [ { "cell_type": "markdown", "id": "cb60aea8-54a9-4cf8-9cff-5f9b2043036b", "metadata": {}, "source": [ "# Preprocessing \n", "* Container: codna_pytorch_py39" ] }, { "cell_type": "markdown", "id": "6dc511d5-62aa-404b-9fbd-0f1a59446926", "metadata": {}, "source": [ "## AutoReload" ] }, { "cell_type": "code", "execution_count": 1, "id": "51b06f10-985d-4077-b729-e484c814f6b0", "metadata": { "tags": [] }, "outputs": [], "source": [ "%load_ext autoreload\n", "%autoreload 2" ] }, { "cell_type": "code", "execution_count": 2, "id": "0ba446e0-448f-4e07-9d5a-225c16bde8cb", "metadata": { "tags": [] }, "outputs": [], "source": [ "import boto3" ] }, { "cell_type": "markdown", "id": "82fcf66a-7690-4ee7-bc5c-ecab6d254e84", "metadata": {}, "source": [ "## 1. Processing-job for preprocessing" ] }, { "cell_type": "code", "execution_count": 3, "id": "31244e49-1ef6-44b0-aa9a-8c279e3e9093", "metadata": { "tags": [] }, "outputs": [], "source": [ "import os\n", "import wget\n", "import sagemaker\n", "from sagemaker.pytorch.estimator import PyTorch\n", "from sagemaker.workflow.execution_variables import ExecutionVariables\n", "from sagemaker.processing import ProcessingInput, ProcessingOutput, FrameworkProcessor" ] }, { "cell_type": "markdown", "id": "8d163960-9894-465c-ae15-78e94a590630", "metadata": {}, "source": [ "## 2. parameter store 설정" ] }, { "cell_type": "code", "execution_count": 4, "id": "831caf05-28f8-4dab-b96c-9b8574e064cf", "metadata": { "tags": [] }, "outputs": [], "source": [ "from utils.ssm import parameter_store\n", "strRegionName=boto3.Session().region_name\n", "pm = parameter_store(strRegionName)\n", "prefix = pm.get_params(key=\"PREFIX\")" ] }, { "cell_type": "markdown", "id": "fef44376-e6ac-43c2-b9e4-e18adb65a2ec", "metadata": {}, "source": [ "* params for processing job" ] }, { "cell_type": "code", "execution_count": 14, "id": "4e5789a5-57b6-4a7e-81bc-188e23c28b06", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "instance-type: local\n", "image-uri: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training\n", "role: arn:aws:iam::419974056037:role/service-role/AmazonSageMaker-ExecutionRole-20221206T163436\n", "bucket: sm-nemo-ramp\n", "dataset-path: /home/ec2-user/SageMaker/nemo-on-sagemaker/1.building-component/data\n", "sagemaker_session: \n", "git_config: {'repo': 'https://git-codecommit.us-east-1.amazonaws.com/v1/repos/nemo-code', 'branch': 'main', 'username': 'dongjin-at-419974056037', 'password': 'wtLv/fP4ESjBDnyW5xgqFPGR0dMTIyK5/8gK6IS1Zsg='}\n" ] } ], "source": [ "local_mode = True\n", "\n", "if local_mode: \n", " instance_type = 'local'\n", " \n", " import os\n", " from sagemaker.local import LocalSession\n", " \n", " sagemaker_session = LocalSession()\n", " data_path = os.path.join(os.getcwd(), \"data\")\n", " \n", "else:\n", " instance_type = \"ml.m5.xlarge\" ## \"ml.g4dn.xlarge\"\n", " sagemaker_session = sagemaker.Session()\n", " data_path = pm.get_params(key=prefix + '-S3-DATA-PATH')\n", " \n", "git_config = {\n", " 'repo': f'https://{pm.get_params(key=\"-\".join([prefix, \"CODE_REPO\"]))}',\n", " 'branch': 'main',\n", " 'username': pm.get_params(key=\"-\".join([prefix, \"CODECOMMIT-USERNAME\"]), enc=True),\n", " 'password': pm.get_params(key=\"-\".join([prefix, \"CODECOMMIT-PWD\"]), enc=True)\n", "} \n", " \n", "print (f\"instance-type: {instance_type}\")\n", "print (f\"image-uri: {pm.get_params(key=''.join([prefix, '-IMAGE-URI']))}\")\n", "print (f\"role: {pm.get_params(key=prefix + '-SAGEMAKER-ROLE-ARN')}\")\n", "print (f\"bucket: {pm.get_params(key=prefix + '-BUCKET')}\")\n", "print (f\"dataset-path: {data_path}\")\n", "print (f\"sagemaker_session: {sagemaker_session}\")\n", "print (f\"git_config: {git_config}\")" ] }, { "cell_type": "markdown", "id": "5b1d1a28-03f1-4f00-ad79-5786b15b0d46", "metadata": {}, "source": [ "* Define processing job" ] }, { "cell_type": "code", "execution_count": 15, "id": "f75ec691-0a7c-450f-9c6f-dfa75f44d22c", "metadata": { "tags": [] }, "outputs": [], "source": [ "dataset_processor = FrameworkProcessor(\n", " estimator_cls=PyTorch,\n", " framework_version=None,\n", " image_uri=pm.get_params(key=''.join([prefix, \"-IMAGE-URI\"])),\n", " instance_type=instance_type,\n", " instance_count=1,\n", " role=pm.get_params(key=prefix + \"-SAGEMAKER-ROLE-ARN\"),\n", " base_job_name=\"preprocessing\", # bucket에 보이는 이름 (pipeline으로 묶으면 pipeline에서 정의한 이름으로 bucket에 보임)\n", " sagemaker_session=sagemaker_session\n", ")\n", "\n", "proc_prefix = \"/opt/ml/processing\"\n", "\n", "output_path = os.path.join(\n", " \"s3://{}\".format(pm.get_params(key=prefix + \"-BUCKET\")),\n", " prefix,\n", " \"preprocessing\",\n", " \"data\"\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "id": "c1406d98-45eb-4adf-87ca-730d5b74b853", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'s3://sm-nemo-ramp/nemo-asr/preprocessing/data'" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "output_path" ] }, { "cell_type": "code", "execution_count": 19, "id": "8cfa4f86-9bb8-47cb-8ca5-e318c0470836", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Cloning into '/tmp/tmpn_1lg81x'...\n", "remote: Counting objects: 20, done. \n", "Already on 'main'\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Your branch is up to date with 'origin/main'.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "INFO:sagemaker:Creating processing-job with name preprocessing-2023-03-22-10-16-10-483\n", "INFO:sagemaker.local.local_session:Starting processing job\n", "INFO:botocore.credentials:Found credentials from IAM Role: BaseNotebookInstanceEc2InstanceRole\n", "INFO:sagemaker.local.image:No AWS credentials found in session but credentials from EC2 Metadata Service are available.\n", "INFO:sagemaker.local.image:docker compose file: \n", "networks:\n", " sagemaker-local:\n", " name: sagemaker-local\n", "services:\n", " algo-1-w6h0y:\n", " container_name: jv6cmky3ha-algo-1-w6h0y\n", " entrypoint:\n", " - /bin/bash\n", " - /opt/ml/processing/input/entrypoint/runproc.sh\n", " - --proc_prefix\n", " - /opt/ml/processing\n", " - --train_mount_dir\n", " - /opt/ml/input/data/training/\n", " - --test_mount_dir\n", " - /opt/ml/input/data/testing/\n", " environment: []\n", " image: 419974056037.dkr.ecr.us-east-1.amazonaws.com/nemo-test-training\n", " networks:\n", " sagemaker-local:\n", " aliases:\n", " - algo-1-w6h0y\n", " stdin_open: true\n", " tty: true\n", " volumes:\n", " - /tmp/tmpczdb0bse/algo-1-w6h0y/config:/opt/ml/config\n", " - /tmp/tmpczdb0bse/algo-1-w6h0y/output:/opt/ml/output\n", " - /tmp/tmp1xqg7ou8:/opt/ml/processing/input\n", " - /tmp/tmpd4p6wk4_:/opt/ml/processing/input/code/\n", " - /tmp/tmpgp5p1b0n:/opt/ml/processing/input/entrypoint\n", " - /tmp/tmp13wj1kvh/output/output-data:/opt/ml/processing/output\n", " - /tmp/tmpczdb0bse/shared:/opt/ml/shared\n", "version: '2.3'\n", "\n", "INFO:sagemaker.local.image:docker command: docker-compose -f /tmp/tmpczdb0bse/docker-compose.yaml up --build --abort-on-container-exit\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Creating jv6cmky3ha-algo-1-w6h0y ... \n", "Creating jv6cmky3ha-algo-1-w6h0y ... done\n", "Attaching to jv6cmky3ha-algo-1-w6h0y\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Received arguments Namespace(proc_prefix='/opt/ml/processing', train_mount_dir='/opt/ml/input/data/training/', test_mount_dir='/opt/ml/input/data/testing/')\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Converting .sph to .wav...\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Finished conversion.\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ******\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ******\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Training manifest created.\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m Test manifest created.\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m ***Done***\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m data_dir ['entrypoint', 'code', 'an4']\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y |\u001b[0m self.output_dir ['an4']\n", "\u001b[36mjv6cmky3ha-algo-1-w6h0y exited with code 0\n", "\u001b[0mAborting on container exit...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:sagemaker.local.utils:Failed to delete: /tmp/tmp13wj1kvh/output/output-data Please remove it manually.\n", "WARNING:sagemaker.local.image:Failed to delete: /tmp/tmp13wj1kvh Please remove it manually.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "===== Job Complete =====\n" ] } ], "source": [ "dataset_processor.run(\n", " #job_name=\"preprocessing\", ## 이걸 넣어야 캐시가 작동함, 안그러면 프로세서의 base_job_name 이름뒤에 날짜 시간이 붙어서 캐시 동작 안함\n", " code='preprocessing.py', #소스 디렉토리 안에서 파일 path\n", " source_dir= \"./code\", #현재 파일에서 소스 디렉토리 상대경로 # add processing.py and requirements.txt here\n", " git_config=git_config,\n", " inputs=[\n", " ProcessingInput(\n", " input_name=\"input-data\",\n", " source=data_path,\n", " destination=os.path.join(proc_prefix, \"input\")\n", " ),\n", " ],\n", " outputs=[ \n", " ProcessingOutput(\n", " output_name=\"output-data\",\n", " source=os.path.join(proc_prefix, \"output\"),\n", " destination=output_path\n", " ),\n", " ],\n", " arguments=[\"--proc_prefix\", proc_prefix, \\\n", " \"--train_mount_dir\", \"/opt/ml/input/data/training/\", \\\n", " \"--test_mount_dir\", \"/opt/ml/input/data/testing/\"],\n", ")" ] }, { "cell_type": "code", "execution_count": 20, "id": "2b39af55-7e48-4730-aa54-01350c162084", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'s3://sm-nemo-ramp/nemo-asr/preprocessing/data'" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!aws s3 sync $output_path ./data/preprocessing --quiet\n", "output_path" ] }, { "cell_type": "markdown", "id": "d86fd8d5-b842-47a1-aa76-fa3043557afc", "metadata": {}, "source": [ "## 3. parameter store에 Processing output 추가" ] }, { "cell_type": "code", "execution_count": 21, "id": "e3328b4c-e16d-4042-aa81-d2ac611b8efe", "metadata": { "tags": [] }, "outputs": [ { "data": { "text/plain": [ "'Store suceess'" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pm.put_params(key=\"-\".join([prefix, \"PREP-DATA-PATH\"]), value=output_path, overwrite=True)" ] }, { "cell_type": "code", "execution_count": null, "id": "bd3268c2-50c1-4462-bb3c-f149a17886fc", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_pytorch_p39", "language": "python", "name": "conda_pytorch_p39" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" } }, "nbformat": 4, "nbformat_minor": 5 }