{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "8314fc9b-c468-497b-abcc-259ec792154c", "metadata": { "tags": [] }, "outputs": [], "source": [ "import sagemaker\n", "import boto3\n", "from sagemaker.pytorch import PyTorch\n", "sagemaker_session = sagemaker.Session()\n", "bucket = sagemaker_session.default_bucket()\n", "role = sagemaker.get_execution_role()\n", "account_id = boto3.client('sts').get_caller_identity().get('Account')\n", "region_name = boto3.session.Session().region_name\n", "images_s3uri = 's3://{0}/lora/images/'.format(bucket)" ] }, { "cell_type": "code", "execution_count": 19, "id": "516418fb-5755-4e40-b0df-ca80c085067e", "metadata": { "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "upload: images/.ipynb_checkpoints/dataset-checkpoint.toml to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/.ipynb_checkpoints/dataset-checkpoint.toml\n", "upload: images/00000-0-IMG20230302163943.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00000-0-IMG20230302163943.txt\n", "upload: images/00002-0-IMG20230302163946.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00002-0-IMG20230302163946.txt\n", "upload: images/00003-0-IMG20230302163947.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00003-0-IMG20230302163947.txt\n", "upload: images/dataset.toml to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/dataset.toml\n", "upload: images/00004-0-IMG20230302163948.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00004-0-IMG20230302163948.txt\n", "upload: images/00000-0-IMG20230302163943.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00000-0-IMG20230302163943.png\n", "upload: images/00001-0-IMG20230302163945.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00001-0-IMG20230302163945.txt\n", "upload: images/00005-0-IMG20230302163954.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00005-0-IMG20230302163954.txt\n", "upload: images/00006-0-IMG20230302163956.txt to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00006-0-IMG20230302163956.txt\n", "upload: images/00001-0-IMG20230302163945.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00001-0-IMG20230302163945.png\n", "upload: images/00004-0-IMG20230302163948.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00004-0-IMG20230302163948.png\n", "upload: images/00003-0-IMG20230302163947.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00003-0-IMG20230302163947.png\n", "upload: images/00002-0-IMG20230302163946.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00002-0-IMG20230302163946.png\n", "upload: images/00006-0-IMG20230302163956.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00006-0-IMG20230302163956.png\n", "upload: images/00005-0-IMG20230302163954.png to s3://sagemaker-ap-southeast-1-687912291502/dreambooth/images/00005-0-IMG20230302163954.png\n" ] } ], "source": [ "##注意:lora train的图像目录下需要metadata.jsonl,其中file_name字段必须##########3\n", "#imgs=\"https://d1xkebsgyt7kzd.cloudfront.net/R_1.jpg,https://d1xkebsgyt7kzd.cloudfront.net/R_2.jpg,https://d1xkebsgyt7kzd.cloudfront.net/R_3.jpg,https://d1xkebsgyt7kzd.cloudfront.net/R_4.jpg,https://d1xkebsgyt7kzd.cloudfront.net/R_5.jpg\"\n", "#for image in imgs.split(\",\"):\n", "# !wget $image\n", "#!mv ./*.jpg ./images/\n", "!aws s3 cp images $images_s3uri --recursive" ] }, { "cell_type": "code", "execution_count": 20, "id": "3656f142-057c-4bc9-bd38-ea318b8c4865", "metadata": {}, "outputs": [], "source": [ "#image_uri = '{0}.dkr.ecr.{1}.amazonaws.com/all-in-one-ai-stable-diffusion-webui-training'.format(account_id, region_name)\n", "image_uri = '687912291502.dkr.ecr.ap-southeast-1.amazonaws.com/lora-finetuning:latest'\n", "models_s3uri = 's3://{}/stable-diffusion/models/{}/{}'.format(bucket,'lora','model.tar.gz')" ] }, { "cell_type": "markdown", "id": "757db6d4-5b2d-486c-8329-0995b21f25cf", "metadata": {}, "source": [ "#################纯开源方式train############" ] }, { "cell_type": "code", "execution_count": 24, "id": "38dda204-a307-4776-b907-e8e3548df905", "metadata": { "collapsed": true, "jupyter": { "outputs_hidden": true }, "tags": [] }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "mixed_precision fp16\n", "pretrained_model_name_or_path runwayml/stable-diffusion-v1-5\n", "dataset_name lambdalabs/pokemon-blip-captions\n", "dataloader_num_workers 8\n", "max_grad_norm 1\n", "output_dir /opt/ml/model/\n", "checkpointing_steps 1000\n", "validation_prompt Totoro\n", "seed 1338\n", "manul_upload_model_path s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\n", "resolution 512\n", "train_batch_size 1\n", "gradient_accumulation_steps 4\n", "learning_rate 2e-06\n", "lr_scheduler cosine\n", "lr_warmup_steps 0\n", "max_train_steps 1000\n", "2023-03-06 14:44:56 Starting - Starting the training job...\n", "2023-03-06 14:45:11 Starting - Preparing the instances for trainingProfilerReport-1678113896: InProgress\n", "......\n", "2023-03-06 14:46:18 Downloading - Downloading input data...\n", "2023-03-06 14:46:38 Training - Downloading the training image.....................\n", "2023-03-06 14:50:19 Training - Training image download completed. Training in progress..\u001b[34m2023-03-06 14:50:20,468 sagemaker-training-toolkit INFO Installing dependencies from requirements.txt:\u001b[0m\n", "\u001b[34m/opt/conda/bin/python -m pip install -r requirements.txt\u001b[0m\n", "\u001b[34mRequirement already satisfied: datasets in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 1)) (2.10.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: diffusers in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 2)) (0.14.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: accelerate in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 3)) (0.16.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: numpy==1.23.3 in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 4)) (1.23.3)\u001b[0m\n", "\u001b[34mRequirement already satisfied: torch in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 5)) (1.12.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: deepspeed==0.7.4 in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 6)) (0.7.4)\u001b[0m\n", "\u001b[34mRequirement already satisfied: ninja in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 7)) (1.11.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: torchvision in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 8)) (0.13.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: transformers>=4.25.1 in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 9)) (4.26.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: ftfy in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 10)) (6.1.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: tensorboard in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 11)) (2.12.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: Jinja2 in /opt/conda/lib/python3.9/site-packages (from -r requirements.txt (line 12)) (3.1.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pydantic in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (1.10.5)\u001b[0m\n", "\u001b[34mRequirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (4.63.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: py-cpuinfo in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (9.0.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: packaging in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (23.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: psutil in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (5.9.4)\u001b[0m\n", "\u001b[34mRequirement already satisfied: hjson in /opt/conda/lib/python3.9/site-packages (from deepspeed==0.7.4->-r requirements.txt (line 6)) (3.1.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: aiohttp in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (3.8.4)\u001b[0m\n", "\u001b[34mRequirement already satisfied: multiprocess in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (0.70.14)\u001b[0m\n", "\u001b[34mRequirement already satisfied: dill<0.3.7,>=0.3.0 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (0.3.6)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (6.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: requests>=2.19.0 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (2.27.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: responses<0.19 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (0.18.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: fsspec[http]>=2021.11.1 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (2023.3.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: huggingface-hub<1.0.0,>=0.2.0 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (0.12.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: xxhash in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (3.2.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pyarrow>=6.0.0 in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (11.0.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pandas in /opt/conda/lib/python3.9/site-packages (from datasets->-r requirements.txt (line 1)) (1.5.3)\u001b[0m\n", "\u001b[34mRequirement already satisfied: importlib-metadata in /opt/conda/lib/python3.9/site-packages (from diffusers->-r requirements.txt (line 2)) (6.0.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from diffusers->-r requirements.txt (line 2)) (2022.10.31)\u001b[0m\n", "\u001b[34mRequirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from diffusers->-r requirements.txt (line 2)) (3.9.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: Pillow in /opt/conda/lib/python3.9/site-packages (from diffusers->-r requirements.txt (line 2)) (9.0.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: typing_extensions in /opt/conda/lib/python3.9/site-packages (from torch->-r requirements.txt (line 5)) (4.3.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /opt/conda/lib/python3.9/site-packages (from transformers>=4.25.1->-r requirements.txt (line 9)) (0.13.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: wcwidth>=0.2.5 in /opt/conda/lib/python3.9/site-packages (from ftfy->-r requirements.txt (line 10)) (0.2.5)\u001b[0m\n", "\u001b[34mRequirement already satisfied: wheel>=0.26 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (0.37.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: tensorboard-plugin-wit>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (1.8.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: protobuf>=3.19.6 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (3.20.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: absl-py>=0.4 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (1.4.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: google-auth-oauthlib<0.5,>=0.4.1 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (0.4.6)\u001b[0m\n", "\u001b[34mRequirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (61.2.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: google-auth<3,>=1.6.3 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (2.16.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: grpcio>=1.48.2 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (1.51.3)\u001b[0m\n", "\u001b[34mRequirement already satisfied: werkzeug>=1.0.1 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (2.2.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: tensorboard-data-server<0.8.0,>=0.7.0 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (0.7.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: markdown>=2.6.8 in /opt/conda/lib/python3.9/site-packages (from tensorboard->-r requirements.txt (line 11)) (3.4.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.9/site-packages (from Jinja2->-r requirements.txt (line 12)) (2.1.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: charset-normalizer<4.0,>=2.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (2.0.4)\u001b[0m\n", "\u001b[34mRequirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (4.0.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (22.2.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.3.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (6.0.4)\u001b[0m\n", "\u001b[34mRequirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.8.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.9/site-packages (from aiohttp->datasets->-r requirements.txt (line 1)) (1.3.3)\u001b[0m\n", "\u001b[34mRequirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 11)) (5.3.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.9/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 11)) (0.2.8)\u001b[0m\n", "\u001b[34mRequirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.9/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 11)) (1.16.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.9/site-packages (from google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 11)) (4.9)\u001b[0m\n", "\u001b[34mRequirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.9/site-packages (from google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 11)) (1.3.1)\u001b[0m\n", "\u001b[34mRequirement already satisfied: zipp>=0.5 in /opt/conda/lib/python3.9/site-packages (from importlib-metadata->diffusers->-r requirements.txt (line 2)) (3.15.0)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.9/site-packages (from pyasn1-modules>=0.2.1->google-auth<3,>=1.6.3->tensorboard->-r requirements.txt (line 11)) (0.4.8)\u001b[0m\n", "\u001b[34mRequirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 1)) (2022.6.15)\u001b[0m\n", "\u001b[34mRequirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 1)) (1.26.8)\u001b[0m\n", "\u001b[34mRequirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.9/site-packages (from requests>=2.19.0->datasets->-r requirements.txt (line 1)) (3.3)\u001b[0m\n", "\u001b[34mRequirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.9/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib<0.5,>=0.4.1->tensorboard->-r requirements.txt (line 11)) (3.2.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: python-dateutil>=2.8.1 in /opt/conda/lib/python3.9/site-packages (from pandas->datasets->-r requirements.txt (line 1)) (2.8.2)\u001b[0m\n", "\u001b[34mRequirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.9/site-packages (from pandas->datasets->-r requirements.txt (line 1)) (2022.7.1)\u001b[0m\n", "\u001b[34mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,250 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,285 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,317 sagemaker-training-toolkit INFO No Neurons detected (normal if no neurons installed)\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,328 sagemaker-training-toolkit INFO Invoking user script\u001b[0m\n", "\u001b[34mTraining Env:\u001b[0m\n", "\u001b[34m{\n", " \"additional_framework_parameters\": {},\n", " \"channel_input_dirs\": {},\n", " \"current_host\": \"algo-1\",\n", " \"current_instance_group\": \"homogeneousCluster\",\n", " \"current_instance_group_hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"current_instance_type\": \"ml.g4dn.xlarge\",\n", " \"distribution_hosts\": [],\n", " \"distribution_instance_groups\": [],\n", " \"framework_module\": null,\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"hyperparameters\": {\n", " \"checkpointing_steps\": 1000,\n", " \"dataloader_num_workers\": 8,\n", " \"dataset_name\": \"lambdalabs/pokemon-blip-captions\",\n", " \"gradient_accumulation_steps\": 4,\n", " \"learning_rate\": 2e-06,\n", " \"lr_scheduler\": \"cosine\",\n", " \"lr_warmup_steps\": 0,\n", " \"manul_upload_model_path\": \"s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\",\n", " \"max_grad_norm\": 1,\n", " \"max_train_steps\": 1000,\n", " \"mixed_precision\": \"fp16\",\n", " \"output_dir\": \"/opt/ml/model/\",\n", " \"pretrained_model_name_or_path\": \"runwayml/stable-diffusion-v1-5\",\n", " \"resolution\": 512,\n", " \"seed\": 1338,\n", " \"train_batch_size\": 1,\n", " \"validation_prompt\": \"Totoro\"\n", " },\n", " \"input_config_dir\": \"/opt/ml/input/config\",\n", " \"input_data_config\": {},\n", " \"input_dir\": \"/opt/ml/input\",\n", " \"instance_groups\": [\n", " \"homogeneousCluster\"\n", " ],\n", " \"instance_groups_dict\": {\n", " \"homogeneousCluster\": {\n", " \"instance_group_name\": \"homogeneousCluster\",\n", " \"instance_type\": \"ml.g4dn.xlarge\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ]\n", " }\n", " },\n", " \"is_hetero\": false,\n", " \"is_master\": true,\n", " \"is_modelparallel_enabled\": null,\n", " \"is_smddpmprun_installed\": false,\n", " \"job_name\": \"lora-finetuning-2023-03-06-14-44-55-604\",\n", " \"log_level\": 20,\n", " \"master_hostname\": \"algo-1\",\n", " \"model_dir\": \"/opt/ml/model\",\n", " \"module_dir\": \"/opt/ml/code\",\n", " \"module_name\": \"train\",\n", " \"network_interface_name\": \"eth0\",\n", " \"num_cpus\": 4,\n", " \"num_gpus\": 1,\n", " \"num_neurons\": 0,\n", " \"output_data_dir\": \"/opt/ml/output/data\",\n", " \"output_dir\": \"/opt/ml/output\",\n", " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", " \"resource_config\": {\n", " \"current_host\": \"algo-1\",\n", " \"current_instance_type\": \"ml.g4dn.xlarge\",\n", " \"current_group_name\": \"homogeneousCluster\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ],\n", " \"instance_groups\": [\n", " {\n", " \"instance_group_name\": \"homogeneousCluster\",\n", " \"instance_type\": \"ml.g4dn.xlarge\",\n", " \"hosts\": [\n", " \"algo-1\"\n", " ]\n", " }\n", " ],\n", " \"network_interface_name\": \"eth0\"\n", " },\n", " \"user_entry_point\": \"train.py\"\u001b[0m\n", "\u001b[34m}\u001b[0m\n", "\u001b[34mEnvironment variables:\u001b[0m\n", "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", "\u001b[34mSM_HPS={\"checkpointing_steps\":1000,\"dataloader_num_workers\":8,\"dataset_name\":\"lambdalabs/pokemon-blip-captions\",\"gradient_accumulation_steps\":4,\"learning_rate\":2e-06,\"lr_scheduler\":\"cosine\",\"lr_warmup_steps\":0,\"manul_upload_model_path\":\"s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\",\"max_grad_norm\":1,\"max_train_steps\":1000,\"mixed_precision\":\"fp16\",\"output_dir\":\"/opt/ml/model/\",\"pretrained_model_name_or_path\":\"runwayml/stable-diffusion-v1-5\",\"resolution\":512,\"seed\":1338,\"train_batch_size\":1,\"validation_prompt\":\"Totoro\"}\u001b[0m\n", "\u001b[34mSM_USER_ENTRY_POINT=train.py\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", "\u001b[34mSM_RESOURCE_CONFIG={\"current_group_name\":\"homogeneousCluster\",\"current_host\":\"algo-1\",\"current_instance_type\":\"ml.g4dn.xlarge\",\"hosts\":[\"algo-1\"],\"instance_groups\":[{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g4dn.xlarge\"}],\"network_interface_name\":\"eth0\"}\u001b[0m\n", "\u001b[34mSM_INPUT_DATA_CONFIG={}\u001b[0m\n", "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", "\u001b[34mSM_CHANNELS=[]\u001b[0m\n", "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", "\u001b[34mSM_CURRENT_INSTANCE_TYPE=ml.g4dn.xlarge\u001b[0m\n", "\u001b[34mSM_CURRENT_INSTANCE_GROUP=homogeneousCluster\u001b[0m\n", "\u001b[34mSM_CURRENT_INSTANCE_GROUP_HOSTS=[\"algo-1\"]\u001b[0m\n", "\u001b[34mSM_INSTANCE_GROUPS=[\"homogeneousCluster\"]\u001b[0m\n", "\u001b[34mSM_INSTANCE_GROUPS_DICT={\"homogeneousCluster\":{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g4dn.xlarge\"}}\u001b[0m\n", "\u001b[34mSM_DISTRIBUTION_INSTANCE_GROUPS=[]\u001b[0m\n", "\u001b[34mSM_IS_HETERO=false\u001b[0m\n", "\u001b[34mSM_MODULE_NAME=train\u001b[0m\n", "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", "\u001b[34mSM_FRAMEWORK_MODULE=\u001b[0m\n", "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", "\u001b[34mSM_NUM_GPUS=1\u001b[0m\n", "\u001b[34mSM_NUM_NEURONS=0\u001b[0m\n", "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", "\u001b[34mSM_MODULE_DIR=/opt/ml/code\u001b[0m\n", "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{},\"current_host\":\"algo-1\",\"current_instance_group\":\"homogeneousCluster\",\"current_instance_group_hosts\":[\"algo-1\"],\"current_instance_type\":\"ml.g4dn.xlarge\",\"distribution_hosts\":[],\"distribution_instance_groups\":[],\"framework_module\":null,\"hosts\":[\"algo-1\"],\"hyperparameters\":{\"checkpointing_steps\":1000,\"dataloader_num_workers\":8,\"dataset_name\":\"lambdalabs/pokemon-blip-captions\",\"gradient_accumulation_steps\":4,\"learning_rate\":2e-06,\"lr_scheduler\":\"cosine\",\"lr_warmup_steps\":0,\"manul_upload_model_path\":\"s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\",\"max_grad_norm\":1,\"max_train_steps\":1000,\"mixed_precision\":\"fp16\",\"output_dir\":\"/opt/ml/model/\",\"pretrained_model_name_or_path\":\"runwayml/stable-diffusion-v1-5\",\"resolution\":512,\"seed\":1338,\"train_batch_size\":1,\"validation_prompt\":\"Totoro\"},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{},\"input_dir\":\"/opt/ml/input\",\"instance_groups\":[\"homogeneousCluster\"],\"instance_groups_dict\":{\"homogeneousCluster\":{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g4dn.xlarge\"}},\"is_hetero\":false,\"is_master\":true,\"is_modelparallel_enabled\":null,\"is_smddpmprun_installed\":false,\"job_name\":\"lora-finetuning-2023-03-06-14-44-55-604\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"/opt/ml/code\",\"module_name\":\"train\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":1,\"num_neurons\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_group_name\":\"homogeneousCluster\",\"current_host\":\"algo-1\",\"current_instance_type\":\"ml.g4dn.xlarge\",\"hosts\":[\"algo-1\"],\"instance_groups\":[{\"hosts\":[\"algo-1\"],\"instance_group_name\":\"homogeneousCluster\",\"instance_type\":\"ml.g4dn.xlarge\"}],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"train.py\"}\u001b[0m\n", "\u001b[34mSM_USER_ARGS=[\"--checkpointing_steps\",\"1000\",\"--dataloader_num_workers\",\"8\",\"--dataset_name\",\"lambdalabs/pokemon-blip-captions\",\"--gradient_accumulation_steps\",\"4\",\"--learning_rate\",\"2e-06\",\"--lr_scheduler\",\"cosine\",\"--lr_warmup_steps\",\"0\",\"--manul_upload_model_path\",\"s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\",\"--max_grad_norm\",\"1\",\"--max_train_steps\",\"1000\",\"--mixed_precision\",\"fp16\",\"--output_dir\",\"/opt/ml/model/\",\"--pretrained_model_name_or_path\",\"runwayml/stable-diffusion-v1-5\",\"--resolution\",\"512\",\"--seed\",\"1338\",\"--train_batch_size\",\"1\",\"--validation_prompt\",\"Totoro\"]\u001b[0m\n", "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", "\u001b[34mSM_HP_CHECKPOINTING_STEPS=1000\u001b[0m\n", "\u001b[34mSM_HP_DATALOADER_NUM_WORKERS=8\u001b[0m\n", "\u001b[34mSM_HP_DATASET_NAME=lambdalabs/pokemon-blip-captions\u001b[0m\n", "\u001b[34mSM_HP_GRADIENT_ACCUMULATION_STEPS=4\u001b[0m\n", "\u001b[34mSM_HP_LEARNING_RATE=2e-06\u001b[0m\n", "\u001b[34mSM_HP_LR_SCHEDULER=cosine\u001b[0m\n", "\u001b[34mSM_HP_LR_WARMUP_STEPS=0\u001b[0m\n", "\u001b[34mSM_HP_MANUL_UPLOAD_MODEL_PATH=s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models\u001b[0m\n", "\u001b[34mSM_HP_MAX_GRAD_NORM=1\u001b[0m\n", "\u001b[34mSM_HP_MAX_TRAIN_STEPS=1000\u001b[0m\n", "\u001b[34mSM_HP_MIXED_PRECISION=fp16\u001b[0m\n", "\u001b[34mSM_HP_OUTPUT_DIR=/opt/ml/model/\u001b[0m\n", "\u001b[34mSM_HP_PRETRAINED_MODEL_NAME_OR_PATH=runwayml/stable-diffusion-v1-5\u001b[0m\n", "\u001b[34mSM_HP_RESOLUTION=512\u001b[0m\n", "\u001b[34mSM_HP_SEED=1338\u001b[0m\n", "\u001b[34mSM_HP_TRAIN_BATCH_SIZE=1\u001b[0m\n", "\u001b[34mSM_HP_VALIDATION_PROMPT=Totoro\u001b[0m\n", "\u001b[34mPYTHONPATH=/opt/ml/code:/opt/conda/bin:/opt/conda/lib/python39.zip:/opt/conda/lib/python3.9:/opt/conda/lib/python3.9/lib-dynload:/opt/conda/lib/python3.9/site-packages:/opt/ml/code/repositories/xformers\u001b[0m\n", "\u001b[34mInvoking script with the following command:\u001b[0m\n", "\u001b[34m/opt/conda/bin/python train.py --checkpointing_steps 1000 --dataloader_num_workers 8 --dataset_name lambdalabs/pokemon-blip-captions --gradient_accumulation_steps 4 --learning_rate 2e-06 --lr_scheduler cosine --lr_warmup_steps 0 --manul_upload_model_path s3://sagemaker-ap-southeast-1-687912291502/lora/trained_models --max_grad_norm 1 --max_train_steps 1000 --mixed_precision fp16 --output_dir /opt/ml/model/ --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 --resolution 512 --seed 1338 --train_batch_size 1 --validation_prompt Totoro\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,329 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker Debugger as it is not installed.\u001b[0m\n", "\u001b[34m2023-03-06 14:50:22,329 sagemaker-training-toolkit INFO Exceptions not imported for SageMaker TF as Tensorflow is not installed.\u001b[0m\n", "\u001b[34m/opt/conda/lib/python3.9/site-packages/accelerate/accelerator.py:231: FutureWarning: `logging_dir` is deprecated and will be removed in version 0.18.0 of 🤗 Accelerate. Use `project_dir` instead.\n", " warnings.warn(\u001b[0m\n", "\u001b[34m03/06/2023 14:50:26 - INFO - __main__ - Distributed environment: NO\u001b[0m\n", "\u001b[34mNum processes: 1\u001b[0m\n", "\u001b[34mProcess index: 0\u001b[0m\n", "\u001b[34mLocal process index: 0\u001b[0m\n", "\u001b[34mDevice: cuda\u001b[0m\n", "\u001b[34mMixed precision type: fp16\u001b[0m\n", "\u001b[34m#015Downloading (…)cheduler_config.json: 0%| | 0.00/308 [00:00