{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "e1922a02-288c-4bf0-a909-2a040cde5fd2", "metadata": {}, "outputs": [], "source": [ "%pip install transformers -Uq\n", "%pip install datasets -Uq" ] }, { "cell_type": "code", "execution_count": null, "id": "4589de41-dcd6-4b34-a862-44a29e25cdb0", "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n", "from pathlib import Path" ] }, { "cell_type": "markdown", "id": "72de8c17-963d-40b1-94ba-8527062b4937", "metadata": {}, "source": [ "## Download Data and Train Tokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "570a3885-9b32-431e-9ad6-6196d3fc08f0", "metadata": {}, "outputs": [], "source": [ "dataset_name = \"oscar\"\n", "dataset_conf = \"unshuffled_deduplicated_no\"" ] }, { "cell_type": "code", "execution_count": null, "id": "9128a03a-1867-4c6a-ad1e-56af36727a40", "metadata": {}, "outputs": [], "source": [ "# load dataset\n", "dataset = load_dataset(dataset_name, dataset_conf, split=\"train\")\n", "\n", "# Instantiate tokenizer\n", "tokenizer = ByteLevelBPETokenizer()" ] }, { "cell_type": "code", "execution_count": null, "id": "bfa762db-0c67-4e5a-bbc1-cb44f4758f71", "metadata": {}, "outputs": [], "source": [ "config_path = Path(\"norwegian-gpt2\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1fdbd012-743e-413e-896f-59e70a3401d7", "metadata": {}, "outputs": [], "source": [ "if (config_path / \"tokenizer.json\").exists():\n", " print(\"Existing tokenizer config detected. Skipping Tokenizer training\")\n", " pass\n", "else:\n", " def batch_iterator(batch_size=1000):\n", " for i in range(0, len(dataset), batch_size):\n", " yield dataset[i: i + batch_size][\"text\"]\n", "\n", " # Customized training\n", " tokenizer.train_from_iterator(batch_iterator(), vocab_size=50256, min_frequency=2, special_tokens=[\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " ])\n", "\n", " # Save files to disk\n", " config_path.mkdir()\n", " tokenizer.save(\"./norwegian-gpt2/tokenizer.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "fe50c60d-8f9e-4856-b9ea-36ab19fbc037", "metadata": {}, "outputs": [], "source": [ "#split data into train and validation\n", "validation_split_percentage = 5\n", "dataset = load_dataset(dataset_name, dataset_conf)\n", "\n", "dataset[\"validation\"] = load_dataset(\n", " dataset_name,\n", " dataset_conf,\n", " split=f\"train[:{validation_split_percentage}%]\"\n", ")\n", "dataset[\"train\"] = load_dataset(\n", " dataset_name,\n", " dataset_conf,\n", " split=f\"train[{validation_split_percentage}%:]\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "b5dba291-8e6f-4a93-a091-7de6c4640ebe", "metadata": {}, "outputs": [], "source": [ "dataset.save_to_disk(\"train-data\")" ] }, { "cell_type": "markdown", "id": "aad217de-fa1d-4581-bfd5-4cece7b02169", "metadata": {}, "source": [ "## Configure Model" ] }, { "cell_type": "code", "execution_count": null, "id": "814890bf-36f0-4266-8963-5d541346dcc9", "metadata": {}, "outputs": [], "source": [ "from transformers import GPT2Config\n", "\n", "config = GPT2Config.from_pretrained(\"gpt2\", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50256)\n", "config.save_pretrained(\"./norwegian-gpt2\")" ] }, { "cell_type": "markdown", "id": "68cf9ff5-ca54-48e5-915a-008f5739a73a", "metadata": {}, "source": [ "## Launch SageMaker Job" ] }, { "cell_type": "code", "execution_count": null, "id": "a6e2f5af-cc03-4ca1-8014-b5abbb26d37b", "metadata": {}, "outputs": [], "source": [ "import sagemaker\n", "from sagemaker.huggingface import HuggingFace\n", "import shutil" ] }, { "cell_type": "code", "execution_count": null, "id": "e956eaef-01a0-425e-aa60-02da2efbae1c", "metadata": {}, "outputs": [], "source": [ "sess = sagemaker.session.Session()\n", "role = sagemaker.get_execution_role()\n", "bucket = sess.default_bucket()\n", "key_prefix = \"alpa_ray_lm\"" ] }, { "cell_type": "code", "execution_count": null, "id": "d41cfe72-0244-4c6f-896c-da4dbf2a0174", "metadata": {}, "outputs": [], "source": [ "subnets=None\n", "security_group_ids=None" ] }, { "cell_type": "code", "execution_count": null, "id": "ef3aa05a-7a45-4d3e-8a66-08712fb94fcb", "metadata": {}, "outputs": [], "source": [ "config_data_path = sess.upload_data(\"norwegian-gpt2\", bucket, key_prefix=f\"{key_prefix}/config/norwegian-gpt2\")\n", "s3_data_path = sess.upload_data(\"train-data\", bucket, key_prefix=f\"{key_prefix}/data/oscar\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1ee6b046-2288-4fb2-b05c-65901e6b563f", "metadata": {}, "outputs": [], "source": [ "hyperparams = dict(\n", " output_dir=\"/opt/ml/model\",\n", " model_type=\"gpt2\",\n", " config_name=\"/opt/ml/input/data/gpt2\",\n", " tokenizer_name=\"/opt/ml/input/data/gpt2\",\n", " dataset_name=\"/opt/ml/input/data/input_data\",\n", " load_data_from_disk=True,\n", " do_train=True, \n", " do_eval=True,\n", " block_size=512,\n", " per_device_train_batch_size=96,\n", " per_device_eval_batch_size=96,\n", " num_micro_batches=4,\n", " dtype=\"float16\",\n", " learning_rate=1e-3, \n", " warmup_steps=1000,\n", " adam_beta1=0.9, \n", " adam_beta2=0.98, \n", " weight_decay=0.01,\n", " overwrite_output_dir=True,\n", " num_train_epochs=2,\n", " logging_steps=100,\n", " save_steps=2500,\n", " eval_steps=2500\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "08485403-0945-4bfa-bd2f-1c316457c0a3", "metadata": {}, "outputs": [], "source": [ "estimator_gpu_alpa_ray = HuggingFace(\n", " source_dir = \"src\",\n", " entry_point=\"sm_run_clm_flax.py\",\n", " pytorch_version=\"1.10\",\n", " transformers_version=\"4.17\",\n", " subnets=subnets,\n", " security_group_ids=security_group_ids,\n", " role=role,\n", " instance_count=4, \n", " instance_type=\"ml.g5.12xlarge\", \n", " py_version=\"py38\",\n", " hyperparameters=hyperparams,\n", " disable_profiler=True\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "d13e47d1-81c5-43f4-8101-0caa2788312c", "metadata": {}, "outputs": [], "source": [ "estimator_gpu_alpa_ray.fit({\"input_data\":s3_data_path, \"gpt2\": config_data_path}, wait=False)" ] }, { "cell_type": "code", "execution_count": null, "id": "005b96a7-aa72-4a11-90f7-0ee153222244", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_pytorch_p38", "language": "python", "name": "conda_pytorch_p38" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 5 }