{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "e1922a02-288c-4bf0-a909-2a040cde5fd2",
"metadata": {},
"outputs": [],
"source": [
"%pip install transformers -Uq\n",
"%pip install datasets -Uq"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4589de41-dcd6-4b34-a862-44a29e25cdb0",
"metadata": {},
"outputs": [],
"source": [
"from datasets import load_dataset\n",
"from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n",
"from pathlib import Path"
]
},
{
"cell_type": "markdown",
"id": "72de8c17-963d-40b1-94ba-8527062b4937",
"metadata": {},
"source": [
"## Download Data and Train Tokenizer"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "570a3885-9b32-431e-9ad6-6196d3fc08f0",
"metadata": {},
"outputs": [],
"source": [
"dataset_name = \"oscar\"\n",
"dataset_conf = \"unshuffled_deduplicated_no\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9128a03a-1867-4c6a-ad1e-56af36727a40",
"metadata": {},
"outputs": [],
"source": [
"# load dataset\n",
"dataset = load_dataset(dataset_name, dataset_conf, split=\"train\")\n",
"\n",
"# Instantiate tokenizer\n",
"tokenizer = ByteLevelBPETokenizer()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bfa762db-0c67-4e5a-bbc1-cb44f4758f71",
"metadata": {},
"outputs": [],
"source": [
"config_path = Path(\"norwegian-gpt2\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1fdbd012-743e-413e-896f-59e70a3401d7",
"metadata": {},
"outputs": [],
"source": [
"if (config_path / \"tokenizer.json\").exists():\n",
" print(\"Existing tokenizer config detected. Skipping Tokenizer training\")\n",
" pass\n",
"else:\n",
" def batch_iterator(batch_size=1000):\n",
" for i in range(0, len(dataset), batch_size):\n",
" yield dataset[i: i + batch_size][\"text\"]\n",
"\n",
" # Customized training\n",
" tokenizer.train_from_iterator(batch_iterator(), vocab_size=50256, min_frequency=2, special_tokens=[\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" \"\",\n",
" ])\n",
"\n",
" # Save files to disk\n",
" config_path.mkdir()\n",
" tokenizer.save(\"./norwegian-gpt2/tokenizer.json\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fe50c60d-8f9e-4856-b9ea-36ab19fbc037",
"metadata": {},
"outputs": [],
"source": [
"#split data into train and validation\n",
"validation_split_percentage = 5\n",
"dataset = load_dataset(dataset_name, dataset_conf)\n",
"\n",
"dataset[\"validation\"] = load_dataset(\n",
" dataset_name,\n",
" dataset_conf,\n",
" split=f\"train[:{validation_split_percentage}%]\"\n",
")\n",
"dataset[\"train\"] = load_dataset(\n",
" dataset_name,\n",
" dataset_conf,\n",
" split=f\"train[{validation_split_percentage}%:]\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b5dba291-8e6f-4a93-a091-7de6c4640ebe",
"metadata": {},
"outputs": [],
"source": [
"dataset.save_to_disk(\"train-data\")"
]
},
{
"cell_type": "markdown",
"id": "aad217de-fa1d-4581-bfd5-4cece7b02169",
"metadata": {},
"source": [
"## Configure Model"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "814890bf-36f0-4266-8963-5d541346dcc9",
"metadata": {},
"outputs": [],
"source": [
"from transformers import GPT2Config\n",
"\n",
"config = GPT2Config.from_pretrained(\"gpt2\", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50256)\n",
"config.save_pretrained(\"./norwegian-gpt2\")"
]
},
{
"cell_type": "markdown",
"id": "68cf9ff5-ca54-48e5-915a-008f5739a73a",
"metadata": {},
"source": [
"## Launch SageMaker Job"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a6e2f5af-cc03-4ca1-8014-b5abbb26d37b",
"metadata": {},
"outputs": [],
"source": [
"import sagemaker\n",
"from sagemaker.huggingface import HuggingFace\n",
"import shutil"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e956eaef-01a0-425e-aa60-02da2efbae1c",
"metadata": {},
"outputs": [],
"source": [
"sess = sagemaker.session.Session()\n",
"role = sagemaker.get_execution_role()\n",
"bucket = sess.default_bucket()\n",
"key_prefix = \"alpa_ray_lm\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d41cfe72-0244-4c6f-896c-da4dbf2a0174",
"metadata": {},
"outputs": [],
"source": [
"subnets=None\n",
"security_group_ids=None"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef3aa05a-7a45-4d3e-8a66-08712fb94fcb",
"metadata": {},
"outputs": [],
"source": [
"config_data_path = sess.upload_data(\"norwegian-gpt2\", bucket, key_prefix=f\"{key_prefix}/config/norwegian-gpt2\")\n",
"s3_data_path = sess.upload_data(\"train-data\", bucket, key_prefix=f\"{key_prefix}/data/oscar\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ee6b046-2288-4fb2-b05c-65901e6b563f",
"metadata": {},
"outputs": [],
"source": [
"hyperparams = dict(\n",
" output_dir=\"/opt/ml/model\",\n",
" model_type=\"gpt2\",\n",
" config_name=\"/opt/ml/input/data/gpt2\",\n",
" tokenizer_name=\"/opt/ml/input/data/gpt2\",\n",
" dataset_name=\"/opt/ml/input/data/input_data\",\n",
" load_data_from_disk=True,\n",
" do_train=True, \n",
" do_eval=True,\n",
" block_size=512,\n",
" per_device_train_batch_size=96,\n",
" per_device_eval_batch_size=96,\n",
" num_micro_batches=4,\n",
" dtype=\"float16\",\n",
" learning_rate=1e-3, \n",
" warmup_steps=1000,\n",
" adam_beta1=0.9, \n",
" adam_beta2=0.98, \n",
" weight_decay=0.01,\n",
" overwrite_output_dir=True,\n",
" num_train_epochs=2,\n",
" logging_steps=100,\n",
" save_steps=2500,\n",
" eval_steps=2500\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08485403-0945-4bfa-bd2f-1c316457c0a3",
"metadata": {},
"outputs": [],
"source": [
"estimator_gpu_alpa_ray = HuggingFace(\n",
" source_dir = \"src\",\n",
" entry_point=\"sm_run_clm_flax.py\",\n",
" pytorch_version=\"1.10\",\n",
" transformers_version=\"4.17\",\n",
" subnets=subnets,\n",
" security_group_ids=security_group_ids,\n",
" role=role,\n",
" instance_count=4, \n",
" instance_type=\"ml.g5.12xlarge\", \n",
" py_version=\"py38\",\n",
" hyperparameters=hyperparams,\n",
" disable_profiler=True\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d13e47d1-81c5-43f4-8101-0caa2788312c",
"metadata": {},
"outputs": [],
"source": [
"estimator_gpu_alpa_ray.fit({\"input_data\":s3_data_path, \"gpt2\": config_data_path}, wait=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "005b96a7-aa72-4a11-90f7-0ee153222244",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_pytorch_p38",
"language": "python",
"name": "conda_pytorch_p38"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}