In [None]:
%pip install transformers -Uq
%pip install datasets -Uq

In [None]:
from datasets import load_dataset
from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
from pathlib import Path

## Download Data and Train Tokenizer

In [None]:
dataset_name = "oscar"
dataset_conf = "unshuffled_deduplicated_no"

In [None]:
# load dataset
dataset = load_dataset(dataset_name, dataset_conf, split="train")

# Instantiate tokenizer
tokenizer = ByteLevelBPETokenizer()

In [None]:
config_path = Path("norwegian-gpt2")

In [None]:
if (config_path / "tokenizer.json").exists():
 print("Existing tokenizer config detected. Skipping Tokenizer training")
 pass
else:
 def batch_iterator(batch_size=1000):
 for i in range(0, len(dataset), batch_size):
 yield dataset[i: i + batch_size]["text"]

 # Customized training
 tokenizer.train_from_iterator(batch_iterator(), vocab_size=50256, min_frequency=2, special_tokens=[
 "",
 "",
 "",
 "",
 "",
 ])

 # Save files to disk
 config_path.mkdir()
 tokenizer.save("./norwegian-gpt2/tokenizer.json")

In [None]:
#split data into train and validation
validation_split_percentage = 5
dataset = load_dataset(dataset_name, dataset_conf)

dataset["validation"] = load_dataset(
 dataset_name,
 dataset_conf,
 split=f"train[:{validation_split_percentage}%]"
)
dataset["train"] = load_dataset(
 dataset_name,
 dataset_conf,
 split=f"train[{validation_split_percentage}%:]"
)

In [None]:
dataset.save_to_disk("train-data")

## Configure Model

In [None]:
from transformers import GPT2Config

config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50256)
config.save_pretrained("./norwegian-gpt2")

## Launch SageMaker Job

In [None]:
import sagemaker
from sagemaker.huggingface import HuggingFace
import shutil

In [None]:
sess = sagemaker.session.Session()
role = sagemaker.get_execution_role()
bucket = sess.default_bucket()
key_prefix = "alpa_ray_lm"

In [None]:
subnets=None
security_group_ids=None

In [None]:
config_data_path = sess.upload_data("norwegian-gpt2", bucket, key_prefix=f"{key_prefix}/config/norwegian-gpt2")
s3_data_path = sess.upload_data("train-data", bucket, key_prefix=f"{key_prefix}/data/oscar")

In [None]:
hyperparams = dict(
 output_dir="/opt/ml/model",
 model_type="gpt2",
 config_name="/opt/ml/input/data/gpt2",
 tokenizer_name="/opt/ml/input/data/gpt2",
 dataset_name="/opt/ml/input/data/input_data",
 load_data_from_disk=True,
 do_train=True, 
 do_eval=True,
 block_size=512,
 per_device_train_batch_size=96,
 per_device_eval_batch_size=96,
 num_micro_batches=4,
 dtype="float16",
 learning_rate=1e-3, 
 warmup_steps=1000,
 adam_beta1=0.9, 
 adam_beta2=0.98, 
 weight_decay=0.01,
 overwrite_output_dir=True,
 num_train_epochs=2,
 logging_steps=100,
 save_steps=2500,
 eval_steps=2500
)

In [None]:
estimator_gpu_alpa_ray = HuggingFace(
 source_dir = "src",
 entry_point="sm_run_clm_flax.py",
 pytorch_version="1.10",
 transformers_version="4.17",
 subnets=subnets,
 security_group_ids=security_group_ids,
 role=role,
 instance_count=4, 
 instance_type="ml.g5.12xlarge", 
 py_version="py38",
 hyperparameters=hyperparams,
 disable_profiler=True
)

In [None]:
estimator_gpu_alpa_ray.fit({"input_data":s3_data_path, "gpt2": config_data_path}, wait=False)