import os import subprocess import sys import argparse import logging import numpy as np from datasets import load_dataset from transformers import ElectraTokenizer def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) def parser_args(train_notebook=False): parser = argparse.ArgumentParser() parser.add_argument("--model_id", default='daekeun-ml/koelectra-small-v3-nsmc') parser.add_argument("--tokenizer_id", default='daekeun-ml/koelectra-small-v3-nsmc') parser.add_argument("--dataset_name", type=str, default='nsmc') parser.add_argument("--small_subset_for_debug", type=bool, default=True) parser.add_argument("--train_dir", type=str, default='/opt/ml/processing/train') parser.add_argument("--validation_dir", type=str, default='/opt/ml/processing/validation') parser.add_argument("--test_dir", type=str, default='/opt/ml/processing/test') if train_notebook: args = parser.parse_args([]) else: args = parser.parse_args() return args if __name__ == "__main__": args = parser_args() # download tokenizer tokenizer = ElectraTokenizer.from_pretrained(args.model_id) # tokenizer helper function def tokenize(batch): return tokenizer(batch['document'], padding='max_length', max_length=128, truncation=True) # load dataset train_dataset, test_dataset = load_dataset(args.dataset_name, split=["train", "test"]) if args.small_subset_for_debug: train_dataset = train_dataset.shuffle().select(range(1000)) test_dataset = test_dataset.shuffle().select(range(1000)) # tokenize dataset train_dataset = train_dataset.map(tokenize, batched=True) test_dataset = test_dataset.map(tokenize, batched=True) # set format for pytorch train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label']) args.train_dir = './train' args.test_dir = './test' train_dataset.save_to_disk(args.train_dir) test_dataset.save_to_disk(args.test_dir)