import datasets
from datasets import load_dataset


assert float(datasets.__version__[:3]) >= 1.8, "`datasets 1.8.0` or higher need to be installed to generate dataset"


"""
This script is creating a sample dataset for the workshop using the `datasets` library and the "amazon_us_reviews" 
dataset. You can configure which dataset split should be used, by default it is the apparel split. You can also
configure the size of the dataset, which is generated. The script creates 2 json files one for training and one for
testing, which need to be uploaded to s3 for the workshop.
"""


# configuration
dataset_name = "amazon_us_reviews"
dataset_split = "Apparel_v1_00"
train_dataset_length = 35_000
test_split_size = 0.15


# load dataset using datasets library, using the Apperal split.
# full information can be found here: https://huggingface.co/datasets/amazon_us_reviews
dataset = load_dataset("amazon_us_reviews", "Apparel_v1_00")

# since there is only a "tran" split assign it as dataset
dataset = dataset["train"]
print(f"total dataset contains: {len(dataset)} rows")


# remove unnecessary columns from dataset
remove_columns = [
    "marketplace",
    "customer_id",
    "review_id",
    "product_id",
    "product_title",
    "product_category",
    "helpful_votes",
    "total_votes",
    "product_parent",
    "vine",
    "verified_purchase",
    "review_headline",
    "review_date",
]
dataset = dataset.remove_columns(remove_columns)

# rename columns to match schema
dataset = dataset.rename_column("review_body", "review")
dataset = dataset.rename_column("star_rating", "label")
print(f"The dataset features are now {list(dataset.features.keys())}")


# shuffle dataset and select x samples
sampled_dataset = dataset.shuffle().select(range(train_dataset_length))
print(f"sampled dataset contains: {len(sampled_dataset)} rows")

# change label indext from 1..5 to 0..4 to work with AutoModelForSequenceClassification
# Label needs to start with 0
def index_label(example):
    example["label"] = example["label"] - 1
    return example


sampled_dataset = sampled_dataset.map(index_label)

# split sampled dataset into test and train split
processed_dataset_dict = sampled_dataset.train_test_split(test_size=test_split_size)
print(f"train dataset contains: {len(processed_dataset_dict['train'])} rows")
print(f"test dataset contains: {len(processed_dataset_dict['test'])} rows")


# save datasets as json for uploading to s3
processed_dataset_dict["train"].to_json(f"../data/{dataset_name}_{dataset_split.lower()}_train.json")
processed_dataset_dict["test"].to_json(f"../data/{dataset_name}_{dataset_split.lower()}_test.json")