In [None]:
#ml.m5.2xlarge instance with Data Science Kernel, restart the kernel after the installation
!conda install pytorch torchvision torchaudio -c pytorch -y
!conda install -c conda-forge tensorboard -y
!conda install -c conda-forge tqdm -y

In [None]:
#install git lfs (required by hugging face model)
!apt update
!apt install gnupg -y
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash
!apt install -y git-lfs -y
!git lfs install

In [None]:
#Check pytorch successfully installed with CUDA enabled if using GPU instance
import torch
# torch.cuda.get_device_name(0)


In [None]:
#ffmpeg 4 is required for pytorch to process mp3
!add-apt-repository -y ppa:jonathonf/ffmpeg-4
!apt install -y ffmpeg


In [None]:
#install transformers and other dependencies
!pip install --upgrade pip
!pip install datasets>=2.6.1
!pip install git+https://github.com/huggingface/transformers
!pip install librosa
!pip install evaluate>=0.3.0
!pip install jiwer
!pip install gradio


In [None]:
#Download common voice dataset
from datasets import load_dataset, DatasetDict

common_voice = DatasetDict()

train_dataset, validation_dataset, test_dataset = \
                load_dataset("mozilla-foundation/common_voice_11_0", "zh-TW", \
                             split=["train", "validation", "test"], use_auth_token=False)

print(train_dataset)


In [None]:
#Assign corresponding splits into the dataset
common_voice["train"]=train_dataset
common_voice["validation"]=validation_dataset
common_voice["test"]=test_dataset
print(common_voice)


In [None]:
#Save the dataset to disk so that we can recover the dataset more easily.
common_voice.save_to_disk("zhtw-common-voice-original")

In [None]:
#Remove unneeded field for the training
from datasets import load_from_disk, DatasetDict
common_voice = load_from_disk("zhtw-common-voice-original")
common_voice = common_voice.remove_columns(["accent", "age", "client_id", "down_votes", 
                                            "gender", "locale", "path", "segment", "up_votes"])
common_voice

In [None]:
#Let's start from the pretrained small model
from transformers import WhisperProcessor

processor = WhisperProcessor.from_pretrained("openai/whisper-small", language="Chinese", task="transcribe")
tokenizer = processor.tokenizer

In [None]:
#Verify that tokenizer works both ways
input_str = common_voice["train"][0]["sentence"]
labels = tokenizer(input_str).input_ids
decoded_with_special = tokenizer.decode(labels, skip_special_tokens=False)
decoded_str = tokenizer.decode(labels, skip_special_tokens=True)

print(f"Input:                 {input_str}")
print(f"Decoded w/ special:    {decoded_with_special}")
print(f"Decoded w/out special: {decoded_str}")
print(f"Are equal:             {input_str == decoded_str}")


In [None]:
#Let's view a sample
print(common_voice["train"][0])

In [None]:
#Downsample the samples
from datasets import Audio

common_voice = common_voice.cast_column("audio", Audio(sampling_rate=16000))
print(common_voice["train"][0])


In [None]:
#Define the extraction function
def prepare_dataset(batch):
    # load and resample audio data from 48 to 16kHz
    audio = batch["audio"]

    # compute log-Mel input features from input audio array 
    batch["input_features"] = processor.feature_extractor(audio["array"], sampling_rate=audio["sampling_rate"]).input_features[0]

    # encode target text to label ids 
    batch["labels"] = tokenizer(batch["sentence"]).input_ids
    return batch


In [None]:
#Apply it to the whole dataset. Give it some time even all lines goes green.
# If the process failed, try delete the cache-xxxx arrow files under zhtw-common-voice-original/train or the ~/.cache and restart the kernel.
common_voice = common_voice.map(prepare_dataset, remove_columns=common_voice.column_names["train"], num_proc=8)


In [None]:
#Save the processed dataset to disk
common_voice.save_to_disk("zhtw-common-voice-processed")

In [None]:
common_voice

In [None]:
#View the default bucket location
import boto3
import sagemaker
import os
from sagemaker import get_execution_role
import os

sess = sagemaker.Session()
ROLE = get_execution_role()

BUCKET = sess.default_bucket() 
PREFIX = "whisper/data/zhtw-common-voice-processed"
s3uri = os.path.join("s3://", BUCKET, PREFIX)
s3uri

In [None]:
#Use the aws s3 cli to upload the processed dataset. You could also choose to use the boto3 python sdk to do the upload.
!aws s3 cp --recursive zhtw-common-voice-processed YOUR_S3_URI