import os
import json
import glob
import tarfile
import argparse
import subprocess
from distutils.dir_util import copy_tree
class preprocess():
def __init__(self, args):
self.args = args
self.proc_prefix = self.args.proc_prefix #'/opt/ml/processing'
self.input_dir = os.path.join(self.proc_prefix, "input")
self.output_dir = os.path.join(self.proc_prefix, "output")
os.makedirs(self.input_dir, exist_ok=True)
os.makedirs(self.output_dir, exist_ok=True)
def _sph_to_wav(self, data_dir):
an4_path = os.path.join(data_dir, "an4_sphere.tar.gz")
if not os.path.exists(data_dir + "/an4/"):
tar = tarfile.open(an4_path)
tar.extractall(path=data_dir)
print("Converting .sph to .wav...")
sph_list = glob.glob(data_dir + '/an4/**/*.sph', recursive=True)
for sph_path in sph_list:
wav_path = sph_path[:-4] + '.wav'
cmd = ["sox", sph_path, wav_path]
subprocess.run(cmd)
print("Finished conversion.\n******")
# Function to build a manifest
def _build_manifest(self, transcripts_path, manifest_path, data_dir, mount_dir, wav_path):
# create manifest with reference to this directory. This is useful when mounting the dataset.
mount_dir = mount_dir if mount_dir else data_dir
with open(transcripts_path, 'r') as fin:
with open(manifest_path, 'w') as fout:
for line in fin:
# Lines look like this:
# transcript (fileID)
transcript = line[: line.find('(') - 1].lower()
transcript = transcript.replace('', '').replace('', '')
transcript = transcript.strip()
file_id = line[line.find('(') + 1 : -2] # e.g. "cen4-fash-b"
audio_path = os.path.join(
data_dir, wav_path, file_id[file_id.find('-') + 1 : file_id.rfind('-')], file_id + '.wav'
)
mounted_audio_path = os.path.join(
mount_dir, wav_path, file_id[file_id.find('-') + 1 : file_id.rfind('-')], file_id + '.wav'
)
# import sox here to not require sox to be available for importing all utils.
import sox
duration = sox.file_info.duration(audio_path)
# Write the metadata to the manifest
metadata = {"audio_filepath": mounted_audio_path, "duration": duration, "text": transcript}
json.dump(metadata, fout)
fout.write('\n')
def _make_manifest(self, data_dir, train_mount_dir, test_mount_dir):
# Building Manifests
print("******")
train_transcripts = data_dir + '/an4/etc/an4_train.transcription'
train_manifest = data_dir + '/an4/train_manifest.json'
if not os.path.isfile(train_manifest):
self._build_manifest(train_transcripts, train_manifest, data_dir, train_mount_dir, 'an4/wav/an4_clstk')
print("Training manifest created.")
test_transcripts = data_dir + '/an4/etc/an4_test.transcription'
test_manifest = data_dir + '/an4/test_manifest.json'
if not os.path.isfile(test_manifest):
self._build_manifest(test_transcripts, test_manifest, data_dir, test_mount_dir, 'an4/wav/an4test_clstk')
print("Test manifest created.")
print("***Done***")
def _delete_sph(self, data_dir):
for (root, dirs, files) in os.walk(data_dir):
if len(files) > 0:
for file_name in files:
if file_name.endswith('.sph'):
os.remove(root + '/' + file_name)
os.remove(os.path.join(data_dir, "an4_sphere.tar.gz"))
def execution(self, ):
self._sph_to_wav(
data_dir=self.input_dir
)
self._make_manifest(
data_dir=self.input_dir,
train_mount_dir=self.args.train_mount_dir,
test_mount_dir=self.args.test_mount_dir
)
self._delete_sph(
data_dir=self.input_dir
)
copy_tree(os.path.join(self.input_dir, "an4"), os.path.join(self.output_dir, "an4"))
print ("data_dir", os.listdir(self.input_dir))
print ("self.output_dir", os.listdir(self.output_dir))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--proc_prefix", type=str, default="/opt/ml/processing")
parser.add_argument("--train_mount_dir", type=str, default="train_mount_dir")
parser.add_argument("--test_mount_dir", type=str, default="test_mount_dir")
args, _ = parser.parse_known_args()
print("Received arguments {}".format(args))
prep = preprocess(args)
prep.execution()