# ==================================================================================
# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
# Permission is hereby granted, free of charge, to any person obtaining a copy of this
# software and associated documentation files (the "Software"), to deal in the Software
# without restriction, including without limitation the rights to use, copy, modify,
# merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
# PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
# SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# ==================================================================================
#
# stvblogCreatePollyAudio.py
# by: Rob Dachowski
# For questions or feedback, please contact robdac@amazon.com
#
# Purpose: This code drives the process to create a language specific MP3 audio track based
# on speech synthesized by Polly. The length of each phrase is calculated by looking
# at the length of the synthesized speech and trying to subtract any extraneous silece
# if the synthesized clip is longer than the duration in the SRT.
# Change Log:
# 3/1/2020: Initial version
#
# ==================================================================================
import os
import json
import uuid
from datetime import datetime
import boto3
from botocore.exceptions import ClientError
import ssmparms as sp
import stmparms as stm
import stverrors
from pydub.audio_segment import AudioSegment as AS
from contextlib import closing
import tempfile
# ==================================================================================
# Function: writeAudio
# Purpose: writes the bytes associates with the stream to a binary file
# Parameters:
# output_file - file object
# stream - the stream of bytes to write to the output_file
# ==================================================================================
def writeAudio( output_file, stream ):
bytes = stream.read()
print( "\t---> Writing ", len(bytes), "bytes to audio file: ", output_file)
try:
# Open a file for writing the output as a binary stream
with open(output_file, "wb") as file:
file.write(bytes)
except IOError as e:
# Could not write to file, exit gracefully
raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***")
# ==================================================================================
# Function: labmda_handler
# Purpose: This is the "main" code for this lambda function
# Parameters:
# event - the JSON input structure containing the parameters from the step function process
# ==================================================================================
def lambda_handler(event, context):
#debugging message
print("===> CreatePollyAudio: " + "\nEvent:" + str(event) + "\nContext: " + str( context ) )
print( "\t---> Boto Version: ", boto3.__version__ )
# load ffmpeg
ffmpeg_version = os.system('/opt/bin/ffmpeg -version')
AS.converter = "/opt/bin/ffmpeg"
# Load the parms from DynamoDB
parms = stm.get_stm_parms( event['input']['Outputs']['process']['ProcessName'])
if not parms:
# We have an issue, so get out
raise stvDynamoDBError( "*** Unable to load parms from DynamoDB ***")
# set up a shortcut
pc = parms['Item']['Config']
pi = parms['Item']['Inputs']
po = parms['Item']['Outputs']
ptgts = parms['Item']['Targets']
pt = event['item']['translate']
sls = pi['sourceLanguageShort']
slf = pi['sourceLanguageFull']
tls = event['item']['translate']['targetLanguageShort']
tlf = event['item']['translate']['targetLanguageFull']
#set up AWS resource for S3 and Polly
s3 = boto3.resource('s3')
polly = boto3.client( 'polly' )
# set up the S3 bucket
bucket = s3.Bucket(pc['baseBucketName'])
# get the SSML key so that we can read in the SSML
ssmlKey = event['item']['polly']['ssmlKey']
if event['item']['polly']['createAudio'] == 'y':
try:
# Read in the SSML file
print("\t---> Reading " + ssmlKey + "\n")
ssmlFile = bucket.Object( key=ssmlKey )
ssmlBytes = ssmlFile.get()
except Exception as e:
# Something went wrong reading the file
print( "***Exception***: Issue reading SSML file: ", ssmlKey)
raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***")
# Ok, we read in the file, now get the contents
ssmlIn = ssmlBytes['Body'].read()
# Convert bytes to a string
ssmlContents = str(ssmlIn, 'utf-8')
# Set up where we want the Polly Output to go
pollyOutputKey = po['process']['uuid'] + '/' + pc['pollyOutput'] + '/' + pi['mediaFile'] + "." + tls
# Create an empty audio file to hold the concatenated audio clips
concatenatedAudio = AS.empty()
# Split the ssml into a list of lines
lines = ssmlContents.split( '\n')
print( "\t---> Process ssml...")
# Now let's walk through each line, split it into the component parts and put the audio together.
for i, line in enumerate(lines):
# Ignore the line if it is the open or close
if (line == "") or (line == ""):
print( "\t\t---> Open / Close SSML - ", line )
else:
# First, split out the break section from the prosody section
ssmlPhrase = line.split("/>")
# Next, strip off the break time up to the first digit of the seconds
breakStr = ssmlPhrase[0].split('
maxDurStr = ssmlPhrase[1].split( 's">')
maxDurStrSecs = maxDurStr[0].split( ' Clip ", i, " max duration: ", maxDuration )
# Put together the ssml we will send to Polly
ssml = "" + ssmlPhrase[1] + ""
print( "\t\t---> ssml: ", ssml )
try:
# Let's call Polly to get the streamed phrase
response = polly.synthesize_speech(
VoiceId = event['item']['polly']['voiceId'],
OutputFormat = "mp3",
Text = ssml,
TextType = "ssml"
)
except Exception as e:
# Something went wrong with Polly
print( "***Exception***: Issue synthesizing speech" )
raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***")
try:
# Get the audio stream from the Polly response
# write it to a tempoarary file in /tmp
if "AudioStream" in response:
with closing(response["AudioStream"]) as stream:
tmpFile = tempfile.NamedTemporaryFile(suffix='.mp3')
tmpFile.close()
audioFileName = tmpFile.name
writeAudio( audioFileName, stream )
speech = AS.from_file(audioFileName, format="mp3")
# Now that we have the Polly stream for the clip, we need to determine how long it is compared to the max
# duration. If it is shorter than the max duration, we'll need to pad the silence so that in the end, the
# total length of the audio track matches the original video
# Now that we know how long the silence needs to be, generate a clip of silence for the duration of breaktime
clipDurationDifference = maxDuration - (speech.duration_seconds * 1000)
print( "\t\t---> clipDurationDifference (%f) = maxDuration (%f) - speech.duration_seconds (%f)" % (clipDurationDifference, maxDuration, (speech.duration_seconds * 1000)) )
# Now that we know how long the clip and the break time are, calculate and generate silence
# for the total of the break time + the clipDurationDifference
silence = AS.silent(duration=(breakTime + clipDurationDifference) )
concatenatedAudio += silence
print( "\t\t---> silence: ", silence.duration_seconds )
concatenatedAudio += speech
os.remove(audioFileName)
print( "\t\t---> %s successfully deleted" % (audioFileName))
except Exception as e:
# Something went wrong writing the temporary file
print( "***Exception***: Issue writing temp file" )
raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***")
print( "\t---> SSML Processing complete. concatenatedAudio duration: ", concatenatedAudio.duration_seconds)
try:
# first export the
tmpFile = tempfile.NamedTemporaryFile(suffix='.mp3')
tmpFile.close()
concatendateAudioFileName = tmpFile.name
concatenatedAudio.export(concatendateAudioFileName, format="mp3")
audioKey = pollyOutputKey + ".mp3"
print( "\t---> Uploading %s to %s " % (concatendateAudioFileName, audioKey))
bucket = s3.Bucket( pc['baseBucketName'])
bucket.upload_file(concatendateAudioFileName, audioKey )
print( "\t---> Upload successful")
# now delete the temporary file once it is copied to s3
print( "\t---> Deleting %s" % (concatendateAudioFileName))
os.remove(concatendateAudioFileName)
except Exception as e:
# Something went wrong writing the final file
print( "***Exception***: Issue writing final audio file to: ", audioKey )
raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***")
# Get some key info from the job so that we can check on the status of the job
pollyOutputURI = 's3://' + pc['baseBucketName'] + '/' + audioKey
pollyOutputKey = audioKey
else:
pollyOutputURI = 'N/A'
pollyOutputKey = 'N/A'
# Generate and return the repsonse
response = event
response['item']['polly']['pollyOutputURI'] = pollyOutputURI
response['item']['polly']['pollyOutputKey'] = pollyOutputKey
ptgts[event['index']]['polly']['pollyOutputURI'] = pollyOutputURI
ptgts[event['index']]['polly']['pollyOutputKey'] = pollyOutputKey
# Put the ouptut back into DynamoDB
if stm.update_stm_target( event['input']['Outputs']['process']['ProcessName'], ptgts[event['index']], event['index'] ):
print('===> stvblogCreatePollyAudio Complete')
return response
else:
raise stvError( "*** Error writing to the stvblog table ***" )