# ================================================================================== # Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # ================================================================================== # # stvblogCreatePollyAudio.py # by: Rob Dachowski # For questions or feedback, please contact robdac@amazon.com # # Purpose: This code drives the process to create a language specific MP3 audio track based # on speech synthesized by Polly. The length of each phrase is calculated by looking # at the length of the synthesized speech and trying to subtract any extraneous silece # if the synthesized clip is longer than the duration in the SRT. # Change Log: # 3/1/2020: Initial version # # ================================================================================== import os import json import uuid from datetime import datetime import boto3 from botocore.exceptions import ClientError import ssmparms as sp import stmparms as stm import stverrors from pydub.audio_segment import AudioSegment as AS from contextlib import closing import tempfile # ================================================================================== # Function: writeAudio # Purpose: writes the bytes associates with the stream to a binary file # Parameters: # output_file - file object # stream - the stream of bytes to write to the output_file # ================================================================================== def writeAudio( output_file, stream ): bytes = stream.read() print( "\t---> Writing ", len(bytes), "bytes to audio file: ", output_file) try: # Open a file for writing the output as a binary stream with open(output_file, "wb") as file: file.write(bytes) except IOError as e: # Could not write to file, exit gracefully raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # ================================================================================== # Function: labmda_handler # Purpose: This is the "main" code for this lambda function # Parameters: # event - the JSON input structure containing the parameters from the step function process # ================================================================================== def lambda_handler(event, context): #debugging message print("===> CreatePollyAudio: " + "\nEvent:" + str(event) + "\nContext: " + str( context ) ) print( "\t---> Boto Version: ", boto3.__version__ ) # load ffmpeg ffmpeg_version = os.system('/opt/bin/ffmpeg -version') AS.converter = "/opt/bin/ffmpeg" # Load the parms from DynamoDB parms = stm.get_stm_parms( event['input']['Outputs']['process']['ProcessName']) if not parms: # We have an issue, so get out raise stvDynamoDBError( "*** Unable to load parms from DynamoDB ***") # set up a shortcut pc = parms['Item']['Config'] pi = parms['Item']['Inputs'] po = parms['Item']['Outputs'] ptgts = parms['Item']['Targets'] pt = event['item']['translate'] sls = pi['sourceLanguageShort'] slf = pi['sourceLanguageFull'] tls = event['item']['translate']['targetLanguageShort'] tlf = event['item']['translate']['targetLanguageFull'] #set up AWS resource for S3 and Polly s3 = boto3.resource('s3') polly = boto3.client( 'polly' ) # set up the S3 bucket bucket = s3.Bucket(pc['baseBucketName']) # get the SSML key so that we can read in the SSML ssmlKey = event['item']['polly']['ssmlKey'] if event['item']['polly']['createAudio'] == 'y': try: # Read in the SSML file print("\t---> Reading " + ssmlKey + "\n") ssmlFile = bucket.Object( key=ssmlKey ) ssmlBytes = ssmlFile.get() except Exception as e: # Something went wrong reading the file print( "***Exception***: Issue reading SSML file: ", ssmlKey) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Ok, we read in the file, now get the contents ssmlIn = ssmlBytes['Body'].read() # Convert bytes to a string ssmlContents = str(ssmlIn, 'utf-8') # Set up where we want the Polly Output to go pollyOutputKey = po['process']['uuid'] + '/' + pc['pollyOutput'] + '/' + pi['mediaFile'] + "." + tls # Create an empty audio file to hold the concatenated audio clips concatenatedAudio = AS.empty() # Split the ssml into a list of lines lines = ssmlContents.split( '\n') print( "\t---> Process ssml...") # Now let's walk through each line, split it into the component parts and put the audio together. for i, line in enumerate(lines): # Ignore the line if it is the open or close if (line == "") or (line == ""): print( "\t\t---> Open / Close SSML - ", line ) else: # First, split out the break section from the prosody section ssmlPhrase = line.split("/>") # Next, strip off the break time up to the first digit of the seconds breakStr = ssmlPhrase[0].split(' maxDurStr = ssmlPhrase[1].split( 's">') maxDurStrSecs = maxDurStr[0].split( ' Clip ", i, " max duration: ", maxDuration ) # Put together the ssml we will send to Polly ssml = "" + ssmlPhrase[1] + "" print( "\t\t---> ssml: ", ssml ) try: # Let's call Polly to get the streamed phrase response = polly.synthesize_speech( VoiceId = event['item']['polly']['voiceId'], OutputFormat = "mp3", Text = ssml, TextType = "ssml" ) except Exception as e: # Something went wrong with Polly print( "***Exception***: Issue synthesizing speech" ) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") try: # Get the audio stream from the Polly response # write it to a tempoarary file in /tmp if "AudioStream" in response: with closing(response["AudioStream"]) as stream: tmpFile = tempfile.NamedTemporaryFile(suffix='.mp3') tmpFile.close() audioFileName = tmpFile.name writeAudio( audioFileName, stream ) speech = AS.from_file(audioFileName, format="mp3") # Now that we have the Polly stream for the clip, we need to determine how long it is compared to the max # duration. If it is shorter than the max duration, we'll need to pad the silence so that in the end, the # total length of the audio track matches the original video # Now that we know how long the silence needs to be, generate a clip of silence for the duration of breaktime clipDurationDifference = maxDuration - (speech.duration_seconds * 1000) print( "\t\t---> clipDurationDifference (%f) = maxDuration (%f) - speech.duration_seconds (%f)" % (clipDurationDifference, maxDuration, (speech.duration_seconds * 1000)) ) # Now that we know how long the clip and the break time are, calculate and generate silence # for the total of the break time + the clipDurationDifference silence = AS.silent(duration=(breakTime + clipDurationDifference) ) concatenatedAudio += silence print( "\t\t---> silence: ", silence.duration_seconds ) concatenatedAudio += speech os.remove(audioFileName) print( "\t\t---> %s successfully deleted" % (audioFileName)) except Exception as e: # Something went wrong writing the temporary file print( "***Exception***: Issue writing temp file" ) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") print( "\t---> SSML Processing complete. concatenatedAudio duration: ", concatenatedAudio.duration_seconds) try: # first export the tmpFile = tempfile.NamedTemporaryFile(suffix='.mp3') tmpFile.close() concatendateAudioFileName = tmpFile.name concatenatedAudio.export(concatendateAudioFileName, format="mp3") audioKey = pollyOutputKey + ".mp3" print( "\t---> Uploading %s to %s " % (concatendateAudioFileName, audioKey)) bucket = s3.Bucket( pc['baseBucketName']) bucket.upload_file(concatendateAudioFileName, audioKey ) print( "\t---> Upload successful") # now delete the temporary file once it is copied to s3 print( "\t---> Deleting %s" % (concatendateAudioFileName)) os.remove(concatendateAudioFileName) except Exception as e: # Something went wrong writing the final file print( "***Exception***: Issue writing final audio file to: ", audioKey ) raise stvError("*** Error Code: ", e.response['Error']['Message'] + " ***") # Get some key info from the job so that we can check on the status of the job pollyOutputURI = 's3://' + pc['baseBucketName'] + '/' + audioKey pollyOutputKey = audioKey else: pollyOutputURI = 'N/A' pollyOutputKey = 'N/A' # Generate and return the repsonse response = event response['item']['polly']['pollyOutputURI'] = pollyOutputURI response['item']['polly']['pollyOutputKey'] = pollyOutputKey ptgts[event['index']]['polly']['pollyOutputURI'] = pollyOutputURI ptgts[event['index']]['polly']['pollyOutputKey'] = pollyOutputKey # Put the ouptut back into DynamoDB if stm.update_stm_target( event['input']['Outputs']['process']['ProcessName'], ptgts[event['index']], event['index'] ): print('===> stvblogCreatePollyAudio Complete') return response else: raise stvError( "*** Error writing to the stvblog table ***" )