### # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 # # Permission is hereby granted, free of charge, to any person obtaining a copy of this # software and associated documentation files (the "Software"), to deal in the Software # without restriction, including without limitation the rights to use, copy, modify, # merge, publish, distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT # HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # Copyright Amazon.com, Inc. and its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT ###### import json import boto3 from urllib.parse import unquote_plus import urllib import time import re import os from awsUtils import readTextFileFromS3, split_s3_path, insertParallelData, writeTextFileToS3 translate = boto3.client(service_name='translate', region_name='us-east-1', use_ssl=True) a2i = boto3.client('sagemaker-a2i-runtime') def lambda_handler(event, context): # describe the job to get details on the job completed. if event['detail']['humanLoopStatus'] == 'Completed': s3location = event['detail']['humanLoopOutput']['outputS3Uri'] if s3location is None: return -1 print("outputS3Uri :", s3location) ## Bucket to use bucketName, prefix = split_s3_path( s3location ) # recreate the output text document, including post edits. tmsFile = json.loads(readTextFileFromS3( bucketName, prefix)) inputContent = tmsFile['inputContent'] rowcount = inputContent['rowCount'] answerContent = tmsFile['humanAnswers'][0]['answerContent'] translatedFileName = inputContent['keyName'] editedContent = '' parallelDataInput = { 'domain' : { 'S': 'Finance'}, 'sourceLanguageCode': { 'S': inputContent['sourceLanguageCode']}, 'targetLanguageCode' : { 'S': inputContent['targetLanguageCode']}, 'source' : None, 'target': None } pattern = "<t>(.*?)</t>" for index in range(1, rowcount+1): if answerContent['addToCustom'+str(index)]['on']: ## insert into parallel data tableName #print(answerContent['translation'+str(index)]) #print(answerContent['originalText'+str(index)]) tagPatternTarget = re.search( pattern, answerContent['translation'+str(index)]) tagPatternSource = re.search( pattern, answerContent['originalText'+str(index)]) if tagPatternTarget is not None and tagPatternTarget is not None: ## extract phrases sourceTxt = tagPatternSource.group(1) targetTxt = tagPatternTarget.group(1) parallelDataInput['source'] = { 'S': sourceTxt} parallelDataInput['target'] = { 'S': targetTxt} else: parallelDataInput['source'] = { 'S': inputContent['translationPairs'][index-1]['originalText']} parallelDataInput['target'] = { 'S': answerContent['translation'+str(index)]} insertParallelData( 'translate_parallel_data', parallelDataInput) editedContent += (answerContent['translation'+str(index)].replace('<t>','').replace('</t>','') + " ") writeTextFileToS3( bucketName, 'edited/{0}'.format(translatedFileName), editedContent) print('Success') return 0