import os
import io
import boto3
from boto3.dynamodb.conditions import Key, Attr
from botocore.exceptions import ClientError
import json
import csv
import logging
import re

# grab environment variables
LCA_CALL_EVENTS_TABLE = os.environ['LCA_CALL_EVENTS_TABLE']

runtime= boto3.client('runtime.sagemaker')
logger = logging.getLogger(__name__)
ddb = boto3.resource('dynamodb')

html_remover = re.compile('<[^>]*>')
filler_remover = re.compile('(^| )([Uu]m|[Uu]h|[Ll]ike|[Mm]hm)[,]?')

lca_call_events = ddb.Table(LCA_CALL_EVENTS_TABLE)

def get_transcripts(callid):
    
    pk = 'trs#'+callid
    print(pk)
    
    try:
        response = lca_call_events.query(KeyConditionExpression=Key('PK').eq(pk), FilterExpression=(Attr('Channel').eq('AGENT') | Attr('Channel').eq('CALLER')) & Attr('IsPartial').eq(False))
        # response = lca_call_events.query(KeyConditionExpression=Key('PK').eq(pk)) 
    except ClientError as err:
        logger.error("Error getting transcripts from LCA Call Events table %s: %s", 
                err.response['Error']['Code'], err.response['Error']['Message'])
        raise 
    else:
        # print(response['Items'])
        return response['Items']

def preprocess_transcripts(transcripts, condense ):
    data = []

    transcripts.sort(key=lambda x: x['EndTime'])

    last_channel = 'start'
    for row in transcripts:
        transcript = row['Transcript']
        if condense == True:
          if row['Channel'] == 'AGENT_ASSISTANT':
              continue
          transcript = remove_html(transcript)
          transcript = remove_filler_words(transcript).strip()

          if row['Channel'] == last_channel:
              transcript = ' ' + transcript
          elif len(transcript) > 1:
              transcript = '\n' + row['Channel'] + ": " + transcript
              last_channel = row['Channel']
        else:
          transcript = '\n' + row['Channel'] + ": " + transcript
  
        data.append(transcript)
    
    return data

def remove_html(transcript_string):
    return re.sub(html_remover, '', transcript_string)

def remove_filler_words(transcript_string):
    return re.sub(filler_remover, '', transcript_string)

def truncate_number_of_words(transcript_string, truncateLength):
    #findall can retain carriage returns
    data = re.findall(r'\S+|\n|.|,',transcript_string)
    if truncateLength > 0:
      data = data[0:truncateLength]
    print('Token Count: ' + str(len(data)))
    return ''.join(data)


def lambda_handler(event, context):
    print("Received event: " + json.dumps(event, indent=2))
    
    # Setup model input data using text (utterances) received from LCA
    data = json.loads(json.dumps(event))
    callid = data['CallId']
    tokenCount = 0
    if 'TokenCount' in data:
        tokenCount = data['TokenCount']
    
    preProcess = False
    if 'ProcessTranscript' in data:
        preProcess = data['ProcessTranscript']

    transcripts = get_transcripts(callid)
    transcripts = preprocess_transcripts(transcripts, preProcess)
    transcript_string = ''.join(transcripts)
    transcript_string = truncate_number_of_words(transcript_string, tokenCount)
    response = { 'transcript': transcript_string }
    # print(transcript_string)
    return response

# Test case
if __name__ == '__main__':
    lambda_handler( {
        "CallId": "2359fb61-f612-4fe9-bce2-839061c328f9",
        "TokenCount": 0,
        "ProcessTranscript": False
    }, {})