import requests
from datetime import datetime
import getopt, sys
import urllib3
import boto3
import json
import os
import GPUtil
import psutil

from io import StringIO 

class Capturing(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio    # free up some memory
        sys.stdout = self._stdout

# OVERVIEW
# This script is adapted from https://github.com/aws-samples/amazon-sagemaker-notebook-instance-lifecycle-config-samples/blob/master/scripts/auto-stop-idle/autostop.py. Modifications are made to calculate four quantities (CPU utilization, CPU memory utilization, GPU utilization, GPU memory utilization) at regular intervals defined by the cron expression of the on-start script. These aggregate values are also added as tags to the notebook instance so users can get an idea of what the utilization looks like without accessing the actual jupyter notebook. Additionally, a cloudwatch agent logs more detailed metrics for users to monitor notebook instance usage. Fianlly, an example query (commented out) is provided to use within Cost Explorer to visualize aggregate metrics. 

idle = True
port = '8443'

# Ignore if any browsers or clients are open
ignore_connections = False

# Threshold for deciding idle value
time_threshold = 4*60*60 # 4 hours in seconds

# Force shutdown if conditions are true, or just log to output
force_shutdown = False

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

def get_notebook_name():
    log_path = '/opt/ml/metadata/resource-metadata.json'
    with open(log_path, 'r') as logs:
        _logs = json.load(logs)
    return _logs['ResourceName']


def get_notebook_resource_arn():
    log_path = '/opt/ml/metadata/resource-metadata.json'
    with open(log_path, 'r') as logs:
        _logs = json.load(logs)
    return _logs['ResourceArn']


# When is a notebook considered idle by the Notebooks API? - https://github.com/jupyter/notebook/issues/4634

# The way it works at present is that the kernel sends a 'busy' message when it starts executing a request, and an 'idle' message when it finishes. So it's idle if there's not code running. The 'while True' loop would leave it busy.

# Code execution isn't the only kind of request, though. Among other things, when you open a notebook in a tab, it will make a kernel info request, which will reset the timer.


def is_idle(last_activity):
    last_activity = datetime.strptime(last_activity,"%Y-%m-%dT%H:%M:%S.%fz")
    if (datetime.now() - last_activity).total_seconds() > time_threshold:
        print('Notebook is idle. Last activity time = ', last_activity)
        return True
    else:
        print('Notebook is not idle. Last activity time = ', last_activity)
        return False


response = requests.get('https://localhost:'+port+'/api/sessions', verify=False)
data = response.json()
print(data)
if len(data) > 0:
    
    print("Using Jupyter Notebook API since request was successful")
    for notebook in data:

        if notebook['kernel']['execution_state'] == 'idle':
            if not ignore_connections:
                if notebook['kernel']['connections'] == 0:
                    if not is_idle(notebook['kernel']['last_activity']):
                        idle = False
                else:
                    idle = False #If any connection exists, notebook is not idling
            else:
                if not is_idle(notebook['kernel']['last_activity']):
                    idle = False #If last activity is recent, notebook is not idling
        else:
            print('Notebook is not idle:', notebook['kernel']['execution_state'])
            idle = False
else:
    print("Using SageMaker instance last modified time")
    client = boto3.client('sagemaker')
    uptime = client.describe_notebook_instance(
        NotebookInstanceName=get_notebook_name()
    )['LastModifiedTime']
    if not is_idle(uptime.strftime("%Y-%m-%dT%H:%M:%S.%fz")):
        idle = False
        
        
#CPU, Mem and GPU utilization
print(f"Utilization metrics at {datetime.now()}")

total_cpu_util = psutil.cpu_percent()
total_mem_util = psutil.virtual_memory().percent
print(f"CPU utilization = {total_cpu_util}%")
print(f"Memory utilization = {total_mem_util}%")
# Testing the GPUtil library for both GPU performance details
num_gpu = 0
try:
    print("GPU utilization = ")
    with Capturing() as output:
        GPUtil.showUtilization()
    

    if len(output)==1:
        print("Found no GPUs")
    else:
        print(f"Found {len(output) -2} GPUs:") # Output is formatted, -2 is one for header and another for separator '---'
        num_gpu = len(output)-2
        total_gpu_util = 0
        total_gpumem_util = 0
        for i in range(2,len(output)):
            tmp = output[i].split('|')
            # print(tmp)
            print(f"GPU{tmp[1]} mem = {tmp[-2]}")
            print(f"GPU{tmp[1]} util = {tmp[-4]}")
            total_gpu_util+=int(tmp[-4].split('%')[0])
            total_gpumem_util+=int(tmp[-2].split('%')[0])
        
        print(f"Total GPU Mem Utilization = {total_gpumem_util}/{(len(output) -2)*100} %")
        print(f"Total GPU Utilization = {total_gpu_util}/{(len(output) -2)*100} %")
        

except Exception as e:
    print("Did not capture GPU utilization")
    print(e)
    total_gpu_util = 0
    total_gpumem_util = 0
    

# Updating tags
client = boto3.client('sagemaker')
response = client.add_tags(
    ResourceArn=get_notebook_resource_arn(),
    Tags=[
    {
        'Key': 'total_cpu_util',
        'Value': str(total_cpu_util)
    },
    {
        'Key': 'total_mem_util',
        'Value': str(total_mem_util)
    },
    {
        'Key': 'total_gpu_util',
        'Value': str(total_gpu_util)
    },
    {
        'Key': 'total_gpumem_util',
        'Value': str(total_gpumem_util)
    }
    ])
    
    
# Add conditions here:

shutdown = False

if not idle and num_gpu>0 and 0 < total_gpu_util < 20:
    print("Recommend using a smaller GPU instance")
    
if idle and total_cpu_util < 10 and total_mem_util < 10 and force_shutdown:
    print(f'Closing idle notebook since Jupyter Kernels idling is {idle}, total CPU utilization is {total_cpu_util} and total Memory utilization is {total_mem_util}')
    client = boto3.client('sagemaker')
    client.stop_notebook_instance(
        NotebookInstanceName=get_notebook_name()
    )

else:
    print(f"Notebook is active at {datetime.now()}. Updated util metrics")
    print(f'NOT closing idle notebook since Jupyter Kernels idling is {idle}, total CPU utilization is {total_cpu_util} and total Memory utilization is {total_mem_util}')
    

print(json.dumps({"CPU_util":total_cpu_util, "Mem_util":total_mem_util, "GPU_util":total_gpu_util, "GPU_mem_util":total_gpumem_util}))

client = boto3.client('sagemaker')
response = client.list_tags(
    ResourceArn=get_notebook_resource_arn()
)
tags = response['Tags']
tagdict = {}

for tag in tags:
    tagdict[tag['Key']] = tag['Value']
print("---")
print(tagdict)


try:
    print("If available, log running average utilization ...")
    print(float(tagdict['total_cpu_util']) + float(total_cpu_util))

    print(json.dumps({
        'avg_CPU_util' : int((float(tagdict['total_cpu_util']) + float(total_cpu_util))/2.),
        'avg_Mem_util' : int((float(tagdict['total_mem_util']) + float(total_mem_util))/2.),
        'avg_GPU_util' : int((float(tagdict['total_gpu_util']) + float(total_gpu_util))/2.),
        'avg_GPUmem_util' : int((float(tagdict['total_gpumem_util']) + float(total_gpumem_util))/2.),
    }))
    
except Exception as e:
    print('Historical values not available')
    print(e)
    
    
# In cloudwatch log insights, use the a query similar to the following:
'''
fields @timestamp, avg_CPU_util
| filter @logStream="notebook-name/jupyter.log"
| stats avg(avg_CPU_util),avg(avg_Mem_util),avg(avg_GPU_util),avg(avg_GPUmem_util),count() by bin(60s)
| sort @timestamp asc 
'''