# Testing with API Gateway

In [None]:
# %%capture
# %pip install aws-requests-auth

In [None]:
import json
import time
import boto3

import matplotlib.pyplot as plt
import numpy as np
import requests
from aws_requests_auth.boto_utils import BotoAWSRequestsAuth
from tqdm.contrib.concurrent import thread_map

In [None]:
# bucket = sm_session.default_bucket()
region = boto3.Session().region_name

### Parameters

In [None]:
url = "" # example url: "https://vcqnacq0k1.execute-api.us-east-1.amazonaws.com/LATEST/HF"

In [None]:
test_phrase = "This is an interesting workshop, very helpful!"

In [None]:
auth = BotoAWSRequestsAuth(
 aws_host=url.split("//")[-1],
 aws_region=region,
 aws_service="execute-api",
)

In [None]:
payload = {"inputs": test_phrase}
response = requests.post(
 url,
 # auth=auth,
 json=payload,
)
print(json.dumps(response.json(), indent=2))

## Benchmark

In [None]:
def time_prediction(payload, return_pred=False):
 t1 = time.time()
 pred = requests.post(url, json=payload)
 if return_pred:
 return pred
 if pred.status_code != 200:
 return None
 return time.time() - t1


def run_benchmark(
 payload,
 num_preds=100,
 print_report=False,
 plot_report=False,
 n_threads=None,
):
 tic = time.time()
 t_vec = thread_map(
 time_prediction,
 [payload] * num_preds,
 max_workers=n_threads,
 )
 duration = time.time() - tic
 n_failed = np.count_nonzero(np.isnan(t_vec))
 TPS = num_preds / duration

 latency_percentiles = np.percentile(t_vec, q=[50, 90, 95, 99])

 if plot_report:

 plt.hist(t_vec, bins=100)
 plt.title("Request latency histogram for ml.c5.xlarge")

 plt.show()

 if print_report:
 print(
 "==== HuggingFace model deployed on CPU instance endpoint benchmark ====\n",
 f"95 % of requests take less than {latency_percentiles[2]*1000} ms\n",
 f"Rough request throughput/second is {TPS}\n",
 f"{n_failed} failed invocations",
 )
 return TPS, latency_percentiles[2] * 1000, n_failed

In [None]:
run_benchmark(payload, 5000, True, True)