{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8d0fe91c-5c00-4f62-984b-bc158b80973d",
   "metadata": {},
   "source": [
    "# Testing with API Gateway"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "875d39fc-9fe5-4817-8614-ee344721913b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# %%capture\n",
    "# %pip install aws-requests-auth"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17e9d857-7f22-4720-8f72-82df56acee38",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import time\n",
    "import boto3\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import requests\n",
    "from aws_requests_auth.boto_utils import BotoAWSRequestsAuth\n",
    "from tqdm.contrib.concurrent import thread_map"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "153c4bbe-4946-4e4a-beaf-b147aaf28f34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# bucket = sm_session.default_bucket()\n",
    "region = boto3.Session().region_name"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f5049d34-e8a6-40ae-8402-b00fa44b8e8b",
   "metadata": {},
   "source": [
    "### Parameters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2140cd06-508a-4ace-b36c-5d9e4194fb28",
   "metadata": {},
   "outputs": [],
   "source": [
    "url = \"<TO FILL>\" # example url: \"https://vcqnacq0k1.execute-api.us-east-1.amazonaws.com/LATEST/HF\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56a75ef9-ffaf-4714-b572-283463d6527c",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_phrase = \"This is an interesting workshop, very helpful!\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10328c56-48d5-4660-9a4c-262f66701179",
   "metadata": {},
   "outputs": [],
   "source": [
    "auth = BotoAWSRequestsAuth(\n",
    "    aws_host=url.split(\"//\")[-1],\n",
    "    aws_region=region,\n",
    "    aws_service=\"execute-api\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33fe2cff-2be3-467e-915d-ceb5c5bb2c8e",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "payload = {\"inputs\": test_phrase}\n",
    "response = requests.post(\n",
    "    url,\n",
    "    # auth=auth,\n",
    "    json=payload,\n",
    ")\n",
    "print(json.dumps(response.json(), indent=2))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b242af23-7452-4e32-8c2b-c9647f597f7c",
   "metadata": {},
   "source": [
    "## Benchmark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c39a42fb-4385-4558-9482-1c98eae36102",
   "metadata": {},
   "outputs": [],
   "source": [
    "def time_prediction(payload, return_pred=False):\n",
    "    t1 = time.time()\n",
    "    pred = requests.post(url, json=payload)\n",
    "    if return_pred:\n",
    "        return pred\n",
    "    if pred.status_code != 200:\n",
    "        return None\n",
    "    return time.time() - t1\n",
    "\n",
    "\n",
    "def run_benchmark(\n",
    "    payload,\n",
    "    num_preds=100,\n",
    "    print_report=False,\n",
    "    plot_report=False,\n",
    "    n_threads=None,\n",
    "):\n",
    "    tic = time.time()\n",
    "    t_vec = thread_map(\n",
    "        time_prediction,\n",
    "        [payload] * num_preds,\n",
    "        max_workers=n_threads,\n",
    "    )\n",
    "    duration = time.time() - tic\n",
    "    n_failed = np.count_nonzero(np.isnan(t_vec))\n",
    "    TPS = num_preds / duration\n",
    "\n",
    "    latency_percentiles = np.percentile(t_vec, q=[50, 90, 95, 99])\n",
    "\n",
    "    if plot_report:\n",
    "\n",
    "        plt.hist(t_vec, bins=100)\n",
    "        plt.title(\"Request latency histogram for ml.c5.xlarge\")\n",
    "\n",
    "        plt.show()\n",
    "\n",
    "    if print_report:\n",
    "        print(\n",
    "            \"==== HuggingFace model deployed on CPU instance endpoint benchmark ====\\n\",\n",
    "            f\"95 % of requests take less than {latency_percentiles[2]*1000} ms\\n\",\n",
    "            f\"Rough request throughput/second is {TPS}\\n\",\n",
    "            f\"{n_failed} failed invocations\",\n",
    "        )\n",
    "    return TPS, latency_percentiles[2] * 1000, n_failed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "642c7e55-991e-4edf-9abb-85ba6fc77f6b",
   "metadata": {},
   "outputs": [],
   "source": [
    "run_benchmark(payload, 5000, True, True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cbb46514-1dda-41d4-9869-4369e99b0ca4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "instance_type": "ml.t3.medium",
  "kernelspec": {
   "display_name": "Python 3 (Data Science)",
   "language": "python",
   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}