{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "241006b5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import os\n",
    "from pathlib import Path\n",
    "import sqlite3\n",
    "\n",
    "from traffic_comparator.sqlite import COLUMN_DATATYPES, COLUMN_JSONS, json_load_function, get_took_value, get_latest_table_name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "340ac0eb",
   "metadata": {},
   "outputs": [],
   "source": [
    "db_file = Path(os.getenv(\"COMPARISONS_DB_LOCATION\", \"comparisons.db\"))\n",
    "print(f\"Loading from DB file: {db_file}\")\n",
    "con = sqlite3.connect(db_file)\n",
    "cur = con.cursor()\n",
    "# By default, this reads from the latest table, but this can be modified to a specific table name instead.\n",
    "table_name = get_latest_table_name(cur)\n",
    "df = pd.read_sql_query(f\"SELECT * from {table_name}\", con,\n",
    "                       dtype=COLUMN_DATATYPES)\n",
    "con.close()\n",
    "\n",
    "# This loads the text from each of the `table_json_fields` as a python dictionary\n",
    "for column in COLUMN_JSONS:\n",
    "    df[column] = df[column].apply(json_load_function)\n",
    "    \n",
    "# This creates the source and target `took` fields by extracting the took value from the response bodies.\n",
    "df['source_took'] = df['source_response_body'].apply(get_took_value)\n",
    "df['target_took'] = df['target_response_body'].apply(get_took_value)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fc208dae",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e329d114",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Measured Latency\")\n",
    "print(df['target_response_latency'].describe())\n",
    "print()\n",
    "print(\"Took field\")\n",
    "print(df['target_took'].describe())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "179b1149",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Latency plot\n",
    "plt.subplot(2, 1, 1)\n",
    "_, bins, _ = plt.hist([df['source_response_latency'], df['target_response_latency']], bins=60, range=(0, 300), histtype='stepfilled', alpha=0.8)\n",
    "plt.axvline(df['source_response_latency'].median(), color='k', linestyle='dashed', linewidth=1)\n",
    "plt.axvline(df['target_response_latency'].median(), color='k', linestyle='dashed', linewidth=1)\n",
    "\n",
    "plt.title(\"Measured latency of source vs target cluster\")\n",
    "plt.xlabel(\"Latency (ms)\")\n",
    "plt.ylabel(\"Count\")\n",
    "\n",
    "# Took plot\n",
    "plt.subplot(2, 1, 2)\n",
    "plt.hist([df['source_took'], df['target_took']], bins=bins, histtype='stepfilled', alpha=0.7)\n",
    "plt.axvline(df['source_took'].median(), color='k', linestyle='dashed', linewidth=1)\n",
    "plt.axvline(df['target_took'].median(), color='k', linestyle='dashed', linewidth=1)\n",
    "plt.title(\"Reported latency (\\\"took\\\") of source vs target cluster\")\n",
    "plt.xlabel(\"Latency (ms)\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.tight_layout()\n",
    "\n",
    "\n",
    "print(f\"Source median latency: {df['source_response_latency'].median():.2f} ms\")\n",
    "print(f\"Target median latency: {df['target_response_latency'].median():.2f} ms\")\n",
    "plt.show()\n",
    "print(f\"Source median 'took': {df['source_took'].median():.2f} ms\")\n",
    "print(f\"Target median 'took': {df['target_took'].median():.2f} ms\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5751cd60",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Estimate the network latency\n",
    "print(\"Source cluster network latency\")\n",
    "print((df['source_response_latency'] - df['source_took']).mean())\n",
    "print()\n",
    "print(\"Target cluster network latency\")\n",
    "print((df['target_response_latency'] - df['target_took']).mean())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "989dc230",
   "metadata": {},
   "outputs": [],
   "source": [
    "uri_groups = df.groupby(['request_uri', 'request_method'])\n",
    "uri_groups.aggregate(func={'responses_are_identical': lambda x: f\"{x.mean():.2%}\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b659080",
   "metadata": {},
   "outputs": [],
   "source": [
    "uri_groups.size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "038afdf3",
   "metadata": {},
   "outputs": [],
   "source": [
    "bulk_df = df[df[\"request_uri\"] == \"/_bulk\"]\n",
    "bulk_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27f9e58c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.0rc2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}