{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Predicting world temperature with DeepAR\n",
    "- [Source](https://julsimon.medium.com/predicting-world-temperature-with-time-series-and-deepar-on-amazon-sagemaker-e371cf94ddb5)  \n",
    "- [Dataset - Daily Land](http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import data science and visualization libraries\n",
    "%matplotlib inline\n",
    "from sklearn.model_selection import train_test_split\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import sagemaker\n",
    "import csv\n",
    "import boto3\n",
    "import json\n",
    "from sagemaker import image_uris\n",
    "\n",
    "print(sagemaker.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!wget -P ./data/ http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove header lines (starting with a %), empty lines and lines with only spaces\n",
    "!grep -v -e '^%\\|^$\\|^\\ *$' ./data/Complete_TAVG_daily.txt > ./data/temps.txt\n",
    "!head -10 ./data/temps.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "minYear  = 1880\n",
    "maxYear  = 2021\n",
    "avg_temp = 8.68\n",
    "\n",
    "# Our model will predict temperature for the next 'prediction_length' days\n",
    "prediction_length = 30"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "f = open('./data/temps.txt', 'r')\n",
    "data = csv.reader(f,delimiter=' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset={}\n",
    "x=[]\n",
    "y=[]\n",
    "count=1\n",
    "prevYear=0\n",
    "\n",
    "for row in data:\n",
    "        # Remove empty strings caused by multiple spaces between columns\n",
    "        row = list(filter(None, row))\n",
    "        \n",
    "        year=row[1]\n",
    "        temp=float(row[5])+avg_temp\n",
    "         \n",
    "        # Data for plotting\n",
    "        # x list=counter, y list=temperature\n",
    "        x.append(count)\n",
    "        y.append(float(temp))\n",
    "        count += 1\n",
    "        \n",
    "        # Data for training\n",
    "        # dictionary: key=year, value=list of ordered daily temperatures\n",
    "        if (year != prevYear):\n",
    "            dataset[year]=[]\n",
    "            prevYear=year\n",
    "        dataset[year].append(float(temp))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Sometimes 'pythonic' rhymes with 'moronic' :D\n",
    "nb_samples_per_year = list(map(lambda x: len(x), (dataset[str(year)] for year in range(minYear, maxYear+1))))\n",
    "nb_samples_per_year = np.unique(nb_samples_per_year).tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nb_samples_per_year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert nb_samples_per_year == [128, 365, 366]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "nbSamples=len(x)\n",
    "print('Number of samples: %d' % nbSamples)\n",
    "\n",
    "fig=plt.figure(figsize=(64, 16))\n",
    "plt.plot(x,y)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainingSet = dataset.copy()\n",
    "trainingSet[year] = { year: dataset[year][:-prediction_length] for year in dataset.keys() }\n",
    "testSet = dataset.copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_key      = 'deepar_training.json'\n",
    "test_key       = 'deepar_test.json'\n",
    "\n",
    "def writeDataset(filename, data): \n",
    "    file=open(filename,'w')\n",
    "    for year in data.keys():\n",
    "        # One JSON sample per line\n",
    "        line = \"\\\"start\\\":\\\"{}-01-01 00:00:00\\\",\\\"target\\\":{}\".format(year,data[year])\n",
    "        file.write('{'+line+'}\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "writeDataset(train_key, trainingSet)        \n",
    "writeDataset(test_key, testSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!head -2 deepar_training.json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "bucket = sagemaker.Session().default_bucket()\n",
    "prefix = \"deepar-daily-temperature\"\n",
    "\n",
    "train_prefix   = f'{prefix}/train'\n",
    "test_prefix    = f'{prefix}/test'\n",
    "output_prefix  = f'{prefix}/output'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sagemaker_session = sagemaker.Session()\n",
    "role              = sagemaker.get_execution_role()\n",
    "region            = boto3.Session().region_name\n",
    "\n",
    "train_path  = sagemaker_session.upload_data(train_key, bucket=bucket, key_prefix=train_prefix)\n",
    "test_path   = sagemaker_session.upload_data(test_key,  bucket=bucket, key_prefix=test_prefix)\n",
    "output_path = f's3://bucket/output_prefix'\n",
    "\n",
    "print(train_path)\n",
    "print(test_path)\n",
    "print(output_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "!aws s3 ls s3://{bucket}/{prefix} --recursive"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "container = image_uris.retrieve(framework='forecasting-deepar',region=region)\n",
    "print(container)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator = sagemaker.estimator.Estimator(\n",
    "    sagemaker_session=sagemaker_session,\n",
    "    image_uri=container,\n",
    "    role=role,\n",
    "    instance_count=1,\n",
    "    instance_type='ml.c4.8xlarge',\n",
    "    base_job_name='daily-temperature',\n",
    "    output_path=output_path\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html\n",
    "\n",
    "hyperparameters = {\n",
    "    \"time_freq\": 'D', # daily series\n",
    "    \"context_length\": prediction_length,\n",
    "    \"prediction_length\": prediction_length, # number of data points to predict\n",
    "    \"num_cells\": \"40\",\n",
    "    \"num_layers\": \"2\",\n",
    "    \"likelihood\": \"gaussian\",\n",
    "    \"epochs\": \"250\",\n",
    "    \"mini_batch_size\": \"32\",\n",
    "    \"learning_rate\": \"0.00001\",\n",
    "    \"dropout_rate\": \"0.05\",\n",
    "    \"early_stopping_patience\": \"10\" # stop if loss hasn't improved in 10 epochs\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "estimator.set_hyperparameters(**hyperparameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.9 64-bit",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}