{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Predicting world temperature with DeepAR\n", "- [Source](https://julsimon.medium.com/predicting-world-temperature-with-time-series-and-deepar-on-amazon-sagemaker-e371cf94ddb5) \n", "- [Dataset - Daily Land](http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# import data science and visualization libraries\n", "%matplotlib inline\n", "from sklearn.model_selection import train_test_split\n", "import matplotlib.pyplot as plt\n", "import pandas as pd\n", "import numpy as np\n", "import seaborn as sns\n", "import sagemaker\n", "import csv\n", "import boto3\n", "import json\n", "from sagemaker import image_uris\n", "\n", "print(sagemaker.__version__)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!wget -P ./data/ http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Remove header lines (starting with a %), empty lines and lines with only spaces\n", "!grep -v -e '^%\\|^$\\|^\\ *$' ./data/Complete_TAVG_daily.txt > ./data/temps.txt\n", "!head -10 ./data/temps.txt" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "minYear = 1880\n", "maxYear = 2021\n", "avg_temp = 8.68\n", "\n", "# Our model will predict temperature for the next 'prediction_length' days\n", "prediction_length = 30" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = open('./data/temps.txt', 'r')\n", "data = csv.reader(f,delimiter=' ')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "dataset={}\n", "x=[]\n", "y=[]\n", "count=1\n", "prevYear=0\n", "\n", "for row in data:\n", " # Remove empty strings caused by multiple spaces between columns\n", " row = list(filter(None, row))\n", " \n", " year=row[1]\n", " temp=float(row[5])+avg_temp\n", " \n", " # Data for plotting\n", " # x list=counter, y list=temperature\n", " x.append(count)\n", " y.append(float(temp))\n", " count += 1\n", " \n", " # Data for training\n", " # dictionary: key=year, value=list of ordered daily temperatures\n", " if (year != prevYear):\n", " dataset[year]=[]\n", " prevYear=year\n", " dataset[year].append(float(temp))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Sometimes 'pythonic' rhymes with 'moronic' :D\n", "nb_samples_per_year = list(map(lambda x: len(x), (dataset[str(year)] for year in range(minYear, maxYear+1))))\n", "nb_samples_per_year = np.unique(nb_samples_per_year).tolist()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nb_samples_per_year" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "assert nb_samples_per_year == [128, 365, 366]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "nbSamples=len(x)\n", "print('Number of samples: %d' % nbSamples)\n", "\n", "fig=plt.figure(figsize=(64, 16))\n", "plt.plot(x,y)\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainingSet = dataset.copy()\n", "trainingSet[year] = { year: dataset[year][:-prediction_length] for year in dataset.keys() }\n", "testSet = dataset.copy()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "train_key = 'deepar_training.json'\n", "test_key = 'deepar_test.json'\n", "\n", "def writeDataset(filename, data): \n", " file=open(filename,'w')\n", " for year in data.keys():\n", " # One JSON sample per line\n", " line = \"\\\"start\\\":\\\"{}-01-01 00:00:00\\\",\\\"target\\\":{}\".format(year,data[year])\n", " file.write('{'+line+'}\\n')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "writeDataset(train_key, trainingSet) \n", "writeDataset(test_key, testSet)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!head -2 deepar_training.json" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "bucket = sagemaker.Session().default_bucket()\n", "prefix = \"deepar-daily-temperature\"\n", "\n", "train_prefix = f'{prefix}/train'\n", "test_prefix = f'{prefix}/test'\n", "output_prefix = f'{prefix}/output'" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sagemaker_session = sagemaker.Session()\n", "role = sagemaker.get_execution_role()\n", "region = boto3.Session().region_name\n", "\n", "train_path = sagemaker_session.upload_data(train_key, bucket=bucket, key_prefix=train_prefix)\n", "test_path = sagemaker_session.upload_data(test_key, bucket=bucket, key_prefix=test_prefix)\n", "output_path = f's3://bucket/output_prefix'\n", "\n", "print(train_path)\n", "print(test_path)\n", "print(output_path)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!aws s3 ls s3://{bucket}/{prefix} --recursive" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "container = image_uris.retrieve(framework='forecasting-deepar',region=region)\n", "print(container)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimator = sagemaker.estimator.Estimator(\n", " sagemaker_session=sagemaker_session,\n", " image_uri=container,\n", " role=role,\n", " instance_count=1,\n", " instance_type='ml.c4.8xlarge',\n", " base_job_name='daily-temperature',\n", " output_path=output_path\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html\n", "\n", "hyperparameters = {\n", " \"time_freq\": 'D', # daily series\n", " \"context_length\": prediction_length,\n", " \"prediction_length\": prediction_length, # number of data points to predict\n", " \"num_cells\": \"40\",\n", " \"num_layers\": \"2\",\n", " \"likelihood\": \"gaussian\",\n", " \"epochs\": \"250\",\n", " \"mini_batch_size\": \"32\",\n", " \"learning_rate\": \"0.00001\",\n", " \"dropout_rate\": \"0.05\",\n", " \"early_stopping_patience\": \"10\" # stop if loss hasn't improved in 10 epochs\n", "}" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "estimator.set_hyperparameters(**hyperparameters)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3.9.9 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "vscode": { "interpreter": { "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" } } }, "nbformat": 4, "nbformat_minor": 4 }