# Predicting world temperature with DeepAR
- [Source](https://julsimon.medium.com/predicting-world-temperature-with-time-series-and-deepar-on-amazon-sagemaker-e371cf94ddb5) 
- [Dataset - Daily Land](http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt)

In [None]:
# import data science and visualization libraries
%matplotlib inline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sagemaker
import csv
import boto3
import json
from sagemaker import image_uris

print(sagemaker.__version__)

In [None]:
!wget -P ./data/ http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt

In [None]:
# Remove header lines (starting with a %), empty lines and lines with only spaces
!grep -v -e '^%\|^$\|^\ *$' ./data/Complete_TAVG_daily.txt > ./data/temps.txt
!head -10 ./data/temps.txt

In [None]:
minYear = 1880
maxYear = 2021
avg_temp = 8.68

# Our model will predict temperature for the next 'prediction_length' days
prediction_length = 30

In [None]:
f = open('./data/temps.txt', 'r')
data = csv.reader(f,delimiter=' ')

In [None]:
dataset={}
x=[]
y=[]
count=1
prevYear=0

for row in data:
 # Remove empty strings caused by multiple spaces between columns
 row = list(filter(None, row))
 
 year=row[1]
 temp=float(row[5])+avg_temp
 
 # Data for plotting
 # x list=counter, y list=temperature
 x.append(count)
 y.append(float(temp))
 count += 1
 
 # Data for training
 # dictionary: key=year, value=list of ordered daily temperatures
 if (year != prevYear):
 dataset[year]=[]
 prevYear=year
 dataset[year].append(float(temp))

In [None]:
# Sometimes 'pythonic' rhymes with 'moronic' :D
nb_samples_per_year = list(map(lambda x: len(x), (dataset[str(year)] for year in range(minYear, maxYear+1))))
nb_samples_per_year = np.unique(nb_samples_per_year).tolist()

In [None]:
nb_samples_per_year

In [None]:
assert nb_samples_per_year == [128, 365, 366]

In [None]:
nbSamples=len(x)
print('Number of samples: %d' % nbSamples)

fig=plt.figure(figsize=(64, 16))
plt.plot(x,y)
plt.show()

In [None]:
trainingSet = dataset.copy()
trainingSet[year] = { year: dataset[year][:-prediction_length] for year in dataset.keys() }
testSet = dataset.copy()

In [None]:
train_key = 'deepar_training.json'
test_key = 'deepar_test.json'

def writeDataset(filename, data): 
 file=open(filename,'w')
 for year in data.keys():
 # One JSON sample per line
 line = "\"start\":\"{}-01-01 00:00:00\",\"target\":{}".format(year,data[year])
 file.write('{'+line+'}\n')

In [None]:
writeDataset(train_key, trainingSet) 
writeDataset(test_key, testSet)

In [None]:
!head -2 deepar_training.json

In [None]:
bucket = sagemaker.Session().default_bucket()
prefix = "deepar-daily-temperature"

train_prefix = f'{prefix}/train'
test_prefix = f'{prefix}/test'
output_prefix = f'{prefix}/output'

In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

train_path = sagemaker_session.upload_data(train_key, bucket=bucket, key_prefix=train_prefix)
test_path = sagemaker_session.upload_data(test_key, bucket=bucket, key_prefix=test_prefix)
output_path = f's3://bucket/output_prefix'

print(train_path)
print(test_path)
print(output_path)

In [None]:
!aws s3 ls s3://{bucket}/{prefix} --recursive

In [None]:
container = image_uris.retrieve(framework='forecasting-deepar',region=region)
print(container)

In [None]:
estimator = sagemaker.estimator.Estimator(
 sagemaker_session=sagemaker_session,
 image_uri=container,
 role=role,
 instance_count=1,
 instance_type='ml.c4.8xlarge',
 base_job_name='daily-temperature',
 output_path=output_path
)

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html

hyperparameters = {
 "time_freq": 'D', # daily series
 "context_length": prediction_length,
 "prediction_length": prediction_length, # number of data points to predict
 "num_cells": "40",
 "num_layers": "2",
 "likelihood": "gaussian",
 "epochs": "250",
 "mini_batch_size": "32",
 "learning_rate": "0.00001",
 "dropout_rate": "0.05",
 "early_stopping_patience": "10" # stop if loss hasn't improved in 10 epochs
}

In [None]:
estimator.set_hyperparameters(**hyperparameters)