{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "217d65b8",
"metadata": {},
"outputs": [],
"source": [
"#importing all the libraries needed for this implementation. The warnings module was introduced as a way to warn \n",
"#programmers about changes in language or library features in anticipation of backwards incompatible changes coming \n",
"#with Python 3.0. Since warnings are not fatal, a program may encounter the same warn-able situation many times in \n",
"#the course of running a program\n",
"#The filter is set to ignore so that the warnings are not displayed\n",
"import warnings\n",
"warnings.filterwarnings('ignore')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "056db896",
"metadata": {},
"outputs": [],
"source": [
"#The main purpose of the OS module is to interact with the operating system\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "83977081",
"metadata": {},
"outputs": [],
"source": [
"#Boto3 is the name of the Python SDK for AWS. The module makes it easy to integrate the Python application, library, or script with AWS services including Amazon S3 and more\n",
"#It allows for direct creation, updating and deletion of AWS resources from your Python scripts\n",
"import boto3"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "9a6e2ff5",
"metadata": {},
"outputs": [],
"source": [
"#Python io module allows us to manage the file-related input and output operations\n",
"import io"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d201b566",
"metadata": {},
"outputs": [],
"source": [
"#SageMaker Python SDK is an open-source library for training and deploying machine learning models on Amazon SageMaker\n",
"import sagemaker"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "431d51e4",
"metadata": {},
"outputs": [],
"source": [
"#For matrix operations and numerical processing\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "ff1d068f",
"metadata": {},
"outputs": [],
"source": [
"#for munging tabular data\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "fa20648e",
"metadata": {},
"outputs": [],
"source": [
"#For charts and visualizations \n",
"from matplotlib import pyplot as plt\n",
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "0ecc9072",
"metadata": {},
"outputs": [],
"source": [
"#Scikit-learn is a machine learning library for Python. It features several regression, classification and clustering \n",
"#algorithms including SVMs, gradient boosting, k-means, random forests and DBSCAN. . Using train_test_split() from the\n",
"#data science library scikit-learn, you can split your dataset into subsets that minimize the potential for bias in \n",
"#your evaluation and validation process.\n",
"import sklearn\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import accuracy_score"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "c9718e1d",
"metadata": {},
"outputs": [],
"source": [
"#With Scikit-Learn it is extremely straight forward to implement linear regression models, as all you really need to \n",
"#do is import the LinearRegression class, instantiate it, and call the fit() method along with our training data.\n",
"from sklearn.linear_model import LinearRegression"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "1ca0e38d",
"metadata": {},
"outputs": [],
"source": [
"#importing other modules that makr it easier to work with the data\n",
"import pickle,gzip,urllib,json,csv"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "7d460506",
"metadata": {},
"outputs": [],
"source": [
"#The preprocessing module provides the StandardScaler utility class, which is a quick and easy way to perform the \n",
"#many operations on an array-like dataset\n",
"from sklearn import preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "cfc4d436",
"metadata": {},
"outputs": [],
"source": [
"#Get the execution role for the notebook instance. This is the IAM role that you created for your notebook instance.\n",
"#You pass the role to the tuning job.\n",
"from sagemaker import get_execution_role\n",
"role = get_execution_role()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "291a762b",
"metadata": {},
"outputs": [],
"source": [
"#Resources represent an object-oriented interface to Amazon Web Services (AWS). They provide a higher-level \n",
"#abstraction than the raw, low-level calls made by service clients. Defining the S3 resource, bucket name and csv\n",
"#file object\n",
"s3 = boto3.resource('s3')\n",
"bucket_name = 'aws-machinelearning-chez'\n",
"object_key = 'weatherhistory2.csv'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "e3ca18e0",
"metadata": {},
"outputs": [],
"source": [
"#Use the boto3 client to retrieve the file from bucket aws-machinelearning-chez named weatherhistory.csv. \n",
"#Define Weather dataset as the csv file read in the response from the get_object operation\n",
"s3_client = boto3.client('s3')\n",
"response = s3_client.get_object(Bucket=bucket_name, Key=object_key)\n",
"response_body = response[\"Body\"].read()\n",
"weather_df = pd.read_csv(io.BytesIO(response_body), header=0, delimiter=\",\", low_memory=False)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "b43f2087",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Formatted Date \n",
" Summary \n",
" Precip Type \n",
" Temperature (C) \n",
" Apparent Temperature (C) \n",
" Humidity \n",
" Wind Speed (km/h) \n",
" Wind Bearing (degrees) \n",
" Visibility (km) \n",
" Loud Cover \n",
" Pressure (millibars) \n",
" Daily Summary \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 2006-04-01 00:00:00.000 +0200 \n",
" Partly Cloudy \n",
" rain \n",
" 9.472222 \n",
" 7.388889 \n",
" 0.89 \n",
" 14.1197 \n",
" 251 \n",
" 15.8263 \n",
" 0 \n",
" 1015.13 \n",
" Partly cloudy throughout the day. \n",
" \n",
" \n",
" 1 \n",
" 2006-04-01 01:00:00.000 +0200 \n",
" Partly Cloudy \n",
" rain \n",
" 9.355556 \n",
" 7.227778 \n",
" 0.86 \n",
" 14.2646 \n",
" 259 \n",
" 15.8263 \n",
" 0 \n",
" 1015.63 \n",
" Partly cloudy throughout the day. \n",
" \n",
" \n",
" 2 \n",
" 2006-04-01 02:00:00.000 +0200 \n",
" Mostly Cloudy \n",
" rain \n",
" 9.377778 \n",
" 9.377778 \n",
" 0.89 \n",
" 3.9284 \n",
" 204 \n",
" 14.9569 \n",
" 0 \n",
" 1015.94 \n",
" Partly cloudy throughout the day. \n",
" \n",
" \n",
" 3 \n",
" 2006-04-01 03:00:00.000 +0200 \n",
" Partly Cloudy \n",
" rain \n",
" 8.288889 \n",
" 5.944444 \n",
" 0.83 \n",
" 14.1036 \n",
" 269 \n",
" 15.8263 \n",
" 0 \n",
" 1016.41 \n",
" Partly cloudy throughout the day. \n",
" \n",
" \n",
" 4 \n",
" 2006-04-01 04:00:00.000 +0200 \n",
" Mostly Cloudy \n",
" rain \n",
" 8.755556 \n",
" 6.977778 \n",
" 0.83 \n",
" 11.0446 \n",
" 259 \n",
" 15.8263 \n",
" 0 \n",
" 1016.51 \n",
" Partly cloudy throughout the day. \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Formatted Date Summary Precip Type Temperature (C) \\\n",
"0 2006-04-01 00:00:00.000 +0200 Partly Cloudy rain 9.472222 \n",
"1 2006-04-01 01:00:00.000 +0200 Partly Cloudy rain 9.355556 \n",
"2 2006-04-01 02:00:00.000 +0200 Mostly Cloudy rain 9.377778 \n",
"3 2006-04-01 03:00:00.000 +0200 Partly Cloudy rain 8.288889 \n",
"4 2006-04-01 04:00:00.000 +0200 Mostly Cloudy rain 8.755556 \n",
"\n",
" Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n",
"0 7.388889 0.89 14.1197 \n",
"1 7.227778 0.86 14.2646 \n",
"2 9.377778 0.89 3.9284 \n",
"3 5.944444 0.83 14.1036 \n",
"4 6.977778 0.83 11.0446 \n",
"\n",
" Wind Bearing (degrees) Visibility (km) Loud Cover Pressure (millibars) \\\n",
"0 251 15.8263 0 1015.13 \n",
"1 259 15.8263 0 1015.63 \n",
"2 204 14.9569 0 1015.94 \n",
"3 269 15.8263 0 1016.41 \n",
"4 259 15.8263 0 1016.51 \n",
"\n",
" Daily Summary \n",
"0 Partly cloudy throughout the day. \n",
"1 Partly cloudy throughout the day. \n",
"2 Partly cloudy throughout the day. \n",
"3 Partly cloudy throughout the day. \n",
"4 Partly cloudy throughout the day. "
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Look at the data\n",
"weather_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "ad2e2238",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',\n",
" 'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',\n",
" 'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',\n",
" 'Pressure (millibars)', 'Daily Summary'],\n",
" dtype='object')"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Gives an array of colums names \n",
"weather_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "748178d6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Temperature (C) \n",
" Apparent Temperature (C) \n",
" Humidity \n",
" Wind Speed (km/h) \n",
" Wind Bearing (degrees) \n",
" Visibility (km) \n",
" Loud Cover \n",
" Pressure (millibars) \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 96453.000000 \n",
" 96453.000000 \n",
" 96453.000000 \n",
" 96453.000000 \n",
" 96453.000000 \n",
" 96453.000000 \n",
" 96453.0 \n",
" 96453.000000 \n",
" \n",
" \n",
" mean \n",
" 11.932678 \n",
" 10.855029 \n",
" 0.734899 \n",
" 10.810640 \n",
" 187.509232 \n",
" 10.347325 \n",
" 0.0 \n",
" 1003.235956 \n",
" \n",
" \n",
" std \n",
" 9.551546 \n",
" 10.696847 \n",
" 0.195473 \n",
" 6.913571 \n",
" 107.383428 \n",
" 4.192123 \n",
" 0.0 \n",
" 116.969906 \n",
" \n",
" \n",
" min \n",
" -21.822222 \n",
" -27.716667 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.0 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 4.688889 \n",
" 2.311111 \n",
" 0.600000 \n",
" 5.828200 \n",
" 116.000000 \n",
" 8.339800 \n",
" 0.0 \n",
" 1011.900000 \n",
" \n",
" \n",
" 50% \n",
" 12.000000 \n",
" 12.000000 \n",
" 0.780000 \n",
" 9.965900 \n",
" 180.000000 \n",
" 10.046400 \n",
" 0.0 \n",
" 1016.450000 \n",
" \n",
" \n",
" 75% \n",
" 18.838889 \n",
" 18.838889 \n",
" 0.890000 \n",
" 14.135800 \n",
" 290.000000 \n",
" 14.812000 \n",
" 0.0 \n",
" 1021.090000 \n",
" \n",
" \n",
" max \n",
" 39.905556 \n",
" 39.344444 \n",
" 1.000000 \n",
" 63.852600 \n",
" 359.000000 \n",
" 16.100000 \n",
" 0.0 \n",
" 1046.380000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Temperature (C) Apparent Temperature (C) Humidity \\\n",
"count 96453.000000 96453.000000 96453.000000 \n",
"mean 11.932678 10.855029 0.734899 \n",
"std 9.551546 10.696847 0.195473 \n",
"min -21.822222 -27.716667 0.000000 \n",
"25% 4.688889 2.311111 0.600000 \n",
"50% 12.000000 12.000000 0.780000 \n",
"75% 18.838889 18.838889 0.890000 \n",
"max 39.905556 39.344444 1.000000 \n",
"\n",
" Wind Speed (km/h) Wind Bearing (degrees) Visibility (km) Loud Cover \\\n",
"count 96453.000000 96453.000000 96453.000000 96453.0 \n",
"mean 10.810640 187.509232 10.347325 0.0 \n",
"std 6.913571 107.383428 4.192123 0.0 \n",
"min 0.000000 0.000000 0.000000 0.0 \n",
"25% 5.828200 116.000000 8.339800 0.0 \n",
"50% 9.965900 180.000000 10.046400 0.0 \n",
"75% 14.135800 290.000000 14.812000 0.0 \n",
"max 63.852600 359.000000 16.100000 0.0 \n",
"\n",
" Pressure (millibars) \n",
"count 96453.000000 \n",
"mean 1003.235956 \n",
"std 116.969906 \n",
"min 0.000000 \n",
"25% 1011.900000 \n",
"50% 1016.450000 \n",
"75% 1021.090000 \n",
"max 1046.380000 "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Generate descriptive statistics that summarize the central tendency, dispersion and shape of the dataset’s \n",
"#distribution excluding NaN values\n",
"weather_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "59c838b9",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" count \n",
" mean \n",
" std \n",
" min \n",
" 25% \n",
" 50% \n",
" 75% \n",
" max \n",
" \n",
" \n",
" \n",
" \n",
" Temperature (C) \n",
" 96453.0 \n",
" 11.932678 \n",
" 9.551546 \n",
" -21.822222 \n",
" 4.688889 \n",
" 12.0000 \n",
" 18.838889 \n",
" 39.905556 \n",
" \n",
" \n",
" Apparent Temperature (C) \n",
" 96453.0 \n",
" 10.855029 \n",
" 10.696847 \n",
" -27.716667 \n",
" 2.311111 \n",
" 12.0000 \n",
" 18.838889 \n",
" 39.344444 \n",
" \n",
" \n",
" Humidity \n",
" 96453.0 \n",
" 0.734899 \n",
" 0.195473 \n",
" 0.000000 \n",
" 0.600000 \n",
" 0.7800 \n",
" 0.890000 \n",
" 1.000000 \n",
" \n",
" \n",
" Wind Speed (km/h) \n",
" 96453.0 \n",
" 10.810640 \n",
" 6.913571 \n",
" 0.000000 \n",
" 5.828200 \n",
" 9.9659 \n",
" 14.135800 \n",
" 63.852600 \n",
" \n",
" \n",
" Wind Bearing (degrees) \n",
" 96453.0 \n",
" 187.509232 \n",
" 107.383428 \n",
" 0.000000 \n",
" 116.000000 \n",
" 180.0000 \n",
" 290.000000 \n",
" 359.000000 \n",
" \n",
" \n",
" Visibility (km) \n",
" 96453.0 \n",
" 10.347325 \n",
" 4.192123 \n",
" 0.000000 \n",
" 8.339800 \n",
" 10.0464 \n",
" 14.812000 \n",
" 16.100000 \n",
" \n",
" \n",
" Loud Cover \n",
" 96453.0 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.0000 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" Pressure (millibars) \n",
" 96453.0 \n",
" 1003.235956 \n",
" 116.969906 \n",
" 0.000000 \n",
" 1011.900000 \n",
" 1016.4500 \n",
" 1021.090000 \n",
" 1046.380000 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" count mean std min \\\n",
"Temperature (C) 96453.0 11.932678 9.551546 -21.822222 \n",
"Apparent Temperature (C) 96453.0 10.855029 10.696847 -27.716667 \n",
"Humidity 96453.0 0.734899 0.195473 0.000000 \n",
"Wind Speed (km/h) 96453.0 10.810640 6.913571 0.000000 \n",
"Wind Bearing (degrees) 96453.0 187.509232 107.383428 0.000000 \n",
"Visibility (km) 96453.0 10.347325 4.192123 0.000000 \n",
"Loud Cover 96453.0 0.000000 0.000000 0.000000 \n",
"Pressure (millibars) 96453.0 1003.235956 116.969906 0.000000 \n",
"\n",
" 25% 50% 75% max \n",
"Temperature (C) 4.688889 12.0000 18.838889 39.905556 \n",
"Apparent Temperature (C) 2.311111 12.0000 18.838889 39.344444 \n",
"Humidity 0.600000 0.7800 0.890000 1.000000 \n",
"Wind Speed (km/h) 5.828200 9.9659 14.135800 63.852600 \n",
"Wind Bearing (degrees) 116.000000 180.0000 290.000000 359.000000 \n",
"Visibility (km) 8.339800 10.0464 14.812000 16.100000 \n",
"Loud Cover 0.000000 0.0000 0.000000 0.000000 \n",
"Pressure (millibars) 1011.900000 1016.4500 1021.090000 1046.380000 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weather_df.describe().transpose()"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "8e8c1115",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"#A seaborn plot enables us to visualize a statistical plot of the raw data\n",
"import seaborn\n",
"plt.figure(figsize=(15,10))\n",
"plt.tight_layout()\n",
"seaborn.distplot(weather_df['Apparent Temperature (C)'])\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "7d82a835",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 96453 entries, 0 to 96452\n",
"Data columns (total 12 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Formatted Date 96453 non-null object \n",
" 1 Summary 96453 non-null object \n",
" 2 Precip Type 95936 non-null object \n",
" 3 Temperature (C) 96453 non-null float64\n",
" 4 Apparent Temperature (C) 96453 non-null float64\n",
" 5 Humidity 96453 non-null float64\n",
" 6 Wind Speed (km/h) 96453 non-null float64\n",
" 7 Wind Bearing (degrees) 96453 non-null int64 \n",
" 8 Visibility (km) 96453 non-null float64\n",
" 9 Loud Cover 96453 non-null int64 \n",
" 10 Pressure (millibars) 96453 non-null float64\n",
" 11 Daily Summary 96453 non-null object \n",
"dtypes: float64(6), int64(2), object(4)\n",
"memory usage: 8.8+ MB\n"
]
}
],
"source": [
"#Prints a concise summary of the dataframe\n",
"weather_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "c246ca16",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Formatted Date False\n",
"Summary False\n",
"Precip Type True\n",
"Temperature (C) False\n",
"Apparent Temperature (C) False\n",
"Humidity False\n",
"Wind Speed (km/h) False\n",
"Wind Bearing (degrees) False\n",
"Visibility (km) False\n",
"Loud Cover False\n",
"Pressure (millibars) False\n",
"Daily Summary False\n",
"dtype: bool"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check if there is any null value in a particular column\n",
"weather_df.isnull().any()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "6c9c0621",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Formatted Date False\n",
"Summary False\n",
"Precip Type False\n",
"Temperature (C) False\n",
"Apparent Temperature (C) False\n",
"Humidity False\n",
"Wind Speed (km/h) False\n",
"Wind Bearing (degrees) False\n",
"Visibility (km) False\n",
"Loud Cover False\n",
"Pressure (millibars) False\n",
"Daily Summary False\n",
"dtype: bool"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#check if there are all null values in a particular column\n",
"weather_df.isnull().all()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "9a09da4e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Formatted Date 0.00\n",
"Summary 0.00\n",
"Precip Type 0.54\n",
"Temperature (C) 0.00\n",
"Apparent Temperature (C) 0.00\n",
"Humidity 0.00\n",
"Wind Speed (km/h) 0.00\n",
"Wind Bearing (degrees) 0.00\n",
"Visibility (km) 0.00\n",
"Loud Cover 0.00\n",
"Pressure (millibars) 0.00\n",
"Daily Summary 0.00\n",
"dtype: float64"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calculate the percentage of null values in each column\n",
"round(100*(weather_df.isnull().sum()/len(weather_df.index)),2)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "b0903331",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"rain 85224\n",
"snow 10712\n",
"Name: Precip Type, dtype: int64"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#determine what values are in this column and what the count is of each value\n",
"weather_df['Precip Type'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cbc2584b",
"metadata": {},
"outputs": [],
"source": [
"#Handling missing values by imputation of the mode i.e. rain\n",
"weather_df.loc[weather_df['Precip Type'].isnull(),'Precip Type']='rain'"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "9ff40c89",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Formatted Date 0.0\n",
"Summary 0.0\n",
"Precip Type 0.0\n",
"Temperature (C) 0.0\n",
"Apparent Temperature (C) 0.0\n",
"Humidity 0.0\n",
"Wind Speed (km/h) 0.0\n",
"Wind Bearing (degrees) 0.0\n",
"Visibility (km) 0.0\n",
"Loud Cover 0.0\n",
"Pressure (millibars) 0.0\n",
"Daily Summary 0.0\n",
"dtype: float64"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#calculate the percentage of null values in each column for the second time\n",
"round(100*(weather_df.isnull().sum()/len(weather_df.index)),2)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "0b15207d",
"metadata": {},
"outputs": [],
"source": [
"#Input binary values in Precip type column for rain and snow\n",
"weather_df.loc[weather_df['Precip Type']=='rain','Precip Type']=1\n",
"weather_df.loc[weather_df['Precip Type']=='snow','Precip Type']=0"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1fd8dc2e",
"metadata": {},
"outputs": [],
"source": [
"weather_df_num=weather_df[list(weather_df.dtypes[weather_df.dtypes!='odject'].index)]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "2f744270",
"metadata": {},
"outputs": [],
"source": [
"#Drop columns that are irrelevant to the model training\n",
"to_drop = ['Formatted Date','Summary','Daily Summary']\n",
"weather_df.drop(to_drop, inplace=True, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "dc3b792f",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Precip Type \n",
" Temperature (C) \n",
" Apparent Temperature (C) \n",
" Humidity \n",
" Wind Speed (km/h) \n",
" Wind Bearing (degrees) \n",
" Visibility (km) \n",
" Loud Cover \n",
" Pressure (millibars) \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" 1 \n",
" 9.472222 \n",
" 7.388889 \n",
" 0.89 \n",
" 14.1197 \n",
" 251 \n",
" 15.8263 \n",
" 0 \n",
" 1015.13 \n",
" \n",
" \n",
" 1 \n",
" 1 \n",
" 9.355556 \n",
" 7.227778 \n",
" 0.86 \n",
" 14.2646 \n",
" 259 \n",
" 15.8263 \n",
" 0 \n",
" 1015.63 \n",
" \n",
" \n",
" 2 \n",
" 1 \n",
" 9.377778 \n",
" 9.377778 \n",
" 0.89 \n",
" 3.9284 \n",
" 204 \n",
" 14.9569 \n",
" 0 \n",
" 1015.94 \n",
" \n",
" \n",
" 3 \n",
" 1 \n",
" 8.288889 \n",
" 5.944444 \n",
" 0.83 \n",
" 14.1036 \n",
" 269 \n",
" 15.8263 \n",
" 0 \n",
" 1016.41 \n",
" \n",
" \n",
" 4 \n",
" 1 \n",
" 8.755556 \n",
" 6.977778 \n",
" 0.83 \n",
" 11.0446 \n",
" 259 \n",
" 15.8263 \n",
" 0 \n",
" 1016.51 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Precip Type Temperature (C) Apparent Temperature (C) Humidity \\\n",
"0 1 9.472222 7.388889 0.89 \n",
"1 1 9.355556 7.227778 0.86 \n",
"2 1 9.377778 9.377778 0.89 \n",
"3 1 8.288889 5.944444 0.83 \n",
"4 1 8.755556 6.977778 0.83 \n",
"\n",
" Wind Speed (km/h) Wind Bearing (degrees) Visibility (km) Loud Cover \\\n",
"0 14.1197 251 15.8263 0 \n",
"1 14.2646 259 15.8263 0 \n",
"2 3.9284 204 14.9569 0 \n",
"3 14.1036 269 15.8263 0 \n",
"4 11.0446 259 15.8263 0 \n",
"\n",
" Pressure (millibars) \n",
"0 1015.13 \n",
"1 1015.63 \n",
"2 1015.94 \n",
"3 1016.41 \n",
"4 1016.51 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"weather_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "ff648831",
"metadata": {},
"outputs": [],
"source": [
"weather_df_num=weather_df[list(weather_df.dtypes[weather_df.dtypes!='odject'].index)]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3bc98770",
"metadata": {},
"outputs": [],
"source": [
"#Separating the training and the test datasets. The feature that we are predicting is temperature, so we separate \n",
"#it out into a column on its own (weather_y) leaving the rest of the data in weather_x\n",
"weather_df_y = weather_df_num.pop('Temperature (C)')\n",
"weather_df_x = weather_df_num"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "62fe9537",
"metadata": {},
"outputs": [],
"source": [
"#In supervised machine learning, we add data to the model and tell it what the output will be based on the various \n",
"#conditions of that data. The model then learns from these features and is then able to make a prediction for new \n",
"#values \n",
"#So we now split the data into train and test\n",
"\n",
"train_x,test_x,train_y,test_y = train_test_split(weather_df_x,weather_df_y,test_size = 0.2,random_state=4)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "56acf2bc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Precip Type \n",
" Apparent Temperature (C) \n",
" Humidity \n",
" Wind Speed (km/h) \n",
" Wind Bearing (degrees) \n",
" Visibility (km) \n",
" Loud Cover \n",
" Pressure (millibars) \n",
" \n",
" \n",
" \n",
" \n",
" 70626 \n",
" 1 \n",
" 21.061111 \n",
" 0.31 \n",
" 12.5580 \n",
" 110 \n",
" 16.1000 \n",
" 0 \n",
" 1005.87 \n",
" \n",
" \n",
" 52457 \n",
" 1 \n",
" 25.016667 \n",
" 0.36 \n",
" 18.4989 \n",
" 352 \n",
" 10.3523 \n",
" 0 \n",
" 1025.36 \n",
" \n",
" \n",
" 90690 \n",
" 1 \n",
" 0.738889 \n",
" 0.89 \n",
" 17.1304 \n",
" 270 \n",
" 15.8263 \n",
" 0 \n",
" 1014.75 \n",
" \n",
" \n",
" 69528 \n",
" 1 \n",
" 13.772222 \n",
" 0.78 \n",
" 14.4900 \n",
" 300 \n",
" 15.8263 \n",
" 0 \n",
" 1014.56 \n",
" \n",
" \n",
" 92419 \n",
" 1 \n",
" 23.288889 \n",
" 0.82 \n",
" 6.3917 \n",
" 357 \n",
" 16.1000 \n",
" 0 \n",
" 1022.05 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Precip Type Apparent Temperature (C) Humidity Wind Speed (km/h) \\\n",
"70626 1 21.061111 0.31 12.5580 \n",
"52457 1 25.016667 0.36 18.4989 \n",
"90690 1 0.738889 0.89 17.1304 \n",
"69528 1 13.772222 0.78 14.4900 \n",
"92419 1 23.288889 0.82 6.3917 \n",
"\n",
" Wind Bearing (degrees) Visibility (km) Loud Cover \\\n",
"70626 110 16.1000 0 \n",
"52457 352 10.3523 0 \n",
"90690 270 15.8263 0 \n",
"69528 300 15.8263 0 \n",
"92419 357 16.1000 0 \n",
"\n",
" Pressure (millibars) \n",
"70626 1005.87 \n",
"52457 1025.36 \n",
"90690 1014.75 \n",
"69528 1014.56 \n",
"92419 1022.05 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Look at the training dataset. It has all columns except the temperature column\n",
"train_x.head()"
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "2f30439e",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"70626 21.061111\n",
"52457 25.016667\n",
"90690 4.422222\n",
"69528 13.772222\n",
"92419 23.288889\n",
"Name: Temperature (C), dtype: float64"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_y.head()"
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "a936d649",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"37443 -2.288889\n",
"86534 8.861111\n",
"2082 9.805556\n",
"53130 27.222222\n",
"45196 17.705556\n",
"Name: Temperature (C), dtype: float64"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_y.head()"
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "16b30e84",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(19291, 8)"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x.shape"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "6e7af2d1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(77162, 8)"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x.shape"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "f97f3a15",
"metadata": {},
"outputs": [],
"source": [
"#Using polynomial regression to train the model. Polynomial Regression is a one of the types of linear regression in \n",
"#which the relationship between the independent variable x and dependent variable y is modeled as an nth degree \n",
"#polynomial. In this case that degree is 4\n",
"\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"poly = PolynomialFeatures(degree = 4)"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "794b8782",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PolynomialFeatures(degree=4)"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Now we train the model\n",
"X_poly = poly.fit_transform(train_x)\n",
"poly.fit(X_poly, train_y)"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "033f87aa",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DecisionTreeRegressor(random_state=0)"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Now we use Decision Tree Regressor to train the model\n",
"#Decision tree builds regression or classification models in the form of a tree structure. It breaks down a dataset\n",
"#into smaller and smaller subsets while at the same time an associated decision tree is incrementally developed. \n",
"#The final result is a tree with decision nodes and leaf nodes. \n",
"\n",
"from sklearn.tree import DecisionTreeRegressor\n",
"regressor = DecisionTreeRegressor(random_state=0)\n",
"regressor.fit(train_x,train_y)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "a3cc05b3",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.00803046229678356"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#We now look at how well out model has been trained by feeding it test data and getting it to make a prediction and\n",
"#then comparing its output to test data output\n",
"\n",
"prediction3 = regressor.predict(test_x)\n",
"np.mean((prediction3-test_y)**2)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "278387c1",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" actual \n",
" prediction \n",
" diff \n",
" \n",
" \n",
" \n",
" \n",
" 37443 \n",
" -2.288889 \n",
" -2.266667 \n",
" -2.222222e-02 \n",
" \n",
" \n",
" 86534 \n",
" 8.861111 \n",
" 8.855556 \n",
" 5.555555e-03 \n",
" \n",
" \n",
" 2082 \n",
" 9.805556 \n",
" 9.811111 \n",
" -5.555555e-03 \n",
" \n",
" \n",
" 53130 \n",
" 27.222222 \n",
" 27.222222 \n",
" 0.000000e+00 \n",
" \n",
" \n",
" 45196 \n",
" 17.705556 \n",
" 17.705556 \n",
" 3.552714e-15 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 55387 \n",
" -10.066667 \n",
" -10.000000 \n",
" -6.666667e-02 \n",
" \n",
" \n",
" 25976 \n",
" 9.972222 \n",
" 9.972222 \n",
" 0.000000e+00 \n",
" \n",
" \n",
" 32423 \n",
" 12.777778 \n",
" 12.777778 \n",
" -6.217249e-14 \n",
" \n",
" \n",
" 26570 \n",
" 15.277778 \n",
" 15.277778 \n",
" 0.000000e+00 \n",
" \n",
" \n",
" 45834 \n",
" 2.200000 \n",
" 2.200000 \n",
" 4.440892e-16 \n",
" \n",
" \n",
"
\n",
"
19291 rows × 3 columns
\n",
"
"
],
"text/plain": [
" actual prediction diff\n",
"37443 -2.288889 -2.266667 -2.222222e-02\n",
"86534 8.861111 8.855556 5.555555e-03\n",
"2082 9.805556 9.811111 -5.555555e-03\n",
"53130 27.222222 27.222222 0.000000e+00\n",
"45196 17.705556 17.705556 3.552714e-15\n",
"... ... ... ...\n",
"55387 -10.066667 -10.000000 -6.666667e-02\n",
"25976 9.972222 9.972222 0.000000e+00\n",
"32423 12.777778 12.777778 -6.217249e-14\n",
"26570 15.277778 15.277778 0.000000e+00\n",
"45834 2.200000 2.200000 4.440892e-16\n",
"\n",
"[19291 rows x 3 columns]"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Now look at the difference between predicted and actual\n",
"pd.DataFrame({'actual':test_y,\n",
" 'prediction':prediction3,\n",
" 'diff':(test_y-prediction3)})"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "3914b5fa",
"metadata": {},
"outputs": [],
"source": [
"#Now look at the difference between predicted and actual\n",
"df=pd.DataFrame({'actual':test_y,\n",
" 'prediction':prediction3,\n",
" 'diff':(test_y-prediction3)})"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "b82a4136",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df1= df.head(25)\n",
"df1.plot(kind='bar', figsize=(16,10))\n",
"plt.grid(which='major', linestyle='-',linewidth='0.5', color='green')\n",
"plt.grid(which='minor', linestyle=':',linewidth='0.5', color='black')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "23224eb6",
"metadata": {},
"outputs": [],
"source": [
"#Import metrics library\n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "2780f7e7",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.021708741547046377\n"
]
}
],
"source": [
"#Print results of MAE\n",
"print(metrics.mean_absolute_error(test_y, prediction3))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "860e7137",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.00803046229678357\n"
]
}
],
"source": [
"#Print results of MSE\n",
"print(metrics.mean_squared_error(test_y, prediction3))"
]
},
{
"cell_type": "code",
"execution_count": 56,
"id": "edbe23f3",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.08961284671732937\n"
]
}
],
"source": [
"#Print results of RMSE\n",
"print(np.sqrt(metrics.mean_squared_error(test_y, prediction3)))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "fb7ce592",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"RandomForestRegressor(max_depth=10, random_state=0)"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"## Using Random Forest to train our model\n",
"from sklearn.ensemble import RandomForestRegressor\n",
"regr = RandomForestRegressor(max_depth=10,random_state=0,n_estimators=100)\n",
"regr.fit(train_x,train_y)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "e2afeb22",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.008141637115617098"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#We now look at how well out model has been trained by feeding it test data and getting it to make a prediction and\n",
"#then comparing its output to test data output\n",
"\n",
"prediction4 = regr.predict(test_x)\n",
"np.mean((prediction4-test_y)**2)"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "57855a0b",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" actual \n",
" prediction \n",
" diff \n",
" \n",
" \n",
" \n",
" \n",
" 37443 \n",
" -2.288889 \n",
" -2.482177 \n",
" 0.193288 \n",
" \n",
" \n",
" 86534 \n",
" 8.861111 \n",
" 8.892260 \n",
" -0.031149 \n",
" \n",
" \n",
" 2082 \n",
" 9.805556 \n",
" 9.847246 \n",
" -0.041690 \n",
" \n",
" \n",
" 53130 \n",
" 27.222222 \n",
" 27.203396 \n",
" 0.018826 \n",
" \n",
" \n",
" 45196 \n",
" 17.705556 \n",
" 17.713716 \n",
" -0.008160 \n",
" \n",
" \n",
" ... \n",
" ... \n",
" ... \n",
" ... \n",
" \n",
" \n",
" 55387 \n",
" -10.066667 \n",
" -10.071566 \n",
" 0.004899 \n",
" \n",
" \n",
" 25976 \n",
" 9.972222 \n",
" 9.913953 \n",
" 0.058269 \n",
" \n",
" \n",
" 32423 \n",
" 12.777778 \n",
" 12.781718 \n",
" -0.003940 \n",
" \n",
" \n",
" 26570 \n",
" 15.277778 \n",
" 15.275648 \n",
" 0.002130 \n",
" \n",
" \n",
" 45834 \n",
" 2.200000 \n",
" 2.187450 \n",
" 0.012550 \n",
" \n",
" \n",
"
\n",
"
19291 rows × 3 columns
\n",
"
"
],
"text/plain": [
" actual prediction diff\n",
"37443 -2.288889 -2.482177 0.193288\n",
"86534 8.861111 8.892260 -0.031149\n",
"2082 9.805556 9.847246 -0.041690\n",
"53130 27.222222 27.203396 0.018826\n",
"45196 17.705556 17.713716 -0.008160\n",
"... ... ... ...\n",
"55387 -10.066667 -10.071566 0.004899\n",
"25976 9.972222 9.913953 0.058269\n",
"32423 12.777778 12.781718 -0.003940\n",
"26570 15.277778 15.275648 0.002130\n",
"45834 2.200000 2.187450 0.012550\n",
"\n",
"[19291 rows x 3 columns]"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Now look at the difference between predicted and actual\n",
"pd.DataFrame({'actual':test_y,\n",
" 'prediction':prediction4,\n",
" 'diff':(test_y-prediction4)})"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "af4663b4",
"metadata": {},
"outputs": [],
"source": [
"#Now look at the difference between predicted and actual\n",
"df2=pd.DataFrame({'actual':test_y,\n",
" 'prediction':prediction4,\n",
" 'diff':(test_y-prediction4)})"
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "5413c801",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
""
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"df1= df2.head(25)\n",
"df1.plot(kind='bar', figsize=(16,10))\n",
"plt.grid(which='major', linestyle='-',linewidth='0.5', color='green')\n",
"plt.grid(which='minor', linestyle=':',linewidth='0.5', color='black')\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "a3e3ce21",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.0432385808682435\n"
]
}
],
"source": [
"#Print results of MAE for Random Forest\n",
"print(metrics.mean_absolute_error(test_y, prediction4))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "f38df479",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.008141637115617032\n"
]
}
],
"source": [
"#Print results of MSE for Random Forest\n",
"print(metrics.mean_squared_error(test_y, prediction4))"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "39a5e097",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"0.09023102080558011\n"
]
}
],
"source": [
"#Print results of RMSE for Random Forest\n",
"print(np.sqrt(metrics.mean_squared_error(test_y, prediction4)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "52c3359c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "conda_python3",
"language": "python",
"name": "conda_python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}