{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "conscious-titanium",
   "metadata": {},
   "source": [
    "# Data preparation\n",
    "---\n",
    "In this notebook, we're going to:\n",
    "1. Fetch open source data from https://opendata-renewables.engie.com/ and \n",
    "2. Preprocess it for for further analysis in the subsequent notebooks.\n",
    "\n",
    "Open source data is made available by Engie group and is distributed under Open License version 2.0 published by Etalab.\n",
    "\n",
    "\n",
    "\n",
    "## Step 1: Fetch Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "swiss-belly",
   "metadata": {},
   "outputs": [],
   "source": [
    "# declare variables for web scraping the data\n",
    "URL = 'https://opendata-renewables.engie.com'\n",
    "LOCAL_DATA = 'data'\n",
    "URL_DIR = f'{LOCAL_DATA}/web_scrape'\n",
    "\n",
    "# Create Amazon S3 bucket and provide bucket name below\n",
    "BUCKET = '<Your Amazon S3 Bucket>'\n",
    "PREFIX = 'wind-turbine/training_data'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "limiting-shape",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import zipfile\n",
    "import pandas as pd\n",
    "\n",
    "from utils import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "burning-sugar",
   "metadata": {},
   "outputs": [],
   "source": [
    "# create a wroking directory for web scraping operations\n",
    "!mkdir $LOCAL_DATA\n",
    "!mkdir $URL_DIR\n",
    "\n",
    "# download index.html\n",
    "!wget --output-document=$URL_DIR/index.html $URL\n",
    "\n",
    "# scrape with BS\n",
    "with open(f'{URL_DIR}/index.html') as fp:\n",
    "    soup = BeautifulSoup(fp, \"html.parser\")\n",
    "\n",
    "# get url paths to the raw zipped files\n",
    "data_urls = [URL + x['href'].\\\n",
    "        replace('explore','media'). \\\n",
    "        replace('dataset','datasets'). \\\n",
    "        replace('/information','.zip') \\\n",
    "        for x in list(set([x for x in soup.find_all('a', href=True) if 'dataset' in x['href']]))]\n",
    "\n",
    "# store url paths as a text\n",
    "with open(f'{URL_DIR}/url_list.txt', 'w') as f:\n",
    "    for item in data_urls:\n",
    "        f.write(\"%s\\n\" % item)\n",
    "\n",
    "# download raw zipped data files\n",
    "!wget --input $URL_DIR/url_list.txt --directory-prefix=$URL_DIR/raw_zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caring-printer",
   "metadata": {},
   "outputs": [],
   "source": [
    "zip_files = !ls $URL_DIR/raw_zip\n",
    "zip_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "spanish-nylon",
   "metadata": {},
   "outputs": [],
   "source": [
    "# unzip files\n",
    "for zip_file_ in zip_files:\n",
    "    with zipfile.ZipFile(f'{URL_DIR}/raw_zip/{zip_file_}', 'r') as zip_ref:\n",
    "        zip_ref.extractall(f'{URL_DIR}/raw_unzip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "inside-invalid",
   "metadata": {},
   "outputs": [],
   "source": [
    "# print unzipped files names\n",
    "unzipped_files = !ls $URL_DIR/raw_unzip\n",
    "unzipped_files"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "forward-locator",
   "metadata": {},
   "source": [
    "## Step 2: Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "incorporate-worry",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_files = [x for x in unzipped_files if '-data-' in x]\n",
    "data_files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "caring-welsh",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read\n",
    "df_1 = pd.read_csv(f'{URL_DIR}/raw_unzip/{data_files[0]}', delimiter=';')\n",
    "df_2 = pd.read_csv(f'{URL_DIR}/raw_unzip/{data_files[1]}', delimiter=';')\n",
    "df_c = pd.concat([df_1 ,df_2], axis=0)\n",
    "\n",
    "turbines = df_c['Wind_turbine_name'].unique()\n",
    "print(f'List of turbines in the data: {turbines}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "pleased-psychiatry",
   "metadata": {},
   "outputs": [],
   "source": [
    "for turbine in turbines:\n",
    "    df_raw = df_c.loc[df_c['Wind_turbine_name'] == turbine,:]\n",
    "    df_clean = clean_up_data(df_raw)\n",
    "    df_description = pd.read_csv(f'{URL_DIR}/raw_unzip/data_description.csv', delimiter=';')\n",
    "    df_ord = order_columns(df_clean, df_description)\n",
    "    df_ord = df_ord.loc[:,~df_ord.columns.duplicated()]\n",
    "    df_ord.to_csv(f's3://{BUCKET}/{PREFIX}/{turbine}/telemetry.csv')\n",
    "    break"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "proprietary-services",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "conda_python3",
   "language": "python",
   "name": "conda_python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}