# Data preparation
---
In this notebook, we're going to:
1. Fetch open source data from https://opendata-renewables.engie.com/ and 
2. Preprocess it for for further analysis in the subsequent notebooks.

Open source data is made available by Engie group and is distributed under Open License version 2.0 published by Etalab.



## Step 1: Fetch Data

In [None]:
# declare variables for web scraping the data
URL = 'https://opendata-renewables.engie.com'
LOCAL_DATA = 'data'
URL_DIR = f'{LOCAL_DATA}/web_scrape'

# Create Amazon S3 bucket and provide bucket name below
BUCKET = ''
PREFIX = 'wind-turbine/training_data'

In [None]:
from bs4 import BeautifulSoup
import zipfile
import pandas as pd

from utils import *

In [None]:
# create a wroking directory for web scraping operations
!mkdir $LOCAL_DATA
!mkdir $URL_DIR

# download index.html
!wget --output-document=$URL_DIR/index.html $URL

# scrape with BS
with open(f'{URL_DIR}/index.html') as fp:
 soup = BeautifulSoup(fp, "html.parser")

# get url paths to the raw zipped files
data_urls = [URL + x['href'].\
 replace('explore','media'). \
 replace('dataset','datasets'). \
 replace('/information','.zip') \
 for x in list(set([x for x in soup.find_all('a', href=True) if 'dataset' in x['href']]))]

# store url paths as a text
with open(f'{URL_DIR}/url_list.txt', 'w') as f:
 for item in data_urls:
 f.write("%s\n" % item)

# download raw zipped data files
!wget --input $URL_DIR/url_list.txt --directory-prefix=$URL_DIR/raw_zip

In [None]:
zip_files = !ls $URL_DIR/raw_zip
zip_files

In [None]:
# unzip files
for zip_file_ in zip_files:
 with zipfile.ZipFile(f'{URL_DIR}/raw_zip/{zip_file_}', 'r') as zip_ref:
 zip_ref.extractall(f'{URL_DIR}/raw_unzip')

In [None]:
# print unzipped files names
unzipped_files = !ls $URL_DIR/raw_unzip
unzipped_files

## Step 2: Preprocess data

In [None]:
data_files = [x for x in unzipped_files if '-data-' in x]
data_files

In [None]:
# read
df_1 = pd.read_csv(f'{URL_DIR}/raw_unzip/{data_files[0]}', delimiter=';')
df_2 = pd.read_csv(f'{URL_DIR}/raw_unzip/{data_files[1]}', delimiter=';')
df_c = pd.concat([df_1 ,df_2], axis=0)

turbines = df_c['Wind_turbine_name'].unique()
print(f'List of turbines in the data: {turbines}')

In [None]:
for turbine in turbines:
 df_raw = df_c.loc[df_c['Wind_turbine_name'] == turbine,:]
 df_clean = clean_up_data(df_raw)
 df_description = pd.read_csv(f'{URL_DIR}/raw_unzip/data_description.csv', delimiter=';')
 df_ord = order_columns(df_clean, df_description)
 df_ord = df_ord.loc[:,~df_ord.columns.duplicated()]
 df_ord.to_csv(f's3://{BUCKET}/{PREFIX}/{turbine}/telemetry.csv')
 break