Copyright 2021 Amazon.com and its affiliates; all rights reserved. This file is AWS Content and may not be duplicated or distributed without permission

# Hello World, Amazon SageMaker Feature Store
This notebook provides a demo of how easy it is to use SageMaker Feature Store. It does this by leveraging a simple set of utility functions that wrap the feature store API to keep it simple for a data scientist using Python.

### A few imports

# %pip install sagemaker --upgrade
%pip install sagemaker --upgrade
%pip install pandas --upgrade
%pip uninstall -y boto3
%pip uninstall -y botocore
%pip uninstall -y aiobotocore
%pip install boto3 --upgrade
%pip install botocore --upgrade
%pip install s3fs

In [None]:
from utilities.feature_store_helper import FeatureStore

from IPython.core.display import display, HTML, Markdown
import pandas as pd
import time
import json
import os
from sklearn.ensemble import RandomForestClassifier

FG_NAME = 'fs-demo-2022-03-24'
FG_NAME = FG_NAME

fs = FeatureStore()

In [None]:
claims_features = [[0, 8019, 0, 0, 1, 4000], 
 [1, 0, 0, 0, 0, 8000], 
 [2, 540, 1, 2, 10, 2000], 
 [3, 955, 0, 1, 3, 3500], 
 [4, 1200, 0, 0, 5, 10544], 
 [5, 600, 0, 1, 2, 7843]]
c_df = pd.DataFrame(claims_features, columns=['Id', 'avg_claim_amount', 'num_claims_last_7d', 'num_claims_last_1y', 
 'num_claims_lifetime', 'total_premiums'])
c_df['update_time'] = '2020-02-01T00:00:00Z'
c_df.head()
fs.create_fg_from_df('claims', c_df, id_name='Id', event_time_name='update_time')



In [None]:
fs.ingest_from_df('claims', c_df)

In [None]:
payment_features = [[0, 1000, 0], [1, 1100, 5], [2, 1200, 2], [3, 1300, 3], [4, 1400, 4], [5, 1500, 5], [6, 1600, 6]]
p_df = pd.DataFrame(payment_features, columns=['Id', 'avg_amount', 'avg_days_late'])
p_df['update_time'] = '2020-02-01T00:00:00Z'
p_df.head()
fs.create_fg_from_df('payments', p_df, id_name='Id', event_time_name='update_time')


In [None]:
fs.ingest_from_df('payments', p_df)

### Load sample customer data

In [None]:
df = pd.read_csv('utilities/customers.csv')
ORIGINAL_RECORD_COUNT = df.shape[0]
df.head()

### Create a new feature group, with schema inferred directly from my dataframe

In [None]:
tags = {'Environment': 'DEV', 
 'CostCenter': 'C20', 
 'Maintainer': 'John Smith', 
 'DocURL': 'https://www.google.com'}
fs.create_fg_from_df(FG_NAME, df, tags=tags, id_name='Id')

### Ingest features from my dataframe into my new feature group

In [None]:
fs.ingest_from_df(FG_NAME, df)

### Show that we can lookup the latest feature values
Notice that the feature values come back in the proper datatype, as defined in the feature definitions.

In [None]:
fs.get_latest_feature_values(FG_NAME, [4], features=['ZipCode'])

#### Can get selected features, or in this case, we get all features

In [None]:
fs.get_latest_feature_values(FG_NAME, [4,2,6])

## Show that we can get the history of feature values
The offline store is append-only. New records are added.

#### Now, ingest some new data with later event timestamps
We'll put in two new sets of records each with the event timestamp advanced one day, and the zipcode changed. We should end up with three total sets of records:

1. Original, event timestamp Feb 1, zip code 11111
2. New set, with event timestamp Feb 2, zip code 22222
3. Final set, with event timestamp Feb 3, zip code 33333

In [None]:
df['UpdateTime'] = '2020-02-02T00:00:00Z'
df['ZipCode'] = 22222
fs.ingest_from_df(FG_NAME, df)

df['UpdateTime'] = '2020-02-03T00:00:00Z'
df['ZipCode'] = 33333
fs.ingest_from_df(FG_NAME, df)

#### Look up the full history for a few id's
It takes a few minutes (up to 15) for the data to be available in the offline store. We'll wait until we see 3 copies of each of the records since we've ingested each record 3 times so far.

In [None]:
ids = [5,6]
features = ['*'] 

mins = 0
while True:
 hist_df = fs.get_historical_offline_feature_values(FG_NAME, record_ids=ids, feature_names=features,
 verbose=False)
 rec_count = hist_df.shape[0]
 if rec_count < (3 * len(ids)):
 if mins == 0:
 print(f'Waiting for offline store data...')
 time.sleep(60)
 mins += 1
 else:
 break

print(f'\nData is available, {rec_count} records. Waited {mins} minutes\n')
hist_df.sort_values(by=['id','zipcode']).head(30)

#### Now show the latest offline features

In [None]:
fs.get_latest_offline_feature_values(FG_NAME, record_ids=[5,6])

#### Browse the set of offline store files in the S3 console

In [None]:
s3_console_url = fs.get_offline_store_url(FG_NAME)
display(Markdown(f'Review offline store partitioned data files here: [{s3_console_url}]({s3_console_url})'))

#### See the Glue table that can be used for Athena queries

In [None]:
glue_console_url = fs.get_glue_table_url(FG_NAME)
display(Markdown(f'To see the Glue table that was created for you, go here: [{glue_console_url}]({glue_console_url})'))

#### Now let's see what the online store thinks are the latest values

In [None]:
fs.get_latest_feature_values(FG_NAME, [4,2,6])

## Train a simple model with features extracted from the feature store
For our example, the dataset we want to train on will have the latest values for specific features for each record id.

In [None]:
full_df = fs.get_latest_offline_feature_values(FG_NAME, feature_names=['ZipCode','Churn'])
full_df

In [None]:
train_rec_count = int(full_df.shape[0] * 0.70)

train_df = full_df[0:train_rec_count]
test_df = full_df[train_rec_count:6]

### Make predictions using the trained model

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(train_df[['ZipCode']], train_df[['Churn']].values.ravel())
clf.predict(test_df[['ZipCode']])

### Make predictions using features from the online store

In [None]:
customer_id = 1

In [None]:
customer_features = fs.get_latest_feature_values(FG_NAME, [customer_id], features=['ZipCode'])[0]['ZipCode']
preds = clf.predict([[customer_features]])
churn_pred = 'will' if preds[0] == 1 else 'will NOT'
print(f'Customer {customer_id} {churn_pred} churn.')

## Explore feature groups and metadata

### Find features

In [None]:
fs.list_feature_groups('demo')

### Describe a feature group

In [None]:
descr = fs.describe_feature_group(FG_NAME)
print(json.dumps(descr, indent=4, sort_keys=True, default=str))

### Get feature group tags

In [None]:
doc_url = fs.get_tags(FG_NAME)['DocURL']
display(Markdown(f'Docs for feature group "**{FG_NAME}**" is [here]({doc_url})'))