Copyright 2021 Amazon.com and its affiliates; all rights reserved. This file is AWS Content and may not be duplicated or distributed without permission

# Try new Feature Store helper class

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

In [None]:
help(fs)

In [None]:
ids = [5,6]
features = ['ZipCode'] #['*'] 
hist_df = fs.get_historical_offline_feature_values('customers-summit', record_ids=ids, feature_names=features,
                                                         verbose=False)
hist_df

In [None]:
ids = [5,6]
features = ['*'] 
latest_df = fs.get_latest_offline_feature_values('customers-summit', record_ids=ids, feature_names=features,
                                                         verbose=False)
latest_df

In [None]:
fs.get_latest_offline_feature_values_as_of('customers-summit', '2020-02-02T00:00:00Z')

In [None]:
fs.list_feature_groups()

In [None]:
fs.list_feature_groups(name_contains='recsys')

In [None]:
fs.describe_feature_group('customers-10k-demo')

In [None]:
fs._wait_for_feature_group_deletion_complete('customers-10k-demo')

In [None]:
fs.delete_feature_group('customers-summit-sql-sql-v2')

In [None]:
import sagemaker
default_bucket = sagemaker.Session().default_bucket()
data_source = f's3://{default_bucket}/sagemaker-feature-store/hello-data/'

fs.schedule_feature_pipeline(data_source, 'customers-summit')

In [None]:
fs.update_feature_pipeline(data_source, 'customers-summit', instance_type='ml.m5.large')

In [None]:
fs.remove_feature_pipeline('customers-summit')

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

df = pd.read_csv('./customers.csv')
ORIGINAL_RECORD_COUNT = df.shape[0]
df.head()

tags = {'Environment': 'DEV', 
        'CostCenter': 'C20', 
        'Maintainer': 'John Smith', 
        'DocURL': 'https://www.google.com'}
fs.create_fg_from_df('tmp-fg', df, 'this is my new fg', tags=tags, id_name='Id')

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

fs.ingest_from_df('tmp-fg', df)

In [None]:
fs.get_latest_feature_values('tmp-fg', [4], features=['ZipCode'])

In [None]:
fs.get_latest_feature_values('tmp-fg', [4])

In [None]:
fs.get_latest_feature_values('tmp-fg', [4,2,6])

In [None]:
fs.delete_record('tmp-fg', 6, '2020-02-01T00:00:00Z')

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()
fs_dict = fs.get_latest_featureset_values({'Id': 2},
                                             ['tmp-fg:ZipCode'])

print(f'Feature set as dictionary: {fs_dict}')

print(f'Feature set as vector: {list(fs_dict.values())}')

In [None]:
fs.describe_feature_group('tmp-fg')

In [None]:
fs.delete_feature_group('tmp-fg')

In [None]:
fs.create_fg_from_df('tmp-fg-light', df, id_name='Id', event_time_name='UpdateTime')

In [None]:
fs.describe_feature_group('tmp-fg-light')

In [None]:
fs.delete_feature_group('tmp-fg-light')

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

In [None]:
fs.describe_feature_group('tmp-fg')

In [None]:
fs.get_tags('tmp-fg')

In [None]:
fg_name = 'housing'

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

In [None]:
right_now = datetime.now()

In [None]:
import pytz

created_at = fs.describe_feature_group(fg_name)['CreationTime']
rn_2 = right_now.replace(tzinfo=pytz.UTC)
created_n_days = (rn_2 - created_at).days
print(f'"{fg_name}" was created {created_n_days} days ago ("{created_at}")')

In [None]:
fg_name = 'housing'

In [None]:
fs.sample(fg_name, sample_pct=5)

In [None]:
count = fs.get_historical_record_count(fg_name)
print(f'Found {count:,d} total records in offline store for "{fg_name}"')

In [None]:
import sys
import os

notebook_dir = os.getcwd()
package_dir = notebook_dir + '/ml-lineage-helper'
sys.path.append(package_dir)

from ml_lineage_helper import *
from ml_lineage_helper.query_lineage import *

In [None]:
fg_name = 'tmp-fg'
fg_name = 'fscw-orders-08-10-17-21-52'

def get_models_list(fg_name):
    try:
        query_lineage = QueryLineage()
        fg_arn = fs.describe_feature_group(fg_name)['FeatureGroupArn']
        models_df = query_lineage.get_models_from_feature_group(fg_arn)
        if models_df is not None:
            models_list = models_df['SageMaker Model Name'].values[0:4]
        else:
            models_list = []
    except:
        models_list = []
        pass
    return models_list

models_string = ', '.join(get_models_list(fg_name))
print(f'Models: {models_string}')

In [None]:
fs.get_minmax_timestamps('housing')


In [None]:
fg_name = 'tmp-fg'
fg_name = 'fscw-orders-08-10-17-21-52'
fg_name = 'customers-summit'

In [None]:
import pytz
from datetime import datetime
from IPython.core.display import display, HTML, Markdown
import pandas as pd

def fg_profile_view(fs, fg_name):
    fg_desc = fs.describe_feature_group(fg_name)
    if 'Description' in fg_desc:
        description = fg_desc['Description']
    else:
        description = ''
    online = fg_desc['OnlineStoreConfig']['EnableOnlineStore']
    offline = fg_desc['OfflineStoreStatus']['Status'] == 'Active'
    if online and not offline:
        mode_string = 'Online-only'
    elif online and offline:
        mode_string = 'Online and offline'
    elif offline and not online:
        mode_string = 'Offline-only'
        
    tags_dict = fs.get_tags(fg_name)

    models_list = get_models_list(fg_name)
    models_string = ', '.join(models_list)
    models_count = len(models_list)

    fg_summary_markdown = \
        f'<u>Name:</u> {fg_name}<br />' +\
        f'<u>Description:</u> {description}<br />' +\
        f'<u>Mode:</u> {mode_string}<br />'
    
    if len(tags_dict) == 0:
        fg_summary_markdown += f'<u>Tags:</u> None<br />'
    else:
        fg_summary_markdown += f'<u>Tags:</u> {tags_dict}<br />'
    
    if models_count == 0:
        fg_summary_markdown += f'<u>Models using this feature group:</u> None<br />'
    else:
        fg_summary_markdown += f'<u>Models using this feature group:</u> {models_count}: {models_string}<br />'

    if offline:
        offline_store_url = fs.get_offline_store_url(fg_name)
        glue_console_url = fs.get_glue_table_url(fg_name)
        athena_url = 'https://console.aws.amazon.com/athena/query-editor'
        count = fs.get_historical_record_count(fg_name)

        right_now = datetime.now()

        times_df = fs.get_minmax_timestamps(fg_name)

        most_recent_write = times_df.iloc[0]['max_write_time'] #fs.get_most_recent_write_time(fg_name)
        last_write = datetime.fromisoformat(most_recent_write)
        last_n_days = (right_now - last_write).days

        oldest_write_time = times_df.iloc[0]['min_write_time'] #fs.get_oldest_write_time(fg_name)
        oldest_write = datetime.fromisoformat(oldest_write_time)
        oldest_n_days = (right_now - oldest_write).days

        max_event_time = times_df.iloc[0]['max_event_time']
        min_event_time = times_df.iloc[0]['min_event_time']

        created_at = fs.describe_feature_group(fg_name)['CreationTime']
        rn_2 = right_now.replace(tzinfo=pytz.UTC)
        created_n_days = (rn_2 - created_at).days

        fg_summary_markdown += \
            f'<u>Total records:</u> {count:,d}<br />' +\
            f'<u>Created:</u> {created_n_days} days ago ({created_at})<br />' +\
            f'<u>Oldest record:</u> {oldest_n_days} days ago ({oldest_write})<br />' +\
            f'<u>Most recent record:</u> {last_n_days} days ago ({most_recent_write})<br />' +\
            f'<u>Event time range:</u> {min_event_time}  -> to ->  {max_event_time}<br />' +\
            f'<u>Offline store in s3 console:</u> [here]({offline_store_url})<br />' +\
            f'<u>Glue table in console:</u> [here]({glue_console_url})<br />' +\
            f'<u>Athena query editor:</u> [here]({athena_url})<br />'
    display(Markdown(fg_summary_markdown))
    display(Markdown(f'<u>Sample offline store records:</u><br />'))
    sample_df = fs.sample(fg_name, 5)
    display(sample_df.head())

In [None]:
fg_profile_view(fs, 'customers-summit')

In [None]:
fg_profile_view(fs, 'housing')

In [None]:
tmp_filename = fs.download_sample_offline_file('customers-summit')

In [None]:
p_df = pd.read_parquet(tmp_filename)
p_df

In [None]:
fs.get_tags('customers-summit')

In [None]:
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

tmp_df = fs.get_minmax_timestamps(fg_name)
tmp_df

In [None]:
tmp_df.iloc[0]['max_event_time']

In [None]:
fg_name = 'fscw-orders-08-10-17-21-52'

In [None]:
ml_lineage = MLLineageHelper()
lineage = ml_lineage.create_ml_lineage('pytorch-hosted-model-2021-10-09-13-32-06-083', 
                                       model_name='house-price-estimate',
                                       feature_group_names=[fg_name])
lineage

In [None]:
print('Here are a few sample records:')
fs.sample('housing', 5)

In [None]:
import pandas as pd
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

In [None]:
multi_id_events = [['2020-02-01T08:30:00Z', 6, 450],
          ['2020-02-02T10:15:30Z', 5, 5000],
          ['2020-02-03T13:20:59Z', 1, 1999],
          ['2021-01-01T00:00:00Z', 1, 2001]
         ]
multi_id_df = pd.DataFrame(multi_id_events, columns=['my_event_time', 'Id', 'HOUSE_ID'])
multi_id_df.head()

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=True)

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=False)

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=False,
               verbose=True)

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=False) #, verbose=True)

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=False) #, verbose=True)

In [None]:
%%time
fs.get_features(multi_id_df, 'my_event_time', 
                   features=['customers:ZipCode', 
                             'payments:avg_amount', 
                             'payments:avg_days_late',
                             'housing:SQUARE_FEET',
                             'housing:PRICE'],
               parallel=True) #, verbose=True)

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
from sm.feature_store_helper import FeatureStore
fs = FeatureStore()

extended_order_events = [['2021-07-07T10:01:00Z', 'C1', 'O1', 'P1'],
          ['2021-07-07T10:02:00Z', 'C2', 'O2', 'P2'],
          ['2021-07-07T10:03:00Z', 'C3', 'O3', 'P3'],
          ['2021-07-07T10:04:00Z', 'C4', 'O4', 'P4']
         ]
extended_orders_df = pd.DataFrame(extended_order_events,
                                  columns=['my_event_time', 'customer_id', 'order_id', 'product_id'])
fs.get_features(extended_orders_df, 'my_event_time', 
                   features=['fscw-orders-08-10-17-21-52:*'],
                  verbose=True,
                  parallel=False)

In [None]:
full_df = fs.get_latest_offline_feature_values('tmp-fg', feature_names=['ZipCode','Churn'])
full_df

In [None]:
full_df = fs.get_latest_offline_feature_values_as_of('customers-summit', '2020-02-03T08:30:00Z', feature_names=['ZipCode','Churn'])
full_df