In [259]:
## This script is used to read and preprocess clinical data (in tabular format) from S3 and store features in SageMaker FeatureStore

In [260]:
import boto3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import io, os
from time import gmtime, strftime, sleep
import time
import sagemaker
from sagemaker.session import Session
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup

## Set up SageMaker FeatureStore

In [261]:
region = boto3.Session().region_name

boto_session = boto3.Session(region_name=region)
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)

feature_store_session = Session(
 boto_session=boto_session,
 sagemaker_client=sagemaker_client,
 sagemaker_featurestore_runtime_client=featurestore_runtime
)

role = get_execution_role()
s3_client = boto3.client('s3', region_name=region)

default_s3_bucket_name = feature_store_session.default_bucket()
prefix = 'sagemaker-featurestore-demo'

## Get data from S3

In [262]:
# Get data from S3 
bucket_clin = 'nsclc-clinical-genomic-data'
#bucket_clin = 

# Clinical data 
#data_key_clin = 'Clinical-data-119patients.csv'
data_key_clin = 'NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv'
#data_key_clin = 

data_location_clin = 's3://{}/{}'.format(bucket_clin, data_key_clin)
data_clinical = pd.read_csv(data_location_clin)

## Preprocess Data

In [264]:
# Keep samples starting with "R01-*" as these IDs have corresponding medical imaging data. Delete samples with Case IDs "AMC-*". 
data_clinical = data_clinical[~data_clinical["Case ID"].str.contains("AMC")]

# Delete columns with ID and dates
list_delete_cols = ['Quit Smoking Year', 'Date of Recurrence', 'Date of Last Known Alive', 'Date of Death', 'CT Date', 'PET Date']
data_clinical.drop(list_delete_cols, axis=1, inplace=True)

# List of features with catergorical value
list_encode_cols = ["Patient affiliation", "Gender", "Ethnicity", "Smoking status", "%GG", "Tumor Location (choice=RUL)", "Tumor Location (choice=RML)", "Tumor Location (choice=RLL)", "Tumor Location (choice=LUL)", "Tumor Location (choice=LLL)", "Tumor Location (choice=L Lingula)", "Tumor Location (choice=Unknown)", "Histology ", "Pathological T stage", "Pathological N stage", "Pathological M stage", "Histopathological Grade", "Lymphovascular invasion", "Pleural invasion (elastic, visceral, or parietal)", "EGFR mutation status", "KRAS mutation status", "ALK translocation status", "Adjuvant Treatment", "Chemotherapy", "Radiation", "Recurrence", "Recurrence Location"]

# List of features with numeric value
list_nonenc_cols = ["Case ID", "Age at Histological Diagnosis", "Weight (lbs)", "Pack Years", "Time to Death (days)", "Days between CT and surgery", "Survival Status"]

# One-hot encoding of features with categorical value
data_clinical_enc = pd.get_dummies(data_clinical[list_encode_cols])

data_clinical_nonenc = data_clinical[list_nonenc_cols]

# Combine all features
data_clin = pd.concat([data_clinical_enc, data_clinical_nonenc], axis=1)

# Feature names inside FeatureStore should not have special chars and should be < 64 chars long
# Update feature names accordingly

l_char = ['-',' ','%','/','<','>','(',')','=',',',':']

for col in (data_clin.columns):

 if (col == "Case ID"):
 data_clin.rename(columns={col: col.replace(' ','_')}, inplace = True)
 continue

 for char in l_char:
 if char in col:
 data_clin.rename(columns={col: col.replace(char,'')}, inplace = True)
 col = col.replace(char,'')
 
 if (len(col)>=64):
 data_clin.rename(columns={col: col[:60]}, inplace = True)
 
# Change label (survival status) "Dead"=1 and "Alive"=0 
data_clin["SurvivalStatus"].replace({"Dead": "1", "Alive": "0"}, inplace=True)


# Drop samples with missing values. 
# Fill NaN with 0. For eg. PackYears for non-smokers is "NA". Change it to 0.
data_clin = data_clin[data_clin['Weightlbs'] != "Not Collected"]
data_clin = data_clin[data_clin['PackYears'] != "Not Collected"]
data_clin.fillna(0)

(147, 89)

## Ingest data into FeatureStore

In [266]:
clinical_feature_group_name = 'clinical-feature-group-' + strftime('%d-%H-%M-%S', gmtime())
clinical_feature_group = FeatureGroup(name=clinical_feature_group_name, sagemaker_session=feature_store_session)

current_time_sec = int(round(time.time()))

def cast_object_to_string(data_frame):
 for label in data_frame.columns:
 print (label)
 if data_frame.dtypes[label] == 'object':
 data_frame[label] = data_frame[label].astype("str").astype("string")

# Cast object dtype to string. SageMaker FeatureStore Python SDK will then map the string dtype to String feature type.
cast_object_to_string(data_clin)

# Record identifier and event time feature names
record_identifier_feature_name = "Case_ID"
event_time_feature_name = "EventTime"

# Append EventTime feature
data_clin[event_time_feature_name] = pd.Series([current_time_sec]*len(data_clin), dtype="float64")

## If event time generates NaN
data_clin[event_time_feature_name] = data_clin[event_time_feature_name].fillna(0)

# Load feature definitions to the feature group. SageMaker FeatureStore Python SDK will auto-detect the data schema based on input data.
clinical_feature_group.load_feature_definitions(data_frame=data_clin); # output is suppressed


def wait_for_feature_group_creation_complete(feature_group):
 status = feature_group.describe().get("FeatureGroupStatus")
 while status == "Creating":
 print("Waiting for Feature Group Creation")
 time.sleep(5)
 status = feature_group.describe().get("FeatureGroupStatus")
 if status != "Created":
 raise RuntimeError(f"Failed to create feature group {feature_group.name}")
 print(f"FeatureGroup {feature_group.name} successfully created.")

clinical_feature_group.create(
 s3_uri=f"s3://{default_s3_bucket_name}/{prefix}",
 record_identifier_name=record_identifier_feature_name,
 event_time_feature_name=event_time_feature_name,
 role_arn=role,
 enable_online_store=True
)

wait_for_feature_group_creation_complete(feature_group=clinical_feature_group)

clinical_feature_group.ingest(
 data_frame=data_clin, max_workers=3, wait=True
)

Patientaffiliation_Stanford
Patientaffiliation_VA
Gender_Female
Gender_Male
Ethnicity_AfricanAmerican
Ethnicity_Asian
Ethnicity_Caucasian
Ethnicity_HispanicLatino
Ethnicity_NativeHawaiianPacificIslander
Smokingstatus_Current
Smokingstatus_Former
Smokingstatus_Nonsmoker
GG_0
GG_100
GG_2550
GG_5075
GG_75100
GG_025
GG_NotAssessed
TumorLocationchoiceRUL_Checked
TumorLocationchoiceRUL_Unchecked
TumorLocationchoiceRML_Checked
TumorLocationchoiceRML_Unchecked
TumorLocationchoiceRLL_Checked
TumorLocationchoiceRLL_Unchecked
TumorLocationchoiceLUL_Checked
TumorLocationchoiceLUL_Unchecked
TumorLocationchoiceLLL_Checked
TumorLocationchoiceLLL_Unchecked
TumorLocationchoiceLLingula_Checked
TumorLocationchoiceLLingula_Unchecked
TumorLocationchoiceUnknown_Unchecked
Histology_Adenocarcinoma
Histology_NSCLCNOSnototherwisespecified
Histology_Squamouscellcarcinoma
PathologicalTstage_T1a
PathologicalTstage_T1b
PathologicalTstage_T2a
PathologicalTstage_T2b
PathologicalTstage_T3
PathologicalTstage_T4
Patholo

IngestionManagerPandas(feature_group_name='clinical-feature-group-29-00-11-19', sagemaker_fs_runtime_client_config=, max_workers=3, max_processes=1, _async_result=, _processing_pool=, _failed_indices=[])