from IPython.display import display from IPython.display import JSON from IPython.core.display import HTML import pandas as pd # --- no changes; just run this code block --- def summary_stats(df): """ Generate summary statistics for a panda's data frame Args: df (DataFrame): panda's dataframe to create summary statistics for. Returns: DataFrame of summary statistics, training data schema, event variables and event lables """ df = df.copy() rowcnt = len(df) df['EVENT_LABEL'] = df['EVENT_LABEL'].astype('str', errors='ignore') df_s1 = df.agg(['count', 'nunique']).transpose().reset_index().rename(columns={"index":"feature_name"}) df_s1["null"] = (rowcnt - df_s1["count"]).astype('int64') df_s1["not_null"] = rowcnt - df_s1["null"] df_s1["null_pct"] = df_s1["null"] / rowcnt df_s1["nunique_pct"] = df_s1['nunique']/ rowcnt dt = pd.DataFrame(df.dtypes).reset_index().rename(columns={"index":"feature_name", 0:"dtype"}) df_stats = pd.merge(dt, df_s1, on='feature_name', how='inner').round(4) df_stats['nunique'] = df_stats['nunique'].astype('int64') df_stats['count'] = df_stats['count'].astype('int64') # -- variable type mapper -- df_stats['feature_type'] = "UNKOWN" df_stats.loc[df_stats["dtype"] == object, 'feature_type'] = "CATEGORY" df_stats.loc[(df_stats["dtype"] == "int64") | (df_stats["dtype"] == "float64"), 'feature_type'] = "NUMERIC" df_stats.loc[df_stats["feature_name"].str.contains("ipaddress|ip_address|ipaddr"), 'feature_type'] = "IP_ADDRESS" df_stats.loc[df_stats["feature_name"].str.contains("email|email_address|emailaddr"), 'feature_type'] = "EMAIL_ADDRESS" df_stats.loc[df_stats["feature_name"] == "EVENT_LABEL", 'feature_type'] = "TARGET" df_stats.loc[df_stats["feature_name"] == "EVENT_TIMESTAMP", 'feature_type'] = "EVENT_TIMESTAMP" # -- variable warnings -- df_stats['feature_warning'] = "NO WARNING" df_stats.loc[(df_stats["nunique"] != 2) & (df_stats["feature_name"] == "EVENT_LABEL"),'feature_warning' ] = "LABEL WARNING, NON-BINARY EVENT LABEL" df_stats.loc[(df_stats["nunique_pct"] > 0.9) & (df_stats['feature_type'] == "CATEGORY") ,'feature_warning' ] = "EXCLUDE, GT 90% UNIQUE" df_stats.loc[(df_stats["null_pct"] > 0.2) & (df_stats["null_pct"] <= 0.5), 'feature_warning' ] = "NULL WARNING, GT 20% MISSING" df_stats.loc[df_stats["null_pct"] > 0.5,'feature_warning' ] = "EXCLUDE, GT 50% MISSING" df_stats.loc[((df_stats['dtype'] == "int64" ) | (df_stats['dtype'] == "float64" ) ) & (df_stats['nunique'] < 0.2), 'feature_warning' ] = "LIKELY CATEGORICAL, NUMERIC w. LOW CARDINALITY" # -- target check -- exclude_fields = df_stats.loc[(df_stats['feature_warning'] != 'NO WARNING')]['feature_name'].to_list() event_variables = df_stats.loc[(~df_stats['feature_name'].isin(['EVENT_LABEL', 'EVENT_TIMESTAMP']))]['feature_name'].to_list() event_labels = df["EVENT_LABEL"].unique().tolist() trainingDataSchema = { 'modelVariables' : df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list(), 'labelSchema' : { 'labelMapper' : { 'FRAUD' : [df["EVENT_LABEL"].value_counts().idxmin()], 'LEGIT' : [df["EVENT_LABEL"].value_counts().idxmax()] } } } model_variables = df_stats.loc[(df_stats['feature_type'].isin(['IP_ADDRESS', 'EMAIL_ADDRESS', 'CATEGORY', 'NUMERIC' ]))]['feature_name'].to_list() # -- label schema -- label_map = { 'FRAUD' : [df["EVENT_LABEL"].value_counts().idxmin()], 'LEGIT' : [df["EVENT_LABEL"].value_counts().idxmax()] } display(HTML("
These are the available features in the data set for the AFD model training
")) display(JSON(event_variables)) display(HTML("We have two types of events - Fraud events and legitimate events
")) display(JSON(event_labels)) display(HTML("Training data schema is required for creating and training the model. Refer to documentation
")) display(JSON(trainingDataSchema)) return df_stats, trainingDataSchema, event_variables, event_labels