# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: LicenseRef-.amazon.com.-AmznSL-1.0 # Licensed under the Amazon Software License http://aws.amazon.com/asl/ from typing import NamedTuple, List, Any, Union, Dict import pandas as pd import pytest from smclarify.bias.report import ( ProblemType, problem_type, bias_basic_stats, model_performance_report, bias_report, FacetColumn, LabelColumn, fetch_metrics_to_run, StageType, label_value_or_threshold, ) from smclarify.bias.metrics import PRETRAINING_METRICS, POSTTRAINING_METRICS, CI, DPL, KL, KS, DPPL, DI, DCA, DCR, RD from smclarify.bias.metrics import common class LabelValueInput(NamedTuple): df: pd.DataFrame positive_label_values: List[Union[str, float, int, bool]] class LabelValueOutput(NamedTuple): value_or_threshold: str metrics: Dict[str, float] def test_invalid_input(): df_cat = pd.DataFrame( [["a", 0, 0, "n"], ["b", 0, 1, "y"], ["c", 1, 0, "n"]], columns=["x", "y", "label", "predicted_label"], ) for staging_type in StageType: # facet not in dataset with pytest.raises(ValueError, match="Facet column z is not present in the dataset"): bias_report( df_cat, FacetColumn("z"), LabelColumn("Label", df_cat["label"]), staging_type, ) # no positive label value with pytest.raises(ValueError, match="Positive label values or thresholds are empty for Label column"): bias_report( df_cat, FacetColumn("x"), LabelColumn("Label", df_cat["label"]), staging_type, ) # incorrect stage type with pytest.raises(ValueError, match="stage_type should be a Enum value of StageType"): # noinspection PyTypeChecker bias_report( df_cat, FacetColumn("x"), LabelColumn("Label", df_cat["label"], [1]), "pre_training", ) # post-training but no predicted label column with pytest.raises(ValueError, match="predicted_label_column has to be provided for Post training metrics"): bias_report( df_cat, FacetColumn("x"), LabelColumn("Label", df_cat["label"], [1]), StageType.POST_TRAINING, ) # positive label value of label and predicted label not the same match_message = "Positive predicted label values or threshold should be empty or same as label values or thresholds" with pytest.raises(ValueError, match=match_message): bias_report( df_cat, FacetColumn("x"), LabelColumn("Label", df_cat["label"], [1]), StageType.POST_TRAINING, LabelColumn("Prediction", df_cat["predicted_label"], [0]), ) # label and positive label have different data types. match_message = "Predicted Label Column series datatype is not the same as Label Column series" with pytest.raises(ValueError, match=match_message): bias_report( df_cat, FacetColumn("x"), LabelColumn("Label", df_cat["label"], [1]), StageType.POST_TRAINING, LabelColumn("Prediction", df_cat["predicted_label"], [1]), ) # threshold not provided for continuous facet df = pd.DataFrame( [ [1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0], [4.0, 5.0, 6.0, 7.0], ], columns=["Label", "Facet", "Feature", "PredictedLabel"], ) with pytest.raises(ValueError, match="Threshold values must be provided for continuous features"): bias_report( df=df, facet_column=FacetColumn("Facet"), label_column=LabelColumn("Label", df["Label"], [2.0]), stage_type=StageType.POST_TRAINING, predicted_label_column=LabelColumn("PredictedLabel", df["PredictedLabel"], [2.0]), ) with pytest.raises( ValueError, match="Facet/label value provided must be a single numeric threshold for continuous data" ): bias_report( df=df, facet_column=FacetColumn("Facet", [3.0]), label_column=LabelColumn("Label", df["Label"], ["string_threshold"]), stage_type=StageType.PRE_TRAINING, ) def test_report_category_data(): # test the bias_report function on the category data # # pre training bias metrics df_cat = pd.DataFrame( [["a", 1, 1, 1, "1"], ["b", 1, 1, 0, "0"], ["b", 0, 1, 0, "0"], ["b", 0, 0, 1, "1"]], columns=["x", "y", "z", "yhat", "yhat_cat"], ) pretraining_report = bias_report( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.PRE_TRAINING, LabelColumn("yhat", df_cat["yhat"]), group_variable=df_cat["z"], ) pretraining_report_cat = bias_report( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.PRE_TRAINING, LabelColumn("yhat", df_cat["yhat_cat"]), group_variable=df_cat["z"], ) assert isinstance(pretraining_report, list) assert len(pretraining_report) > 0 assert pretraining_report == pretraining_report_cat result = [ { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(-0.375), }, {"description": "Class Imbalance (CI)", "name": "CI", "value": pytest.approx(0.5)}, { "description": "Difference in Positive Proportions in Labels (DPL)", "name": "DPL", "value": pytest.approx(-0.6666666666666667), }, { "description": "Jensen-Shannon Divergence (JS)", "name": "JS", "value": pytest.approx(0.08720802396075798), }, { "description": "Kullback-Liebler Divergence (KL)", "name": "KL", "value": pytest.approx(-0.3662040962227032), }, { "description": "Kolmogorov-Smirnov Distance (KS)", "name": "KS", "value": pytest.approx(0.6666666666666667), }, {"description": "L-p Norm (LP)", "name": "LP", "value": pytest.approx(0.6666666666666667)}, { "description": "Total Variation Distance (TVD)", "name": "TVD", "value": pytest.approx(0.33333333333333337), }, ], "value_or_threshold": "a", }, { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(0.625), }, {"description": "Class Imbalance (CI)", "name": "CI", "value": pytest.approx(-0.5)}, { "description": "Difference in Positive Proportions in Labels (DPL)", "name": "DPL", "value": pytest.approx(0.6666666666666667), }, { "description": "Jensen-Shannon Divergence (JS)", "name": "JS", "value": pytest.approx(0.08720802396075798), }, { "description": "Kullback-Liebler Divergence (KL)", "name": "KL", "value": pytest.approx(1.0986122886681098), }, { "description": "Kolmogorov-Smirnov Distance (KS)", "name": "KS", "value": pytest.approx(0.6666666666666667), }, {"description": "L-p Norm (LP)", "name": "LP", "value": pytest.approx(0.6666666666666667)}, { "description": "Total Variation Distance (TVD)", "name": "TVD", "value": pytest.approx(0.33333333333333337), }, ], "value_or_threshold": "b", }, ] assert pretraining_report == result # post training bias metrics posttraining_report = bias_report( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.POST_TRAINING, LabelColumn("yhat", df_cat["yhat"]), metrics=["AD", "DI", "DPPL", "RD"], group_variable=df_cat["z"], ) posttraining_report_cat = bias_report( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.POST_TRAINING, LabelColumn("yhat", df_cat["yhat_cat"]), metrics=["AD", "DI", "DPPL", "RD"], group_variable=df_cat["z"], ) assert isinstance(posttraining_report, list) assert len(posttraining_report) > 0 assert posttraining_report == posttraining_report_cat expected_result_1 = [ { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(-0.6666666666666667)}, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(3.0)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(-0.6666666666666667), }, {"description": "Recall Difference (RD)", "name": "RD", "value": pytest.approx(-1.0)}, ], "value_or_threshold": "a", }, { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(0.6666666666666667)}, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(0.3333333333333333)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(0.6666666666666667), }, {"description": "Recall Difference (RD)", "name": "RD", "value": pytest.approx(1.0)}, ], "value_or_threshold": "b", }, ] assert posttraining_report == expected_result_1 def test_report_continuous_data(): # test the bias_report function on the category data # # pre training bias metrics df_cont = pd.DataFrame( [ [0, 0, 0, 0, True, 1, 1, 1], [3, 0, 0, 0, True, 0, 1, 1], [3, 0, 1, 0, True, 0, 1, 1], [0, 0, 0, 0, False, 1, 1, 0], [4, 0, 0, 1, True, 0, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [3, 0, 0, 0, True, 1, 1, 1], [3, 1, 0, 0, True, 1, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [3, 0, 1, 1, True, 1, 0, 1], [4, 0, 0, 0, True, 1, 0, 1], [3, 0, 1, 0, True, 1, 1, 1], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 0, 0, True, 1, 1, 1], [0, 0, 1, 0, True, 0, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [0, 1, 0, 1, False, 0, 1, 0], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 1, 0, False, 1, 1, 1], [3, 0, 0, 0, True, 1, 0, 1], [3, 0, 1, 0, False, 1, 1, 0], [0, 1, 0, 0, False, 1, 1, 0], [3, 0, 1, 0, True, 0, 1, 1], [0, 0, 0, 1, True, 1, 0, 1], ], columns=["x", "y", "z", "a", "b", "c", "d", "yhat"], ) pretraining_report = bias_report( df_cont, FacetColumn("x", [2]), LabelColumn("y", df_cont["y"], [0]), StageType.PRE_TRAINING, LabelColumn("yhat", df_cont["yhat"]), group_variable=df_cont["z"], ) assert isinstance(pretraining_report, list) assert len(pretraining_report) > 0 result = [ { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(0.3851010101010101), }, {"description": "Class Imbalance (CI)", "name": "CI", "value": pytest.approx(-0.08333333333333333)}, { "description": "Difference in Positive Proportions in Labels (DPL)", "name": "DPL", "value": pytest.approx(0.1048951048951049), }, { "description": "Jensen-Shannon Divergence (JS)", "name": "JS", "value": pytest.approx(0.01252420207928287), }, { "description": "Kullback-Liebler Divergence (KL)", "name": "KL", "value": pytest.approx(0.057704603668062765), }, { "description": "Kolmogorov-Smirnov Distance (KS)", "name": "KS", "value": pytest.approx(0.1048951048951049), }, {"description": "L-p Norm (LP)", "name": "LP", "value": pytest.approx(0.14834407996920576)}, { "description": "Total Variation Distance (TVD)", "name": "TVD", "value": pytest.approx(0.1048951048951049), }, ], "value_or_threshold": "(2, 4]", } ] assert pretraining_report == result posttraining_report = bias_report( df_cont, FacetColumn("x", [2]), LabelColumn("y", df_cont["y"], [0]), StageType.POST_TRAINING, LabelColumn("yhat", df_cont["yhat"]), group_variable=df_cont["z"], ) assert isinstance(posttraining_report, list) assert len(posttraining_report) > 0 expected_result_1 = [ { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(-0.2167832167832168)}, { "description": "Conditional Demographic Disparity in Predicted " "Labels (CDDPL)", "name": "CDDPL", "value": pytest.approx(0.07592592592592595), }, {"description": "Difference in Acceptance Rates (DAR)", "name": "DAR", "value": pytest.approx(-0.1)}, { "description": "Difference in Conditional Acceptance (DCA)", "name": "DCA", "value": pytest.approx(0.15), }, { "description": "Difference in Conditional Rejection (DCR)", "name": "DCR", "value": pytest.approx(1.0), }, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(1.0576923076923077)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(-0.04195804195804198), }, { "description": "Difference in Rejection Rates (DRR)", "name": "DRR", "value": pytest.approx(0.6666666666666667), }, {"description": "Flip Test (FT)", "name": "FT", "value": pytest.approx(-0.23076923076923078)}, { "description": "Generalized Entropy (GE)", "name": "GE", "value": 0.07593688362919139, }, {"description": "Recall Difference (RD)", "name": "RD", "value": -1.0}, {"description": "Specificity Difference (SD)", "name": "SD", "value": 0.1388888888888889}, {"description": "Treatment Equality (TE)", "name": "TE", "value": pytest.approx(-0.25)}, ], "value_or_threshold": "(2, 4]", } ] print(posttraining_report) assert posttraining_report == expected_result_1 def test_report_continuous_data_regression(): # test that we correctly apply thresholds for regression tasks. # df_cont_old = pd.DataFrame( [ [0, 0, 0, 0, True, 1, 1, 1], [3, 0, 0, 0, True, 0, 1, 1], [3, 0, 1, 0, True, 0, 1, 1], [0, 0, 0, 0, False, 1, 1, 0], [4, 0, 0, 1, True, 0, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [3, 0, 0, 0, True, 1, 1, 1], [3, 1, 0, 0, True, 1, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [3, 0, 1, 1, True, 1, 0, 1], [4, 0, 0, 0, True, 1, 0, 1], [3, 0, 1, 0, True, 1, 1, 1], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 0, 0, True, 1, 1, 1], [0, 0, 1, 0, True, 0, 1, 1], [0, 0, 1, 0, True, 1, 1, 1], [0, 1, 0, 1, False, 0, 1, 0], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 1, 0, False, 1, 1, 1], [3, 0, 0, 0, True, 1, 0, 1], [3, 0, 1, 0, False, 1, 1, 0], [0, 1, 0, 0, False, 1, 1, 0], [3, 0, 1, 0, True, 0, 1, 1], [0, 0, 0, 1, True, 1, 0, 1], ], columns=["x", "y", "z", "a", "b", "c", "d", "yhat"], ) df_cont = pd.DataFrame( [ [0, 0.0, 0, 0, True, 1, 1, 11], # 11 is the highest among y and yhat [3, 0.5, 0, 0, True, 0, 1, 6], [3, 2, 1, 0, True, 0, 1, 6.6], [0, 3, 0, 0, False, 1, 1, 0.3], [4, 2.2, 0, 1, True, 0, 1, 6], [0, 0.1, 1, 0, True, 1, 1, 6], [3, 0, 0, 0, True, 1, 1, 6], [3, 6, 0, 0, True, 1, 1, 6], [0, 0, 1, 0, True, 1, 1, 6], [3, 0, 1, 1, True, 1, 0, 6], [4, 0, 0, 0, True, 1, 0, 6], [3, 0, 1, 0, True, 1, 1, 6], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 0, 0, True, 1, 1, 6.2], [0, 0, 1, 0, True, 0, 1, 6.6], [0, 0, 1, 0, True, 1, 1, 6.6], [0, 7, 0, 1, False, 0, 1, 0.1], [3, 0, 0, 0, False, 1, 1, 2], [0, 0, 1, 0, False, 1, 1, 8], [3, 0, 0, 0, True, 1, 0, 9], [3, 0, 1, 0, False, 1, 1, 0.1], [0, 8, 0, 0, False, 1, 1, 2.2], [3, 0, 1, 0, True, 0, 1, 10], [0, 0, 0, 1, True, 1, 0, 9], ], columns=["x", "y", "z", "a", "b", "c", "d", "yhat"], ) # Old and new df should yield the same results if we use threshold 5 for the latter. threshold_old = 0.5 threshold_new = 5 assert ((df_cont_old[["y"]] > threshold_old) == (df_cont[["y"]] > threshold_new)).all assert ((df_cont_old[["yhat"]] > threshold_old) == (df_cont[["yhat"]] > threshold_new)).all posttraining_report = bias_report( df_cont, FacetColumn("x", [2]), LabelColumn("y", df_cont["y"], positive_label_values=[threshold_new]), StageType.POST_TRAINING, LabelColumn("yhat", df_cont["yhat"], positive_label_values=[threshold_new]), group_variable=df_cont["z"], ) posttraining_report_old = bias_report( df_cont_old, FacetColumn("x", [2]), LabelColumn("y", df_cont_old["y"], positive_label_values=[threshold_old]), StageType.POST_TRAINING, LabelColumn("yhat", df_cont_old["yhat"], positive_label_values=[threshold_old]), group_variable=df_cont["z"], ) assert posttraining_report == posttraining_report_old def test_report_string_data_determined_as_continuous(): # Although the data columns look like categorical, they are determined as continuous # because the data can be casted to numbers, and the data uniqueness is high. # The test case means to check if the report method can handle the case correctly. df = pd.DataFrame( data=[ ["1", "1", "1", "1"], ["2", "2", "2", "2"], ["3", "3", "3", "3"], ["4", "4", "4", "4"], ], columns=["Label", "Facet", "Feature", "PredictedLabel"], ) pretraining_report = bias_report( df=df, facet_column=FacetColumn("Facet", [2]), label_column=LabelColumn("Label", df["Label"], [2]), stage_type=StageType.POST_TRAINING, predicted_label_column=LabelColumn("PredictedLabel", df["PredictedLabel"], [2]), metrics=["DPPL"], ) # Actually the validation below is not really needed. If there was problem then the report method # should have failed with error like "TypeError: bad operand type for abs(): 'str'" when it tried to # manipulate string as number. assert pretraining_report == [ { "value_or_threshold": "(2, 4]", # <== range, so the facet is indeed determined as continuous "metrics": [ { "name": "DPPL", "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "value": pytest.approx(-1.0), } ], } ] def test_report_integer_data_determined_as_categorical(): # Although the data columns look like continuous, they are determined as categorical because # the facet values or label values are categorical. Note that the label column and the predicted # label column have different categories (1,3,4 and 2,3,4 respectively). They can not be cast to # the type of each other, but no problem to get the positive label index. df = pd.DataFrame( data=[ [1, 1, 1, 2], [1, 2, 2, 2], [3, 3, 3, 3], [4, 4, 4, 4], ], columns=["Label", "Facet", "Feature", "PredictedLabel"], ) pretraining_report = bias_report( df=df, facet_column=FacetColumn("Facet", [1, 2]), label_column=LabelColumn("Label", df["Label"], [1, 2]), stage_type=StageType.POST_TRAINING, predicted_label_column=LabelColumn("PredictedLabel", df["PredictedLabel"], [1, 2]), metrics=["DPPL"], ) assert pretraining_report == [ { "value_or_threshold": "1,2", # <== range, so the facet is indeed determined as categorical "metrics": [ { "name": "DPPL", "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "value": pytest.approx(-1.0), } ], } ] def test_label_values(): """ Test bias metrics for multiple label values """ df = pd.DataFrame( [["a", "p", 1, "p"], ["b", "q", 1, "p"], ["b", "r", 1, "q"], ["c", "p", 0, "p"], ["c", "q", 0, "p"]], columns=["x", "y", "z", "yhat"], ) # when explicit label values are given for categorical data # Pre training bias metrics pretraining_report = bias_report( df, FacetColumn("x"), LabelColumn("y", df["y"], ["p", "q"]), StageType.PRE_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["DPL", "CDDL"], group_variable=df["z"], ) assert isinstance(pretraining_report[0], dict) expected_result_1 = [ { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(-0.3), }, { "description": "Difference in Positive Proportions in Labels " "(DPL)", "name": "DPL", "value": pytest.approx(-0.25), }, ], "value_or_threshold": "a", }, { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(0.3), }, { "description": "Difference in Positive Proportions in Labels " "(DPL)", "name": "DPL", "value": pytest.approx(0.5), }, ], "value_or_threshold": "b", }, { "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(-0.4), }, { "description": "Difference in Positive Proportions in Labels (DPL)", "name": "DPL", "value": pytest.approx(-0.33333333333333337), }, ], "value_or_threshold": "c", }, ] assert pretraining_report == expected_result_1 # post training bias metrics posttraining_report = bias_report( df, FacetColumn("x"), LabelColumn("y", df["y"], ["p", "q"]), StageType.POST_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["AD", "DI", "DPPL", "RD", "DAR", "DRR"], group_variable=df["z"], ) assert isinstance(posttraining_report[0], dict) expected_result_2 = [ { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(-0.25)}, {"description": "Difference in Acceptance Rates (DAR)", "name": "DAR", "value": pytest.approx(-0.25)}, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(1.0)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(0.0), }, {"description": "Difference in Rejection Rates (DRR)", "name": "DRR", "value": pytest.approx(0)}, {"description": "Recall Difference (RD)", "name": "RD", "value": pytest.approx(0.0)}, ], "value_or_threshold": "a", }, { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(0.5)}, {"description": "Difference in Acceptance Rates (DAR)", "name": "DAR", "value": pytest.approx(0.5)}, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(1.0)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(0.0), }, {"description": "Difference in Rejection Rates (DRR)", "name": "DRR", "value": pytest.approx(0)}, {"description": "Recall Difference (RD)", "name": "RD", "value": pytest.approx(0.0)}, ], "value_or_threshold": "b", }, { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(-0.33333333333333337)}, { "description": "Difference in Acceptance Rates (DAR)", "name": "DAR", "value": pytest.approx(-0.33333333333333337), }, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(1.0)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(0.0), }, {"description": "Difference in Rejection Rates (DRR)", "name": "DRR", "value": pytest.approx(0)}, {"description": "Recall Difference (RD)", "name": "RD", "value": pytest.approx(0.0)}, ], "value_or_threshold": "c", }, ] assert posttraining_report == expected_result_2 def label_values_test_cases() -> List[List[Union[LabelValueInput, List[LabelValueOutput]]]]: """ Setting the `y` and `yhat` series .astype('category'), as this conversion feature is supposed to work only on categorical data. """ test_cases = [] output = [ LabelValueOutput("a", {"CDDL": -0.3, "DPL": -0.25}), LabelValueOutput("b", {"CDDL": 0.3, "DPL": 0.5}), LabelValueOutput("c", {"CDDL": -0.4, "DPL": -0.33333333333333337}), ] df = pd.DataFrame( [["a", None, 1, None], ["b", None, 1, None], ["b", None, 1, None], ["c", None, 0, None], ["c", None, 0, None]], columns=["x", "y", "z", "yhat"], ) # series - int, label values - int df["y"] = pd.Series([1, 2, 0, 1, 2]).astype("category") df["yhat"] = pd.Series([1, 1, 0, 1, 1]).astype("category") function_input = LabelValueInput(df=df.copy(), positive_label_values=[1, 2]) test_cases.append([function_input, output[:]]) # series - str, label values - int df["y"] = pd.Series(["1", "2", "0", "1", "2"]).astype("category") df["yhat"] = pd.Series(["1", "1", "0", "1", "1"]).astype("category") function_input = LabelValueInput(df=df.copy(), positive_label_values=[1, 2]) test_cases.append([function_input, output[:]]) # series - int, label values - str df["y"] = pd.Series([1, 2, 0, 1, 2]).astype("category") df["yhat"] = pd.Series([1, 1, 0, 1, 1]).astype("category") function_input = LabelValueInput(df=df.copy(), positive_label_values=["1", "2"]) test_cases.append([function_input, output[:]]) # series - str, label values - str df["y"] = pd.Series(["1", "2", "0", "1", "2"]).astype("category") df["yhat"] = pd.Series(["1", "1", "0", "1", "1"]).astype("category") function_input = LabelValueInput(df=df.copy(), positive_label_values=["1", "2"]) test_cases.append([function_input, output[:]]) return test_cases @pytest.mark.parametrize("function_input,function_output", label_values_test_cases()) def test_label_values_with_different_types_for_pre_training( function_input: LabelValueInput, function_output: List[LabelValueOutput] ): df = function_input.df pretraining_report = bias_report( df, FacetColumn("x"), LabelColumn("y", df["y"], function_input.positive_label_values), StageType.PRE_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["DPL", "CDDL"], group_variable=df["z"], ) expected_result_1 = [ { "value_or_threshold": output.value_or_threshold, "metrics": [ { "description": "Conditional Demographic Disparity in Labels " "(CDDL)", "name": "CDDL", "value": pytest.approx(output.metrics["CDDL"]), }, { "description": "Difference in Positive Proportions in Labels " "(DPL)", "name": "DPL", "value": pytest.approx(output.metrics["DPL"]), }, ], } for output in function_output ] assert pretraining_report == expected_result_1 @pytest.mark.parametrize("function_input,function_output", label_values_test_cases()) def test_label_values_with_different_types_for_post_training( function_input: LabelValueInput, function_output: List[LabelValueOutput] ): df = function_input.df pretraining_report = bias_report( df, FacetColumn("x"), LabelColumn("y", df["y"], function_input.positive_label_values), StageType.POST_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["DPPL", "CDDPL"], group_variable=df["z"], ) expected_result_1 = [ { "value_or_threshold": output.value_or_threshold, "metrics": [ { "description": "Conditional Demographic Disparity in Predicted " "Labels (CDDPL)", "name": "CDDPL", "value": pytest.approx(output.metrics["CDDL"]), }, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(output.metrics["DPL"]), }, ], } for output in function_output ] assert pretraining_report == expected_result_1 def test_fetch_metrics_to_run(): """ test the list of callable metric functions to be run """ input_metrics_1 = ["CI", "DPL", "KL", "KS"] metrics_to_run = fetch_metrics_to_run(PRETRAINING_METRICS, input_metrics_1) assert metrics_to_run == [CI, DPL, KL, KS] input_metrics_2 = ["DPPL", "DI", "DCA", "DCR", "RD"] metrics_to_run = fetch_metrics_to_run(POSTTRAINING_METRICS, input_metrics_2) assert metrics_to_run == [DPPL, DI, DCA, DCR, RD] def test_partial_bias_report(): """ Test that bias report is generated in for partial metrics when errors occur to compute some metrics """ df = pd.DataFrame( [[1, 1, 1, 1], [2, 1, 1, 0], [3, 0, 0, 0], [2, 0, 1, 1], [0, 0, 1, 1]], columns=["x", "y", "z", "yhat"] ) # pre training bias metrics pretraining_report = bias_report( df, FacetColumn("x", [2]), LabelColumn("y", df["y"], [0]), StageType.PRE_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["CI", "CDDL", "DPL", "KL"], ) assert isinstance(pretraining_report, list) expected_result_1 = [ { "metrics": [ { "description": "Conditional Demographic Disparity in Labels (CDDL)", "error": "Group variable is empty or not provided", "name": "CDDL", "value": None, }, {"description": "Class Imbalance (CI)", "name": "CI", "value": pytest.approx(0.6)}, { "description": "Difference in Positive Proportions in Labels " "(DPL)", "name": "DPL", "value": pytest.approx(0.5), }, { "description": "Kullback-Liebler Divergence (KL)", "name": "KL", "value": pytest.approx(-0.34657359027997264), }, ], "value_or_threshold": "(2, 3]", } ] assert pretraining_report == expected_result_1 # post training bias metrics posttraining_report = bias_report( df, FacetColumn("x", [2]), LabelColumn("y", df["y"], [0]), StageType.POST_TRAINING, LabelColumn("yhat", df["yhat"]), metrics=["AD", "CDDPL", "DCA", "DI", "DPPL", "FT", "GE", "SD"], ) assert isinstance(posttraining_report, list) expected_result_2 = [ { "metrics": [ {"description": "Accuracy Difference (AD)", "name": "AD", "value": pytest.approx(-0.75)}, { "description": "Conditional Demographic Disparity in Predicted " "Labels (CDDPL)", "error": "Group variable is empty or not provided", "name": "CDDPL", "value": None, }, { "description": "Difference in Conditional Acceptance (DCA)", "name": "DCA", "value": pytest.approx(0.6666666666666666), }, {"description": "Disparate Impact (DI)", "name": "DI", "value": pytest.approx(0.0)}, { "description": "Difference in Positive Proportions in Predicted " "Labels (DPPL)", "name": "DPPL", "value": pytest.approx(0.75), }, {"description": "Flip Test (FT)", "name": "FT", "value": pytest.approx(-1.0)}, {"description": "Generalized Entropy (GE)", "name": "GE", "value": 0.19444444444444456}, {"description": "Specificity Difference (SD)", "name": "SD", "value": 1.0}, ], "value_or_threshold": "(2, 3]", } ] assert posttraining_report == expected_result_2 def test_metric_descriptions(): """ Test the list of callable metrics have descriptions present """ pretraining_metrics = PRETRAINING_METRICS postraining_metrics = POSTTRAINING_METRICS pretraining_metric_descriptions = {} for metric in pretraining_metrics: description = common.metric_description(metric) pretraining_metric_descriptions.update({metric.__name__: description}) expected_result_1 = { "CDDL": "Conditional Demographic Disparity in Labels (CDDL)", "CI": "Class Imbalance (CI)", "DPL": "Difference in Positive Proportions in Labels (DPL)", "JS": "Jensen-Shannon Divergence (JS)", "KL": "Kullback-Liebler Divergence (KL)", "KS": "Kolmogorov-Smirnov Distance (KS)", "LP": "L-p Norm (LP)", "TVD": "Total Variation Distance (TVD)", } assert pretraining_metric_descriptions == expected_result_1 # post training metrics posttraining_metric_descriptions = {} for metric in postraining_metrics: description = common.metric_description(metric) posttraining_metric_descriptions.update({metric.__name__: description}) expected_result_2 = { "AD": "Accuracy Difference (AD)", "CDDPL": "Conditional Demographic Disparity in Predicted Labels (CDDPL)", "DAR": "Difference in Acceptance Rates (DAR)", "DCA": "Difference in Conditional Acceptance (DCA)", "DCR": "Difference in Conditional Rejection (DCR)", "DI": "Disparate Impact (DI)", "DPPL": "Difference in Positive Proportions in Predicted Labels (DPPL)", "DRR": "Difference in Rejection Rates (DRR)", "FT": "Flip Test (FT)", "GE": "Generalized Entropy (GE)", "RD": "Recall Difference (RD)", "SD": "Specificity Difference (SD)", "TE": "Treatment Equality (TE)", } assert posttraining_metric_descriptions == expected_result_2 def test_predicted_label_values(): """ Tests whether exception is raised when predicted label values are differnt from positive label values """ df = pd.DataFrame( [["a", "p", 1, "p"], ["b", "q", 1, "p"], ["b", "r", 1, "q"], ["c", "p", 0, "p"], ["c", "q", 0, "p"]], columns=["x", "y", "z", "yhat"], ) # when explicit label values are given for categorical data # Pre training bias metrics with pytest.raises( ValueError, match="Positive predicted label values or threshold should" " be empty or same as label values or thresholds", ): pretraining_report = bias_report( df, FacetColumn("x"), LabelColumn("y", df["y"], ["p", "q"]), StageType.PRE_TRAINING, LabelColumn("yhat", df["yhat"], ["q"]), metrics=["DPL", "CDDL"], group_variable=df["z"], ) def test_problem_type(): series = pd.Series([1, 2, 1, 2]) assert problem_type(series) == ProblemType.BINARY def test_bias_basic_stats(): df_cat = pd.DataFrame( [["a", 1, 1, 1, "1"], ["b", 1, 1, 0, "0"], ["b", 0, 1, 0, "0"], ["b", 0, 0, 1, "1"]], columns=["x", "y", "z", "yhat", "yhat_cat"], ) # Proportion results = bias_basic_stats( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.PRE_TRAINING, LabelColumn("yhat", df_cat["yhat"]), ) expected_results = [ { "value_or_threshold": "a", "metrics": [ { "name": "proportion", "description": "Proportion of examples in sensitive facet.", "value": pytest.approx(0.25), }, { "name": "observed_label_distribution", "description": "Distribution of observed label outcomes for sensitive facet", "value": [pytest.approx(1.0), pytest.approx(0.0)], }, ], }, { "value_or_threshold": "b", "metrics": [ { "name": "proportion", "description": "Proportion of examples in sensitive facet.", "value": pytest.approx(0.75), }, { "name": "observed_label_distribution", "description": "Distribution of observed label outcomes for sensitive facet", "value": [pytest.approx(1 / 3.0), pytest.approx(2 / 3.0)], }, ], }, ] assert expected_results == results # Confusion matrix results = bias_basic_stats( df_cat, FacetColumn("x"), LabelColumn("y", df_cat["y"], [0]), StageType.POST_TRAINING, LabelColumn("yhat", df_cat["yhat"]), ) expected_results = [ { "value_or_threshold": "a", "metrics": [ { "name": "proportion", "description": "Proportion of examples in sensitive facet.", "value": pytest.approx(0.25), }, { "name": "observed_label_distribution", "description": "Distribution of observed label outcomes for sensitive facet", "value": [pytest.approx(1.0), pytest.approx(0.0)], }, { "name": "confusion_matrix", "description": "Fractions of TP, FP, FN, TN.", "value": [pytest.approx(1.0), pytest.approx(0.0), pytest.approx(0.0), pytest.approx(0.0)], }, ], }, { "value_or_threshold": "b", "metrics": [ { "name": "proportion", "description": "Proportion of examples in sensitive facet.", "value": pytest.approx(0.75), }, { "name": "observed_label_distribution", "description": "Distribution of observed label outcomes for sensitive facet", "value": [pytest.approx(1 / 3.0), pytest.approx(2 / 3.0)], }, { "name": "confusion_matrix", "description": "Fractions of TP, FP, FN, TN.", "value": [ pytest.approx(0.0), pytest.approx(1 / 3.0), pytest.approx(1 / 3.0), pytest.approx(1 / 3.0), ], }, ], }, ] assert expected_results == results def test_model_performance_categorical(): df_cat = pd.DataFrame( [["a", "p", 1, 1, "q"], ["b", "p", 1, 0, "r"], ["b", "r", 1, 0, "q"], ["b", "q", 0, 1, "p"]], columns=["x", "y_cat", "z", "yhat", "yhat_cat"], ) result = model_performance_report( df=df_cat, label_column=LabelColumn("y_cat", df_cat["y_cat"], ["p"]), predicted_label_column=LabelColumn("yhat_cat", df_cat["yhat_cat"], ["p"]), ) expected_result = { "label": "y_cat", "model_performance_metrics": [ { "name": "Accuracy", "description": "Proportion of inputs assigned the correct predicted label by the model.", "value": pytest.approx(1 / 4.0), }, { "name": "Proportion of Positive Predictions in Labels", "description": "Proportion of input assigned in positive predicted label.", "value": pytest.approx(1 / 4.0), }, { "name": "Proportion of Negative Predictions in Labels", "description": "Proportion of input assigned the negative predicted label.", "value": pytest.approx(3 / 4.0), }, { "name": "True Positive Rate / Recall", "description": "Proportion of inputs with positive observed label correctly assigned the positive predicted label.", "value": pytest.approx(0.0), }, { "name": "True Negative Rate / Specificity", "description": "Proportion of inputs with negative observed label correctly assigned the negative predicted label.", "value": pytest.approx(1 / 2.0), }, { "name": "Acceptance Rate / Precision", "description": "Proportion of inputs with positive predicted label that actually have a positive observed label.", "value": pytest.approx(0.0), }, { "name": "Rejection Rate", "description": "Proportion of inputs with negative predicted label that actually have a negative observed label.", "value": pytest.approx(1 / 3.0), }, { "name": "Conditional Acceptance", "description": "Ratio between the positive observed labels and positive predicted labels.", "value": pytest.approx(2.0), }, { "name": "Conditional Rejection", "description": "Ratio between the negative observed labels and negative predicted labels.", "value": pytest.approx(2 / 3.0), }, {"name": "F1 Score", "description": "Harmonic mean of precision and recall.", "value": pytest.approx(0.0)}, ], "binary_confusion_matrix": [pytest.approx(0.0), pytest.approx(0.25), pytest.approx(0.5), pytest.approx(0.25)], "confusion_matrix": { "p": {"p": pytest.approx(0.0), "q": pytest.approx(1.0), "r": pytest.approx(1.0)}, "q": {"p": pytest.approx(1.0), "q": pytest.approx(0.0), "r": pytest.approx(0.0)}, "r": {"p": pytest.approx(0.0), "q": pytest.approx(1.0), "r": pytest.approx(0.0)}, }, } assert expected_result == result def test_model_performance_continuous(): df_cont = pd.DataFrame( [ [0, 0.0, 0, 0, True, 1, 1, 11], # 11 is the highest among y and yhat [3, 0.5, 0, 0, True, 0, 1, 6], [3, 2, 1, 0, True, 0, 1, 6.6], [0, 3, 0, 0, False, 1, 1, 0.3], [4, 2.2, 0, 1, True, 0, 1, 6], [0, 0.1, 1, 0, True, 1, 1, 6], [3, 0, 0, 0, True, 1, 1, 6], [3, 6, 0, 0, True, 1, 1, 6], [0, 0, 1, 0, True, 1, 1, 6], [3, 0, 1, 1, True, 1, 0, 6], [4, 0, 0, 0, True, 1, 0, 6], [3, 0, 1, 0, True, 1, 1, 6], [3, 0, 0, 0, False, 1, 1, 0], [0, 0, 0, 0, True, 1, 1, 6.2], [0, 0, 1, 0, True, 0, 1, 6.6], [0, 0, 1, 0, True, 1, 1, 6.6], [0, 7, 0, 1, False, 0, 1, 0.1], [3, 0, 0, 0, False, 1, 1, 2], [0, 0, 1, 0, False, 1, 1, 8], [3, 0, 0, 0, True, 1, 0, 9], [3, 0, 1, 0, False, 1, 1, 0.1], [0, 8, 0, 0, False, 1, 1, 2.2], [3, 0, 1, 0, True, 0, 1, 10], [0, 0, 0, 1, True, 1, 0, 9], ], columns=["x", "y", "z", "a", "b", "c", "d", "yhat"], ) result = model_performance_report( df=df_cont, label_column=LabelColumn("y", df_cont["y"], [5]), predicted_label_column=LabelColumn("yhat", df_cont["yhat"], [5]), ) # No multicategory confusion matrix expected_result = { "label": "y", "model_performance_metrics": [ { "name": "Accuracy", "description": "Proportion of inputs assigned the correct predicted label by the model.", "value": pytest.approx(5 / 24), }, { "name": "Proportion of Positive Predictions in Labels", "description": "Proportion of input assigned in positive predicted label.", "value": pytest.approx(0.75), }, { "name": "Proportion of Negative Predictions in Labels", "description": "Proportion of input assigned the negative predicted label.", "value": pytest.approx(0.25), }, { "name": "True Positive Rate / Recall", "description": "Proportion of inputs with positive observed label correctly assigned the positive predicted label.", "value": pytest.approx(1 / 3), }, { "name": "True Negative Rate / Specificity", "description": "Proportion of inputs with negative observed label correctly assigned the negative predicted label.", "value": pytest.approx(4 / 21), }, { "name": "Acceptance Rate / Precision", "description": "Proportion of inputs with positive predicted label that actually have a positive observed label.", "value": pytest.approx(1 / 18), }, { "name": "Rejection Rate", "description": "Proportion of inputs with negative predicted label that actually have a negative observed label.", "value": pytest.approx(2 / 3.0), }, { "name": "Conditional Acceptance", "description": "Ratio between the positive observed labels and positive predicted labels.", "value": pytest.approx(1 / 6), }, { "name": "Conditional Rejection", "description": "Ratio between the negative observed labels and negative predicted labels.", "value": pytest.approx(3.5), }, { "name": "F1 Score", "description": "Harmonic mean of precision and recall.", "value": pytest.approx(2 / 21), }, ], "binary_confusion_matrix": [ pytest.approx(1 / 24), pytest.approx(17 / 24), pytest.approx(1 / 12), pytest.approx(1 / 6), ], } assert expected_result == result class LabelValueOrThresholdFunctionInput(NamedTuple): data: pd.Series values: List[Any] class LabelValueOrThresholdFunctionOutput(NamedTuple): result: str def label_value_or_threshold_test_cases(): test_cases = [] # categorical data series function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([1, 2, 3]).astype("category"), values=[2]) function_output = LabelValueOrThresholdFunctionOutput(result="2") # instead of "(2, 3]" test_cases.append([function_input, function_output]) # categorical values function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([1, 2, 3]), values=[1, 2]) function_output = LabelValueOrThresholdFunctionOutput(result="1,2") test_cases.append([function_input, function_output]) # continuous data series function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([1.0, 2.0, 3.0]), values=[2.0]) function_output = LabelValueOrThresholdFunctionOutput(result="(2.0, 3.0]") test_cases.append([function_input, function_output]) # continuous data series, positive value less than all data function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([1.0, 2.0, 3.0]), values=[0.0]) function_output = LabelValueOrThresholdFunctionOutput(result="(0.0, 3.0]") test_cases.append([function_input, function_output]) # continuous data series, positive value greater than all data function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([1.0, 2.0, 3.0]), values=[5.0]) function_output = LabelValueOrThresholdFunctionOutput(result="(3.0, 5.0]") test_cases.append([function_input, function_output]) # object data series, can NOT be converted to numeric function_input = LabelValueOrThresholdFunctionInput(data=pd.Series(["yes", "no", "yes"]), values=["yes"]) function_output = LabelValueOrThresholdFunctionOutput(result="yes") test_cases.append([function_input, function_output]) # object data series, can be converted to numeric, and uniqueness is high function_input = LabelValueOrThresholdFunctionInput(data=pd.Series(["1", "2", "3"]), values=[2]) function_output = LabelValueOrThresholdFunctionOutput(result="(2, 3]") test_cases.append([function_input, function_output]) # boolean data series function_input = LabelValueOrThresholdFunctionInput(data=pd.Series([True, False, True]), values=[True]) function_output = LabelValueOrThresholdFunctionOutput(result="True") test_cases.append([function_input, function_output]) # Test that for trivial dataset where labels don't have as many element as label_values data = pd.Series([0]) positive_values = [1] function_input = LabelValueOrThresholdFunctionInput(data=data, values=positive_values) function_output = LabelValueOrThresholdFunctionOutput(result="(0, 1]") test_cases.append([function_input, function_output]) return test_cases @pytest.mark.parametrize("function_input,function_output", label_value_or_threshold_test_cases()) def test_label_value_or_threshold(function_input, function_output): result = label_value_or_threshold(*function_input) assert result == function_output.result