from typing import Any, List, NamedTuple, Optional import pandas as pd import pytest from smclarify.bias.metrics.common import ( DataType, series_datatype, ensure_series_data_type, convert_positive_label_values, calc_confusion_matrix_quadrants, ) from .test_metrics import dfBinary class EnsureSeriesDataTypeInput(NamedTuple): data: pd.Series values: Optional[List[Any]] = None class EnsureSeriesDataTypeOutput(NamedTuple): data_type: DataType new_data: pd.Series def ensure_series_data_type_test_cases(): test_cases = [] # categorical data series data = pd.Series([1, 2, 3]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data) test_cases.append([function_input, function_output]) # categorical values data = pd.Series([1, 2, 3]) function_input = EnsureSeriesDataTypeInput(data=data, values=[1, 2, 3]) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data.astype("category")) test_cases.append([function_input, function_output]) # floating data series data = pd.Series([1.0, 2.0, 3.0]) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CONTINUOUS, new_data=data) test_cases.append([function_input, function_output]) # object data series, can NOT be converted to numeric data = pd.Series(["a", "b", "c"]) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data.astype("category")) test_cases.append([function_input, function_output]) # object data series, can be converted to numeric, and uniqueness is high data = pd.Series(["1", "2", "3"]) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CONTINUOUS, new_data=pd.to_numeric(data)) test_cases.append([function_input, function_output]) # object data series, can be converted to numeric, but uniqueness is low data = ["1"] * 40 data.append("2") data = pd.Series(data) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data.astype("category")) test_cases.append([function_input, function_output]) # integer data series, uniqueness is high data = pd.Series([1, 2, 3]) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CONTINUOUS, new_data=data) test_cases.append([function_input, function_output]) # integer data series, uniqueness is low data = [1] * 40 data.append(2) data = pd.Series(data) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data.astype("category")) test_cases.append([function_input, function_output]) # boolean data series data = pd.Series([True, False, True]) function_input = EnsureSeriesDataTypeInput(data=data) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CATEGORICAL, new_data=data.astype("category")) test_cases.append([function_input, function_output]) # continous with empty values data = pd.Series([0.2, 0.3, 0.4]) function_input = EnsureSeriesDataTypeInput(data=data, values=[]) function_output = EnsureSeriesDataTypeOutput(data_type=DataType.CONTINUOUS, new_data=data) test_cases.append([function_input, function_output]) return test_cases def convert_positive_label_values_test_case(): test_cases = [] # series - int, label values - int data = pd.Series([1, 2, 3]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1, 2]) function_output = [1, 2] test_cases.append([function_input, function_output]) # series - int, label values - str data = pd.Series([1, 2, 3]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=["1", "2"]) function_output = [1, 2] test_cases.append([function_input, function_output]) # series - int, label values - string float data = pd.Series([1, 2, 3]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=["1.0", "2.0"]) function_output = [1, 2] test_cases.append([function_input, function_output]) # series - string, label values - string data = pd.Series(["1", "2", "3"]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=["1", "2"]) function_output = ["1", "2"] test_cases.append([function_input, function_output]) # series - string, label values - int data = pd.Series(["1", "2", "3"]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1, 2]) function_output = ["1", "2"] test_cases.append([function_input, function_output]) # series - string float, label values - float data = pd.Series(["1.0", "2.0", "3.0"]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1.0, 2.0]) function_output = ["1.0", "2.0"] test_cases.append([function_input, function_output]) # series - float, label values - float data = pd.Series([1.0, 2.0, 3.0]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1.0, 2.0]) function_output = [1.0, 2.0] test_cases.append([function_input, function_output]) # series - float, label values - string float data = pd.Series([1.0, 2.0, 3.0]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=["1.0", "2.0"]) function_output = [1.0, 2.0] test_cases.append([function_input, function_output]) # series - float, label values - int data = pd.Series([1.0, 2.0, 3.0]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1, 2]) function_output = [1.0, 2.0] test_cases.append([function_input, function_output]) # series - bool, label values - bool data = pd.Series([True, True, False]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[True]) function_output = [True] test_cases.append([function_input, function_output]) # series - string, label values - bool data = pd.Series(["True", "True", "False"]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[True]) function_output = ["True"] test_cases.append([function_input, function_output]) # series - bool, label values - int data = pd.Series([True, True, False]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[1, 2, 0]) function_output = [True, True, False] test_cases.append([function_input, function_output]) # series - int, label values - bool data = pd.Series([1, 1, 0]).astype("category") function_input = EnsureSeriesDataTypeInput(data=data, values=[True, False]) function_output = [1, 0] test_cases.append([function_input, function_output]) return test_cases @pytest.mark.parametrize("function_input,function_output", ensure_series_data_type_test_cases()) def test_ensure_series_data_type(function_input, function_output): # Test the series_datatype function by the way data_type = series_datatype(*function_input) assert data_type == function_output.data_type # Test the ensure_series_data_type data_type, new_data = ensure_series_data_type(*function_input) assert data_type == function_output.data_type assert new_data.equals(function_output.new_data) @pytest.mark.parametrize("function_input,function_output", convert_positive_label_values_test_case()) def test_convert_positive_label_values(function_input, function_output): positive_label_values = convert_positive_label_values(*function_input) assert positive_label_values == function_output def test_calc_confusion_matrix_quadrants(): # binary (dfB, dfB_label, dfB_pos_label_idx, dfB_pred_label, dfB_pos_pred_label_idx) = dfBinary() dfB_features = dfB[0] assert (2, 3, 4, 3) == calc_confusion_matrix_quadrants(dfB_features, dfB_pos_label_idx, dfB_pos_pred_label_idx) dfB_sensitive_features = dfB_features == "F" assert (2, 2, 2, 1) == calc_confusion_matrix_quadrants( dfB_features[dfB_sensitive_features], dfB_pos_label_idx[dfB_sensitive_features], dfB_pos_pred_label_idx[dfB_sensitive_features], ) dfB_sensitive_features = dfB_features == "M" assert (0, 1, 2, 2) == calc_confusion_matrix_quadrants( dfB_features[dfB_sensitive_features], dfB_pos_label_idx[dfB_sensitive_features], dfB_pos_pred_label_idx[dfB_sensitive_features], ) # multi category dfM = pd.DataFrame( [ ("a", "white", 1, "red"), ("b", "white", 1, "blue"), ("b", "blue", 1, "blue"), ("b", "blue", 0, "red"), ("a", "green", 1, "white"), ("b", "white", 1, "white"), ("b", "white", 1, "green"), ("b", "white", 0, "white"), ] ) dfM.columns = ["x", "y", "z", "yhat"] dfM_features = dfM["x"] dfM_label = dfM["y"] dfM_predicted_label = dfM["yhat"] dfM_pos_label_idx = dfM_label == "blue" dfM_pos_pred_label_idx = dfM_predicted_label == "blue" assert (1, 5, 1, 1) == calc_confusion_matrix_quadrants(dfM_features, dfM_pos_label_idx, dfM_pos_pred_label_idx) dfM_sensitive_features = dfM_features == "a" assert (0, 2, 0, 0) == calc_confusion_matrix_quadrants( dfM_features[dfM_sensitive_features], dfM_pos_label_idx[dfM_sensitive_features], dfM_pos_pred_label_idx[dfM_sensitive_features], ) dfM_sensitive_features = dfM_features == "b" assert (1, 3, 1, 1) == calc_confusion_matrix_quadrants( dfM_features, dfM_pos_label_idx, dfM_pos_pred_label_idx[dfM_sensitive_features] )