# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"). You # may not use this file except in compliance with the License. A copy of # the License is located at # # http://aws.amazon.com/apache2.0/ # # or in the "license" file accompanying this file. This file is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF # ANY KIND, either express or implied. See the License for the specific # language governing permissions and limitations under the License. import numpy as np import pytest from sklearn.utils.testing import assert_array_equal from sagemaker_sklearn_extension.impute import RobustImputer, RobustMissingIndicator, is_finite_numeric X_impute = np.array([[np.nan, 2, np.inf], [4, np.inf, 6], [10, np.nan, 10]]) X_impute_boolean_mask = np.array([[True, False, True], [False, True, False], [False, True, False]]) X_impute_string = X_impute.astype("O") X_impute_mixed = np.array([["2", "a"], ["inf", "nan"], ["-1e2", "10.0"], ["0.0", "foobar"], ["-inf", "8"]]) X_impute_mixed_boolean_mask = np.array([[False, True], [True, True], [False, False], [False, True], [True, False]]) X_impute_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["banana"]]) X_imputed_median = np.array([[7.0, 2.0, 8.0], [4.0, 2.0, 6.0], [10.0, 2.0, 10.0]]) X_imputed_constant = np.array([[1.0, 2.0, 13.0], [4.0, 7.0, 6.0], [10.0, 7.0, 10.0]]) X_imputed_mixed = np.array([[2.0, 9.0], [0.0, 9.0], [-1e2, 10.0], [0.0, 9.0], [0.0, 8.0]]) X_imputed_categorical = np.array([["hot dog"], ["hot dog"], ["hot dog"], ["not hot dog"]]) transform_error_msg = "'transform' input X has 4 features per sample, expected 3 from 'fit' input" fill_values_error_msg = "'fill_values' should have length equal to number of features in X 3, got 5" @pytest.mark.parametrize( "val, expected", [(np.array([1738, "10", np.inf, np.nan, "foobar"]), np.array([True, True, False, False, False]))] ) def test_is_finite_numeric(val, expected): observed = is_finite_numeric(val) assert_array_equal(observed, expected) @pytest.mark.parametrize( "X, X_expected, strategy, fill_values", [ (X_impute_mixed, X_imputed_mixed, "median", None), (X_impute, X_imputed_median, "median", None), (X_impute_string, X_imputed_median, "median", None), (X_impute, X_imputed_constant, "constant", [1.0, 7.0, 13.0]), (X_impute_string, X_imputed_constant, "constant", [1.0, 7.0, 13.0]), ], ) def test_robust_imputer(X, X_expected, strategy, fill_values): robust_imputer = RobustImputer(strategy=strategy, fill_values=fill_values) robust_imputer.fit(X) X_observed = robust_imputer.transform(X) assert_array_equal(X_observed, X_expected) def test_robust_imputer_categorical_custom_function(): robust_imputer = RobustImputer( dtype=np.dtype("O"), strategy="constant", fill_values="not hot dog", mask_function=lambda x: x == "hot dog" ) robust_imputer.fit(X_impute_categorical) X_observed = robust_imputer.transform(X_impute_categorical) assert_array_equal(X_observed, X_imputed_categorical) def test_robust_imputer_transform_dim_error(): with pytest.raises(ValueError, match=transform_error_msg): robust_imputer = RobustImputer() robust_imputer.fit(X_impute) robust_imputer.transform(np.zeros((3, 4))) def test_robust_imputer_fill_values_dim_error(): with pytest.raises(ValueError, match=fill_values_error_msg): robust_imputer = RobustImputer(strategy="constant", fill_values=np.zeros(5)) robust_imputer.fit(X_impute) @pytest.mark.parametrize( "X, boolean_mask_X", [(X_impute_mixed, X_impute_mixed_boolean_mask), (X_impute, X_impute_boolean_mask)] ) def test_robust_missing_indicator(X, boolean_mask_X): robust_indicator = RobustMissingIndicator() robust_indicator.fit(X) boolean_mask_X_observed = robust_indicator.transform(X) assert_array_equal(boolean_mask_X_observed, boolean_mask_X)