{ "metadata": { "version": 1, "disable_limits": false, "instance_type": "ml.m5.4xlarge" }, "nodes": [ { "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca", "type": "SOURCE", "operator": "sagemaker.s3_source_0.1", "parameters": { "dataset_definition": { "__typename": "S3CreateDatasetDefinitionOutput", "datasetSourceType": "S3", "name": "diabetic_data.csv", "description": null, "s3ExecutionContext": { "__typename": "S3ExecutionContext", "s3Uri": "s3://sagemaker-diabetes-AWS_ACCOUNT/diabetic_data.csv", "s3ContentType": "csv", "s3HasHeader": true, "s3FieldDelimiter": ",", "s3DirIncludesNested": false, "s3AddsFilenameColumn": false } } }, "inputs": [], "outputs": [ { "name": "default" } ] }, { "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381", "type": "TRANSFORM", "operator": "sagemaker.spark.infer_and_cast_type_0.1", "parameters": {}, "trained_parameters": { "schema": { "encounter_id": "long", "patient_nbr": "long", "race": "string", "gender": "string", "age": "string", "weight": "string", "admission_type_id": "long", "discharge_disposition_id": "long", "admission_source_id": "long", "time_in_hospital": "long", "payer_code": "string", "medical_specialty": "string", "num_lab_procedures": "long", "num_procedures": "long", "num_medications": "long", "number_outpatient": "long", "number_emergency": "long", "number_inpatient": "long", "diag_1": "long", "diag_2": "long", "diag_3": "long", "number_diagnoses": "long", "max_glu_serum": "string", "A1Cresult": "string", "metformin": "string", "repaglinide": "string", "nateglinide": "string", "chlorpropamide": "string", "glimepiride": "string", "acetohexamide": "string", "glipizide": "string", "glyburide": "string", "tolbutamide": "string", "pioglitazone": "string", "rosiglitazone": "string", "acarbose": "string", "miglitol": "string", "troglitazone": "string", "tolazamide": "string", "examide": "string", "citoglipton": "string", "insulin": "string", "glyburide-metformin": "string", "glipizide-metformin": "string", "glimepiride-pioglitazone": "string", "metformin-rosiglitazone": "string", "metformin-pioglitazone": "string", "change": "string", "diabetesMed": "string", "readmitted": "string" } }, "inputs": [ { "name": "default", "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54", "type": "TRANSFORM", "operator": "sagemaker.spark.manage_columns_0.1", "parameters": { "operator": "Move column", "move_column_parameters": { "move_type": "Move to start", "move_to_start_parameters": { "column_to_move": "readmitted" } }, "drop_column_parameters": {} }, "inputs": [ { "name": "df", "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "9c491942-6270-410a-8734-dafaa3bee672", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "float", "udf_mode": "Pandas", "input_col": "readmitted", "output_col": "readmitted", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 0.0 if (str(x) == 'NO') else (1.0)) \n return series\n " }, "pyspark_parameters": {}, "name": "readmitted" }, "inputs": [ { "name": "df", "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a", "type": "TRANSFORM", "operator": "sagemaker.spark.manage_columns_0.1", "parameters": { "operator": "Drop column", "drop_column_parameters": { "column_to_drop": [ "payer_code", "encounter_id", "patient_nbr", "weight", "medical_specialty", "acarbose", "metformin-pioglitazone", "acetohexamide", "metformin-rosiglitazone", "glimepiride", "glimepiride-pioglitazone", "glipizide", "glyburide-metformin", "examide", "troglitazone", "miglitol", "citoglipton", "glipizide-metformin", "chlorpropamide", "tolbutamide", "glyburide", "tolazamide", "nateglinide" ] } }, "inputs": [ { "name": "df", "node_id": "9c491942-6270-410a-8734-dafaa3bee672", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b", "type": "TRANSFORM", "operator": "sagemaker.spark.handle_missing_0.1", "parameters": { "operator": "Fill missing", "fill_missing_parameters": { "input_column": [ "diag_1", "diag_2", "diag_3" ], "fill_value": "0" }, "impute_parameters": { "column_type": "Numeric", "numeric_parameters": { "strategy": "Approximate Median" } } }, "inputs": [ { "name": "df", "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d", "type": "TRANSFORM", "operator": "sagemaker.spark.search_and_edit_0.1", "parameters": { "operator": "Find and replace substring", "find_and_replace_substring_parameters": { "input_column": [ "race" ], "pattern": "\\?", "replacement": "Unknown" } }, "inputs": [ { "name": "df", "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "diag_1", "output_col": "diag_1", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n \n \"\"\" The following function is applied over batches of the input. The Series that it outputs must be the same length as the input Series.\n\n Example:\n\n def lowercase(series: pd.Series) -> pd.Series:\n return series.str.lower()\n \"\"\"" }, "pyspark_parameters": {}, "name": "diag-1" }, "inputs": [ { "name": "df", "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "diag_2", "output_col": "diag_2", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n" }, "pyspark_parameters": {}, "name": "diag-2" }, "inputs": [ { "name": "df", "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "diag_3", "output_col": "diag_3", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n" }, "pyspark_parameters": {}, "name": "diag-3" }, "inputs": [ { "name": "df", "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "admission_type_id", "output_col": "admission_type_id", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Unknown' if (int(x) in [5,6,8] ) else ('Emergency' if int(x) == 1 else ('Urgent' if int(x) == 2 else ('Elective' if int(x) == 3 else ('Newborn' if int(x) == 4 else ('TraumaCenter'))))))\n return series\n" }, "pyspark_parameters": {}, "name": "admission-type-id" }, "inputs": [ { "name": "df", "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "discharge_disposition_id", "output_col": "discharge_disposition_id", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Discharged' if (int(x) in [1,2,3,4,5,6,8,10,15,16,17,22,23,24,27,28,29,30]) else ('LeftAMA' if int(x) == 7 else ('InPatient' if int(x) == 9 else ('OutPatient' if int(x) == 12 else ('Expired' if int(x) in [11,19,20,21] else ('Hospice' if int(x) in [13,14] else ('Unknown')))))))\n\n return series" }, "pyspark_parameters": {}, "name": "discharge-disposition-id" }, "inputs": [ { "name": "df", "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78", "type": "TRANSFORM", "operator": "sagemaker.spark.custom_code_0.1", "parameters": { "operator": "Python (User-Defined Function)", "udf_parameters": { "return_type": "string", "udf_mode": "Pandas", "input_col": "admission_source_id", "output_col": "admission_source_id", "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Referral' if (int(x) in [1,2,3]) else ('Transfer' if int(x) in [4,5,6,10,18,19,22,25,26] else ('Emergency' if int(x) == 7 else ('Court' if int(x) == 8 else ('Unknown' if int(x) in [9,15,17,20,21] else ('NormalDelivery' if int(x) == 11 else ('AbnormalDelivery' if int(x) in [12,13,14] else ('BornInside' if int(x) == 23 else ('BornOutside')))))))))\n return series" }, "pyspark_parameters": {}, "name": "admission-source-id" }, "inputs": [ { "name": "df", "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9", "type": "TRANSFORM", "operator": "sagemaker.spark.manage_rows_0.1", "parameters": { "operator": "Drop duplicates", "drop_duplicates_parameters": {}, "sort_parameters": { "order": "Ascending" } }, "inputs": [ { "name": "df", "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254", "type": "TRANSFORM", "operator": "sagemaker.spark.balance_data_0.1", "parameters": { "operator": "SMOTE", "ratio": 1, "smote_params": { "num_neighbors": 10 }, "target_column": "readmitted" }, "inputs": [ { "name": "df", "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9", "output_name": "default" } ], "outputs": [ { "name": "default" } ] }, { "node_id": "d593101e-278b-4330-9779-b6e02fbeb99e", "type": "TRANSFORM", "operator": "sagemaker.spark.encode_categorical_0.1", "parameters": { "operator": "One-hot encode", "one_hot_encode_parameters": { "invalid_handling_strategy": "Keep", "drop_last": false, "output_style": "Columns", "input_column": [ "race", "gender", "age", "diag_1", "diag_2", "diag_3", "max_glu_serum", "A1Cresult", "metformin", "repaglinide", "pioglitazone", "rosiglitazone", "insulin", "change", "diabetesMed", "admission_type_id", "discharge_disposition_id", "admission_source_id" ] }, "ordinal_encode_parameters": { "invalid_handling_strategy": "Replace with NaN" } }, "inputs": [ { "name": "df", "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254", "output_name": "default" } ], "outputs": [ { "name": "default" } ] } ] }