# In this notebook, we use unsupervised machine learning with anomaly detection to identify Fraudulent Medicare providers using data from CMS that has been preprocessed using Data Wrangler. 

## Setup

Import required libraries (install imblearn using pip if not present)

In [None]:
!pip install imblearn

In [None]:
import numpy as np 
import pandas as pd
import boto3
import os
import sagemaker
import seaborn as sns
import matplotlib.pyplot as plt
import io
import sklearn
from math import sqrt
from sagemaker import get_execution_role
from sagemaker import RandomCutForest
from sagemaker.deserializers import JSONDeserializer
from sagemaker.serializers import CSVSerializer
from sagemaker.amazon.amazon_estimator import get_image_uri
from sklearn.datasets import dump_svmlight_file 
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.datasets import dump_svmlight_file 
from collections import Counter
from sagemaker.s3 import S3Downloader

Enable the ability to see all columns and rows of data if the data size is big

In [None]:
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [None]:
session = sagemaker.Session()
bucket = session.default_bucket()
prefix = 'fraud-detect-demo/randomforest'
role = get_execution_role()
s3_client = boto3.client("s3")

Let's start by reading in the entire preprocessed medicare data set prepared for anomaly detection. This dataset has a lot more data elements than the dataset prepared for classification

In [None]:
!gzip -d processed_data_anomaly_detection1.csv.gz
!gzip -d processed_data_anomaly_detection2.csv.gz

In [None]:
data1 = pd.read_csv('processed_data_anomaly_detection1.csv', delimiter=',')
data2 = pd.read_csv('processed_data_anomaly_detection2.csv', delimiter=',')

In [None]:
data = data1.append(data2)

In [None]:
data.head()

## Investigate and process the data

Check data for any nulls

In [None]:
data.isnull().values.any()

In [None]:
data['fraudulent_provider'].value_counts()

In [None]:
# 80% for the training set and 20% for testing set
RANDOM_SEED = 314 #used to help randomly select the data points
TEST_PCT = 0.2 # 20% of the data

train, test = train_test_split(data, test_size=TEST_PCT,stratify = data['fraudulent_provider'])

## Training and Prediction - Unsupervised Learning (Anomaly Detection)

We will use Anomaly Detection, an unsupervised Learning, to determine fraud

In Anomaly Detection, ideally you need to run the training on the "normal" data set (in this case non fraud) and then test on a mix of fraud and non fraud data

In [None]:
train_x = train.loc[train['fraudulent_provider'] == 0]
train_x = train_x.drop(['fraudulent_provider'], axis=1)

In [None]:
X_train = train_x.values.astype('float32')

In [None]:
# specify general training job information
rcf = RandomCutForest(role=get_execution_role(),
 instance_count=2,
 instance_type='ml.c5.4xlarge',
 data_location='s3://{}/{}/'.format(bucket, prefix),
 output_path='s3://{}/{}/output'.format(bucket, prefix),
 num_samples_per_tree=2048,
 num_trees=1000)

In [None]:
rcf.fit(rcf.record_set(X_train, channel='train'))

### Host Random Cut Forest

Once we have a trained model we can deploy it and get some predictions for our test set. 

In [None]:
rcf_predictor = rcf.deploy(
 initial_instance_count=1,
 instance_type='ml.c4.xlarge',
 serializer=CSVSerializer(),
 deserializer=JSONDeserializer()
)

### Evaluate Results

With the model deployed, let's see how it performs in terms of separating fraudulent from legitimate transactions.

In [None]:
def predict_rcf(current_predictor, d, rows=500):
 split_array = np.array_split(d, int(d.shape[0] / float(rows) + 1))
 predictions = []
 for array in split_array:
 array_preds = [s['score'] for s in current_predictor.predict(array)['scores']]
 predictions.append(array_preds)

 return np.concatenate([np.array(batch) for batch in predictions])

In [None]:
frauds = test.loc[test['fraudulent_provider'] == 1]
nonfrauds = test.loc[test['fraudulent_provider'] == 0]

In [None]:
frauds = frauds.drop(['fraudulent_provider'], axis=1)
nonfrauds = nonfrauds.drop(['fraudulent_provider'], axis=1)

In [None]:
positives= frauds.values.astype('float32')
negatives= nonfrauds.values.astype('float32')

In [None]:
positives_scores = predict_rcf(rcf_predictor, positives)
negatives_scores = predict_rcf(rcf_predictor, negatives)

In [None]:
sns.set(color_codes=True)
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.distplot(positives_scores, label='fraud', bins=20)
sns.distplot(negatives_scores, label='not-fraud', bins=20)
plt.legend()

## Check the data for Bias

In [None]:
# converting the facet value that we wil check for bias ('female') from float to int for setting up Sagemaker clarify bias processing
test['referring_provider_gender_f'] = test['referring_provider_gender_f'].astype(int)

In [None]:
from sagemaker.s3 import S3Uploader

test.to_csv("train.csv", index=False, header=False)

train_uri = S3Uploader.upload("train.csv", "s3://{}/{}".format(bucket, prefix))

IMPORTANT: Since we are running RCF on a large number of columns, it is recommended to use an instance with high memory and increase the number of instances to the extent possible

In [None]:
from sagemaker import clarify

clarify_processor = clarify.SageMakerClarifyProcessor(
 role=role, instance_count=3, instance_type="ml.r5.24xlarge", sagemaker_session=session
)

In [None]:
bias_report_output_path = "s3://{}/{}/clarify-bias".format(bucket, prefix)
bias_data_config = clarify.DataConfig(
 s3_data_input_path=train_uri,
 s3_output_path=bias_report_output_path,
 label="fraudulent_provider",
 headers=train.columns.to_list(),
 dataset_type="text/csv"
)

Update the `model_name` below with the `model_name` of the sagemaker endpoint you deployed. this should be available in the parameters returned from the 
`describe_endpoint_config` command in the sagemaker client smclient. To get the endpoint config you need to run the `describe_endpoint` command on the endpoint you deployed

In [None]:
smclient = boto3.client(service_name='sagemaker')

In [None]:
smclient.describe_endpoint_config(EndpointConfigName= 'get name from Console')

IMPORTANT: Use the `ModelName` from the above in the `model_name` below; Also, since we are running RCF on a large number of columns, it is recommended to use an instance with high memory and increase the number of instances to the extent possible

In [None]:
model_config = clarify.ModelConfig(
 model_name="replace with model name",
 instance_type="ml.r5.24xlarge",
 instance_count=3,
 accept_type="text/csv",
 content_type="text/csv",
)

In [None]:
predictions_config = clarify.ModelPredictedLabelConfig()

In [None]:
bias_config = clarify.BiasConfig(
 label_values_or_threshold=[1.25], facet_name="referring_provider_gender_f", facet_values_or_threshold=[1]
)

In [None]:
clarify_processor.run_post_training_bias(
 data_config=bias_data_config,
 data_bias_config=bias_config,
 model_config=model_config,
 model_predicted_label_config=predictions_config,
 methods=["DI","RD"],
 logs=False
)

In [None]:
bias_report_output_path

In [None]:
S3Downloader.download("{}/report.pdf".format(bias_report_output_path), "../Fraud Detection/Bias/AnomalyDetection")

To view the bias metrics, open up the bias_report.pdf - alternatively you can view results in Studio under the expirements tab

## Evaluate which features contribute to the model predictions (Explainability)

The number of samples below are critical to determine explainability. Ideally, you need to have at least 5 times the number of columns in the dataset to allow enough permutations and combinations. However, it is important to keep in mind that this dramatically increases memory required. In this case, RCF already requires significant amount of memory. So we keep the number of samples to 200

In [None]:
shap_config = clarify.SHAPConfig(
 num_samples=200,
 agg_method="mean_abs",
 save_local_shap_values=True,
)

explainability_output_path = "s3://{}/{}/clarify-explainability".format(bucket, prefix)
explainability_data_config = clarify.DataConfig(
 s3_data_input_path=train_uri,
 s3_output_path=explainability_output_path,
 label="fraudulent_provider",
 headers=train.columns.to_list(),
 dataset_type="text/csv",
)

In [None]:
clarify_processor.run_explainability(
 data_config=explainability_data_config,
 model_config=model_config,
 explainability_config=shap_config,
 logs=False
)

In [None]:
explainability_output_path

In [None]:
S3Downloader.download("{}/report.pdf".format(explainability_output_path), "../Fraud Detection/Exp/AnomalyDetection")

## Clean Up

In [None]:
# Uncomment to clean up endpoints
# rcf_predictor.delete_endpoint()


## Data Acknowledgements

The dataset used to demonstrated the fraud detection solution has been collected and analysed from CMS 

https://data.cms.gov/provider-summary-by-type-of-service/medicare-physician-other-practitioners/medicare-physician-other-practitioners-by-provider-and-service

