{ "cells": [ { "cell_type": "markdown", "id": "47342590", "metadata": {}, "source": [ "# ISO20022 Datasets for ML Prototype\n", "This notebook creates Legal Entity Identifier (LEI) and fake Business Identification Code (BIC) datasets for use in a machine learning prototype for predicting if a pacs.008 message will be processed without exceptions. \n", "\n", "LEI dataset uses [GLEIF Golden Copy](https://www.gleif.org/en/lei-data/gleif-golden-copy/download-the-golden-copy#/) downloaded from Global Legal Entity Indentifier Foundation (GLEIF). The LEI dataset is a subset used for ML prototype.\n", "\n", "BIC dataset is a fake dataset generated using [Faker](https://faker.readthedocs.io/en/master/).\n" ] }, { "cell_type": "markdown", "id": "a2209d24", "metadata": {}, "source": [ "## Basic setup" ] }, { "cell_type": "code", "execution_count": null, "id": "6e4deba1", "metadata": {}, "outputs": [], "source": [ "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import nltk\n", "from nltk.corpus import stopwords\n", "import string\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "from sklearn.decomposition import TruncatedSVD\n", "from sklearn import ensemble, metrics, model_selection, naive_bayes\n", "from sklearn.utils import shuffle\n", "color = sns.color_palette()\n", "\n", "%matplotlib inline" ] }, { "cell_type": "markdown", "id": "97fc47d9", "metadata": {}, "source": [ "## LEI Database" ] }, { "cell_type": "code", "execution_count": null, "id": "a4f33cf6", "metadata": {}, "outputs": [], "source": [ "lei_full_df = pd.read_csv(\"iso20022-data/20210408-0000-gleif-goldencopy-lei2-golden-copy.csv\")\n", "\n", "print(\"Number of rows in full LEI dataset : \",lei_full_df.shape[0])\n", "\n", "print(f\"Shape: {lei_full_df.shape}\")\n", "print(f\"Columns: {lei_full_df.columns}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "8c62be94", "metadata": {}, "outputs": [], "source": [ "lei_full_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "061d4694", "metadata": {}, "outputs": [], "source": [ "lei_selected_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE'])]\n", "print(f\"Shape of selected countries: {lei_selected_df.shape}\")\n", "\n", "us_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['US'])]).head(500)\n", "print(f\"US shape: {us_df.shape}\")\n", "ca_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['CA'])]).head(200)\n", "print(f\"Canada shape: {ca_df.shape}\")\n", "in_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['IN'])]).head(650)\n", "print(f\"India shape: {in_df.shape}\")\n", "gb_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['GB'])].head(300)\n", "print(f\"Great Britain shape: {gb_df.shape}\")\n", "th_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['TH'])]).head(100)\n", "print(f\"Thailand shape: {th_df.shape}\")\n", "mx_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['MX'])].head(100)\n", "print(f\"Mexico shape: {mx_df.shape}\")\n", "ie_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['IE'])].head(150)\n", "print(f\"Ireland shape: {ie_df.shape}\")\n", "\n", "frames = [us_df, ca_df, in_df, gb_df, th_df, mx_df, ie_df]\n", "\n", "ml_prototype_df = pd.concat(frames)\n", "print(f\"ML Prototype shape: {ml_prototype_df.shape}\")\n", "\n", "ml_prototype_df = shuffle(ml_prototype_df)\n", "ml_prototype_df.to_csv('iso20022-data/lei_ml_prototype_records.csv', index=False)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "db89b8a4", "metadata": {}, "outputs": [], "source": [ "lei_small_df = pd.read_csv(\"iso20022-data/lei-records.csv\")\n", "print(\"Number of rows in full LEI dataset : \",lei_small_df.shape[0])\n", "lei_small_df.head()" ] }, { "cell_type": "markdown", "id": "7a135503", "metadata": {}, "source": [ "## BIC Database" ] }, { "cell_type": "code", "execution_count": null, "id": "7eb3ceab", "metadata": {}, "outputs": [], "source": [ "!pip install Faker" ] }, { "cell_type": "markdown", "id": "6c5bcba5", "metadata": {}, "source": [ "### Build Synthetic BIC Database" ] }, { "cell_type": "code", "execution_count": null, "id": "0ba6148c", "metadata": {}, "outputs": [], "source": [ "from faker import Faker \n", "import numpy as np\n", "import pandas as pd\n", "\n", "mod_flags = ['A','M','D']\n", "countries = ['US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE']\n", "locale_map = {\n", " 'US': 'en_US',\n", " 'CA': 'en_CA',\n", " 'IN': 'en_IN',\n", " 'GB': 'en_GB',\n", " 'TH': 'en_TH',\n", " 'MX': 'es_MX',\n", " 'IE': 'en_IE'\n", "}\n", "\n", "countries_dict = {\n", " 'US': 'UNITED STATES OF AMERICA',\n", " 'CA': 'CANADA',\n", " 'IN': 'INDIA',\n", " 'GB': 'GREAT BRITAIN',\n", " 'TH': 'Thailand',\n", " 'MX': 'MEXICO',\n", " 'IE': 'IRELAND'\n", "}\n", "branch_info = ['BRANCH', 'BANKING DEPARTMENT', 'REGIONAL OFFICE', 'OFFICE', 'HQ', 'SHOPPING MALL', 'INDUSTRIAL ESTATE']\n", "address2 = ['BUILDING', 'SHOP', 'UNIT', '']\n", "subtypes = ['SUPE', 'NSWB', 'BEID', 'CORP', 'PSPA', 'SSPA']\n", "value_added_services = ['AIMFIN', 'AIMFINTG+', 'FIN', 'FINSCO', 'FINTG+', 'ERPFIN', 'ERPFINTG+', 'FINTGT', 'FINTAS']\n", "extra_info = ['ALL', 'ADM', 'ADM C4', 'BRA', 'PAY', 'ALLBRA', 'DOC', 'CAL']\n", "\n", "# Instantiate Faker with multiple locales\n", "fake = Faker(['en_US', 'en_CA', 'en_IN', 'en_GB', 'en_TH', 'es_MX', 'en_IE']) \n", "print(f'Locales: {fake.locales}')\n", "fake1 = fake[locale_map['US']]\n", "print(f'Generator locale: {fake1}')\n", "\n", "def create_data(x): \n", " # dictionary \n", " bank = {} \n", " \n", " for i in range(0, x): \n", " country = fake.random_element(countries)\n", " generator = fake[locale_map[country]]\n", " \n", " bank[i] = {} \n", " bank[i]['TAG'] = 'FI'\n", " bank[i]['MODIFICATION FLAG'] = fake.random_element(mod_flags)\n", " bic = generator.swift(length=8)\n", " random_bic = bic[0:4] + country + bic[6:8]\n", " bank[i]['BIC CODE'] = random_bic\n", " bank[i]['BRANCH CODE'] = 'XXX'\n", " bank[i]['INSTITUTION NAME'] = generator.company()\n", " bank[i]['BRANCH INFORMATION'] = fake.random_element(branch_info)\n", " city = generator.city()\n", " bank[i]['CITY HEADING'] = city\n", " bank[i]['SUBTYPE INDICATION'] = fake.random_element(subtypes)\n", " bank[i]['VALUE ADDED SERVICES'] = fake.random_element(value_added_services)\n", " bank[i]['EXTRA INFO'] = fake.random_element(extra_info)\n", " bank[i]['PHYSICAL ADDRESS 1'] = generator.street_address()\n", " addr2 = generator.random_element(address2)\n", " bank[i]['PHYSICAL ADDRESS 2'] = '' if addr2 == '' else addr2 + ' ' + generator.building_number()\n", " bank[i]['PHYSICAL ADDRESS 3'] = ''\n", " bank[i]['PHYSICAL ADDRESS 4'] = ''\n", " bank[i]['LOCATION'] = city\n", " bank[i]['COUNTRY NAME'] = country\n", " bank[i]['POB NUMBER'] = 'POB ' + generator.building_number()\n", " bank[i]['POB LOCATION'] = city\n", " bank[i]['POB COUNTRY NAME'] = countries_dict[country] \n", " \n", " return bank\n", " \n", "df = pd.DataFrame(create_data(200)).transpose()\n", "df.to_csv('iso20022-data/bic_ml_prototype_records.csv', index=False)\n", "\n", "df.head(10)" ] }, { "cell_type": "markdown", "id": "a4982f73", "metadata": {}, "source": [ "## ISO20022 PACS.008 Synthetic Dataset" ] }, { "cell_type": "code", "execution_count": null, "id": "466d19e5", "metadata": {}, "outputs": [], "source": [ "print(f\"Unique BICs: {len(pd.unique(df['BIC CODE']))}\")\n", "print(f\"Unique BICs: {len(df['BIC CODE'].unique())}\")\n", "#'US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE'\n", "\n", "us_shape = df[df['BIC CODE'].astype(str).str.contains(\"US\")].shape\n", "print(f\"US BICs: {in_shape[0]}\")\n", "\n", "ca_shape = df[df['BIC CODE'].astype(str).str.contains(\"CA\")].shape\n", "print(f\"Canada BICs: {in_shape[0]}\")\n", "\n", "gb_shape = df[df['BIC CODE'].astype(str).str.contains(\"GB\")].shape\n", "print(f\"Great Britain BICs: {in_shape[0]}\")\n", "\n", "in_shape = df[df['BIC CODE'].astype(str).str.contains(\"IN\")].shape\n", "print(f\"India BICs: {in_shape[0]}\")\n", "\n", "th_shape = df[df['BIC CODE'].astype(str).str.contains(\"TH\")].shape\n", "print(f\"Thailand BICs: {in_shape[0]}\")\n", "\n", "mx_shape = df[df['BIC CODE'].astype(str).str.contains(\"MX\")].shape\n", "print(f\"Mexico BICs: {in_shape[0]}\")\n", "\n", "ie_shape = df[df['BIC CODE'].astype(str).str.contains(\"IE\")].shape\n", "print(f\"Ireland BICs: {in_shape[0]}\")\n", "\n", "print(\"All BICS \")\n", "print(f\"{df['BIC CODE'].unique()}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "11f5def9", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "conda_python3", "language": "python", "name": "conda_python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.13" } }, "nbformat": 4, "nbformat_minor": 5 }