# ISO20022 Datasets for ML Prototype
This notebook creates Legal Entity Identifier (LEI) and fake Business Identification Code (BIC) datasets for use in a machine learning prototype for predicting if a pacs.008 message will be processed without exceptions. 

LEI dataset uses [GLEIF Golden Copy](https://www.gleif.org/en/lei-data/gleif-golden-copy/download-the-golden-copy#/) downloaded from Global Legal Entity Indentifier Foundation (GLEIF). The LEI dataset is a subset used for ML prototype.

BIC dataset is a fake dataset generated using [Faker](https://faker.readthedocs.io/en/master/).


## Basic setup

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import ensemble, metrics, model_selection, naive_bayes
from sklearn.utils import shuffle
color = sns.color_palette()

%matplotlib inline

## LEI Database

In [None]:
lei_full_df = pd.read_csv("iso20022-data/20210408-0000-gleif-goldencopy-lei2-golden-copy.csv")

print("Number of rows in full LEI dataset : ",lei_full_df.shape[0])

print(f"Shape: {lei_full_df.shape}")
print(f"Columns: {lei_full_df.columns}")

In [None]:
lei_full_df.head()

In [None]:
lei_selected_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE'])]
print(f"Shape of selected countries: {lei_selected_df.shape}")

us_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['US'])]).head(500)
print(f"US shape: {us_df.shape}")
ca_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['CA'])]).head(200)
print(f"Canada shape: {ca_df.shape}")
in_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['IN'])]).head(650)
print(f"India shape: {in_df.shape}")
gb_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['GB'])].head(300)
print(f"Great Britain shape: {gb_df.shape}")
th_df = (lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['TH'])]).head(100)
print(f"Thailand shape: {th_df.shape}")
mx_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['MX'])].head(100)
print(f"Mexico shape: {mx_df.shape}")
ie_df = lei_full_df.loc[lei_full_df['Entity.LegalAddress.Country'].isin(['IE'])].head(150)
print(f"Ireland shape: {ie_df.shape}")

frames = [us_df, ca_df, in_df, gb_df, th_df, mx_df, ie_df]

ml_prototype_df = pd.concat(frames)
print(f"ML Prototype shape: {ml_prototype_df.shape}")

ml_prototype_df = shuffle(ml_prototype_df)
ml_prototype_df.to_csv('iso20022-data/lei_ml_prototype_records.csv', index=False)


In [None]:
lei_small_df = pd.read_csv("iso20022-data/lei-records.csv")
print("Number of rows in full LEI dataset : ",lei_small_df.shape[0])
lei_small_df.head()

## BIC Database

In [None]:
!pip install Faker

### Build Synthetic BIC Database

In [None]:
from faker import Faker 
import numpy as np
import pandas as pd

mod_flags = ['A','M','D']
countries = ['US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE']
locale_map = {
 'US': 'en_US',
 'CA': 'en_CA',
 'IN': 'en_IN',
 'GB': 'en_GB',
 'TH': 'en_TH',
 'MX': 'es_MX',
 'IE': 'en_IE'
}

countries_dict = {
 'US': 'UNITED STATES OF AMERICA',
 'CA': 'CANADA',
 'IN': 'INDIA',
 'GB': 'GREAT BRITAIN',
 'TH': 'Thailand',
 'MX': 'MEXICO',
 'IE': 'IRELAND'
}
branch_info = ['BRANCH', 'BANKING DEPARTMENT', 'REGIONAL OFFICE', 'OFFICE', 'HQ', 'SHOPPING MALL', 'INDUSTRIAL ESTATE']
address2 = ['BUILDING', 'SHOP', 'UNIT', '']
subtypes = ['SUPE', 'NSWB', 'BEID', 'CORP', 'PSPA', 'SSPA']
value_added_services = ['AIMFIN', 'AIMFINTG+', 'FIN', 'FINSCO', 'FINTG+', 'ERPFIN', 'ERPFINTG+', 'FINTGT', 'FINTAS']
extra_info = ['ALL', 'ADM', 'ADM C4', 'BRA', 'PAY', 'ALLBRA', 'DOC', 'CAL']

# Instantiate Faker with multiple locales
fake = Faker(['en_US', 'en_CA', 'en_IN', 'en_GB', 'en_TH', 'es_MX', 'en_IE']) 
print(f'Locales: {fake.locales}')
fake1 = fake[locale_map['US']]
print(f'Generator locale: {fake1}')

def create_data(x): 
 # dictionary 
 bank = {} 
 
 for i in range(0, x): 
 country = fake.random_element(countries)
 generator = fake[locale_map[country]]
 
 bank[i] = {} 
 bank[i]['TAG'] = 'FI'
 bank[i]['MODIFICATION FLAG'] = fake.random_element(mod_flags)
 bic = generator.swift(length=8)
 random_bic = bic[0:4] + country + bic[6:8]
 bank[i]['BIC CODE'] = random_bic
 bank[i]['BRANCH CODE'] = 'XXX'
 bank[i]['INSTITUTION NAME'] = generator.company()
 bank[i]['BRANCH INFORMATION'] = fake.random_element(branch_info)
 city = generator.city()
 bank[i]['CITY HEADING'] = city
 bank[i]['SUBTYPE INDICATION'] = fake.random_element(subtypes)
 bank[i]['VALUE ADDED SERVICES'] = fake.random_element(value_added_services)
 bank[i]['EXTRA INFO'] = fake.random_element(extra_info)
 bank[i]['PHYSICAL ADDRESS 1'] = generator.street_address()
 addr2 = generator.random_element(address2)
 bank[i]['PHYSICAL ADDRESS 2'] = '' if addr2 == '' else addr2 + ' ' + generator.building_number()
 bank[i]['PHYSICAL ADDRESS 3'] = ''
 bank[i]['PHYSICAL ADDRESS 4'] = ''
 bank[i]['LOCATION'] = city
 bank[i]['COUNTRY NAME'] = country
 bank[i]['POB NUMBER'] = 'POB ' + generator.building_number()
 bank[i]['POB LOCATION'] = city
 bank[i]['POB COUNTRY NAME'] = countries_dict[country] 
 
 return bank
 
df = pd.DataFrame(create_data(200)).transpose()
df.to_csv('iso20022-data/bic_ml_prototype_records.csv', index=False)

df.head(10)

## ISO20022 PACS.008 Synthetic Dataset

In [None]:
print(f"Unique BICs: {len(pd.unique(df['BIC CODE']))}")
print(f"Unique BICs: {len(df['BIC CODE'].unique())}")
#'US', 'CA', 'IN', 'GB', 'TH', 'MX', 'IE'

us_shape = df[df['BIC CODE'].astype(str).str.contains("US")].shape
print(f"US BICs: {in_shape[0]}")

ca_shape = df[df['BIC CODE'].astype(str).str.contains("CA")].shape
print(f"Canada BICs: {in_shape[0]}")

gb_shape = df[df['BIC CODE'].astype(str).str.contains("GB")].shape
print(f"Great Britain BICs: {in_shape[0]}")

in_shape = df[df['BIC CODE'].astype(str).str.contains("IN")].shape
print(f"India BICs: {in_shape[0]}")

th_shape = df[df['BIC CODE'].astype(str).str.contains("TH")].shape
print(f"Thailand BICs: {in_shape[0]}")

mx_shape = df[df['BIC CODE'].astype(str).str.contains("MX")].shape
print(f"Mexico BICs: {in_shape[0]}")

ie_shape = df[df['BIC CODE'].astype(str).str.contains("IE")].shape
print(f"Ireland BICs: {in_shape[0]}")

print("All BICS ")
print(f"{df['BIC CODE'].unique()}")