""" A simple script for generating sample data for learning to give personalised offers. """ import json import pandas as pd import numpy as np import gzip import random import logging GENERATE_INBALANCED_DATA = False NUM_INTERACTIONS_PER_USER = 3 FIRST_TIMESTAMP = 1591803782 # 2020-06-10, 18:43:02 LAST_TIMESTAMP = 1599579782 # 2020-09-08, 18:43:02 RANDOM_SEED = 1 IN_PRODUCTS_FILENAME = "src/products/src/products-service/data/products.yaml" IN_USERS_FILENAME = "src/users/src/users-service/data/users.json.gz" IN_OFFERS_FILENAME = "src/offers/src/offers-service/data/offers.json" # Where to put the generated data so that it is picked up by stage.sh GENERATED_DATA_ROOT = "src/aws-lambda/personalize-pre-create-resources/data" def generate_data(interactions_filename, users_df, offers_df): """Script for writing to a file simulated user-offer interactions""" random.seed(RANDOM_SEED) np.random.seed(RANDOM_SEED) num_users = users_df.shape[0] num_interactions = NUM_INTERACTIONS_PER_USER * num_users if GENERATE_INBALANCED_DATA: # We may wish to assume probability is proportional to ID to show off how we can add # business logic around Personalize offer_probs = offers_df.id.values.astype(float) else: # Or we can work around inbalance at the data munging stage offer_probs = np.ones(len(offers_df.id.values), dtype=float) # Normalise so that we have probabilities offer_probs = offer_probs / offer_probs.sum() # generate timestamps time_between_events = (LAST_TIMESTAMP - FIRST_TIMESTAMP) / num_interactions timestamps = np.arange(FIRST_TIMESTAMP, LAST_TIMESTAMP, time_between_events).astype(int) # pre-shuffle them as we will be using them as a randomising key when we sort by timestamp np.random.shuffle(timestamps) # generate all users Ids sample_user_ids = np.tile(users_df['id'].values.astype(int), NUM_INTERACTIONS_PER_USER) # only one event type event_type = ['OfferConverted'] * num_interactions # we sort it to ensure there is a correlation between user ID and offer ID. # This correlation is what the personalisation will learn. sampled_offers = sorted(np.random.choice(offers_df.id.values, num_interactions, p=offer_probs)) interactions_df = pd.DataFrame({'ITEM_ID': sampled_offers, 'USER_ID': sample_user_ids, 'EVENT_TYPE': event_type, 'TIMESTAMP': timestamps}) # by sorting by timestamp, other elements get shuffled interactions_df = interactions_df.sort_values('TIMESTAMP') with open(interactions_filename, 'w') as outfile: interactions_df.to_csv(outfile, index=False) globals().update(locals()) # This can be used for inspecting in console after script ran or if run with ipython. print('Generation script finished - created offers dataset') if __name__ == '__main__': # User info is stored in the repository - it was automatically generated with gzip.open(IN_USERS_FILENAME, 'r') as f: users = json.load(f) users_df = pd.DataFrame(users) # Offers info is stored in repository with open(IN_OFFERS_FILENAME, 'r') as f: offers = json.load(f) offers_df = pd.DataFrame(offers) logging.basicConfig(level=logging.INFO) generate_data(GENERATED_DATA_ROOT + '/offer_interactions.csv', users_df, offers_df)