# A closer inspection of `flight_sales/run_exp.py`

In [None]:
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'

from __future__ import annotations

import my_nb_path # isort: split
import math
from pathlib import Path

import a2rl as wi
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from smallmatter.ds import SimpleMatrixPlotter
from stable_baselines3 import A2C

from flight_sales.flight_sales_gym import flight_sales_gym, fsigmoid, parameters
from flight_sales.run_exp import (
 ExperimentRewards,
 plot_results,
 run_ppo_agent,
 run_random_agent,
 run_ucb_agent,
 run_whatif,
)

## Simple propensity model

### Parameters of logistic function

The logistic function's parameters are seasonal.

In [None]:
def plot_parameters(f):
 """Evaluate the behavior of parameters function ``f``.

 Arguments:
 f: the batch-parameters function.
 """
 days = np.arange(1, 366)
 season = 0.5 * (np.cos(days * (2.0 * np.pi / 365)) + 1)

 daily_params = [parameters(day) for day in days]
 smoothness, mid_price, conversion = zip(*daily_params)

 fig, axes = plt.subplots(1, 4, figsize=(10, 2))
 for ax, a, title in zip(
 axes.flatten(),
 (smoothness, mid_price, conversion, season),
 ("k (smoothness)", "x0 (mid_price)", "L (conversion)", 'State "seasonality"'),
 ):
 pd.DataFrame(a).plot(title=title, ax=ax, legend=False)
 ax.set_xlabel("Day-of-year")
 ax.set_ylabel("Seasonal Factor")

 fig.suptitle(f.__name__)
 plt.tight_layout()
 fig.subplots_adjust(top=0.75)
 plt.show()


plot_parameters(parameters)

### Propensity to buy

In [None]:
params_peak_season = np.array([0.5, 10, 0.4])
params_off_season = np.array([0.3, 10, 0.2])
prices = np.linspace(0, 20, 10)

plt.plot(prices, [fsigmoid(price, day=2) for price in prices], label="peak season")
plt.plot(prices, [fsigmoid(price, day=180) for price in prices], label="off season")

plt.title("Conversion rate vs fare")
plt.legend()

plt.ylabel("Conversion rate")
plt.xlabel("Fare");

## Generate offline data

In RL parlence, offline data is historical data collected by another policy.

In [None]:
env = flight_sales_gym(f_reward="revenue_0_05")
print(f"Reward function: {env.f_reward.__name__}()")

previous_model = A2C(policy="MlpPolicy", env=env, verbose=False) # type: ignore[call-arg,arg-type]
previous_model.learn(total_timesteps=1000)

cap_env = wi.TransitionRecorder(env)
previous_model.set_env(cap_env)
previous_model.learn(total_timesteps=10000)

wi_df = wi.WiDataFrame(
 cap_env.df.values,
 columns=["season", "freight_price", "ticket_price", "reward"],
 states=["season", "freight_price"],
 actions=["ticket_price"],
 rewards=["reward"],
)
wi_df.describe()

## Train an A2RL simulator backbone

In [None]:
model_dir = "model-dyn-pricing"
tokenizer = wi.AutoTokenizer(wi_df, block_size_row=2)
model_builder = wi.GPTBuilder(tokenizer, model_dir)
model_builder.fit() # ~1.1m on MBP M1

## Evaluation

Run each type of agents, then plot the results.

### Run experiments

In [None]:
# MBP M1, sample_size = 500
# ep=1 => 1m:10s
# ep=5 => 4m:50s
whatif_rewards = run_whatif(env, model_builder, ep=5, sample_size=500)

In [None]:
random_agent_rewards = run_random_agent(env)

In [None]:
ucb_agent_rewards = run_ucb_agent(env)

In [None]:
ppo_agent_rewards = run_ppo_agent(env, previous_model)

### Compile results

In [None]:
rewards: dict[str, ExperimentRewards] = {
 "whatif": whatif_rewards,
 "random_agent": random_agent_rewards,
 "ucb_agent": ucb_agent_rewards,
 "ppo_agent": ppo_agent_rewards,
}

smp = plot_results(rewards, suptitle=f"Reward function: {env.f_reward.__name__}()")