# -*- coding: utf-8 -*-
"""Tests for LightGBM integration"""
from __future__ import print_function
import unittest
import os
import numpy as np
import pytest
import treelite
import treelite.runtime
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from util import os_compatible_toolchains, libname, assert_almost_equal,\
                 run_pipeline_test, os_platform

dpath = os.path.abspath(os.path.join(os.getcwd(), 'tests/examples/'))

try:
  import lightgbm
except ImportError:
  # skip this test suite if LightGBM is not installed
  pytest.skip('LightGBM not installed; skipping', allow_module_level=True)

class TestLightGBMIntegration(unittest.TestCase):
  def test_lightgbm_multiclass_classification(self):
    X, y = load_iris(return_X_y=True)
    X_train, X_test, y_train, y_test \
      = train_test_split(X, y, test_size=0.2, shuffle=False)
    dtrain = lightgbm.Dataset(X_train, y_train, free_raw_data=False)
    dtest = lightgbm.Dataset(X_test, y_test, reference=dtrain, free_raw_data=False)
    param = {'task': 'train', 'boosting_type': 'gbdt',
             'metric': 'multi_logloss', 'num_class': 3,
             'num_leaves': 31, 'learning_rate': 0.05}
    num_round = 10
    watchlist = [dtrain, dtest]
    watchlist_names = ['train', 'test']
    for objective in ['multiclass', 'multiclassova']:
      param['objective'] = objective
      bst = lightgbm.train(param, dtrain, num_round, watchlist, watchlist_names)
      bst.save_model('./iris_lightgbm.txt')

      expected_pred = bst.predict(X_test)

      model = treelite.Model.load('./iris_lightgbm.txt', model_format='lightgbm')
      libpath = libname('./iris_{}{{}}'.format(objective))
      batch = treelite.runtime.Batch.from_npy2d(X_test)
      for toolchain in os_compatible_toolchains():
        model.export_lib(toolchain=toolchain, libpath=libpath,
                         params={}, verbose=True)
        predictor = treelite.runtime.Predictor(libpath=libpath, verbose=True)
        out_pred = predictor.predict(batch)
        assert_almost_equal(out_pred, expected_pred)

  def test_lightgbm_binary_classification(self):
    dtrain_path = os.path.join(dpath, 'mushroom/agaricus.train')
    dtest_path = os.path.join(dpath, 'mushroom/agaricus.test')
    dtrain = lightgbm.Dataset(dtrain_path)
    dtest = lightgbm.Dataset(dtest_path, reference=dtrain)
    watchlist = [dtrain, dtest]
    watchlist_names = ['train', 'test']
    param = {'task': 'train', 'boosting_type': 'gbdt',
             'metric': 'auc', 'num_leaves': 7, 'learning_rate': 0.1}
    num_round = 10

    for objective in ['binary', 'xentlambda', 'xentropy']:
      param['objective'] = objective
      bst = lightgbm.train(param, dtrain, num_round, watchlist, watchlist_names)
      bst.save_model('./mushroom_lightgbm.txt')

      expected_prob = bst.predict(dtest_path)
      expected_margin = bst.predict(dtest_path, raw_score=True)

      model = treelite.Model.load('./mushroom_lightgbm.txt',
                                  model_format='lightgbm')
      libpath = libname('./agaricus_{}{{}}'.format(objective))
      batch = treelite.runtime.Batch.from_csr(treelite.DMatrix(dtest_path))
      for toolchain in os_compatible_toolchains():
        model.export_lib(toolchain=toolchain, libpath=libpath,
                         params={}, verbose=True)
        predictor = treelite.runtime.Predictor(libpath, verbose=True)
        out_prob = predictor.predict(batch)
        assert_almost_equal(out_prob, expected_prob)
        out_margin = predictor.predict(batch, pred_margin=True)
        assert_almost_equal(out_margin, expected_margin)

  def test_categorical_data(self):
    """
    LightGBM is able to produce categorical splits directly, so that
    categorical data don't have to be one-hot encoded. Test if Treelite is
    able to handle categorical splits.

    This toy example contains two features, both of which are categorical.
    The first has cardinality 3 and the second 5. The label was generated using
    the formula

       y = f(x0) + g(x1) + [noise with std=0.1]

    where f and g are given by the tables

       x0  f(x0)        x1  g(x1)
        0    -20         0     -2
        1    -10         1     -1
        2      0         2      0
                         3      1
                         4      2
    """

    for model_path, dtest_path, libname_fmt, \
        expected_prob_path, expected_margin_path, multiclass in \
        [('toy_categorical/toy_categorical_model.txt',
          'toy_categorical/toy_categorical.test', './toycat{}',
          None, 'toy_categorical/toy_categorical.test.pred', False)]:
      model_path = os.path.join(dpath, model_path)
      model = treelite.Model.load(model_path, model_format='lightgbm')
      for use_quantize in [False, True]:
        for use_parallel_comp in [None, 2]:
          run_pipeline_test(model=model, dtest_path=dtest_path,
                            libname_fmt=libname_fmt,
                            expected_prob_path=expected_prob_path,
                            expected_margin_path=expected_margin_path,
                            multiclass=multiclass, use_annotation=None,
                            use_quantize=use_quantize,
                            use_parallel_comp=use_parallel_comp)

  @pytest.mark.skipif(os_platform() == 'windows', reason='MSVC cannot handle long if conditional')
  def test_sparse_categorical_model(self):
    """
    LightGBM is able to produce categorical splits directly, so that
    categorical data don't have to be one-hot encoded. Test if Treelite is
    able to handle categorical splits.

    This example produces a model with high-cardinality categorical variables.
    The training data has many missing values, so we need to match LightGBM
    when it comes to handling missing values
    """

    for model_path, dtest_path, libname_fmt, \
        expected_prob_path, expected_margin_path, multiclass in \
        [('sparse_categorical/sparse_categorical_model.txt',
          'sparse_categorical/sparse_categorical.test', './sparsecat{}',
          None, 'sparse_categorical/sparse_categorical.test.margin', False)]:
      model_path = os.path.join(dpath, model_path)
      model = treelite.Model.load(model_path, model_format='lightgbm')
      for use_quantize in [False, True]:
        for use_parallel_comp in [None]:
          run_pipeline_test(model=model, dtest_path=dtest_path,
                            libname_fmt=libname_fmt,
                            expected_prob_path=expected_prob_path,
                            expected_margin_path=expected_margin_path,
                            multiclass=multiclass, use_annotation=None,
                            use_quantize=use_quantize,
                            use_parallel_comp=use_parallel_comp,
                            use_toolchains=['gcc'])