# SPDX-License-Identifier: Apache-2.0
#
# The OpenSearch Contributors require contributions made to
# this file be licensed under the Apache-2.0 license or a
# compatible open source license.
"""
Script builds complex dataset with additional attributes from exiting dataset that has only vectors. 
Additional attributes are predefined in the script: color, taste, age. Only HDF5 format of vector dataset is supported.

Output dataset file will have additional dataset 'attributes' with multiple columns, each column corresponds to one attribute
from an attribute set, and value is generated at random, e.g.:

0: green	None	71
1: green	bitter	28

there is no explicit index reference in 'attributes' dataset, index of the row corresponds to a document id. 
For instance, in example above two rows of fields mapped to documents with ids '0' and '1'.  

If 'generate_filters' flag is set script generates additional dataset of neighbours (ground truth) for each filter type. 
Output is a new file with several datasets, each dataset corresponds to one filter. Datasets are named 'neighbour_filter_X'
where X is 1 based index of particular filter. 
Each dataset has rows with array of integers, where integer corresponds to 
a document id from original dataset with additional fields. Array ca have -1 values that are treated as null, this is because
subset of filtered documents is same of smaller than original set. 

For example, dataset file content may look like :

neighbour_filter_1: [[ 2,  5, -1],
                     [ 3,  1, -1],
                     [ 2   5,  7]]
neighbour_filter_2: [[-1, -1, -1],
                     [ 5,  6, -1],
                     [ 4,  2,  1]]

In this case we do have datasets for two filters, 3 query results for each. [2, 5, -1] indicates that for first query 
if filter 1 is used most similar document is with id 2, next similar is 5, and the rest do not pass filter 1 criteria.

Example of script usage:
    
    create new hdf5 file with attribute dataset
    add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data.hdf5 ~/dev/opensearch/datasets/data-with-attr True False
    
    create new hdf5 file with filter datasets
    add-filters-to-dataset.py ~/dev/opensearch/k-NN/benchmarks/perf-tool/dataset/data-with-attr.hdf5 ~/dev/opensearch/datasets/data-with-filters False True
"""

import getopt
import os
import random
import sys

import h5py

from osb.extensions.data_set import HDF5DataSet


class _Dataset:
    """Type of dataset container for data with additional attributes"""
    DEFAULT_TYPE = HDF5DataSet.FORMAT_NAME

    def create_dataset(self, source_dataset_path, out_file_path, generate_attrs: bool, generate_filters: bool) -> None:
        path_elements = os.path.split(os.path.abspath(source_dataset_path))
        data_set_dir = path_elements[0]

        # For HDF5, because multiple data sets can be grouped in the same file,
        # we will build data sets in memory and not write to disk until
        # _flush_data_sets_to_disk is called
        # read existing dataset
        data_hdf5 = os.path.join(os.path.dirname(os.path.realpath('/')), source_dataset_path)

        with h5py.File(data_hdf5, "r") as hf:

            if generate_attrs:
                data_set_w_attr = self.create_dataset_file(out_file_path, self.DEFAULT_TYPE, data_set_dir)

                possible_colors = ['red', 'green', 'yellow', 'blue', None]
                possible_tastes = ['sweet', 'salty', 'sour', 'bitter', None]
                max_age = 100

                for key in hf.keys():
                    if key not in ['neighbors', 'test', 'train']:
                        continue
                    data_set_w_attr.create_dataset(key, data=hf[key][()])

                attributes = []
                for i in range(len(hf['train'])):
                    attr = [random.choice(possible_colors), random.choice(possible_tastes),
                            random.randint(0, max_age + 1)]
                    attributes.append(attr)

                data_set_w_attr.create_dataset('attributes', (len(attributes), 3), 'S10', data=attributes)

                data_set_w_attr.flush()
                data_set_w_attr.close()

            if generate_filters:
                attributes = hf['attributes'][()]
                expected_neighbors = hf['neighbors'][()]

                data_set_filters = self.create_dataset_file(out_file_path, self.DEFAULT_TYPE, data_set_dir)

                def filter1(attributes, vector_idx):
                    if attributes[vector_idx][0].decode() == 'red' and int(attributes[vector_idx][2].decode()) >= 20:
                        return True
                    else:
                        return False

                self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_1', filter1)

                # filter 2 - color = blue or None and taste = 'salty'
                def filter2(attributes, vector_idx):
                    if (attributes[vector_idx][0].decode() == 'blue' or attributes[vector_idx][
                        0].decode() == 'None') and attributes[vector_idx][1].decode() == 'salty':
                        return True
                    else:
                        return False

                self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_2', filter2)

                # filter 3 - color and taste are not None and age is between 20 and 80
                def filter3(attributes, vector_idx):
                    if attributes[vector_idx][0].decode() != 'None' and attributes[vector_idx][
                        1].decode() != 'None' and 20 <= \
                            int(attributes[vector_idx][2].decode()) <= 80:
                        return True
                    else:
                        return False

                self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_3', filter3)

                # filter 4 - color green or blue and taste is bitter and age is between (30, 60)
                def filter4(attributes, vector_idx):
                    if (attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue') \
                            and (attributes[vector_idx][1].decode() == 'bitter') \
                            and 30 <= int(attributes[vector_idx][2].decode()) <= 60:
                        return True
                    else:
                        return False

                self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_4', filter4)

                # filter 5 color is (green or blue or yellow) or taste = sweet or age is between (30, 70)
                def filter5(attributes, vector_idx):
                    if attributes[vector_idx][0].decode() == 'green' or attributes[vector_idx][0].decode() == 'blue' \
                            or attributes[vector_idx][0].decode() == 'yellow' \
                            or attributes[vector_idx][1].decode() == 'sweet' \
                            or 30 <= int(attributes[vector_idx][2].decode()) <= 70:
                        return True
                    else:
                        return False

                self.apply_filter(expected_neighbors, attributes, data_set_filters, 'neighbors_filter_5', filter5)

                data_set_filters.flush()
                data_set_filters.close()

    def apply_filter(self, expected_neighbors, attributes, data_set_w_filtering, filter_name, filter_func):
        neighbors_filter = []
        filtered_count = 0
        for expected_neighbors_row in expected_neighbors:
            neighbors_filter_row = [-1] * len(expected_neighbors_row)
            idx = 0
            for vector_idx in expected_neighbors_row:
                if filter_func(attributes, vector_idx):
                    neighbors_filter_row[idx] = vector_idx
                    idx += 1
                    filtered_count += 1
            neighbors_filter.append(neighbors_filter_row)
        overall_count = len(expected_neighbors) * len(expected_neighbors[0])
        perc = float(filtered_count / overall_count) * 100
        print('ground truth size for {} is {}, percentage {}'.format(filter_name, filtered_count, perc))
        data_set_w_filtering.create_dataset(filter_name, data=neighbors_filter)
        return expected_neighbors

    def create_dataset_file(self, file_name, extension, data_set_dir) -> h5py.File:
        data_set_file_name = "{}.{}".format(file_name, extension)
        data_set_path = os.path.join(data_set_dir, data_set_file_name)

        data_set_w_filtering = h5py.File(data_set_path, 'a')

        return data_set_w_filtering


def main(argv):
    opts, args = getopt.getopt(argv, "")
    in_file_path = args[0]
    out_file_path = args[1]
    generate_attr = str2bool(args[2])
    generate_filters = str2bool(args[3])

    worker = _Dataset()
    worker.create_dataset(in_file_path, out_file_path, generate_attr, generate_filters)


def str2bool(v):
    return v.lower() in ("yes", "true", "t", "1")


if __name__ == "__main__":
    main(sys.argv[1:])