""" Copyright OpenSearch Contributors SPDX-License-Identifier: Apache-2.0 utils.py contains the small utility functions needed throughout the tool Functions: - validate_filename(): Function that validates the filename is a json, ndjson, or csv file - unzip_file(): Function that unzips a filename if it was zipped. - validate_job_args(): Function that validates various arguments for the startup and refresh jobs - untar_file(): Function that extracts files from a tar.gz file """ from opensearchpy import OpenSearch # Standard libraries from shutil import copyfileobj from os import path import tarfile import gzip import sys # Adds parent directory "/sample_data_tooling" to sys.path sys.path.append(path.abspath(__file__).split("sample_data_tooling")[0]) from sample_data_tooling.sample_data_authentication.sample_data_authentication import Authentication def validate_filename(filename:str): """ Validates that the filename is a string and that the file is a json, ndjson, or csv file Arguments: - filename: The object to validate Raises: - TypeError: filename should be a string - ValueError: filename must be a .json, .ndjson, or .csv file """ if type(filename) is not str: raise TypeError("filename should be a string") if ".json" not in filename and ".csv" not in filename and ".ndjson" not in filename: raise ValueError("filename must be a .json, .ndjson, or .csv file") def unzip_file(filename:str) -> str: """ Given a filename, if it is zipped, unzip into a new file and return the new filename Arguments: - filename: The filename to potentially unzip Returns: - The unzipped filename (or None if the filename is invalid) """ # Ignores non json or ndjson files if type(filename) is not str or (".json" not in filename): return None # For already unzipped files if type(filename) is str and ".gz" not in filename: print("Payload is already unzipped") return filename with gzip.open(filename, 'rt') as fin: with open(filename.split(".gz")[0], 'wt') as fout: copyfileobj(fin, fout) return filename.split(".gz")[0] def validate_job_args(config_path:str = None, url:str = None, header:Authentication = None, client:OpenSearch = None ): """ Given various arguments, validate input type Arguments: - config_path: The directory path in which the plugin config json files are located - url: The base url in which the API can be called - header: The Authentication object used to create and return request headers - client: The OpenSearch Python client object used to create and ingest indices Raises: - TypeError: config_path should be a string - TypeError: url should be a string - TypeError: header should be a subclass of Authentication - TypeError: client should be a OpenSearch Python client object """ if config_path and type(config_path) is not str: raise TypeError("config_path should be a string") if url and type(url) is not str: raise TypeError("url should be a string") if header and not isinstance(header, Authentication): raise TypeError("header should be a subclass of Authentication") if client and not isinstance(client, OpenSearch): raise TypeError("client should be a OpenSearch Python client object") def untar_file(filename:str, destination:str = None) -> list: """ Utility function that extracts tar files and returns the filenames Arguments: - filename: The filename (as a tar.gz file) to extract - destination: The destination directory to store the files Returns: - A list of filenames extracted from the tar file (or none if the filename isn't a tar file) """ # Ignores non-filenames or non-tar.gz files if type(filename) is not str or ".tar.gz" not in filename or type(destination) is not str: return [] t_file = tarfile.open(filename) filename_list = [] # Put files into a desired directory, if specified if destination: t_file.extractall(destination) for file in t_file.getnames(): filename_list.append(path.join(destination, file)) else: t_file.extractall() filename_list = t_file.getnames() return filename_list