"""
Hash calculation utilities for files and directories.
"""
import hashlib
import os
import sys
from typing import Any, List, Optional, cast

BLOCK_SIZE = 4096
# earliest python version to support usedforsecurity option for hashlib.md5 is 3.9
# https://docs.python.org/3/library/hashlib.html#hash-algorithms
_MAJOR_PYTHON_VERSION = 3
_MINOR_PYTHON_VERSION = 9


def _get_md5():
    if sys.version_info.major >= _MAJOR_PYTHON_VERSION and sys.version_info.minor >= _MINOR_PYTHON_VERSION:
        return hashlib.md5(usedforsecurity=False)
    else:
        return hashlib.md5()


def file_checksum(file_name: str, hash_generator: Any = None) -> str:
    """

    Parameters
    ----------
    file_name : str
        file name of the file for which md5 checksum is required.
    hash_generator : hashlib._Hash
        hashlib _Hash object for generating hashes. Defaults to hashlib.md5.

    Returns
    -------
    checksum of the given file.

    """
    # Default value is set here because default values are static mutable in Python
    if not hash_generator:
        hash_generator = _get_md5()
    with open(file_name, "rb") as file_handle:
        # Save current cursor position and reset cursor to start of file
        curpos = file_handle.tell()
        file_handle.seek(0)

        buf = file_handle.read(BLOCK_SIZE)
        while buf:
            hash_generator.update(buf)
            buf = file_handle.read(BLOCK_SIZE)

        # Restore file cursor's position
        file_handle.seek(curpos)

        return cast(str, hash_generator.hexdigest())


def dir_checksum(
    directory: str, followlinks: bool = True, ignore_list: Optional[List[str]] = None, hash_generator: Any = None
) -> str:
    """

    Parameters
    ----------
    directory : dict
        A directory with an absolute path
    followlinks : bool
        Follow symbolic links through the given directory
    ignore_list : list(str)
        The list of file/directory names to ignore in checksum
    hash_generator : hashlib._Hash
        The hashing method (hashlib _Hash object) that generates checksum. Defaults to hashlib.md5.

    Returns
    -------
    checksum hash of the directory.

    """
    ignore_set = set(ignore_list or [])
    if not hash_generator:
        hash_generator = _get_md5()
    files = list()
    # Walk through given directory and find all directories and files.
    for dirpath, dirnames, filenames in os.walk(directory, followlinks=followlinks):
        # > When topdown is True, the caller can modify the dirnames list in-place
        # > (perhaps using del or slice assignment) and walk() will only recurse
        # > into the subdirectories whose names remain in dirnames
        # > https://docs.python.org/library/os.html#os.walk
        dirnames[:] = [dirname for dirname in dirnames if dirname not in ignore_set]
        # Go through every file in the directory and sub-directory.
        for filepath in [os.path.join(dirpath, filename) for filename in filenames if filename not in ignore_set]:
            # Look at filename and contents.
            # Encode file's checksum to be utf-8 and bytes.
            files.append(filepath)

    files.sort()
    for file in files:
        hash_generator.update(os.path.relpath(file, directory).encode("utf-8"))
        filepath_checksum = file_checksum(file)
        hash_generator.update(filepath_checksum.encode("utf-8"))

    return cast(str, hash_generator.hexdigest())


def str_checksum(content: str, hash_generator: Any = None) -> str:
    """
    return a md5 checksum of a given string

    Parameters
    ----------
    content: string
        the string to be hashed
     hash_generator : hashlib._Hash
        The hashing method (hashlib _Hash object) that generates checksum. Defaults to hashlib.md5.
    Returns
    -------
    md5 checksum of content
    """
    if not hash_generator:
        hash_generator = _get_md5()
    hash_generator.update(content.encode("utf-8"))
    return cast(str, hash_generator.hexdigest())