# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

"""Modified from https://github.com/flatironinstitute/DeepFRI/blob/fa9409cca7dc7b475f71ab4bab0aa7b6b1091448/deepfrier/utils.py"""
import csv
import numpy as np

from sklearn import metrics
from joblib import Parallel, delayed

from Bio import SeqIO
from Bio.PDB.PDBParser import PDBParser


def load_FASTA(filename):
    """
    Loads a FASTA file and returns the protein ids and their sequences.

    Args:
        filename: String representing the path to the FASTA file.
    Returns
        Tuple where the first elemnent is a list of protein ids and the second element is a list of protein sequences.
    """
    # Loads fasta file and returns a list of the Bio SeqIO records
    infile = open(filename, "rU")
    entries = []
    proteins = []
    for entry in SeqIO.parse(infile, "fasta"):
        entries.append(str(entry.seq))
        proteins.append(str(entry.id))
    return proteins, entries


def load_GO_annot(filename):
    """
    Loads the GO annotations.

    Args:
        filename: String representing the path to the GO annotations file.
    Returns
        Quatruple where elements are
            1/ a dict of dict with protein annotations: {protein: {'cc': np.array([...])}}
            2/ a dict with metadata of GO terms: {'cc': [goterm1, ...]}
            3/ a dict with metadata of GO names: {'cc': [goname1, ...]}
            4/ a dict with protein counts of GO terms: {'cc': np.array(...)}
    """
    # Load GO annotations
    onts = ["mf", "bp", "cc"]
    prot2annot = {}
    goterms = {ont: [] for ont in onts}
    gonames = {ont: [] for ont in onts}
    with open(filename, mode="r") as tsvfile:
        reader = csv.reader(tsvfile, delimiter="\t")

        # molecular function
        next(reader, None)  # skip the headers
        goterms[onts[0]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[0]] = next(reader)

        # biological process
        next(reader, None)  # skip the headers
        goterms[onts[1]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[1]] = next(reader)

        # cellular component
        next(reader, None)  # skip the headers
        goterms[onts[2]] = next(reader)
        next(reader, None)  # skip the headers
        gonames[onts[2]] = next(reader)

        next(reader, None)  # skip the headers
        counts = {
            ont: np.zeros(len(goterms[ont]), dtype=float) for ont in onts
        }
        for row in reader:
            prot, prot_goterms = row[0], row[1:]
            prot2annot[prot] = {ont: [] for ont in onts}
            for i in range(3):
                goterm_indices = [
                    goterms[onts[i]].index(goterm)
                    for goterm in prot_goterms[i].split(",")
                    if goterm != ""
                ]
                prot2annot[prot][onts[i]] = np.zeros(len(goterms[onts[i]]))
                prot2annot[prot][onts[i]][goterm_indices] = 1.0
                counts[onts[i]][goterm_indices] += 1.0
    return prot2annot, goterms, gonames, counts


def norm_adj(A, symm=True):
    """
    Normalize adj matrix

    Args:
        A: numpy array representing the adjacency matrix to be normalized.
        symm: Boolean representing if the adjacency matrix is symmetric (i.e. undirected graph)
    Returns
        Numpy array representing the normalized adjacency matrix.
    """
    A += np.eye(A.shape[1])
    if symm:
        d = 1.0 / np.sqrt(A.sum(axis=1))
        D = np.diag(d)
        A = D.dot(A.dot(D))
    else:
        A /= A.sum(axis=1)[:, np.newaxis]
    return A


def _micro_aupr(y_true, y_test):
    """
    Computes the micro AUPR

    Args:
        y_true: array with the GT observations.
        y_test: array with the predictions.
    Returns
        float representing the micro aupr score
    """
    return metrics.average_precision_score(y_true, y_test, average="micro")


def compute_f1_score_at_threshold(
    y_true: np.ndarray, y_pred: np.ndarray, t: float
):
    """Calculate protein-centric F1 score based on DeepFRI's description.
    ref: https://www.nature.com/articles/nmeth.2340
    Online method -> Evaluation metrics

    Args:
        y_true: [n_proteins, n_functions], binary matrix of ground truth labels
        y_pred: [n_proteins, n_functions], probabilities from model predictions after sigmoid.
        t: Float representing the threshold to use to compute the f1 score.

    Returns:
        float representing the f1 score
    """
    n_proteins = y_true.shape[0]
    y_pred_bin = y_pred >= t  # binarize predictions
    pr = []
    rc = []
    for i in range(n_proteins):
        if y_pred_bin[i].sum() > 0:
            pr_i = metrics.precision_score(y_true[i], y_pred_bin[i])
            pr.append(pr_i)

        rc_i = metrics.recall_score(y_true[i], y_pred_bin[i])
        rc.append(rc_i)

    pr = np.mean(pr)
    rc = np.mean(rc)
    return 2 * pr * rc / (pr + rc)


def evaluate_multilabel(
    y_true: np.ndarray, y_pred: np.ndarray, n_thresholds=100
):
    """Calculate protein-centric F_max and function-centric AUPR
    based on DeepFRI's description.
    ref: https://www.nature.com/articles/nmeth.2340
    Online method -> Evaluation metrics
    Args:
        y_true: [n_proteins, n_functions], binary matrix of ground truth labels
        y_pred: [n_proteins, n_functions], logits from model predictions
        n_thresholds (int): number of thresholds to estimate F_max

    Returns:
        Tuple where the first element is the F1 score and the second element is the micro AUPR
    """
    # function-centric AUPR
    micro_aupr = _micro_aupr(y_true, y_pred)

    # apply sigmoid to logits
    y_pred = 1 / (1 + np.exp(-y_pred))

    thresholds = np.linspace(0.0, 1.0, n_thresholds, endpoint=False)
    f_scores = Parallel(n_jobs=-1, verbose=10)(
        delayed(compute_f1_score_at_threshold)(y_true, y_pred, thresholds[i])
        for i in range(n_thresholds)
    )

    return np.nanmax(f_scores), micro_aupr