# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
# SPDX-License-Identifier: MIT-0

"""
Helpers for parsing protein structure files and generating contact maps.
"""

import gzip
import boto3
import numpy as np
import pandas as pd
from io import StringIO
from Bio.PDB.Polypeptide import three_to_one, is_aa
from Bio.PDB import PDBParser
from Bio.PDB.PDBIO import PDBIO
from Bio.PDB.Entity import Entity as PDBEntity
from rdkit import Chem
from tqdm import tqdm
from .xpdb import SloppyStructureBuilder


def gunzip_to_ram(gzip_file_path):
    """
    gunzip a gzip file and decode it to a io.StringIO object.

    Args:
        gzip_file_path: String. Gunzip filepath.

    Returns:
        io.StringIO object.
    """
    content = []
    with gzip.open(gzip_file_path, "rb") as f:
        for line in f:
            content.append(line.decode("utf-8"))

    temp_fp = StringIO("".join(content))
    return temp_fp


def _parse_structure(parser, name, file_path):
    """Parse a .pdb or .cif file into a structure object.
    The file can be gzipped.

    Args:
        parser: a Bio.PDB.PDBParser or Bio.PDB.MMCIFParser instance.
        name: String. name of protein
        file_path: String. Filpath of the pdb or cif file to be read.

    Retruns:
        a Bio.PDB.Structure object representing the protein structure.

    """
    if pd.isnull(file_path):
        return None
    if file_path.endswith(".gz"):
        structure = parser.get_structure(name, gunzip_to_ram(file_path))
    else:  # not gzipped
        structure = parser.get_structure(name, file_path)
    return structure


parse_pdb_structure = _parse_structure  # for backward compatiblity


def parse_structure(pdb_parser, cif_parser, name, file_path):
    """Parse a .pdb file or .cif file into a structure object.
    The file can be gzipped.

    Args:
        pdb_parser: a Bio.PDB.PDBParser instance
        cif_parser: Bio.PDB.MMCIFParser instance
        name: String. name of protein
        file_path: String. Filpath of the pdb or cif file to be read.

    Return:
        a Bio.PDB.Structure object representing the protein structure.
    """
    if file_path.rstrip(".gz").endswith("pdb"):
        return _parse_structure(pdb_parser, name, file_path)
    else:
        return _parse_structure(cif_parser, name, file_path)


def three_to_one_standard(res):
    """Encode non-standard AA to X.

    Args:
        res: a Bio.PDB.Residue object representing the residue.

    Return:
        String. One letter code of the residue.
    """
    if not is_aa(res, standard=True):
        return "X"
    return three_to_one(res)


def get_atom_coords(residue, target_atoms=["N", "CA", "C", "O"]):
    """Extract the coordinates of the target_atoms from an AA residue.
    Handles exception where residue doesn't contain certain atoms
    by setting coordinates to np.nan

    Args:
        residue: a Bio.PDB.Residue object.
        target_atoms: Target atoms which residues will be resturned.

    Returns:
        np arrays with target atoms 3D coordinates in the order of target atoms.
    """
    atom_coords = []
    for atom in target_atoms:
        try:
            coord = residue[atom].coord
        except KeyError:
            coord = [np.nan] * 3
        atom_coords.append(coord)
    return np.asarray(atom_coords)


def chain_to_coords(
    chain, target_atoms=["N", "CA", "C", "O"], name="", residue_smiles=False
):
    """Convert a PDB chain in to coordinates of target atoms from all
    AAs

    Args:
        chain: a Bio.PDB.Chain object
        target_atoms: Target atoms which residues will be resturned.
        name: String. Name of the protein.
        residue_smiles: bool. Whether to get a list of smiles strings for the residues
    Returns:
        Dictonary containing protein sequence `seq`, 3D coordinates `coord` and name `name`.

    """
    output = {}
    # get AA sequence in the pdb structure
    pdb_seq = "".join(
        [
            three_to_one_standard(res.get_resname())
            for res in chain.get_residues()
            if is_aa(res)
        ]
    )
    if len(pdb_seq) <= 1:
        # has no or only 1 AA in the chain
        return None
    output["seq"] = pdb_seq
    if residue_smiles:
        residues = []
        for res in chain.get_residues():
            if is_aa(res):
                mol = residue_to_mol(res)
                residues.append(Chem.MolToSmiles(mol))
        output["residues"] = residues
    # get the atom coords
    coords = np.asarray(
        [
            get_atom_coords(res, target_atoms=target_atoms)
            for res in chain.get_residues()
            if is_aa(res)
        ]
    )
    output["coords"] = coords.tolist()
    output["name"] = "{}-{}".format(name, chain.id)
    return output


def read_file_from_s3(bucket: str, prefix: str):
    s3 = boto3.resource("s3")
    obj = s3.Object(bucket, prefix)
    return obj.get()["Body"]


def extract_coords(
    structure, target_atoms=["N", "CA", "C", "O"], residue_smiles=False
):
    """
    Extract the atomic coordinates for all the chains.
    """
    records = {}
    for chain in structure.get_chains():
        record = chain_to_coords(
            chain,
            name=structure.id,
            target_atoms=target_atoms,
            residue_smiles=residue_smiles,
        )
        if record is not None:
            records[chain.id] = record
    return records


def parse_pdb_ids(pdb_ids: list, residue_smiles=False) -> dict:
    """
    Parse a list of PDB ids to structures by first retrieving
    PDB files from AWS OpenData Registry, then parse to structure objects.
    """
    PDB_BUCKET_NAME = "pdbsnapshots"
    pdb_parser = PDBParser(
        QUIET=True,
        PERMISSIVE=True,
        structure_builder=SloppyStructureBuilder(),
    )
    parsed_structures = {}
    for pdb_id in tqdm(pdb_ids):
        try:
            pdb_file = read_file_from_s3(
                PDB_BUCKET_NAME,
                f"20220103/pub/pdb/data/structures/all/pdb/pdb{pdb_id.lower()}.ent.gz",
            )
        except Exception as e:
            print(pdb_id, "caused the following error:")
            print(e)
        else:
            structure = pdb_parser.get_structure(
                pdb_id, gunzip_to_ram(pdb_file)
            )
            rec = extract_coords(structure, residue_smiles=residue_smiles)
            parsed_structures[pdb_id] = rec
    return parsed_structures


def remove_nan_residues(rec: dict) -> dict:
    """
    Remove the residues from a parsed protein chain where coordinates contains nan's
    """
    if len(rec["coords"]) == 0:
        return None
    coords = np.asarray(rec["coords"])  # shape: (n_residues, 4, 3)
    mask = np.isfinite(coords.sum(axis=(1, 2)))
    if mask.sum() == 0:
        # all residues coordinates are nan's
        return None
    if mask.sum() < coords.shape[0]:
        rec["seq"] = "".join(np.asarray(list(rec["seq"]))[mask])
        rec["coords"] = coords[mask].tolist()
    return rec


def residue_to_mol(residue: PDBEntity, **kwargs) -> Chem.rdchem.Mol:
    """Convert a parsed Biopython PDB object (Residue, Chain, Structure) to a
    rdkit Mol object"""
    # Write the PDB object into PDB string
    stream = StringIO()
    pdbio = PDBIO()
    pdbio.set_structure(residue)
    pdbio.save(stream)
    # Parse the PDB string with rdkit
    mol = Chem.MolFromPDBBlock(stream.getvalue(), **kwargs)
    return mol


def mol_to_pdb_structure(
    mol: Chem.rdchem.Mol, pdb_parser=None, protein_id=""
) -> PDBEntity:
    """
    Convert a rdkit Mol object to a Biopython PDB Structure object
    """
    # Write the Mol object into PDB string
    stream = StringIO()
    stream.write(Chem.MolToPDBBlock(mol))
    stream.seek(0)
    # parse the stream into a PDB Structure object
    if not pdb_parser:
        pdb_parser = PDBParser(
            QUIET=True,
            PERMISSIVE=True,
            structure_builder=SloppyStructureBuilder(),
        )
    structure = pdb_parser.get_structure(protein_id, stream)
    return structure