# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: Apache-2.0 """ Helpers for parsing protein structure files and generating contact maps. """ import gzip from io import StringIO from typing import Union, List import numpy as np import pandas as pd from Bio.PDB.Polypeptide import three_to_one, is_aa from Bio.PDB import PDBParser, MMCIFParser, Structure, Chain, Residue def gunzip_to_ram(gzip_file_path: str) -> StringIO: """ gunzip a gzip file and decode it to a io.StringIO object. """ content = [] with gzip.open(gzip_file_path, "rb") as f: for line in f: content.append(line.decode("utf-8")) temp_fp = StringIO("".join(content)) return temp_fp def _parse_structure( parser: Union[PDBParser, MMCIFParser], name: str, file_path: str ) -> Structure: """Parse a .pdb or .cif file into a structure object. The file can be gzipped.""" if pd.isnull(file_path): return None if file_path.endswith(".gz"): structure = parser.get_structure(name, gunzip_to_ram(file_path)) else: # not gzipped structure = parser.get_structure(name, file_path) return structure parse_pdb_structure = _parse_structure # for backward compatiblity def parse_structure( pdb_parser: PDBParser, cif_parser: MMCIFParser, name: str, file_path: str ) -> Structure: """Parse a .pdb file or .cif file into a structure object. The file can be gzipped.""" if file_path.rstrip(".gz").endswith("pdb"): return _parse_structure(pdb_parser, name, file_path) else: return _parse_structure(cif_parser, name, file_path) def three_to_one_standard(res: Residue) -> str: """Encode non-standard AA to X.""" if not is_aa(res, standard=True): return "X" return three_to_one(res) def is_aa_by_target_atoms(res: Residue) -> bool: """Tell if a Residue object is AA""" target_atoms = ["N", "CA", "C", "O"] for atom in target_atoms: try: res[atom] except KeyError: return False return True def get_atom_coords( residue: Residue, target_atoms: List[str] = ["N", "CA", "C", "O"] ) -> np.ndarray: """Extract the coordinates of the target_atoms from an amino acid residue. Handles exception where residue doesn't contain certain atoms """ atom_coords = [] for atom in target_atoms: try: coord = residue[atom].coord except KeyError: coord = [np.nan] * 3 atom_coords.append(coord) return np.asarray(atom_coords) def chain_to_coords( chain: Chain, target_atoms: List[str] = ["N", "CA", "C", "O"], name: str = "", ) -> dict: """Convert a protein chain in a PDB file to coordinates of target atoms from all residues""" output = {} # get AA sequence in the pdb structure pdb_seq = "".join( [ three_to_one_standard(res.get_resname()) for res in chain.get_residues() if is_aa(res) ] ) if len(pdb_seq) <= 1: # has no or only 1 AA in the chain return None output["seq"] = pdb_seq # get the atom coords coords = np.asarray( [ get_atom_coords(res, target_atoms=target_atoms) for res in chain.get_residues() if is_aa(res) ] ) output["coords"] = coords.tolist() output["name"] = "{}-{}".format(name, chain.id) return output def parse_pdb_file_to_json_record( pdb_parser: Union[PDBParser, MMCIFParser], pdb_file_path: str, name: str = "", ) -> dict: """Parse a protein structure file (.pdb) to extract all the chains to json records.""" try: struct = parse_pdb_structure(pdb_parser, name, pdb_file_path) except Exception as e: print(pdb_file_path, "raised an error:") print(e) return [] else: records = [] chain_ids = set() for chain in struct.get_chains(): if chain.id in chain_ids: # skip duplicated chains continue chain_ids.add(chain.id) record = chain_to_coords(chain, name=name) if record is not None: records.append(record) return records