# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. # SPDX-License-Identifier: MIT-0 """ # Convert DiffDock's output into the same format with CASF-2016 Docking data. The output from DiffDock is a PDB file, which contains a single chain representing the pose of the small molecule ligand. Example usage: python preprocess_diffdock_output.py \ --data_dir /home/ec2-user/SageMaker/efs/data/DiffDockData/inference1 \ --pdb_data_dir /home/ec2-user/SageMaker/efs/data/DiffDockData/PDBBind_processed \ --thres 6 \ --output_dir /home/ec2-user/SageMaker/efs/data/DiffDockData/inference1_processed_t6 In this pipeline, we perform the following steps to convert the outputs from DiffDock to a format compatible with our affinity prediction model: 1. parse the ligand pose PDB file 2. combine the ligand with protein PDB into the same coordinate system 3. subset the protein chain(s) to only include the residues around the ligand -> pocket-ligand structure 4. save the pocket-ligand structures into pickles; save the RMSD values into text files After these steps, we should be able to run inference using our affinity prediction model in a similar settings in `evaluate_casf2016.py` function `evaluate_docking`. """ import os import pickle import argparse from tqdm import tqdm from rdkit import Chem import pandas as pd import numpy as np from scipy.spatial.distance import cdist from Bio.PDB import PDBParser, Select, PDBIO from Bio.PDB.Polypeptide import is_aa from ppi.data_utils import parse_pdb_structure def get_calpha_coords(residue): try: return residue["CA"].coord except KeyError: return [np.nan] * 3 def get_contact_residues(ligand_mol, chain, thres=10): # get the residue IDs res_ids = np.asarray( [res.id[1] for res in chain.get_residues() if is_aa(res)] ) # ligand coords coords1 = ligand_mol.GetConformers()[0].GetPositions() # extract the C-alpha coordinates of all AA residues coords2 = np.asarray( [get_calpha_coords(res) for res in chain.get_residues() if is_aa(res)] ) # calculate interchain distance dist = cdist(coords1, coords2) dist_bool = dist <= thres res_keep = res_ids[dist_bool.sum(axis=0) > 0] return res_keep def get_contact_residues_across_chains(ligand_mol, protein, thres=10): d_chain_residues = {} for chain in protein.get_chains(): res_keep = get_contact_residues(ligand_mol, chain, thres=thres) d_chain_residues[chain.id] = res_keep return d_chain_residues def subset_pdb_structure(structure, d_chain_residues, outfile): # to subset the protein structure class ResSelect(Select): def accept_residue(self, res): if res.id[1] in d_chain_residues.get(res.parent.id, set()): return True else: return False io = PDBIO() # set the structure as the entire protein io.set_structure(structure) # subset and save the pocket into PDB file io.save(outfile, ResSelect()) return def process_one(row, pdb_parser, args): # 1. Parse ligand poses from PDB files ligand_mol = Chem.MolFromPDBFile( os.path.join(args.data_dir, row["pdb_file"]), sanitize=False ) # 2. combine the ligand with protein PDB into the same coordinate system # parse the corresponding protein protein = parse_pdb_structure( pdb_parser, row.pdb_id, os.path.join( args.pdb_data_dir, row["pdb_id"], f"{row['pdb_id']}_protein_processed.pdb", ), ) # 3. subset the protein chain(s) to only include the residues around the # ligand d_chain_residues = get_contact_residues_across_chains( ligand_mol, protein, thres=args.thres ) subset_pdb_structure( protein, d_chain_residues, os.path.join(args.output_dir, f"{row['pdb_id']}_chopped.pdb"), ) protein_pocket = parse_pdb_structure( pdb_parser, row.pdb_id, os.path.join(args.output_dir, f"{row['pdb_id']}_chopped.pdb"), ) # 4. write to pickle output = (ligand_mol, None, protein_pocket, None) output_file = os.path.join(args.output_dir, "data", row["file_id"]) pickle.dump(output, open(output_file, "wb")) return def main(args): os.makedirs(args.output_dir, exist_ok=True) sub_dirs = ["decoys_docking_rmsd", "data", "keys"] for sub_dir in sub_dirs: os.makedirs(os.path.join(args.output_dir, sub_dir), exist_ok=True) # 0. parse metadata from DiffDock output files meta_df = [] for pdb_file in os.listdir(args.data_dir): if pdb_file.endswith(".pdb"): row = { "id": pdb_file[:-4], "pdb_file": pdb_file, "pdb_id": pdb_file.split("_")[0], "rank": int(pdb_file.split("_")[1]), "rmsd": float(pdb_file.split("_")[2]), "confidence": float(pdb_file.split("_")[3][:-4]), } row["file_id"] = row["pdb_id"] + "_" + row["id"].split("_")[1] meta_df.append(row) meta_df = pd.DataFrame(meta_df).set_index("id", verify_integrity=True) print(meta_df.shape) pdb_parser = PDBParser( QUIET=True, PERMISSIVE=True, ) for _, row in tqdm(meta_df.iterrows(), total=meta_df.shape[0]): process_one(row, pdb_parser, args) # Write RMSD files for pdb_id, sub_df in meta_df.groupby("pdb_id"): out_rmsd_filename = f"{pdb_id}_rmsd.dat" sub_df[["file_id", "rmsd"]].to_csv( os.path.join( args.output_dir, "decoys_docking_rmsd", out_rmsd_filename ), sep="\t", index=False, ) # Write keys and pdb_to_affinity.txt keys = list(meta_df["file_id"]) pickle.dump( keys, open(os.path.join(args.output_dir, "keys/test_keys.pkl"), "wb") ) pdb_to_affinity = meta_df[["file_id"]] pdb_to_affinity.loc[:, "affinity"] = 0 pdb_to_affinity.to_csv( os.path.join(args.output_dir, "pdb_to_affinity.txt"), sep="\t", index=False, header=False, ) return if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--data_dir", type=str, required=True, help="Directory to the output ligand poses in pdb files from DiffDock", ) parser.add_argument( "--pdb_data_dir", type=str, required=True, help="Directory to oringal protein pdb files", ) parser.add_argument( "--output_dir", type=str, required=True, help="Output directory", ) parser.add_argument( "--thres", type=int, required=True, default=10, help="Threshold for identifying contact residues", ) args = parser.parse_args() main(args)