import argparse import os, sys import json import tarfile import warnings import pymol from pymol import cmd, stored import numpy as np # from sagemaker.experiments.run import load_run def rmsd_cur(mol0, mol1, sel='*'): """ Computes the root mean square deviation from the current coordinates of two pairs of equivalent atoms. Does not perform a superposition. Parameters ---------- mol0 : PyMOL object mol1 : PyMOL object sel : PyMOL selection, atoms used to compute rmsd. e.g. use ca+c+n for the backbone """ model0 = cmd.get_model('%s and name %s' % (mol0, sel)) model1 = cmd.get_model('%s and name %s' % (mol1, sel)) xyz0 = np.array(model0.get_coord_list()) xyz1 = np.array(model1.get_coord_list()) rmsd = (np.sum((xyz0 - xyz1 )**2)/len(xyz0))**0.5 return rmsd def rmsd_fit(mol0, mol1, sel='*', fit=True): """ Computes the root mean square deviation from two pairs of equivalent atoms after superposition. Parameters ---------- mol0 : PyMOL object mol1 : PyMOL object sel : PyMOL selection. atoms used to compute rmsd. e.g. use ca+c+n for the backbone fit : bool. If false computes the rmsd after superposition, but without updating the coordinates """ xyz0 = np.array(cmd.get_model('%s and name %s' % (mol0, sel)).get_coord_list()) xyz1 = np.array(cmd.get_model('%s and name %s' % (mol1, sel)).get_coord_list()) xyz0_all = np.array(cmd.get_model('%s' % mol0).get_coord_list()) xyz1_all = np.array(cmd.get_model('%s' % mol1).get_coord_list()) # Translation X = xyz0 - xyz0.mean(axis=0) Y = xyz1 - xyz1.mean(axis=0) # Covariation matrix Cov_matrix = np.dot(Y.T, X) # Optimal rotation matrix U, S, Wt = np.linalg.svd(Cov_matrix) # Create Rotation matrix R R = np.dot(U, Wt) # Ensure a right-handed coordinate system if np.linalg.det(R) < 0.: S[-1] = -S[-1] Wt[-1] *= -1 R = np.dot(U, Wt) if fit: # center the first molecule stored.sel0 = list(xyz0_all - xyz0.mean(axis=0)) # rotate and translate the second molecule stored.sel1 = list(np.dot((xyz1_all - xyz1.mean(0)), R)) #update the changes to the coordinates cmd.alter_state(1, mol0,"(x,y,z)=stored.sel0.pop(0)") cmd.alter_state(1, mol1,"(x,y,z)=stored.sel1.pop(0)") # We compute the RMSD after superposition by using the matrix S. The advantage is # we do not need to actually do the superposition before computing the RMSD. #rmsd = (np.exp(np.log(np.sum(X ** 2) + np.sum(Y ** 2)) - 2.0 * np.log(np.sum(S)))/len(X))**0.5 rmsd = ((np.sum(X ** 2) + np.sum(Y ** 2) - 2.0 * np.sum(S))/len(X))**0.5 # scales and translates the window to show a selection cmd.zoom() return rmsd def tm_score(mol0, mol1, sel='*'): #Check if TM-align use all atoms! """ Compute TM-score between two set of coordinates Parameters ---------- mol0 : PyMOL object mol1 : PyMOL object sel : PyMOL selection, atoms used to compute rmsd. e.g. use ca+c+n for the backbone """ xyz0 = np.array(cmd.get_model('%s and name %s' % (mol0, sel)).get_coord_list()) xyz1 = np.array(cmd.get_model('%s and name %s' % (mol1, sel)).get_coord_list()) L = len(xyz0) # d0 is less than 0.5 for L < 22 # and nan for L < 15 (root of a negative number) d0 = 1.24 * np.power(L - 15, 1/3) - 1.8 d0 = max(0.5, d0) # compute the distance for each pair of atoms di = np.sum((xyz0 - xyz1) ** 2, 1) # sum along first axis return np.sum(1 / (1 + (di / d0) ** 2)) / L # what to be the input? # two model.tar.gz S3 URI? Then they will be available in the instance /opt/ml/processing/{method1,method2} # but how do you access the pdb file? when each method has its own folder structure and naming # perhaps another argument/ENV to specify the pdb file to analyze if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--pdb1', type=str, help='PDB file name for the first structure to compare that can be found in the corresponding tar.gz file, e.g. ranked_0.pdb.') parser.add_argument('--pdb2', type=str, help='PDB file name for the second structure to compare that can be found in the corresponding tar.gz file, e.g. XXX_model_1_ptm_relaxed.pdb.') args, _ = parser.parse_known_args() print("Received arguments {}".format(args)) processing_dir='/opt/ml/processing' # processing_dir='/root/protein-folding-on-sagemaker' # open file tarfile1 = os.path.join(processing_dir, "input-pdb1", "model.tar.gz") file1 = tarfile.open(tarfile1) # print file names print(file1.getnames()) foi1=[i for i in file1.getnames() if i.endswith(args.pdb1)] # extract files file1.extract(foi1[0], path=os.path.join(processing_dir, "input-pdb1")) # close file file1.close() # for the second file tarfile2 = os.path.join(processing_dir, "input-pdb2", "model.tar.gz") file2 = tarfile.open(tarfile2) print(file2.getnames()) foi2=[i for i in file2.getnames() if i.endswith(args.pdb2)] # extract files file2.extract(foi2[0], path=os.path.join(processing_dir, "input-pdb2")) file2.close() pdbfile1=os.path.join(processing_dir, "input-pdb1", foi1[0]) pdbfile2=os.path.join(processing_dir, "input-pdb2", foi2[0]) cmd.load(pdbfile1) cmd.load(pdbfile2) cmd.remove('not polymer or hydro') object1 = cmd.get_names()[0] object2 = cmd.get_names()[1] print(object1) print(object2) # compute rmsd_cur rmsd = rmsd_cur(object1, object2, sel='ca') #'ca+c+n' print('%.2f' % rmsd) # compute rmsd_fit rmsd_sp = rmsd_fit(object1, object2, sel='ca', fit=True) print('%.2f' % rmsd_sp) # compute TM score tmscore = tm_score(object1, object2, sel='ca') print('%.4f' % tmscore) report_dict = { "structure_metrics": { "rmsd": {"value": rmsd}, "rmsd_superposition": {"value": rmsd_sp}, "tm_score": {"value": tmscore} }, } output_dir = os.path.join(processing_dir, 'evaluation') # output_dir = '/root/protein-folding-on-sagemaker/evaluation' os.makedirs(output_dir, exist_ok=True) evaluation_path = f"{output_dir}/evaluation.json" with open(evaluation_path, "w") as f: f.write(json.dumps(report_dict))