# -*- coding: utf-8 -*- import pandas as pd """ BEFORE THIS STEP: (Jasleen added): 1a. Prepare pepbdb-2020/pepbdb directory from .tgz download at http://huanglab.phys.hust.edu.cn/pepbdb/db/download/ [DONE] - /home/ec2-user/SageMaker/efs/data/CAMP/paper/pepbdb 1b. Run crawl.py to generate crawl_results.csv [ONGOING] 2. Next, run query-mapping.py that uses inputs crawl_results.csv, and the peptide.pdb files in pepbdb-20200318/pepbdb/{pdbid} - This script outputs peptide-mapping.txt, query_peptide_sequence_index.txt """ # Step 1: According to the "PDB ID-Peptide Chain-Protein Chain" obtained in "step1_pdb_process.py" , retrieve the interacting information with following fields: # ("Peptide ID","Interacting peptide residues","Peptide sequence","Interacting receptor residues","Receptor sequence(s)") a # nd downloading the corresponding "peptide.pdb" files (please put under ./pepbdb-2020/pepbdb/$pdb_id$/peptide.pdb) # Step 2: To map the peptide sequences from PepBDB to the peptide sequences from the peptide sequences from the RCSB PDB() generated in "step1_pdb_process.py"). # Generate query (PepBDB version) sequence file called "query_peptide.fasta" & target (RSCB PDB) fasta sequence files called "target_peptide.fasta" for peptides # We use scripts under ./smith-waterman-src/ to align two versions of peptide sequences. The output is "alignment_result.txt" #python query_mapping.py #to get peptide sequence vectors (the output is "peptide-mapping.txt ") #python target_mapping.py #to get target sequence vector # Step 3: Loading and mapping labels of binding residues for peptide sequences # load peptide-protein pairs & pepBDB files (target : PDB fasta, query : pepBDB) df_train = pd.read_csv('step1/pdb_chain_uniprot-processed.tsv', header=0, sep='\t', comment='#') # The output of "step1_pdb_process.py" df_zy_pep = pd.read_csv('step2/peptide-mapping.txt',header=None,sep='\t') df_zy_pep.columns= ['bdb_id','bdb_pep_seq','pep_binding_vec'] df_zy_pep['pdb_id'] = df_zy_pep.bdb_id.apply(lambda x: x.split('_')[0]) df_zy_pep['pep_chain'] = df_zy_pep.bdb_id.apply(lambda x: x.split('_')[1].lower()) df_zy_pep['prot_chain'] = df_zy_pep.bdb_id.apply(lambda x: x.split('_')[2].upper()) df_zy_pep.drop_duplicates(['bdb_id'],inplace=True) # Since we did not run uniprot based MHC filtering step in Step 1, # We don't have the protein chain column defined. Instead, we do that # modification here (same as 'plip_prot_chain' in skipped step). df_train['prot_chain'] = df_train.predicted_chain.apply(lambda x:\ x.upper()) df_join = pd.merge(df_train, df_zy_pep, how='left', left_on=['pdb_id','pep_chain','prot_chain'],right_on=['pdb_id','pep_chain','prot_chain']) #df_v1 = df_join[['pdb_id','pep_chain','prot_chain','pep_seq','SP_PRIMARY','prot_seq','Protein_families','pep_binding_vec']] df_v1 = df_join[['pdb_id','pep_chain','prot_chain','pep_seq','SP_PRIMARY','prot_seq','pep_binding_vec']] print(df_v1.shape) # impute records that don't have bs information with -99999 def extract_inter_idx(pep_seq,binding_vec): if binding_vec==binding_vec: if len(binding_vec) != len(pep_seq): print('Error length') return '-99999' else: binding_lst = [] for idx in range(len(binding_vec)): if binding_vec[idx]=='1': binding_lst.append(idx) binding_str = ','.join(str(e) for e in binding_lst) return binding_str else: return '-99999' df_v1['binding_idx'] = df_v1.apply(lambda x: extract_inter_idx(x.pep_seq,x.pep_binding_vec),axis=1) #df_part_pair = df_part_all[['pep_seq','prot_seq','binding_idx']] df_part_pair = df_v1[['pep_seq','prot_seq','binding_idx']] df_pos_bs = pd.merge(df_v1,df_part_pair,how='left',left_on=['pep_seq','prot_seq'],right_on=['pep_seq','prot_seq']).drop_duplicates().reset_index() df_pos_bs.to_csv('step2/pdb_pairs_bindingsites', encoding = 'utf-8', index = False, sep = ',') ofile = open('step2/peptide_sequences.fasta', 'w') for i in range(df_pos_bs.shape[0]): ofile.write('>' + df_pos_bs.loc[i, 'pdb_id'] + '_' + df_pos_bs.loc[i, 'pep_chain'] + '\n' + df_pos_bs.loc[i, 'pep_seq'] + '\n') ofile.close() ofile = open('step2/prot_sequences.fasta', 'w') for i in range(df_pos_bs.shape[0]): ofile.write('>' + df_pos_bs.loc[i, 'pdb_id'] + '_' + df_pos_bs.loc[i, 'prot_chain'] + '\n' + df_pos_bs.loc[i, 'prot_seq'] + '\n') ofile.close()