"""
This script will 
a) generate .fasta and .pssm files for each individual protein sequence. 
b) process the pssms into a dict with the sequence IDs. 

# May first need to run perl /home/ec2-user/SageMaker/efs/data/CAMP/tools/ncbi-blast-2.13.0+/bin/update_blastdb.pl swissprot.fa

This script will run the following psiblast command for each sequence, to generate the PSSM: 
$psiblast -db swissprot -query $outdir/tmp/1a61_i.fasta -num_iterations 3 -evalue 0.001 -out_ascii_pssm test.pssm
"""

import pandas as pd
import numpy as np
import os
import subprocess
import pickle

### Generate Protein PSSM Files  
psiblast="/home/ec2-user/SageMaker/efs/data/CAMP/tools/ncbi-blast-2.13.0+/bin/psiblast "
blastdb='export BLASTDB=/home/ec2-user/SageMaker/efs/data/CAMP/tools/ncbi-blast-2.13.0+/bin/; '
psiblast_opts = '-db ' + swissprotdb + ' -num_iterations 3 -evalue 0.001 '
swissprotdb='swissprot'

querytype = 'prot' # prot or peptide

fasta_file = './step2/' + querytype + '_sequences.fasta'

outdir = os.path.abspath('./step3/')
output_pssm_dict = outdir + 'pssm_' + querytype + '.pkl'

if not os.path.exists(os.path.abspath(outdir + '/tmp/')):
    os.makedirs(os.path.abspath(outdir + '/tmp'))  

def get_pssm(fasta_file, outdir, psiblast=psiblast, psiblast_opts=psiblast_opts):
    ## This command runs psiblast for each sequence 
    ## (each sequence must be a separate fasta file, so we create temp subset files from main fasta)
    # First we get the input ids and sequences
    with open(fasta_file, 'r') as f:
        id_list = []
        seq_list = []
        for line in f.readlines():
            line = line.strip()
            if line.startswith('>'):
                id_list.append(line)
            else:
                if(len(line) > 0):
                    seq_list.append(line)
    # Then run psiblast for each sequence
    longseq_pred = {}
    for ix in range(len(id_list)):
        # Write out each sequence as a fasta file
        outfile_seq = os.path.abspath(outdir + '/tmp/' + id_list[ix].strip('>') + '.fasta')
        outfile_pssm = os.path.abspath(outdir + '/tmp/' + id_list[ix].strip('>') + '.pssm')
        if not os.path.exists(outfile_pssm):
            with open(outfile_seq, 'w') as outfile:
                for elem in [id_list[ix], seq_list[ix]]:
                    outfile.write(elem + '\n')
            # Run that sequence through psiblast prediction (long, short)
            process = subprocess.Popen([
                blastdb + 
                psiblast + psiblast_opts + '-query ' + outfile_seq + ' -out_ascii_pssm ' + outfile_pssm], stdout=subprocess.PIPE,stderr=subprocess.PIPE, shell=True)
            stdout, stderr = (process.communicate())
    return id_list

proteins_list = get_pssm(fasta_file = fasta_file, outdir = outdir)

### Load Protein PSSM Files (first change the value of protein_number)
# prot_pssm_dict : key is protein sequence, value is protein PSSM Matrix
prot_pssm_dict_all={}
prot_pssm_dict={}
protein_num = len(proteins_list) ### NEED TO BE CHANGED TO the total number of protein sequences
inputs_dir = os.path.abspath(outdir + "tmp/")
for protid in proteins_list:
    filename_pssm = protid.strip('>') + '.pssm' # need to name each individual fasta and pssm file with the same prefix
    filename_fasta = protid.strip('>') + '.fasta'
    prot_key = protid.strip('>') # 'new_prot_'+str(i)
    pssm_line_list= []
    
    with open(inputs_dir+'/'+filename_fasta,'r') as f: # directory to store fasta files (single file of each protein)
        for line in f.readlines():
            prot_seq = line.strip()
    
    with open(inputs_dir+'/'+filename_pssm,'r') as f:  # directory to store pssm files (single file of each protein)
        for line in f.readlines()[3:-6]:
            line_list = line.strip().split(' ')
            line_list = [x for x in line_list if x!=''][2:22]
            line_list = [int(x) for x in line_list]
            if len(line_list)!=20:
                print('Error line:')
                print(line_list)
            pssm_line_list.append(line_list)
        pssm_array = np.array(pssm_line_list)
        if pssm_array.shape[1]!=20:
            print('Error!')
            print(filename_pssm)
        else:
            prot_pssm_dict_all[prot_key] = (prot_seq,pssm_array)
            prot_pssm_dict[prot_seq]=pssm_array

with open(output_pssm_dict,'wb') as f:  # 'output_pssm_dict' is the name of the output dict you like
    pickle.dump(prot_pssm_dict,f)