import re import os def parse_fasta(data): data = re.sub('>$', '', data, flags=re.M) lines = [ l.replace('\n', '') for prot in data.split('>') for l in prot.strip().split('\n', 1) ][1:] tags, seqs = lines[::2], lines[1::2] tags = [t.split()[0]+'_'+t.split()[6] for t in tags] return tags, seqs cameo_path = '/fsx-shared/openfold/cameo/' test_squences_path = cameo_path + '/cameo_protein_targets.fasta' # Gather input sequences with open(test_squences_path, "r") as fp: data = fp.read() tags, seqs = parse_fasta(data) if not os.path.isdir(cameo_path+"/cameo-fastas"): os.mkdir(cameo_path+"/cameo-fastas") for tag, seq in zip(tags, seqs): tmp_fasta_path = os.path.join(cameo_path , 'cameo-fastas', f"tmp_{tag}.fasta") with open(tmp_fasta_path, "w") as fp: fp.write(f">{tag}\n{seq}")