#!/usr/bin/env python3 import os import sys import numpy import json import gzip import subprocess import SequencePairing as sp from string import Template import time import pickle import itertools import math from config import para_json from utils import job_queue, submitjob, command_header, fasta2len from utils import read_sequence_from_fasta chainstr='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' docstring=''' MSA_combination.py MSA combination for all component chains. Usage: MSA_combination.py [option] required options: -o=/home/simth/test (/home/simith/test should contain the fasta sequence file seq.fasta) ''' def getseqs(seq_path,seqName): ''' return no-reduandant sequences and mapping ''' titles=[] mapping={} ### mapping['title']=seqName+'-A/B/C' etc A2B3 five chain the A copy 2 B copy 3, but AF2 need A (ch1) B (ch1) C (ch2) D (ch2) E (ch2) mapping sequences={} #### sequences[seqName-A]='xxxxxx' copys={} seq_file=open(seq_path,'r') seqtext='\n'+seq_file.read() seq_file.close() seqblocks=seqtext.split('\n>') for block in seqblocks[1:]: #print(block) tmpstrs=block.split('\n') title=tmpstrs[0] seq='' for seqstr in tmpstrs[1:]: seq+=seqstr if not titles.__contains__(title): titles.append(title) mapping[title]=seqName+'-'+chainstr[len(titles)-1] copys[mapping[title]]=1 if list(sequences.values()).__contains__(seq): print("check your sequence title") exit(0) else: sequences[mapping[title]]=seq else: copys[mapping[title]]+=1 #print(titles) #print(mapping) #print(sequences) #print(copys) return titles,mapping,sequences,copys def getjsonseqs(myjson,seqName): ''' return no-reduandant sequences and mapping ''' titles={} mapping={} ### mapping['title']=seqName+'-A/B/C' etc A2B3 five chain the A copy 2 B copy 3, but AF2 need A (ch1) B (ch1) C (ch2) D (ch2) E (ch2) mapping sequences={} #### sequences[seqName-A]='xxxxxx' copys={} chains=[] mapping2={} for key in myjson.keys(): chain=key title=myjson[key]['description'] sequence=myjson[key]['sequence'] chains.append(chain) titles[chain]=title if not list(sequences.values()).__contains__(sequence): mapping[chain]=seqName+'-'+chain mapping2[chain]=chain copys[chain]=1 else: for ch in sequences.keys(): if sequences[ch]==sequence: mapping[chain]=seqName+'-'+ch mapping2[chain]=ch copys[ch]+=1 break sequences[chain]=sequence print(titles) print(chains) print(mapping) print(mapping2) print(sequences) print(copys) return chains,titles,sequences,mapping,mapping2,copys def getlinearseqs(file_path): seqfile=open(file_path,'r') text='\n'+seqfile.read() seqfile.close() titles=[] seqs=[] blocks=text.split('\n>')[1:] #print len(blocks) for block in blocks: tmpstrs=block.split('\n') titles.append(tmpstrs[0]) seq='' for tmpstr in tmpstrs[1:]: seq+=tmpstr seqs.append(seq) return titles,seqs def writeseq(title,seq,seq_path): seq_file=open(seq_path,'w') seq_file.write('>'+title+'\n') seq_file.write(seq+'\n') seq_file.close() def getids(idfile_path,title=False): ids=[] idfile=open(idfile_path,'r') lines=idfile.readlines() idfile.close() start=0 if title: start=1 for line in lines[start:]: ids.append(line.strip('\n').split()[0]) return ids def getids2(idfile_path,topN,title=False): ids=[] idfile=open(idfile_path,'r') lines=idfile.readlines() idfile.close() start=0 if title: start=1 for line in lines[start:]: ids.append(line.strip('\n').split()[0]) return ids[0:min(topN,len(ids))] def getlines(idfile_path,title=False): ids=[] idfile=open(idfile_path,'r') lines=idfile.readlines() idfile.close() start=0 if title: start=1 for line in lines[start:]: ids.append(line.strip('\n')) return ids def getlines2(idfile_path,topN,title=False): ids=[] idfile=open(idfile_path,'r') lines=idfile.readlines() idfile.close() start=0 if title: start=1 for line in lines[start:]: ids.append(line.strip('\n')) return ids[0:min(topN,len(ids))] def combineMSAs(chains,MSA_Tags): #dict combined_MSA_Tags=[] x=[] for chain in chains: x.append(MSA_Tags[chain]) combined_MSA_Tags=list(itertools.product(*x)) return combined_MSA_Tags def getTopN(maxP,chainN,ctype="heteromer"): topN=10 if ctype=="oligomer": topN=50 if ctype=="heteromer": topN=math.floor(math.pow(maxP,1.0/chainN)) return topN def readjson(json_path): f=open(json_path,'r') myjson=json.load(f) f.close() return myjson if __name__=="__main__": datadir = "" argv = [] for arg in sys.argv[1:]: if arg.startswith("-o="): datadir = os.path.abspath(arg[len("-o="):]) elif arg.startswith('-'): sys.stderr.write("ERROR! No such option %s\n" % arg) exit() else: argv.append(arg) if len(argv) != 0: sys.stderr.write(docstring) exit() if not os.path.exists(os.path.join(datadir,"seq.fasta")): print("There is no sequence file seq.fasta in the datadir %s"%datadir) sys.stderr.write(docstring) exit(1) _,psequences=read_sequence_from_fasta(os.path.join(datadir,"seq.fasta")) protein_type="multimer" if len(psequences)==1: protein_type="monomer" seqName=os.path.basename(datadir) print("protein type=%s"%protein_type) if protein_type=="multimer": json_path=os.path.join(datadir,"ChainMapping","seq","msas","chain_id_map.json") Version_Tags=['multi2.2-v4'] MSA_list="MSA_ranking.info" heteromer_maxP=para_json['alphafoldm_MaxPairs'] alphafoldm_Max_MSA_filter=para_json['alphafoldm_Max_MSA_filter'] print(os.path.join(datadir,"seq.fasta")) if not os.path.exists(os.path.join(datadir,"seq.fasta")): print("sequence file does not exist!") exit(0) myjson=readjson(json_path) chains,titles,sequences,mapping,mapping2,copys=getjsonseqs(myjson,seqName) Modelsdir=os.path.join(datadir,"CombinedMSAs") os.system("mkdir -p "+Modelsdir) if len(copys)==1: #### oligomer print("oligomer") AllMSAlist_path=os.path.join(datadir,seqName+'-'+list(copys.keys())[0],'finalMSAs',MSA_list) if not os.path.exists(AllMSAlist_path): print("All MSA list for oligomer target %s is not yet generated!"%seqName) exit(0) topN=getTopN(heteromer_maxP,1,'oligomer') MSA_Tags=getids2(AllMSAlist_path,topN,title=True) alllist=open(os.path.join(Modelsdir,'all.list'),'w') for msatag in MSA_Tags: alllist.write(msatag+'\n') alllist.close() for version_tag in Version_Tags: for MSA in MSA_Tags: print(MSA) a3m_path=os.path.join(datadir,seqName+'-'+list(copys.keys())[0],"AllMSAs",MSA,"seq.a3m") if os.path.exists(a3m_path): os.system("mkdir -p %s"%os.path.join(Modelsdir,MSA)) os.system("mkdir -p %s"%os.path.join(Modelsdir,MSA,version_tag,"seq","msas","A")) os.system("cp -f %s %s "%(a3m_path,os.path.join(Modelsdir,MSA,version_tag,"seq","msas","A","alphafold2.a3m"))) os.system("cp -f %s %s "%(os.path.join(datadir,"seq.fasta"),os.path.join(Modelsdir,MSA,version_tag,"seq.fasta"))) else: print("a3m file %s does not exist"%a3m_path) ###### copy MSAs to finalMSAs folder final_list_file=os.path.join(Modelsdir,'all.list') finalMSAs_dir=os.path.join(datadir,"finalMSAs") os.system("mkdir -p %s"%finalMSAs_dir) final_msa_tags=getids(final_list_file) final_msa_lines=getlines(os.path.join(datadir,seqName+"-A","finalMSAs","MSA_ranking.info")) for final_msa_tag in final_msa_tags: os.system("mkdir -p %s/%s/"%(finalMSAs_dir,final_msa_tag)) os.system("cp -r %s/%s/multi2.2-v4/seq/msas/* %s/%s/"%(Modelsdir,final_msa_tag,finalMSAs_dir,final_msa_tag)) MSA_rank_file=open(os.path.join(finalMSAs_dir,'MSA_ranking.info'),'w') #MSA_rank_file.write("MSA_tags pLDDT\n") for line in final_msa_lines: MSA_rank_file.write(line+'\n') MSA_rank_file.close() else: # for hetromer MSA_Tags={} nrchains=[] topN=getTopN(heteromer_maxP,len(copys),"heteromer") print("topN=%s"%str(topN)) #exit(1) for key in list(copys.keys()): chain=key nrchains.append(key) AllMSAlist_path=os.path.join(datadir,seqName+'-'+chain,'finalMSAs',MSA_list) if not os.path.exists(AllMSAlist_path): print("MSA list for %s of the target %s is not yet generated!"%(chainName,seqName)) exit(0) MSA_Tags[chain]=getids2(AllMSAlist_path,topN,title=True) ######## combine MSAs ####### combined_MSA_Tags=combineMSAs(nrchains,MSA_Tags) print(len(combined_MSA_Tags)) print(combined_MSA_Tags) #exit(2) combinedMSAlist=open(os.path.join(Modelsdir,'all.list'),'w') for combined_MSA_Tag in combined_MSA_Tags: # c(qMSA,DeepMSA.hhb,DeepJGI3) MSA='' for i in range(0,len(nrchains)): MSA+=combined_MSA_Tag[i]+'-' MSA=MSA.strip('-') print(MSA) combinedMSAlist.write(MSA+'\n') os.system("mkdir -p %s"%os.path.join(Modelsdir,MSA)) combined_MSA_Tag_dict={} for jjj in range(0,len(list(copys.keys()))): combined_MSA_Tag_dict[list(copys.keys())[jjj]]=combined_MSA_Tag[jjj] #print(combined_MSA_Tag_dict) for version_tag in Version_Tags: os.system("mkdir -p %s"%os.path.join(Modelsdir,MSA,version_tag)) os.system("cp -f %s %s "%(os.path.join(datadir,"seq.fasta"),os.path.join(Modelsdir,MSA,version_tag,"seq.fasta"))) #mj=0 for mkey in list(mapping.keys()): os.system("mkdir -p %s"%os.path.join(Modelsdir,MSA,version_tag,"seq","msas",mkey)) a3m_path=os.path.join(datadir,mapping[mkey],"AllMSAs",combined_MSA_Tag_dict[mapping2[mkey]],"seq.a3m") os.system("cp -f %s %s "%(a3m_path,os.path.join(Modelsdir,MSA,version_tag,"seq","msas",mkey,"alphafold2.a3m"))) #mj+=1 combinedMSAlist.close() ############## MSA filter ####### MSA_filter=para_json['joint_MSA_filter'] pkgdir=para_json['programrootpath'] calNf=os.path.join(pkgdir,'bin','calNf') if True: #MSA_filter==True: Neff_dict={} #{key=joint_msatag, value=joint_neff} pLDDT_dict={} #{key=ch_id, value={key=single_msatag, value=pLDDT}} Fscores=[] joint_MSAs=[] for ch in copys.keys(): pLDDT_dict[ch]={} plddtfile=open(os.path.join(datadir,seqName+'-'+ch,'finalMSAs','MSA_ranking.info'),'r') for plddtline in plddtfile.readlines()[1:]: Mtag=plddtline.strip('\n').split()[0] pLDDT_dict[ch][Mtag]=float(plddtline.strip('\n').split()[1]) plddtfile.close() for combined_MSA_Tag in combined_MSA_Tags: # c(qMSA,DeepMSA.hhb,DeepJGI3) MSA='' for i in range(0,len(nrchains)): MSA+=combined_MSA_Tag[i]+'-' MSA=MSA.strip('-') joint_MSAs.append(MSA) msa_datadir=os.path.join(Modelsdir,MSA,version_tag,"seq","msas") ######## link sequences print("Pairing Sequences for joint MSA %s"%MSA) sp.sequence_pairing_from_data(json_path,msa_datadir) sp.full_sequence_pairing_from_data(json_path,msa_datadir,add_nonpaired=False) sp.full_sequence_pairing_from_data(json_path,msa_datadir,add_nonpaired=True) print("Calculate Neff for joint MSA %s"%MSA) joint_MSA_path=os.path.join(Modelsdir,MSA,version_tag,"seq","msas","paired.aln") Neff_MSA_path=os.path.join(Modelsdir,MSA,version_tag,"seq","msas","paired.neff") cmd="%s %s >%s"%(calNf,joint_MSA_path,Neff_MSA_path) #print(cmd) sub=subprocess.run(cmd, stdout=subprocess.PIPE, shell=True).stdout.decode("utf-8").strip('\n') #### read neff nefffile=open(Neff_MSA_path,'r') neff=float(nefffile.readline()) nefffile.close() Neff_dict[MSA]=neff fscore=0.0 Nchain=0 for i in range(0,len(nrchains)): fscore+=copys[nrchains[i]]*pLDDT_dict[nrchains[i]][combined_MSA_Tag[i]] Nchain+=copys[nrchains[i]] fscore=fscore/Nchain fscore=math.log(Neff_dict[MSA],10)*fscore Fscores.append(fscore) #if MSA_filter==True: ############ sort MSAs ######## print("Sorting joint MSAs!") np_Fscores=numpy.array(Fscores) indexes=numpy.argsort(-np_Fscores) outfile_path=os.path.join(Modelsdir,'all.sort') outfile=open(outfile_path,'w') for index in indexes: print("%s %s"%(joint_MSAs[index],str(Fscores[index]))) outfile.write("%s %s\n"%(joint_MSAs[index],str(Fscores[index]))) outfile.close() ###### copy MSAs to finalMSAs folder final_list_file=os.path.join(Modelsdir,'all.list') finalMSAs_dir=os.path.join(datadir,"finalMSAs") os.system("mkdir -p %s"%finalMSAs_dir) final_msa_tags=getids(final_list_file) final_msa_lines=getlines(final_list_file) if len(copys)>=1: if MSA_filter==True: #### oligomer final_list_file=os.path.join(Modelsdir,'all.sort') final_msa_tags=getids2(final_list_file,alphafoldm_Max_MSA_filter) final_msa_lines=getlines2(final_list_file,alphafoldm_Max_MSA_filter) else: final_list_file=os.path.join(Modelsdir,'all.sort') final_msa_tags=getids(final_list_file) final_msa_lines=getlines(final_list_file) for final_msa_tag in final_msa_tags: os.system("mkdir -p %s/%s/"%(finalMSAs_dir,final_msa_tag)) os.system("cp -r %s/%s/multi2.2-v4/seq/msas/* %s/%s/"%(Modelsdir,final_msa_tag,finalMSAs_dir,final_msa_tag)) MSA_rank_file=open(os.path.join(finalMSAs_dir,'MSA_ranking.info'),'w') MSA_rank_file.write("MSA_tags Mscore\n") for line in final_msa_lines: MSA_rank_file.write(line+'\n') MSA_rank_file.close() #os.system("cp -r %s %s"%(final_list_file,finalMSAs_dir)) else: print("monomer protein, no MSA combination and pairing is needed!")