#!/nfs/amino-home/zhng/local_library/anaconda3-tf/bin/python import numpy import os import sys import re import json # Sequences coming from UniProtKB database come in the # `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE` # or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively). UNIPROT_PATTERN = re.compile( r""" ^ # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot (?:tr|sp) \| # A primary accession number of the UniProtKB entry. (?P[A-Za-z0-9]{6,10}) # Occasionally there is a _0 or _1 isoform suffix, which we ignore. (?:_\d)? \| # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic # protein ID code. (?:[A-Za-z0-9]+) _ # A mnemonic species identification code. (?P([A-Za-z0-9]){1,5}) # Small BFD uses a final value after an underscore, which we ignore. (?:_\d+)? $ """, re.VERBOSE) # Sequences coming from Uniref database come in the # `RepID=xxx_xxx` format, e.g. `RepID=A0A660T7U3_9SPIR` # or `RepID=UPI00068451ED` UNIREF_PATTERN = re.compile( r""" ^ # Uniref (RepID=) # A primary accession number of the UniProtKB entry. (?P[A-Za-z0-9]{6,10}) _ # A mnemonic species identification code. (?P([A-Za-z0-9]){1,5}) $ """, re.VERBOSE) def parse_sequence_specie(msa_sequence_identifier: str, dbtype='uniprot'): """Gets species from an msa sequence identifier. The sequence identifier has the format specified by _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN. An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE` Args: msa_sequence_identifier: a sequence identifier. Returns: An `Identifiers` instance with species_id. These can be empty in the case where no identifier was found. """ if dbtype=='uniref': matches = re.search(UNIREF_PATTERN, msa_sequence_identifier.strip()) else: matches = re.search(UNIPROT_PATTERN, msa_sequence_identifier.strip()) species_id='' if matches: species_id=matches.group('SpeciesIdentifier') return species_id def extract_sequence_ids(id: str): """Extracts sequence identifier from description. Returns None if no match.""" split_id = id.split() if split_id: if split_id[0].lower().__contains__('uniref'): #return split_id[0].partition('/')[0],'uniprot' ### only return uniprot ID return split_id[-1].partition('/')[0],'uniref' else: return split_id[0].partition('/')[0],'uniprot' else: return None,None def get_species(id: str): """get species from id""" sequence_identifier, dbtype = extract_sequence_ids(id) #print(sequence_identifier,dbtype) if sequence_identifier is None: return '' else: #print(parse_sequence_specie(sequence_identifier,dbtype=dbtype)) return parse_sequence_specie(sequence_identifier,dbtype=dbtype) def parsea3m(a3m_path): seqids=[] species={} #(seqid,specie) sequences={} #(seqid,sequence) in_seqs=[] a3mfile=open(a3m_path,'r') a3mtxt=a3mfile.read() a3mfile.close() a3mblocks=('\n'+a3mtxt).split('\n>')[1:] for a3mblock in a3mblocks: a3mstrs=a3mblock.split('\n') seqid=a3mstrs[0] a3mseq='' for seq in a3mstrs[1:]: a3mseq+=seq seqids.append(seqid) sequences[seqid]=a3mseq in_seqs.append(a3mseq) specie=get_species(seqid) species[seqid]=specie query_seq=in_seqs[0] return query_seq,seqids,species,sequences def parsea3m2aln(a3m_path): seqids=[] species={} #(seqid,specie) sequences={} #(seqid,sequence) in_seqs=[] a3mfile=open(a3m_path,'r') a3mtxt=a3mfile.read() a3mfile.close() a3mblocks=('\n'+a3mtxt).split('\n>')[1:] for a3mblock in a3mblocks: a3mstrs=a3mblock.split('\n') seqid=a3mstrs[0] a3mseq='' for seq in a3mstrs[1:]: a3mseq+=seq alnseq=single_sequence_a3m2aln(a3mseq) seqids.append(seqid) sequences[seqid]=alnseq in_seqs.append(alnseq) specie=get_species(seqid) species[seqid]=specie query_seq=in_seqs[0] return query_seq,seqids,species,sequences def single_sequence_a3m2aln(a3m_seq): aln_seq='' remove_lower = lambda text: re.sub('[a-z]', '', text) aln_seq=remove_lower(a3m_seq) return aln_seq def group_by_species(query_seq,seqids,species,sequences): '''assume the input sequence is a3m style''' nr_species_list_all=list(set(species.values())) nr_species_list=[] #print(nr_species_list) grouped_seqids={} #(specie,[id1,ids2]) grouped_sequences={} #(specie,[seq1,seq2]) for nr_species_key in nr_species_list_all: if nr_species_key!='': grouped_seqids[nr_species_key]=[] grouped_sequences[nr_species_key]=[] nr_species_list.append(nr_species_key) for seqid in seqids: specie=species[seqid] sequence=sequences[seqid] if specie!='': # use !='' instead of grouped_seqids.keys().__contains__() grouped_seqids[specie].append(seqid) grouped_sequences[specie].append(sequence) return nr_species_list,grouped_seqids,grouped_sequences def cal_seqid(query_seq,aln_seq): #print('query:',query_seq) #print('seq :',aln_seq) if len(query_seq)!=len(aln_seq): #print(query_seq) #print(aln_seq) print("The length of the two sequence used for sequence identity calculation is not same!") exit(1) seqid=0 same=0 for i in range(0,len(query_seq)): if aln_seq[i]!='-' and query_seq[i]!='-' and aln_seq[i]!='*' and query_seq[i]!='*' and query_seq[i]==aln_seq[i]: same+=1 seqid=1.0*same/len(query_seq.replace('*','')) return seqid def sort_by_sequenceID_list(query_seq,nr_species_list,grouped_seqids,grouped_sequences): '''assume the input sequence is a3m style, then use single_sequence_a3m2aln transfer as aln to calculate seqid''' sorted_grouped_seqids=[] sorted_grouped_sequences=[] sorted_grouped_sequence_identities=[] sequence_identities=[] sequence_ids=[] sequences=[] for specie_key in nr_species_list: #print(specie_key,grouped_sequences[specie_key]) for i in range(0,len(grouped_sequences[specie_key])): seq=grouped_sequences[specie_key][i] seq_identity=cal_seqid(query_seq,single_sequence_a3m2aln(seq)) sequence_identities.append(seq_identity) sequences.append(seq) sequence_ids.append(grouped_seqids[specie_key][i]) np_sequence_identities=numpy.array(sequence_identities) indexs=numpy.argsort(-np_sequence_identities) for index in indexs: sorted_grouped_seqids.append(sequence_ids[index]) sorted_grouped_sequence_identities.append(sequence_identities[index]) sorted_grouped_sequences.append(sequences[index]) return sorted_grouped_seqids,sorted_grouped_sequences,sorted_grouped_sequence_identities def sort_by_sequenceID(query_seq,nr_species_list,grouped_seqids,grouped_sequences): '''assume the input sequence is a3m style, then use single_sequence_a3m2aln transfer as aln to calculate seqid''' sorted_grouped_seqids={} sorted_grouped_sequences={} sorted_grouped_sequence_identites={} for specie_key in nr_species_list: sequence_identities=[] #print(specie_key,grouped_sequences[specie_key]) for seq in grouped_sequences[specie_key]: seq_identity=cal_seqid(query_seq,single_sequence_a3m2aln(seq)) sequence_identities.append(seq_identity) np_sequence_identities=numpy.array(sequence_identities) indexs=numpy.argsort(-np_sequence_identities) sorted_grouped_sequence_identites[specie_key]=[] sorted_grouped_seqids[specie_key]=[] sorted_grouped_sequences[specie_key]=[] for index in indexs: sorted_grouped_sequence_identites[specie_key].append(sequence_identities[index]) sorted_grouped_seqids[specie_key].append(grouped_seqids[specie_key][index]) sorted_grouped_sequences[specie_key].append(grouped_sequences[specie_key][index]) #print(sorted_grouped_sequence_identites) #print(sorted_grouped_seqids) #print(sorted_grouped_sequences) return nr_species_list,sorted_grouped_seqids,sorted_grouped_sequences,sorted_grouped_sequence_identites def pair_node(common_species,node1_seqids,node1_sequences,node2_seqids,node2_sequences): comm_seqids={} comm_sequences={} #print(node1_sequences.keys()) #print(node2_sequences.keys()) for specie in common_species: comm_seqids[specie]=[] comm_sequences[specie]=[] seqN=min(len(node1_sequences[specie]),len(node2_sequences[specie])) #print(specie,seqN,len(node1_sequences[specie]),len(node2_sequences[specie])) #seqN2=max(len(node1_sequences[specie]),len(node2_sequences[specie])) #print(seqN2) for i in range(0,seqN): joint_id=node1_seqids[specie][i]+'*'+node2_seqids[specie][i] comm_seqids[specie].append(joint_id) joint_seq=node1_sequences[specie][i]+'*'+node2_sequences[specie][i] comm_sequences[specie].append(joint_seq) #print(comm_sequences[specie]) #print(comm_seqids) #print(comm_sequences) return comm_seqids,comm_sequences def full_pair_node(common_species,common_species_seqN,node1_query_N,node1_seqids,node1_sequences,node2_query_N,node2_seqids,node2_sequences): comm_seqids={} comm_sequences={} #print(node1_sequences.keys()) #print(node2_sequences.keys()) for specie in common_species: #print(specie) comm_seqids[specie]=[] comm_sequences[specie]=[] seqN=common_species_seqN[specie] #seqN=min(len(node1_sequences[specie]),len(node2_sequences[specie])) #print(specie,seqN,len(node1_sequences[specie]),len(node2_sequences[specie])) #seqN2=max(len(node1_sequences[specie]),len(node2_sequences[specie])) #print(seqN2) if node1_seqids.keys().__contains__(specie): tmp_node1_seqids,tmp_node1_sequences=createNewNode(node1_seqids[specie],node1_sequences[specie],seqN,node1_query_N) else: tmp_node1_seqids,tmp_node1_sequences=createEmptyNode(seqN,node1_query_N) if node2_seqids.keys().__contains__(specie): tmp_node2_seqids,tmp_node2_sequences=createNewNode(node2_seqids[specie],node2_sequences[specie],seqN,node2_query_N) else: tmp_node2_seqids,tmp_node2_sequences=createEmptyNode(seqN,node2_query_N) for i in range(0,seqN): joint_id=tmp_node1_seqids[i]+'*'+tmp_node2_seqids[i] comm_seqids[specie].append(joint_id) joint_seq=tmp_node1_sequences[i]+'*'+tmp_node2_sequences[i] comm_sequences[specie].append(joint_seq) #print(comm_sequences[specie]) #print(specie,len(comm_sequences[specie])) #exit(0) #print(comm_seqids) #print(comm_sequences) return comm_seqids,comm_sequences,node1_query_N+node2_query_N def createEmptyNode(N,query_N): empty_node_seqids=[] empty_node_sequences=[] for i in range(0,N): empty_node_seqids.append("NA") seq="" for j in range(0,query_N): seq+="-" empty_node_sequences.append(seq) return empty_node_seqids,empty_node_sequences def createNewNode(node_seqids,node_sequences,N,query_N): new_node_seqids=[] new_node_sequences=[] #print(len(node_seqids),N) if len(node_seqids)>=N: for i in range(0,N): #print(i) new_node_seqids.append(node_seqids[i]) new_node_sequences.append(node_sequences[i]) else: for i in range(0,len(node_seqids)): new_node_seqids.append(node_seqids[i]) new_node_sequences.append(node_sequences[i]) for i in range(0,N-len(node_seqids)): new_node_seqids.append("NA") seq="" for j in range(0,query_N): seq+="-" new_node_sequences.append(seq) return new_node_seqids,new_node_sequences def sequence_pairing(List_query_seqs,List_nr_species,List_sorted_grouped_seqids,List_sorted_grouped_sequences): common_species=[] common_species=List_nr_species[0] for node in List_nr_species[1:]: common_species=list(set(common_species).intersection(set(node))) #print(common_species) Paired_query='' for seq in List_query_seqs: Paired_query+=seq+'*' Paired_query=Paired_query.strip('*') #print(Paired_query) #Paired_seqids={} #Paired_sequences={} #for specie in common_species: # Paired_seqids[specie]=[] # Paired_sequences[specie]=[] Paired_seqids=List_sorted_grouped_seqids[0] Paired_sequences=List_sorted_grouped_sequences[0] #print(Paired_seqids.keys()) for i in range(1,len(List_sorted_grouped_seqids)): #print(i) node_seqids=List_sorted_grouped_seqids[i] node_sequences=List_sorted_grouped_sequences[i] #print(Paired_seqids.keys()) Paired_seqids,Paired_sequences=pair_node(common_species,Paired_seqids,Paired_sequences,node_seqids,node_sequences) #print(Paired_seqids) #print(Paired_sequences) return Paired_query,common_species,Paired_seqids,Paired_sequences def full_sequence_pairing(List_query_seqs,List_nr_species,List_sorted_grouped_seqids,List_sorted_grouped_sequences): common_species=[] common_species=List_nr_species[0] for node in List_nr_species[1:]: common_species=list(set(common_species).union(set(node))) #print(common_species) min_nonzero_seqN={} species_nonzero_N={} for c_specie in common_species: min_nonzero_seqN[c_specie]=0 species_nonzero_N[c_specie]=0 for node in List_sorted_grouped_sequences: for key in node.keys(): if len(node[key])>0: species_nonzero_N[key]+=1 if min_nonzero_seqN[key]==0: min_nonzero_seqN[key]=len(node[key]) else: if len(node[key])=2: paired_common_species.append(key) paired_min_nonzero_seqN[key]=min_nonzero_seqN[key] #print(min_nonzero_seqN) #print(species_nonzero_N) #print(paired_common_species) #print(paired_min_nonzero_seqN) #exit(0) Paired_query='' for seq in List_query_seqs: Paired_query+=seq+'*' Paired_query=Paired_query.strip('*') #print(Paired_query) #Paired_seqids={} #Paired_sequences={} #for specie in common_species: # Paired_seqids[specie]=[] # Paired_sequences[specie]=[] Paired_seqids=List_sorted_grouped_seqids[0] Paired_sequences=List_sorted_grouped_sequences[0] Paired_node_query_N=len(List_query_seqs[0]) #print(Paired_seqids.keys()) for i in range(1,len(List_sorted_grouped_seqids)): #print(i) node_seqids=List_sorted_grouped_seqids[i] node_sequences=List_sorted_grouped_sequences[i] node_query_N=len(List_query_seqs[i]) #print(Paired_seqids.keys()) Paired_seqids,Paired_sequences,Paired_node_query_N=full_pair_node(paired_common_species,paired_min_nonzero_seqN,Paired_node_query_N,Paired_seqids,Paired_sequences,node_query_N,node_seqids,node_sequences) #print(Paired_seqids) #print(Paired_sequences) return Paired_query,paired_common_species,Paired_seqids,Paired_sequences def write_paired_msa(msa_path,paired_query,sorted_paired_seqids,sorted_paired_sequences,msa_type='a3m',starmark_remove=True): msa_file=open(msa_path,'w') if msa_type=='a3m': msa_file.write('>seq\n') if starmark_remove: msa_file.write(paired_query.replace('*','')+'\n') else: msa_file.write(paired_query+'\n') else: if starmark_remove: msa_file.write(paired_query.replace('*','')+'\n') else: msa_file.write(paired_query+'\n') for i in range(0,len(sorted_paired_seqids)): id=sorted_paired_seqids[i] seq=sorted_paired_sequences[i] if msa_type=='a3m': msa_file.write('>%s\n'%id) else: seq=single_sequence_a3m2aln(seq) if starmark_remove: msa_file.write('%s\n'%seq.replace('*','')) else: msa_file.write('%s\n'%seq) msa_file.close() return 1 def write_paired_msa_separate(datadir,chains,paired_query,sorted_paired_seqids,sorted_paired_sequences,base_name='paired_sep',msa_type='a3m'): for i in range(0,len(chains)): ch=chains[i] outfile_name=base_name+"."+msa_type outfile_path=os.path.join(datadir,ch,outfile_name) outfile=open(outfile_path,'w') querys=paired_query.split('*') if msa_type=='a3m': outfile.write('>seq\n') outfile.write(querys[i]+'\n') else: outfile.write(single_sequence_a3m2aln(querys[i])+'\n') for j in range(0,len(sorted_paired_seqids)): ids=sorted_paired_seqids[j].split('*') seqs=sorted_paired_sequences[j].split('*') if msa_type=='a3m': outfile.write('>'+ids[i]+'\n') outfile.write(seqs[i]+'\n') else: outfile.write(single_sequence_a3m2aln(seqs[i])+'\n') outfile.close() return 1 def readjson(json_path): f=open(json_path,'r') myjson=json.load(f) f.close() return myjson def getjsonseqs(myjson,seqName): ''' return no-reduandant sequences and mapping ''' titles={} mapping={} ### mapping['title']=seqName+'-A/B/C' etc A2B3 five chain the A copy 2 B copy 3, but AF2 need A (ch1) B (ch1) C (ch2) D (ch2) E (ch2) mapping sequences={} #### sequences[seqName-A]='xxxxxx' copys={} chains=[] for key in myjson.keys(): chain=key title=myjson[key]['description'] sequence=myjson[key]['sequence'] chains.append(chain) titles[chain]=title if not list(sequences.values()).__contains__(sequence): mapping[chain]=seqName+'-'+chain copys[chain]=1 else: for ch in sequences.keys(): if sequences[ch]==sequence: mapping[chain]=seqName+'-'+ch copys[ch]+=1 break sequences[chain]=sequence #print(titles) #print(chains) #print(mapping) #print(sequences) #print(copys) return chains,titles,sequences,mapping,copys def sequence_pairing_from_data(json_path,datadir): myjson=readjson(json_path) chains,titles,sequences,mapping,copys=getjsonseqs(myjson,"protein") #print(chains) #print(mapping) #print(copys) #exit(1) List_query_seqs=[] List_nr_species=[] List_sorted_grouped_seqids=[] List_sorted_grouped_sequences=[] List_sorted_grouped_sequence_identites=[] for ch in chains: A_a3m_path=os.path.join(datadir,ch,'alphafold2.a3m') A_query_seq,A_seqids,A_species,A_sequences=parsea3m(A_a3m_path) A_nr_species_list,A_grouped_seqids,A_grouped_sequences=group_by_species(A_query_seq,A_seqids,A_species,A_sequences) #print(A_query_seq) A_nr_species_list,A_sorted_grouped_seqids,A_sorted_grouped_sequences,A_sorted_grouped_sequence_identites=sort_by_sequenceID(A_query_seq,A_nr_species_list,A_grouped_seqids,A_grouped_sequences) List_query_seqs.append(A_query_seq) List_nr_species.append(A_nr_species_list) List_sorted_grouped_seqids.append(A_sorted_grouped_seqids) List_sorted_grouped_sequences.append(A_sorted_grouped_sequences) List_sorted_grouped_sequence_identites.append(A_sorted_grouped_sequence_identites) Paired_query,common_species,Paired_seqids,Paired_sequences=sequence_pairing(List_query_seqs,List_nr_species,List_sorted_grouped_seqids,List_sorted_grouped_sequences) #_,sorted_paired_seqids,sorted_paired_sequences,sorted_paired_sequence_identities=sort_by_sequenceID(Paired_query,common_species,Paired_seqids,Paired_sequences) #print(sorted_paired_sequences) sorted_paired_seqids,sorted_paired_sequences,sorted_paired_sequence_identities=sort_by_sequenceID_list(Paired_query,common_species,Paired_seqids,Paired_sequences) #print(sorted_paired_sequence_identities) write_paired_msa(os.path.join(datadir,'paired.a3m'),Paired_query,sorted_paired_seqids,sorted_paired_sequences,msa_type='a3m',starmark_remove=True) write_paired_msa(os.path.join(datadir,'paired.aln'),Paired_query,sorted_paired_seqids,sorted_paired_sequences,msa_type='aln',starmark_remove=True) return 1 def full_sequence_pairing_from_data(json_path,datadir,add_nonpaired=False): myjson=readjson(json_path) chains,titles,sequences,mapping,copys=getjsonseqs(myjson,"protein") #print(chains) #print(mapping) #print(copys) #exit(1) List_query_seqs=[] List_nr_species=[] List_seqids=[] List_sequences=[] List_query_N=[] List_sorted_grouped_seqids=[] List_sorted_grouped_sequences=[] List_sorted_grouped_sequence_identites=[] for ch in chains: A_a3m_path=os.path.join(datadir,ch,'alphafold2.a3m') A_query_seq,A_seqids,A_species,A_sequences=parsea3m(A_a3m_path) A_nr_species_list,A_grouped_seqids,A_grouped_sequences=group_by_species(A_query_seq,A_seqids,A_species,A_sequences) #print(A_query_seq) A_nr_species_list,A_sorted_grouped_seqids,A_sorted_grouped_sequences,A_sorted_grouped_sequence_identites=sort_by_sequenceID(A_query_seq,A_nr_species_list,A_grouped_seqids,A_grouped_sequences) List_query_seqs.append(A_query_seq) List_nr_species.append(A_nr_species_list) List_seqids.append(A_seqids) List_sequences.append(A_sequences) List_query_N.append(len(A_query_seq)) List_sorted_grouped_seqids.append(A_sorted_grouped_seqids) List_sorted_grouped_sequences.append(A_sorted_grouped_sequences) List_sorted_grouped_sequence_identites.append(A_sorted_grouped_sequence_identites) Paired_query,common_species,Paired_seqids,Paired_sequences=full_sequence_pairing(List_query_seqs,List_nr_species,List_sorted_grouped_seqids,List_sorted_grouped_sequences) #print(sorted_paired_sequences) sorted_paired_seqids,sorted_paired_sequences,sorted_paired_sequence_identities=sort_by_sequenceID_list(Paired_query,common_species,Paired_seqids,Paired_sequences) #print(sorted_paired_sequence_identities) #print(sorted_paired_seqids) if not(add_nonpaired): write_paired_msa(os.path.join(datadir,'full_paired.a3m'),Paired_query,sorted_paired_seqids,sorted_paired_sequences,msa_type='a3m',starmark_remove=True) write_paired_msa(os.path.join(datadir,'full_paired.aln'),Paired_query,sorted_paired_seqids,sorted_paired_sequences,msa_type='aln',starmark_remove=True) #write_paired_msa_separate(datadir,chains,Paired_query,sorted_paired_seqids,sorted_paired_sequences,base_name='full_paired_sep',msa_type='a3m') #write_paired_msa_separate(datadir,chains,Paired_query,sorted_paired_seqids,sorted_paired_sequences,base_name='full_paired_sep',msa_type='aln') else: List_non_paired_ids=[] List_non_paired_sequences=[] for i in range(0,len(chains)): tmp_paired_ids=[] for tmp_ids in sorted_paired_seqids: tmp_id=tmp_ids.split('*')[i] tmp_paired_ids.append(tmp_id) if len(List_seqids[i])>1: single_ids=List_seqids[i][1:] else: single_ids=[] non_paired_ids=list(set(single_ids).difference(set(tmp_paired_ids))) List_non_paired_ids.append(non_paired_ids) non_paired_sequences=[] for id in non_paired_ids: non_paired_sequences.append(List_sequences[i][id]) List_non_paired_sequences.append(non_paired_sequences) #print(List_non_paired_ids) #print(List_non_paired_sequences) all_seqids=[] all_sequences=[] for i in range(0,len(sorted_paired_seqids)): all_seqids.append(sorted_paired_seqids[i]) all_sequences.append(sorted_paired_sequences[i]) for i in range(0,len(chains)): for j in range(0,len(List_non_paired_ids[i])): tmp_id=List_non_paired_ids[i][j] tmp_seq=List_non_paired_sequences[i][j] for k in range(i-0-1,0-1,-1): #print(k) tmp_id='NA*'+tmp_id seq='' for t in range(0,List_query_N[k]): seq+='-' tmp_seq=seq+'*'+tmp_seq for k in range(i+1,len(chains)): tmp_id=tmp_id+'*NA' seq='' for t in range(0,List_query_N[k]): seq+='-' tmp_seq=tmp_seq+'*'+seq all_seqids.append(tmp_id) all_sequences.append(tmp_seq) write_paired_msa(os.path.join(datadir,'full_paired_pad.a3m'),Paired_query,all_seqids,all_sequences,msa_type='a3m',starmark_remove=True) write_paired_msa(os.path.join(datadir,'full_paired_pad.aln'),Paired_query,all_seqids,all_sequences,msa_type='aln',starmark_remove=True) #write_paired_msa_separate(datadir,chains,Paired_query,all_seqids,all_sequences,base_name='full_paired_pad_sep',msa_type='a3m') #write_paired_msa_separate(datadir,chains,Paired_query,all_seqids,all_sequences,base_name='full_paired_pad_sep',msa_type='aln') return 1 if __name__ == '__main__': json_path=sys.argv[1] datadir=sys.argv[2] sequence_pairing_from_data(json_path,datadir) full_sequence_pairing_from_data(json_path,datadir,add_nonpaired=False) full_sequence_pairing_from_data(json_path,datadir,add_nonpaired=True)