#!/usr/bin/env python3 import os import sys import numpy import json from config import para_json from utils import read_sequence_from_fasta docstring = ''' Parse-sequence.py Parse chain mapping. Usage: Parse-sequence.py [option] required options: -o=/home/simth/test (/home/simith/test should contain the fasta sequence file seq.fasta) ''' def readjson(json_path): f=open(json_path,'r') myjson=json.load(f) f.close() return myjson def getjsonseqs(myjson,seqName): ''' return no-reduandant sequences and mapping ''' titles={} mapping={} ### mapping['title']=seqName+'-A/B/C' etc A2B3 five chain the A copy 2 B copy 3, but AF2 need A (ch1) B (ch1) C (ch2) D (ch2) E (ch2) mapping sequences={} #### sequences[seqName-A]='xxxxxx' copys={} chains=[] for key in myjson.keys(): chain=key title=myjson[key]['description'] sequence=myjson[key]['sequence'] chains.append(chain) titles[chain]=title if not list(sequences.values()).__contains__(sequence): mapping[chain]=seqName+'-'+chain copys[chain]=1 else: for ch in sequences.keys(): if sequences[ch]==sequence: mapping[chain]=seqName+'-'+ch copys[ch]+=1 break sequences[chain]=sequence #print(titles) #print(chains) #print(mapping) #print(sequences) #print(copys) return chains,titles,sequences,mapping,copys if __name__=="__main__": datadir = "" argv = [] for arg in sys.argv[1:]: if arg.startswith("-o="): datadir = os.path.abspath(arg[len("-o="):]) elif arg.startswith('-'): sys.stderr.write("ERROR! No such option %s\n" % arg) exit() else: argv.append(arg) if len(argv) != 0: sys.stderr.write(docstring) exit() if not os.path.exists(os.path.join(datadir,"seq.fasta")): print("There is no sequence file seq.fasta in the datadir %s"%datadir) sys.stderr.write(docstring) exit(1) _,psequences=read_sequence_from_fasta(os.path.join(datadir,"seq.fasta")) protein_type="multimer" if len(psequences)==1: protein_type="monomer" ######### parse sequence AF2Mdir=para_json['alphafoldm_pkgdir'] AF2Mlib=para_json['alphafold_libdir'] AF2Menv=para_json['alphafold_env'] seqName=os.path.basename(datadir) if protein_type=="multimer": ChainMapDir=os.path.join(datadir,"ChainMapping") print("%s/run_alphafold_mymonomermsa_parsechain.sh -o %s -f %s -t 2099-01-01 -g false -r false -m multimer -c full_dbs -p false -l 5 -D %s -E %s -L %s "%(AF2Mdir,ChainMapDir,os.path.join(datadir,"seq.fasta"),AF2Mdir,AF2Menv,AF2Mlib)) os.system("%s/run_alphafold_mymonomermsa_parsechain.sh -o %s -f %s -t 2099-01-01 -g false -r false -m multimer -c full_dbs -p false -l 5 -D %s -E %s -L %s "%(AF2Mdir,ChainMapDir,os.path.join(datadir,"seq.fasta"),AF2Mdir,AF2Menv,AF2Mlib)) json_path=os.path.join(ChainMapDir,"seq","msas","chain_id_map.json") chainfile_path=os.path.join(datadir,"chainnr.list") myjson=readjson(json_path) #print(myjson) #print(myjson.keys()) #print(myjson['A']) #print(myjson['A']['description']) #print(myjson['A']['sequence']) chains,titles,sequences,mapping,copys=getjsonseqs(myjson,seqName) chainfile=open(chainfile_path,'w') for key in copys.keys(): print(seqName+'-'+key) chainfile.write(seqName+'-'+key+'\n') os.system("mkdir -p %s/%s"%(datadir,seqName+'-'+key)) #os.system("chmod -R 777 %s/%s"%(datadir,seqName+'-'+key)) seqfile=open(os.path.join(datadir,seqName+'-'+key,'seq.fasta'),'w') seqfile.write('>'+key+'\n') seqfile.write(sequences[key]+'\n') seqfile.close() chainfile.close() #os.system("chmod -R 777 %s"%datadir) else: os.system("mkdir -p %s/%s"%(datadir,seqName+'-A')) os.system("cp %s/seq.fasta %s/%s/"%(datadir,datadir,seqName+"-A/")) chainfile=open(os.path.join(datadir,'chainnr.list'),'w') chainfile.write(seqName+'-A\n') chainfile.close()