// hhsearch.C: // Search for a multiple alignment (transformed into HMM) in a profile HMM database // Error codes: 0: ok 1: file format error 2: file access error 3: memory error 4: command line error 6: internal logic error 7: internal numeric error // (C) Johannes Soeding and Michael Remmert 2012 // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // You should have received a copy of the GNU General Public License // along with this program. If not, see . // We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de // Reference: // Remmert M., Biegert A., Hauser A., and Soding J. // HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. // Nat. Methods 9:173-175 (2011); epub Dec 25, doi: 10.1038/NMETH.1818 // // TODOs // // * Allow using the base name (without extension) in hhsearch to make it compatible with the ussage of hhblits // ////#define WINDOWS #define PTHREAD #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf #include // min,max #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // islower, isdigit etc #include // clock_gettime etc. (in realtime library (-lrt compiler option)) #include // perror() #include #include #include // access() #ifdef PTHREAD #include // POSIX pthread functions and data structures #endif #include //#include //#include "efence.h" //#include "efence.c" #ifdef HH_SSE41 #include // SSSE3 #include // SSE4.1 #define HH_SSE3 #endif #ifdef HH_SSE3 #include // SSE3 #define HH_SSE2 #endif #ifdef HH_SSE2 #ifndef __SUNPRO_C #include // SSE2 #else #include #endif #endif using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include "cs.h" // context-specific pseudocounts #include "context_library.h" #include "library_pseudocounts-inl.h" #include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "list.C" // list data structure #include "hash.C" // hash data structure #include "hhdecl.C" // Constants, global variables, struct Parameters #include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen, #include "hhmatrices.C" // BLOSUM50, GONNET, HSDM #include "hhhmm.h" // class Hit #include "hhhit.h" // class Hit #include "hhalignment.h" // class Alignment #include "hhhalfalignment.h" // class HalfAlignment #include "hhfullalignment.h" // class FullAlignment #include "hhhitlist.h" // class Hit #include "hhhmm.C" // class HMM #include "hhalignment.C" // class Alignment #include "hhhit.C" // class Hit #include "hhhalfalignment.C" // class HalfAlignment #include "hhfullalignment.C" // class FullAlignment #include "hhhitlist.C" // class HitList #include "hhfunc.C" // some functions common to hh programs ///////////////////////////////////////////////////////////////////////////////////// // Global variables //////////////////////////////////////////////////////////////////////////////////// const char HHSEARCH_REFERENCE[]="Soding, J. Protein homology detection by HMM-HMM comparison. Bioinformatics 21:951-960 (2005).\n"; const int MAXTHREADS=256; // maximum number of threads (i.e. CPUs) for parallel computation const int MAXBINS=384; // maximum number of bins (positions in thread queue) enum bin_states {FREE=0, SUBMITTED=1, RUNNING=2}; int v1; // verbose mode int threads=0; // number of threads (apart from the main thread which reads from the databases file) 0:no multithreading int bins; // number of bins; jobs gets allocated to a FREE bin were they are waiting for exection by a thread char bin_status[MAXBINS]; // The status for each bin is FREE, SUBMITTED, or RUNNING int jobs_running; // number of active jobs, i.e. number of bins set to RUNNING int jobs_submitted; // number of submitted jobs, i.e. number of bins set to SUBMITTED char reading_dbs; // 1: still HMMs to read in a database; 0: finshed reading all HMMs, no db left const char DEBUG_THREADS=0; // Debugging flag HMM* q = new HMM; // Create query HMM with maximum of par.maxres match states HMM* t[MAXBINS]; // Each bin has a template HMM allocated that was read from the database file Hit* hit[MAXBINS]; // Each bin has an object of type Hit allocated with a separate dynamic programming matrix (memory!!) HitList hitlist; // list of hits with one Hit object for each pairwise comparison done Hit hit_cur; // current Hit element read from hitlist int* format; // format[bin] = 0 if in HHsearch format => add pcs; format[bin] = 1 if in HMMER format => no pcs int read_from_db; // The value of this flag is returned from HMM::Read(); 0:end of file 1:ok 2:skip HMM int N_searched; // Number of HMMs searched char line[LINELEN]=""; // input line int bin; // bin index struct Thread_args // data to hand to WorkerLoop thread { int thread_id; // id of thread (for debugging) void (*function)(int); // pointer to function (=job) to execute by WorkerLoop once old job is done }; #ifdef PTHREAD struct Thread_args thread_data[MAXTHREADS]; // store a threads thread_id and function to call (AlignByWorker, RealignByWorker) pthread_t pthread[MAXTHREADS]; // info on thread's structures (needed by system) pthread_attr_t joinable; // attribute set for describing threads int rc; // return code for threading commands // With this condition variable the main thread signals to the worker threads that it has submitted a new job pthread_cond_t new_job = PTHREAD_COND_INITIALIZER; // Mutex assures exclusive access to bin_status[], jobs_sumitted, jobs_running, and new_job by threads pthread_mutex_t bin_status_mutex = PTHREAD_MUTEX_INITIALIZER; // Mutex assures exclusive access to hitlist pthread_mutex_t hitlist_mutex = PTHREAD_MUTEX_INITIALIZER; // With this condition variable a worker thread signals to the main thread that it has finished a job pthread_cond_t finished_job = PTHREAD_COND_INITIALIZER; #endif /////////////////////////////////////////////////////////////////////////////////////// //// For multi-threading: return a bin with the desired status, return -1 if no such bin found ////////////////////////////////////////////////////////////////////////////////////// inline int PickBin(char status) { for (int b=0; b input/query multiple sequence alignment (a2m, a3m, FASTA) or HMM\n"); printf(" -d HMM database of concatenated HMMs in hhm, HMMER, or a3m format,\n"); printf(" OR, if file has extension pal, list of HMM file names, one per\n"); printf(" line. Multiple dbs, HMMs, or pal files with -d ' ...'\n"); if (all) { printf("\n"); printf(" may be 'stdin' or 'stdout' throughout.\n"); } printf("\n"); printf("Output options: \n"); printf(" -o write results in standard format to file (default=)\n"); if (all) { printf(" -Ofas write pairwise alignments of significant matches in FASTA format\n"); printf(" Analogous for output in a3m, a2m, and psi format (e.g. -Oa3m)\n"); printf(" -oa3m write MSA of significant matches in a3m format\n"); printf(" Analogous for output in a2m, psi, and hhm format (e.g. -ohhm)\n"); printf(" -e [0,1] E-value cutoff for inclusion in multiple alignment (def=%G) \n",par.e); printf(" -seq max. number of query/template sequences displayed (def=%i) \n",par.nseqdis); printf(" Beware of overflows! All these sequences are stored in memory.\n"); printf(" -cons show consensus sequence as master sequence of query MSA \n"); } printf(" -nocons don't show consensus sequence in alignments (default=show) \n"); printf(" -nopred don't show predicted 2ndary structure in alignments (default=show)\n"); printf(" -nodssp don't show DSSP 2ndary structure in alignments (default=show) \n"); printf(" -ssconf show confidences for predicted 2ndary structure in alignments\n"); printf(" -p minimum probability in summary and alignment list (def=%G) \n",par.p); printf(" -E maximum E-value in summary and alignment list (def=%G) \n",par.E); printf(" -Z maximum number of lines in summary hit list (def=%i) \n",par.Z); printf(" -z minimum number of lines in summary hit list (def=%i) \n",par.z); printf(" -B maximum number of alignments in alignment list (def=%i) \n",par.B); printf(" -b minimum number of alignments in alignment list (def=%i) \n",par.b); if (all) { printf(" -aliw [40,..[ number of columns per line in alignment list (def=%i)\n",par.aliwidth); printf(" -dbstrlen max length of database string to be printed in hhr file\n"); } printf("\n"); printf("Filter query multiple sequence alignment \n"); printf(" -id [0,100] maximum pairwise sequence identity (%%) (def=%i) \n",par.max_seqid); printf(" -diff [0,inf[ filter MSA by selecting most diverse set of sequences, keeping \n"); printf(" at least this many seqs in each MSA block of length 50 (def=%i) \n",par.Ndiff); printf(" -cov [0,100] minimum coverage with query (%%) (def=%i) \n",par.coverage); printf(" -qid [0,100] minimum sequence identity with query (%%) (def=%i) \n",par.qid); printf(" -qsc [0,100] minimum score per column with query (def=%.1f)\n",par.qsc); printf(" -neff [1,inf] target diversity of alignment (default=off)\n"); printf("\n"); printf("Input alignment format: \n"); printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n"); printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n"); printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n"); printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n"); if (all) { printf(" -tags do NOT neutralize His-, C-myc-, FLAG-tags, and trypsin \n"); printf(" recognition sequence to background distribution \n"); } printf("\n"); printf("HMM-HMM alignment options: \n"); printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n"); printf(" -mact [0,1[ posterior probability threshold for MAC re-alignment (def=%.3f)\n",par.mact); printf(" Parameter controls alignment greediness: 0:global >0.1:local\n"); printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n"); // printf(" -vit use Viterbi algorithm for searching/ranking (default) \n"); // printf(" -mac use Maximum Accuracy MAC algorithm for searching/ranking\n"); // printf(" -forward use Forward probability for searching \n"); printf(" -alt show up to this many significant alternative alignments(def=%i)\n",par.altali); if (all) { printf(" -vit use Viterbi algorithm for searching/ranking (default) \n"); printf(" -mac use Maximum Accuracy (MAC) algorithm for searching/ranking\n"); printf(" -forward use Forward probability for searching \n"); printf(" -excl exclude query positions from the alignment, e.g. '1-33,97-168' \n"); printf(" -shift [-1,1] score offset (def=%-.2f) \n",par.shift); printf(" -corr [0,1] weight of term for pair correlations (def=%.2f) \n",par.corr); printf(" -sc amino acid score (tja: template HMM at column j) (def=%i)\n",par.columnscore); printf(" 0 = log2 Sum(tja*qia/pa) (pa: aa background frequencies) \n"); printf(" 1 = log2 Sum(tja*qia/pqa) (pqa = 1/2*(pa+ta) ) \n"); printf(" 2 = log2 Sum(tja*qia/ta) (ta: av. aa freqs in template) \n"); printf(" 3 = log2 Sum(tja*qia/qa) (qa: av. aa freqs in query) \n"); printf(" 5 local amino acid composition correction \n"); } printf(" -ssm {0,..,4} 0: no ss scoring \n"); printf(" 1,2: ss scoring after or during alignment [default=%1i] \n",par.ssm); printf(" 3,4: ss scoring after or during alignment, predicted vs. predicted \n"); if (all) { printf(" -ssw [0,1] weight of ss score compared to column score (def=%-.2f) \n",par.ssw); printf(" -ssa [0,1] SS substitution matrix = (1-ssa)*I + ssa*full-SS-substition-matrix [def=%-.2f)\n",par.ssa); printf("\n"); printf("Gap cost options: \n"); printf(" -gapb [0,inf[ Transition pseudocount admixture (def=%-.2f) \n",par.gapb); printf(" -gapd [0,inf[ Transition pseudocount admixture for open gap (default=%-.2f) \n",par.gapd); printf(" -gape [0,1.5] Transition pseudocount admixture for extend gap (def=%-.2f) \n",par.gape); printf(" -gapf ]0,inf] factor to increase/reduce the gap open penalty for deletes (def=%-.2f) \n",par.gapf); printf(" -gapg ]0,inf] factor to increase/reduce the gap open penalty for inserts (def=%-.2f) \n",par.gapg); printf(" -gaph ]0,inf] factor to increase/reduce the gap extend penalty for deletes(def=%-.2f)\n",par.gaph); printf(" -gapi ]0,inf] factor to increase/reduce the gap extend penalty for inserts(def=%-.2f)\n",par.gapi); printf(" -egq [0,inf[ penalty (bits) for end gaps aligned to query residues (def=%-.2f) \n",par.egq); printf(" -egt [0,inf[ penalty (bits) for end gaps aligned to template residues (def=%-.2f) \n",par.egt); printf("\n"); printf("Pseudocount (pc) options: \n"); printf(" -pcm {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pcm); printf(" 0: no pseudo counts: tau = 0 \n"); printf(" 1: constant tau = a \n"); printf(" 2: diversity-dependent: tau = a/(1 + ((Neff[i]-1)/b)^c) \n"); printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n"); printf(" 3: constant diversity pseudocounts \n"); printf(" -pca [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pca); printf(" -pcb [1,inf[ Neff threshold value for -pcm 2 (def=%-.1f) \n",par.pcb); printf(" -pcc [0,3] extinction exponent c for -pcm 2 (def=%-.1f) \n",par.pcc); // printf(" -pcw [0,3] weight of pos-specificity for pcs (def=%-.1f) \n",par.pcw); } printf("\n"); printf("Context-specific pseudo-counts: \n"); printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n"); printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n",par.clusterfile); printf(" -cslib column state file for fast database prefiltering (default=%s)\n",par.cs_library); if (all) { printf(" -csw [0,inf] weight of central position in cs pseudocount mode (def=%.1f)\n", par.csw); printf(" -csb [0,1] weight decay parameter for positions in cs pc mode (def=%.1f)\n", par.csb); } printf("\n"); printf("Other options: \n"); printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=1)\n"); #ifndef PTHREAD printf("(The -cpu option is inactive since POSIX threads ae not supported on your platform)\n"); #endif printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose \n"); if (all) { printf(" -maxres max number of HMM columns (def=%5i) \n",par.maxres); printf(" -maxmem [1,inf[ max available memory in GB (def=%.1f) \n",par.maxmem); printf(" -scores write scores for all pairwise comparisions to file \n"); printf(" -calm {0,..,3} empirical score calibration of 0:query 1:template 2:both \n"); printf(" default 3: neural network-based estimation of EVD params \n"); // printf(" -opt parameter optimization mode (def=off): return sum of ranks \n"); // printf(" of true positives (same superfamily) for minimization \n"); // printf(" and write result into file \n"); printf("\n"); } else { printf("An extended list of options can be obtained by calling 'hhblits -help'\n"); } printf("\n"); printf("Example: %s -i a.1.1.1.a3m -d scop70_1.71.hhm \n",program_name); cout<=4) cout<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help(); cerr<=argc) {help(); exit(4);} par.exclstr = new(char[strlen(argv[i])+1]); strcpy(par.exclstr,argv[i]); } else if (!strcmp(argv[i],"-v") && (i0.35 && par.mact<0.351) {par.mact=0;} } else if (!strncmp(argv[i],"-loc",4)) par.loc=1; else if (!strncmp(argv[i],"-alt",4) && (i='0' && argv[i][0]<='9') {par.Mgaps=atoi(argv[i]); par.M=2;} else cerr<=argc || argv[i][0]=='-') {help() ; cerr<=4) cout<Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done int nhits=0; int N_aligned=0; // Longest allowable length of database HMM (backtrace: 5 chars, fwd: 1 double, bwd: 1 double long int Lmaxmem=((par.maxmem-0.5)*1024*1024*1024)/(2*sizeof(double)+8)/q->L/bins; long int Lmax=0; // length of longest HMM to be realigned // phash_plist_realignhitpos->Show(dbfile) is pointer to list with template indices and their ftell positions. // This list can be sorted by ftellpos to access one template after the other efficiently during realignment Hash< List* >* phash_plist_realignhitpos; phash_plist_realignhitpos = new Hash< List* > (2311,NULL); // Some templates have several (suboptimal) alignments in hitlist. For realignment, we need to efficiently // access all hit objects in hitlist belonging to one template (because we don't want to read templates twice) // We therefore need for each template (identified by its index between 0 and N_searched-1) a list of elements // in hitlist that store the alignments with the template of that index. // This list is pointed to by array_plist_phits[index]. List** array_plist_phits; array_plist_phits = new List*[N_searched]; for (int index=0; index= par.realign_max && nhits>=imax(par.B,par.Z)) break; if (hit_cur.Eval > par.e) { if (nhits>=imax(par.B,par.Z)) continue; if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) continue; if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue; } if (hit_cur.L>Lmax) Lmax=hit_cur.L; if (hit_cur.L>Lmaxmem) {nhits++; continue;} // fprintf(stderr,"hit.name=%-15.15s hit.index=%-5i hit.ftellpos=%-8i hit.dbfile=%s\n",hit_cur.name,hit_cur.index,(unsigned int)hit_cur.ftellpos,hit_cur.dbfile); if (nhits>=par.premerge || hit_cur.irep>1) // realign the first premerge hits consecutively to query profile { if (hit_cur.irep==1) { // For each template (therefore irep==1), store template index and position on disk in a list Realign_hitpos realign_hitpos; realign_hitpos.ftellpos = hit_cur.ftellpos; // stores position on disk of template for current hit realign_hitpos.index = hit_cur.index; // stores index of template of current hit if (! phash_plist_realignhitpos->Contains(hit_cur.dbfile)) { List* newlist = new List; phash_plist_realignhitpos->Add(hit_cur.dbfile,newlist); } // Add template index and ftellpos to list which belongs to key dbfile in hash phash_plist_realignhitpos->Show(hit_cur.dbfile)->Push(realign_hitpos); } if (! array_plist_phits[hit_cur.index]) // pointer at index is still NULL { List* newlist = new List; // create new list of pointers to all aligments of a template array_plist_phits[hit_cur.index] = newlist; // set array[index] to newlist } // Push(hitlist.ReadCurrentAddress()) : Add address of current hit in hitlist to list... // array_plist_phits[hit_cur.index]-> : pointed to by hit_cur.index'th element of array_plist_phits array_plist_phits[hit_cur.index]->Push(hitlist.ReadCurrentAddress()); } nhits++; } if (Lmax>Lmaxmem) { Lmax=Lmaxmem; if (v>=1) { cerr<<"WARNING: Realigning sequences only up to length "< option (currently "<forward_allocated) hit[bin]->AllocateForwardMatrix(q->L+2,Lmax+1); if (!hit[bin]->backward_allocated) hit[bin]->AllocateBackwardMatrix(q->L+2,Lmax+1); } if (v>=2) printf("Realigning %i database HMMs using HMM-HMM Maximum Accuracy algorithm\n",nhits); v1 = v; if (v>0 && v<=3) v=1; else v-=2; // Supress verbose output during iterative realignment and realignment ///////////////////////////////////////////////////////////////////////////////////////////////////// // Start premerge: align the first par.premerge templates? if (par.premerge>0) { // Read query alignment into Qali Alignment Qali; // output A3M generated by merging A3M alignments for significant hits to the query alignment char qa3mfile[NAMELEN]; char ta3mfile[NAMELEN]; RemoveExtension(qa3mfile,par.infile); // directory?? strcat(qa3mfile,".a3m"); FILE* qa3mf=fopen(qa3mfile,"r"); if (!qa3mf) OpenFileError(qa3mfile); Qali.Read(qa3mf,qa3mfile); fclose(qa3mf); delete[] Qali.longname; Qali.longname = new(char[strlen(q->longname)+1]); strcpy(Qali.longname,q->longname); strcpy(Qali.name,q->name); strcpy(Qali.fam,q->fam); RemovePathAndExtension(Qali.file,par.hhmfile); // If par.append==1 do not print query alignment if (par.append) Qali.MarkSeqsAsNonPrintable(); if (v>=2) printf("Merging best hits to query alignment %s ...\n",qa3mfile); bin=0; nhits=0; hitlist.Reset(); while (!hitlist.End() && nhits=imax(par.B,par.Z)) break; if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break; if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue; if (hit_cur.irep>1) continue; // Align only the best hit of the first par.premerge templates if (hit_cur.L>Lmaxmem) {nhits++; continue;} //Don't align to long sequences due to memory limit // Open HMM database file dbfiles[idb] FILE* dbf=fopen(hit_cur.dbfile,"rb"); if (!dbf) OpenFileError(hit_cur.dbfile); read_from_db=1; // Forward stream position to start of next database HMM to be realigned hit[bin]->index = hit_cur.index; // give hit a unique index for HMM hit[bin]->ftellpos = hit_cur.ftellpos; fseek(dbf,hit_cur.ftellpos,SEEK_SET); hit[bin]->dbfile = new(char[strlen(hit_cur.dbfile)+1]); strcpy(hit[bin]->dbfile,hit_cur.dbfile); // record db file name from which next HMM is read hit[bin]->irep = 1; // Needed for min_overlap calculation in InitializeForAlignment in hhhit.C char path[NAMELEN]; Pathname(path,hit_cur.dbfile); /////////////////////////////////////////////////// // Read next HMM from database file if (!fgetline(line,LINELEN,dbf)) {fprintf(stderr,"Error in %s: end of file %s reached prematurely!\n",par.argv[0],hit_cur.dbfile); exit(1);} while (strscn(line)==NULL && fgetline(line,LINELEN,dbf)) {} // skip lines that contain only white space if (!strncmp(line,"HMMER3",6)) // read HMMER3 format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer3(dbf,hit_cur.dbfile); par.hmmer_used = true; } else if (!strncmp(line,"HMMER",5)) // read HMMER format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer(dbf,hit_cur.dbfile); par.hmmer_used = true; } else if (!strncmp(line,"HH",2)) // read HHM format { format[bin] = 0; read_from_db = t[bin]->Read(dbf,path); } else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1 { format[bin] = 0; fseek(dbf,hit_cur.ftellpos,SEEK_SET); // rewind to beginning of line read_from_db = t[bin]->Read(dbf,path); } else if (line[0]=='#') // read a3m alignment { Alignment tali; tali.Read(dbf,hit_cur.dbfile,line); tali.Compress(hit_cur.dbfile); // qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db); t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0'; tali.FrequenciesAndTransitions(t[bin]); format[bin] = 0; } else { cerr<=2) fprintf(stderr,"Realigning with %s ***** \n",t[bin]->name); /////////////////////////////////////////////////// N_aligned++; if (v1>=2 && !(N_aligned%10)) { cout<<"."; if (!(N_aligned%500)) printf(" %-4i HMMs aligned\n",N_aligned); cout.flush(); } // Prepare MAC comparison(s) PrepareTemplateHMM(q,t[bin],format[bin]); t[bin]->Log2LinTransitionProbs(1.0); // Align q to template in *hit[bin] hit[bin]->Forward(q,t[bin]); hit[bin]->Backward(q,t[bin]); hit[bin]->MACAlignment(q,t[bin]); hit[bin]->BacktraceMAC(q,t[bin]); // Overwrite *hit[bin] with Viterbi scores, Probabilities etc. of hit_cur hit[bin]->score = hit_cur.score; hit[bin]->score_aass = hit_cur.score_aass; hit[bin]->score_ss = hit_cur.score_ss; hit[bin]->Pval = hit_cur.Pval; hit[bin]->Pvalt = hit_cur.Pvalt; hit[bin]->logPval = hit_cur.logPval; hit[bin]->logPvalt = hit_cur.logPvalt; hit[bin]->Eval = hit_cur.Eval; hit[bin]->logEval = hit_cur.logEval; hit[bin]->Probab = hit_cur.Probab; hit[bin]->irep = hit_cur.irep; // Replace original hit in hitlist with realigned hit //hitlist.ReadCurrent().Delete(); hitlist.Delete().Delete(); // delete the list record and hit object hitlist.Insert(*hit[bin]); // Read a3m alignment of hit and merge with Qali according to Q-T-alignment in hit[bin] strcpy(ta3mfile,hit[bin]->file); // copy filename including path but without extension strcat(ta3mfile,".a3m"); Alignment Tali; FILE* ta3mf=fopen(ta3mfile,"r"); if (!ta3mf) OpenFileError(ta3mfile); Tali.Read(ta3mf,ta3mfile); // Read template alignment into Tali fclose(ta3mf); Tali.Compress(ta3mfile); // Filter database alignment Qali.MergeMasterSlave(*hit[bin],Tali,ta3mfile); // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] Qali.Compress("merged A3M file"); // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) Qali.N_filtered = Qali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] Qali.FrequenciesAndTransitions(q); // Comput substitution matrix pseudocounts if (par.nocontxt) { // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q->PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q->AddAminoAcidPseudocounts(); } else { // Generate an amino acid frequency matrix from f[i][a] with full context specific pseudocount admixture (tau=1) -> g[i][a] // q->PrepareContextSpecificPseudocounts(); //OLD q->AddContextSpecificPseudocounts(); } // Transform transition freqs to lin space if not already done q->AddTransitionPseudocounts(); q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done q->CalculateAminoAcidBackground(); nhits++; } } // End premerge: align the first par.premerge templates? ///////////////////////////////////////////////////////////////////////////////////////////////////// #ifdef PTHREAD // Start threads for realignment for (int j=0; jContains(dbfiles[idb])) continue; // phash_plist_realignhitpos->Show(dbfile) is pointer to list with template indices and their ftell positions. // This list is now sorted by ftellpos in ascending order to access one template after the other efficiently phash_plist_realignhitpos->Show(dbfiles[idb])->SortList(); // Open HMM database file dbfiles[idb] FILE* dbf=fopen(dbfiles[idb],"rb"); if (!dbf) OpenFileError(dbfiles[ndb]); read_from_db=1; /////////////////////////////////////////////////////////////////////////////////////// // The loop (reads HMMs from the database file and) submits jobs into free bins as soon as they become available phash_plist_realignhitpos->Show(dbfiles[idb])->Reset(); while (! phash_plist_realignhitpos->Show(dbfiles[idb])->End()) { // Submit jobs until no bin is free anymore while (! phash_plist_realignhitpos->Show(dbfiles[idb])->End() && jobs_submitted+jobs_runningShow(dbfiles[idb])->ReadNext(); hit[bin]->index = hitpos_curr.index; // give hit[bin] a unique index for HMM fseek(dbf,hitpos_curr.ftellpos,SEEK_SET); // start to read at ftellpos for template // Give hit[bin] the pointer to the list of pointers to hitlist elements of same template (for realignment) hit[bin]->plist_phits = array_plist_phits[hitpos_curr.index]; // fprintf(stderr,"dbfile=%-40.40s index=%-5i ftellpos=%l\n",dbfiles[idb],hitpos_curr.index,hitpos_curr.ftellpos); char path[NAMELEN]; Pathname(path,dbfiles[idb]); /////////////////////////////////////////////////// // Read next HMM from database file if (!fgetline(line,LINELEN,dbf)) {fprintf(stderr,"Error in %s: end of file %s reached prematurely!\n",par.argv[0],dbfiles[idb]); exit(1);} while (strscn(line)==NULL && fgetline(line,LINELEN,dbf)) {} // skip lines that contain only white space if (!strncmp(line,"HMMER3",6)) // read HMMER3 format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer3(dbf,dbfiles[idb]); par.hmmer_used = true; } else if (!strncmp(line,"HMMER",5)) // read HMMER format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer(dbf,dbfiles[idb]); par.hmmer_used = true; } else if (!strncmp(line,"HH",2)) // read HHM format { format[bin] = 0; read_from_db = t[bin]->Read(dbf,path); } else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1 { format[bin] = 0; fseek(dbf,hitpos_curr.ftellpos,SEEK_SET); // rewind to beginning of line read_from_db = t[bin]->Read(dbf,path); } else if (line[0]=='#') // read a3m alignment { Alignment tali; tali.Read(dbf,dbfiles[idb],line); tali.Compress(dbfiles[idb]); // qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db); t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0'; tali.FrequenciesAndTransitions(t[bin]); format[bin] = 0; } else { cerr<=2) fprintf(stderr,"Realigning with %s\n",t[bin]->name); /////////////////////////////////////////////////// hit[bin]->dbfile = new(char[strlen(dbfiles[idb])+1]); strcpy(hit[bin]->dbfile,dbfiles[idb]); // record db file name from which next HMM is read N_aligned++; if (v1>=2 && !(N_aligned%10)) { cout<<"."; if (!(N_aligned%500)) printf(" %-4i HMMs aligned\n",N_aligned); cout.flush(); } #ifdef PTHREAD // Lock access to bin_status if (threads>0) rc = pthread_mutex_lock(&bin_status_mutex); #endif // Send the job in bin to a thread bin_status[bin] = SUBMITTED; jobs_submitted++; if (threads==0) // if no multi-threading mode, main thread executes job itself { RealignByWorker(bin); bin_status[bin] = FREE; jobs_submitted--; break; } #ifdef PTHREAD // Restart threads waiting for a signal rc = pthread_cond_signal(&new_job); // Unlock access to bin_status rc = pthread_mutex_unlock(&bin_status_mutex); if (DEBUG_THREADS) fprintf(stderr,"Main: put job into bin %i name=%-7.7s Jobs running: %i jobs_submitted:%i \n",bin,t[bin]->name,jobs_running,jobs_submitted); #endif } #ifdef PTHREAD if (threads>0) { // Lock mutex rc = pthread_mutex_lock(&bin_status_mutex); // Wait until job finishes and a bin becomes free if (jobs_submitted+jobs_running>=bins) { if (DEBUG_THREADS) fprintf(stderr,"Master thread is waiting for jobs to be finished...\n"); #ifdef WINDOWS rc = pthread_cond_wait(&finished_job, &bin_status_mutex); #else #ifdef HH_MAC // If no submitted jobs are in the queue we have to wait for a new job ... struct timespec ts; struct timeval tv; gettimeofday(&tv, NULL); ts.tv_sec = tv.tv_sec + 1; rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts); #else // If no submitted jobs are in the queue we have to wait for a new job ... struct timespec ts; clock_gettime(CLOCK_REALTIME,&ts); ts.tv_sec += 1; rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts); #endif #endif } // Unlock mutex rc = pthread_mutex_unlock(&bin_status_mutex); } #endif } // End while(1) /////////////////////////////////////////////////////////////////////////////////////// fclose(dbf); } reading_dbs=0; #ifdef PTHREAD if (threads>0) { // No more HMMs in database => wait until all threads have finished if (DEBUG_THREADS) fprintf(stderr,"No more jobs read from database Jobs running:%i jobs_submitted:%i \n",jobs_running,jobs_submitted); // Free all threads still waiting for jobs rc = pthread_mutex_lock(&bin_status_mutex); rc = pthread_cond_broadcast(&new_job); rc = pthread_mutex_unlock(&bin_status_mutex); // Unlock mutex // Wait for all jobs to finish => join all jobs to main for (int j=0; j=2) cout<<"\n"; v=v1; // Print for each HMM: n score -log2(Pval) L name (n=5:same name 4:same fam 3:same sf...) if (*par.scorefile) { if (v>=3) printf("Printing scores file ...\n"); hitlist.PrintScoreFile(q); } // Delete all hitlist entries with too short alignments nhits=0; hitlist.Reset(); while (!hitlist.End()) { hit_cur = hitlist.ReadNext(); // printf("Deleting alignment of %s with length %i? nhits=%-2i par.B=%-3i par.Z=%-3i par.e=%.2g par.b=%-3i par.z=%-3i par.p=%.2g\n",hit_cur.name,hit_cur.matched_cols,nhits,par.B,par.Z,par.e,par.b,par.z,par.p); if (nhits>=imax(par.B,par.Z)) break; if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break; if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue; if (hit_cur.matched_cols < MINCOLS_REALIGN) { if (v>=3) printf("Deleting alignment of %s with length %i\n",hit_cur.name,hit_cur.matched_cols); hitlist.Delete().Delete(); // delete the list record and hit object // // Make sure only realigned alignments get displayed! JS: Why? better unrealigned than none. // if (par.B>par.Z) par.B--; else if (par.B==par.Z) {par.B--; par.Z--;} else par.Z--; // if (par.b>par.z) par.b--; else if (par.b==par.z) {par.b--; par.z--;} else par.z--; } nhits++; } // Delete hash phash_plist_realignhitpos with lists phash_plist_realignhitpos->Reset(); while (!phash_plist_realignhitpos->End()) delete(phash_plist_realignhitpos->ReadNext()); // delete list to which phash_plist_realignhitpos->ReadNext() points delete(phash_plist_realignhitpos); // Delete array_plist_phits with lists for (int index=0; index1 && !strcmp(argv[i],"-v0")) v=0; else if (argc>1 && !strcmp(argv[i],"-v1")) v=1; else if (argc>2 && !strcmp(argv[i],"-v")) v=atoi(argv[i+1]); } par.SetDefaultPaths(program_path); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf,argv_conf,program_path); ProcessArguments(argc_conf,argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc,argv); // Check command line input and default values if (!*par.infile || !strcmp(par.infile,"")) // infile not given {help(); cerr<MAXTHREADS) { threads=MAXTHREADS; if (v>=1) fprintf(stderr,"WARNING: number of CPUs set to maximum value of %i\n",MAXTHREADS); } RemoveExtension(q->file,par.infile); // get rootname of infile (no directory path, no extension) Extension(inext,par.infile); // get extension of infile if (!*par.outfile) // outfile not given? Name it basename.hhm { RemoveExtension(par.outfile,par.infile); strcat(par.outfile,".hhr"); if (v>=2) cout<<"Search results will be written to "<MAXSEQDIS-3-par.showcons) par.nseqdis=MAXSEQDIS-3-par.showcons; //3 reserved for secondary structure if (par.aliwidth<20) par.aliwidth=20; if (par.pca<0.001) par.pca=0.001; // to avoid log(0) if (par.b>par.B) par.B=par.b; if (par.z>par.Z) par.Z=par.z; if (par.maxmem<1.0) {cerr<<"Warning: setting -maxmem to its minimum allowed value of 1.0\n"; par.maxmem=1.0;} // Input parameters if (v>=3) { cout<<"Input file : "<(fin); fclose(fin); cs::TransformToLog(*context_lib); lib_pc = new cs::LibraryPseudocounts(*context_lib, par.csw, par.csb); } // Set secondary structure substitution matrix if (par.ssm) SetSecStrucSubstitutionMatrix(); // Set (global variable) substitution matrix and derived matrices SetSubstitutionMatrix(); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format=0; ReadQueryFile(par.infile,input_format,q); PrepareQueryHMM(input_format,q); // // Rescale matrix according to query aa composition? (Two iterations are sufficient) // if (par.pcm==4) // { // q->RescaleMatrix(); // q->PreparePseudocounts(); // q->AddAminoAcidPseudocounts(); // SetSubstitutionMatrix(); // q->RescaleMatrix(); // q->PreparePseudocounts(); // q->AddAminoAcidPseudocounts(); // } // Reset lamda? if (par.calibrate>0) {q->lamda=LAMDA; q->mu=0.0;} // Set query columns in His-tags etc to Null model distribution if (par.notags) q->NeutralizeTags(); if (par.forward>=1) { if (v>=2 && par.forward==1) printf("Using Forward algorithm ...\n"); if (v>=2 &&par.forward==2) printf("Using maximum accuracy (MAC) alignment algorithm ...\n"); } else if (v>=3) printf("Using Viterbi algorithm ...\n"); // Prepare multi-threading - reserve memory for threads, intialize, etc. if (threads==0) bins=1; else bins=iround(threads*1.2+0.5); jobs_running = 0; jobs_submitted = 0; reading_dbs=1; // needs to be set to 1 before threads are created for (bin=0; binAllocateBacktraceMatrix(q->L+2,par.maxres); // ...with a separate dynamic programming matrix (memory!!) if (par.forward>=1) hit[bin]->AllocateForwardMatrix(q->L+2,par.maxres); if (par.forward==2) hit[bin]->AllocateBackwardMatrix(q->L+2,par.maxres); bin_status[bin] = FREE; } #ifdef PTHREAD // Start threads for database search for (int j=0; j* doubled; doubled = new(Hash); doubled->New(16381,0); char** dbfiles = new char*[par.maxnumdb+1]; char* dbfile_cur=strscn_(par.dbfiles); // current file name char* dbfile_next; // next file name after current while (*dbfile_cur && ndbContains(dbfile_cur)) { doubled->Add(dbfile_cur); if (!strcmp(dbfile_cur+strlen(dbfile_cur)-4,".pal")) { char dbfile[NAMELEN]=""; // input line FILE* palf = NULL; if (!strcmp(dbfile,"stdin")) palf=stdin; else { palf = fopen(dbfile_cur,"rb"); if (!palf) OpenFileError(dbfile_cur); } while(fgetline(dbfile,NAMELEN,palf) && ndbContains(dbfile)) { doubled->Add(dbfile); dbfiles[ndb]=new(char[strlen(dbfile)+1]); strcpy(dbfiles[ndb],dbfile); if (ndb<5 && ndb>0 && access(dbfiles[ndb],R_OK)) OpenFileError(dbfiles[ndb]); // file not readable? // printf("dbfiles[%i]='%s'\n",ndb,dbfiles[ndb]); ndb++; } else if (v>=1) fprintf(stderr,"WARNING: skipping doubled datbase file %s\n",dbfile); } fclose(palf); } else { dbfiles[ndb]=new(char[strlen(dbfile_cur)+1]); strcpy(dbfiles[ndb],dbfile_cur); if (ndb<5 && ndb>0 && access(dbfiles[ndb],R_OK)) OpenFileError(dbfiles[ndb]); // file not readable? // printf("dbfiles[%i]='%s'\n",ndb,dbfiles[ndb]); ndb++; } } else if (v>=1) fprintf(stderr,"WARNING: skipping doubled datbase file %s\n",dbfile_cur); dbfile_cur=dbfile_next; } //fprintf (stderr,"ndb: %i par.maxnumdb: %i\n",ndb,par.maxnumdb); if (v>=1 && ndb>=par.maxnumdb && *dbfiles[ndb-1]) fprintf (stderr,"WARNING: maximum of %i allowed databases surpassed. Skipping the rest.\n",par.maxnumdb); /////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Search databases // Initialize N_searched=0; v1=v; if (v>0 && v<=3) v=1; else v-=2; if (print_elapsed) ElapsedTimeSinceLastCall("(preparing for search)"); // For all the databases given in -d '...' option ... for (int idb=0; idbindex = N_searched; // give hit a unique index for HMM hit[bin]->ftellpos = ftell(dbf); // record position in dbfile of next HMM to be read char path[NAMELEN]; Pathname(path,dbfiles[idb]); /////////////////////////////////////////////////// // Read next HMM from database file if (!fgetline(line,LINELEN,dbf)) {read_from_db=0; break;} while (strscn(line)==NULL && fgetline(line,LINELEN,dbf)) {} // skip lines that contain only white space if (strscn(line)==NULL) break; if (!strncmp(line,"HMMER3",6)) // read HMMER3 format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer3(dbf,dbfiles[idb]); par.hmmer_used = true; } else if (!strncmp(line,"HMMER",5)) // read HMMER format { format[bin] = 1; read_from_db = t[bin]->ReadHMMer(dbf,dbfiles[idb]); par.hmmer_used = true; } else if (!strncmp(line,"HH",2)) // read HHM format { format[bin] = 0; read_from_db = t[bin]->Read(dbf,path); } else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1 { fseek(dbf,hit[bin]->ftellpos,SEEK_SET); // rewind to beginning of line format[bin] = 0; read_from_db = t[bin]->Read(dbf,path); } else if (line[0]=='#') // read a3m alignment { Alignment tali; tali.Read(dbf,dbfiles[idb],line); tali.Compress(dbfiles[idb]); // qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db); char wg=par.wg; par.wg=1; // use global weights t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0'; tali.FrequenciesAndTransitions(t[bin]); par.wg=wg; //reset global weights format[bin] = 0; } else { cerr<=4) printf("Aligning with %s\n",t[bin]->name); /////////////////////v>=4 /////////////////////////////////////////////////// hit[bin]->dbfile = new(char[strlen(dbfiles[idb])+1]); strcpy(hit[bin]->dbfile,dbfiles[idb]); // record db file name from which next HMM is read ++N_searched; if (v1>=2 && !(N_searched%20)) { cout<<"."; if (!(N_searched%1000)) printf(" %-4i HMMs searched\n",N_searched); cout.flush(); } #ifdef PTHREAD // Lock access to bin_status if (threads>0) rc = pthread_mutex_lock(&bin_status_mutex); #endif // Send the job in bin to a thread bin_status[bin] = SUBMITTED; jobs_submitted++; if (threads==0) // if no multi-threading mode, main thread executes job itself { AlignByWorker(bin); bin_status[bin] = FREE; jobs_submitted--; break; } #ifdef PTHREAD // Restart threads waiting for a signal rc = pthread_cond_signal(&new_job); // Unlock access to bin_status rc = pthread_mutex_unlock(&bin_status_mutex); if (DEBUG_THREADS) fprintf(stderr,"Main: put job into bin %i name=%-7.7s Jobs running: %i jobs_submitted:%i \n",bin,t[bin]->name,jobs_running,jobs_submitted); #endif } #ifdef PTHREAD if (threads>0) { // Lock mutex rc = pthread_mutex_lock(&bin_status_mutex); // Wait until job finishes and a bin becomes free if (jobs_submitted+jobs_running>=bins) { if (DEBUG_THREADS) fprintf(stderr,"Master thread is waiting for jobs to be finished...\n"); #ifdef WINDOWS rc = pthread_cond_wait(&finished_job, &bin_status_mutex); #else #ifdef HH_MAC // If no submitted jobs are in the queue we have to wait for a new job ... struct timespec ts; struct timeval tv; gettimeofday(&tv, NULL); ts.tv_sec = tv.tv_sec + 1; rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts); #else // If no submitted jobs are in the queue we have to wait for a new job ... struct timespec ts; clock_gettime(CLOCK_REALTIME,&ts); ts.tv_sec += 1; rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts); #endif #endif } // Unlock mutex rc = pthread_mutex_unlock(&bin_status_mutex); } #endif } // end while(read_from_db!=0) /////////////////////////////////////////////////////////////////////////////////////// fclose(dbf); } reading_dbs=0; #ifdef PTHREAD if (threads>0) { // No more HMMs in database => wait until all threads have finished if (DEBUG_THREADS) fprintf(stderr,"No more jobs read from database Jobs running:%i jobs_submitted:%i \n",jobs_running,jobs_submitted); // Free all threads still waiting for jobs rc = pthread_mutex_lock(&bin_status_mutex); rc = pthread_cond_broadcast(&new_job); rc = pthread_mutex_unlock(&bin_status_mutex); // Unlock mutex // Wait for all jobs to finish => join all jobs to main for (int j=0; j=2) cout<<"\n"; v=v1; if (print_elapsed) ElapsedTimeSinceLastCall("(search through database)"); // Sort list according to sortscore if (v>=3) printf("Sorting hit list ...\n"); hitlist.SortList(); // Fit EVD (with lamda, mu) to score distribution? if (par.calm==3) { hitlist.CalculatePvalues(q); // Use NN prediction of lamda and mu } else if ((par.calm!=1 && q->lamda==0) || par.calibrate>0) { if (v>=2 && par.loc) printf("Fitting scores with EVD (first round) ...\n"); hitlist.MaxLikelihoodEVD(q,3); // first ML fit: exclude 3 best superfamilies from fit if (v>=3) printf("Number of families present in database: %i\n",hitlist.fams); // DEBUG if (hitlist.fams>=100) { if (par.loc) { if (v>=2) printf("Fitting scores with EVD (second round) ...\n"); hitlist.MaxLikelihoodEVD(q,0); // second ML fit: exclude superfamilies with E-value=2) fprintf(stderr,"WARNING: E-values for global alignment option may be unreliable.\n"); hitlist.ResortList(); } } else { if (v) { fprintf(stderr,"\nWARNING: no E-values could be calculated.\n"); fprintf(stderr,"To calculate E-values you have two options:\n"); fprintf(stderr,"1. Calibrate your query profile HMM with a calibration database:\n"); fprintf(stderr," > hhsearch -i yourHMM.hhm -d cal.hhm -cal\n"); fprintf(stderr," This will insert a line in yourHMM.hhm with lamda and mu of the score distribution.\n"); fprintf(stderr," cal.hhm contains 1220 HMMs from different SCOP superfamilies and is supplied with HHsearch.\n"); fprintf(stderr," Instead of cal.hhm you may also use any SCOP database file, e.g. scop70_1.69\n"); fprintf(stderr," Note that your HMM needs to be recalibrated when changing HMM-HMM alignment options.\n"); fprintf(stderr,"2. Append cal.hhm to your own database:\n"); fprintf(stderr," > cat cal.hhm >> yourDB.hhm\n"); fprintf(stderr," But note that HMMs contained in cal.hmm will pop up among your hits.\n"); } } if (par.calm==2) hitlist.GetPvalsFromCalibration(q); } else hitlist.GetPvalsFromCalibration(q); // Optimization mode? if (par.opt) hitlist.Optimize(q); // Set new ss weight for realign par.ssw = par.ssw_realign; // Realign hits with MAC algorithm if (par.realign && par.forward!=2) { perform_realign(dbfiles,ndb); } else { // Print for each HMM: n score -log2(Pval) L name (n=5:same name 4:same fam 3:same sf...) if (*par.scorefile) { if (v>=3) printf("Printing scores file ...\n"); } } // // Print for each HMM: n score -log2(Pval) L name (n=5:same name 4:same fam 3:same sf...) // if (*par.scorefile) { // if (v>=3) printf("Printing scores file ...\n"); // hitlist.PrintScoreFile(q); // } // Print FASTA or A2M alignments? if (*par.pairwisealisfile) { if (v>=2) cout<<"Printing alignments in "<<(par.outformat==1? "FASTA" : par.outformat==2?"A2M" :"A3M")<<" format to "<=1) fprintf(stderr,"WARNING: Using HMMER files results in a drastically reduced sensitivity (>10%%).\nWe recommend to use HHMs build by hhmake.\n"); // Print summary listing of hits if (v>=3) printf("Printing hit list ...\n"); hitlist.PrintHitList(q,par.outfile); // Write only hit list to screen? if (v==2 && strcmp(par.outfile,"stdout")) WriteToScreen(par.outfile,1009); // write only hit list to screen // Print alignments of query sequences against hit sequences hitlist.PrintAlignments(q,par.outfile); // Write whole output file to screen? (max 10000 lines) if (v>=3 && strcmp(par.outfile,"stdout")) WriteToScreen(par.outfile,10009); // Write HMM to output file without pseudocounts if (par.calibrate) q->InsertCalibration(par.infile); // Generate result alignment or HMM file? if (*par.alnfile || *par.psifile || *par.hhmfile) { Alignment Qali; // output A3M generated by merging A3M alignments for significant hits to the query alignment Hit hit; int nhits=0; // Read query alignment into Qali char qa3mfile[NAMELEN]; char ta3mfile[NAMELEN]; RemoveExtension(qa3mfile,par.infile); // directory?? strcat(qa3mfile,".a3m"); FILE* qa3mf=fopen(qa3mfile,"r"); if (!qa3mf) OpenFileError(qa3mfile); Qali.Read(qa3mf,qa3mfile); fclose(qa3mf); // If par.append==1 do not print query alignment if (par.append) Qali.MarkSeqsAsNonPrintable(); if (v>=2) printf("Merging hits to query alignment %s ...\n",qa3mfile); // If query consists of only one sequence: // realign templates to query HMM enforced by sequences from up to the 10 best templates v1=v--; // For each template below threshold hitlist.Reset(); while (!hitlist.End()) { hit = hitlist.ReadNext(); if (hit.Eval > 100.0*par.e) break; // E-value much too large if (hit.Eval > par.e) continue; // E-value too large // Read a3m alignment of hit from .a3m file and merge into Qali alignment strcpy(ta3mfile,hit.file); // copy filename including path but without extension strcat(ta3mfile,".a3m"); Alignment Tali; FILE* ta3mf=fopen(ta3mfile,"r"); if (!ta3mf) OpenFileError(ta3mfile); Tali.Read(ta3mf,ta3mfile); // Read template alignment into Tali fclose(ta3mf); Tali.Compress(ta3mfile); // Filter database alignment Qali.MergeMasterSlave(hit,Tali,ta3mfile); nhits++; } // Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i] Qali.Compress("merged A3M file"); // Sort out the nseqdis most dissimilar sequences for display in the result alignments Qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis); v=v1; // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) float const COV_ABS = 25; // min. number of aligned residues int cov_tot = imax(imin((int)(COV_ABS / Qali.L * 100 + 0.5), 70), par.coverage); if (v>2) printf("Filter new alignment with cov %3i%%\n", cov_tot); Qali.N_filtered = Qali.Filter(par.max_seqid,cov_tot,par.qid,par.qsc,par.Ndiff); // Calculate (and write) output HMM? if (*par.hhmfile) { strcpy(Qali.longname,q->longname); strcpy(Qali.name,q->name); strcpy(Qali.fam,q->fam); RemovePathAndExtension(Qali.file,par.hhmfile); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] HMM* Q = new HMM; // output HMM: generated from Qali Qali.FrequenciesAndTransitions(Q); // Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a] Q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0); Q->CalculateAminoAcidBackground(); // Write HMM to output file in HHsearch format? Q->WriteToFile(par.hhmfile); delete Q; } // Write output A3M alignment? if (*par.alnfile) Qali.WriteToFile(par.alnfile,"a3m"); // Write output PSI-BLAST-formatted alignment? if (*par.psifile) Qali.WriteToFile(par.psifile,"psi"); } // Write alignments with posteriors etc. to alitabfile? if (*par.alitabfile) { FILE* alitabf=NULL; if (strcmp(par.alitabfile,"stdout")) alitabf = fopen(par.alitabfile, "w"); else alitabf = stdout; if (!alitabf) OpenFileError(par.alitabfile); // Store all dbfiles and ftell positions of templates to be displayed and realigned int nhits=0; hitlist.Reset(); while (!hitlist.End()) { hit_cur = hitlist.ReadNext(); if (nhits>=imax(par.B,par.Z)) break; if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break; if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue; fprintf(alitabf, ">%s\n", hit_cur.longname); WriteToAlifile(alitabf,&hit_cur); nhits++; } fclose(alitabf); } // Delete memory for dynamic programming matrix for (bin=0; binDeleteBacktraceMatrix(q->L+2); if (par.forward>=1 || par.realign) hit[bin]->DeleteForwardMatrix(q->L+2); if (par.forward==2 || par.realign) hit[bin]->DeleteBackwardMatrix(q->L+2); delete hit[bin]; delete t[bin]; } if (par.dbfiles) delete[] par.dbfiles; if (dbfiles) { for (int idb=0; idb=2) printf("Done\n"); } exit(0); } //end main ////////////////////////////////////////////////////////////////////////////////////////////////////// // END OF MAIN //////////////////////////////////////////////////////////////////////////////////////////////////////