// hhblits.C:
// Iterative search for a multiple alignment in a profile HMM database
//
// Error codes: 0: ok 1: file format error 2: file access error 3: memory error 4: command line error 6: internal logic error 7: internal numeric error
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see .
// We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
// Reference:
// Remmert M., Biegert A., Hauser A., and Soding J.
// HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
// Nat. Methods 9:173-175 (2011); epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
//////////////////////////////////////////////////////////////////////////////////////////
// This program contains, in file hhprefilter.C, code adapted from Michael Farrar
// (http://sites.google.com/site/farrarmichael/smith-waterman). His code is marked
// in the file hhprefilter.C.
// The copy right of his code is shown below:
// Copyright 2006, by Michael Farrar. All rights reserved. The SWSSE2
// program and documentation may not be sold or incorporated into a
// commercial product, in whole or in part, without written consent of
// Michael Farrar.
//
// For further information regarding permission for use or reproduction,
// please contact Michael Farrar at:
//
// farrar.michael@gmail.com
//
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
//
// Note by J. Soeding: Michael Farrar died unexpectedly in December 2010.
// Many thanks posthumously for your great code!
#define PTHREAD
#define MAIN
#define HHBLITS
// #define HHDEBUG
// #define DEBUG_THREADS
// #define WINDOWS
#include // cin, cout, cerr
#include // ofstream, ifstream
#include // printf
#include // min,max
#include // exit
#include // strcmp, strstr
#include
#include
#include // sqrt, pow
#include // INT_MIN
#include // FLT_MIN
#include // islower, isdigit etc
#include // clock_gettime etc. (in realtime library (-lrt compiler option))
#include // perror(), strerror(errno)
#include
#include
#ifdef PTHREAD
#include // POSIX pthread functions and data structures
#endif
#ifdef _OPENMP
#include
#endif
#include
#ifdef HH_SSE41
#include // SSSE3
#include // SSE4.1
#define HH_SSE3
#endif
#ifdef HH_SSE3
#include // SSE3
#define HH_SSE2
#endif
#ifdef HH_SSE2
#ifndef __SUNPRO_C
#include // SSE2
#else
#include
#endif
#endif
using std::cout;
using std::cerr;
using std::endl;
using std::ios;
using std::ifstream;
using std::ofstream;
using std::string;
using std::stringstream;
using std::vector;
using std::pair;
extern "C" {
#include // fast index-based database reading
}
#include "cs.h" // context-specific pseudocounts
#include "context_library.h"
#include "library_pseudocounts-inl.h"
#include "abstract_state_matrix.h"
cs::ContextLibrary *cs_lib;
#include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
#include "list.C" // list data structure
#include "hash.C" // hash data structure
#include "hhdecl.C" // Constants, global variables, struct Parameters
#include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen,
#include "hhmatrices.C" // BLOSUM50, GONNET, HSDM
#include "hhhmm.h" // class HMM
#include "hhhit.h" // class Hit
#include "hhalignment.h" // class Alignment
#include "hhhalfalignment.h" // class HalfAlignment
#include "hhfullalignment.h" // class FullAlignment
#include "hhhitlist.h" // class Hit
#include "hhhmm.C" // class HMM
#include "hhalignment.C" // class Alignment
#include "hhhit.C" // class Hit
#include "hhhalfalignment.C" // class HalfAlignment
#include "hhfullalignment.C" // class FullAlignment
#include "hhhitlist.C" // class HitList
#include "hhfunc.C" // some functions common to hh programs
/////////////////////////////////////////////////////////////////////////////////////
// Global variables
////////////////////////////////////////////////////////////////////////////////////
char line[LINELEN]=""; // input line
int bin; // bin index
const char print_elapsed=0; // debug output for runtimes
char tmp_file[]="/tmp/hhblitsXXXXXX"; // for runtime secondary structure prediction (only with -addss option)
// HHblits variables
const char HHBLITS_REFERENCE[] = "Remmert M., Biegert A., Hauser A., and Soding J.\nHHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.\nNat. Methods 9:173-175 (2011)\n";
int v1=v; // verbose mode
int num_rounds = 2; // number of iterations
bool last_round = false; // set to true in last iteration
bool already_seen_filter = true; // Perform filtering of already seen HHMs
bool block_filter = true; // Perform viterbi and forward algorithm only on unshaded tube given by prefiltering
bool realign_old_hits = false; // Realign old hits in last round or use previous alignments
char input_format = 0; // Set to 1 if input in HMMER format (has already pseudocounts)
float neffmax = 10; // Break if Neff > Neffmax
int cpu = 2; // default: use 2 cores
char config_file[NAMELEN];
char infile[NAMELEN];
char alis_basename[NAMELEN];
char query_hhmfile[NAMELEN]; // -qhmm output file
bool alitab_scop = false; // Write only SCOP alignments in alitabfile
char db_ext[NAMELEN];
// Needed for fast index reading
size_t data_size;
FILE *dba3m_data_file;
FILE *dba3m_index_file;
FILE *dbhhm_data_file;
FILE *dbhhm_index_file;
char* dba3m_data;
char* dbhhm_data;
ffindex_index_t* dbhhm_index = NULL;
ffindex_index_t* dba3m_index = NULL;
char db_base[NAMELEN]; // database basename
char db[NAMELEN]; // database with context-state sequences
char dba3m[NAMELEN]; // database with A3M-files
char dbhhm[NAMELEN]; // database with HHM-files
char** dbfiles_new;
char** dbfiles_old;
int ndb_new=0;
int ndb_old=0;
Hash* previous_hits;
Hash* premerged_hits;
// HHsearch variables
const int MAXTHREADS=256; // maximum number of threads (i.e. CPUs) for parallel computation
const int MAXBINS=384; // maximum number of bins (positions in thread queue)
enum bin_states {FREE=0, SUBMITTED=1, RUNNING=2};
int threads=0; // number of threads (apart from the main thread which reads from the databases file) 0:no multithreading
int bins; // number of bins; jobs gets allocated to a FREE bin were they are waiting for execution by a thread
char bin_status[MAXBINS]; // The status for each bin is FREE, SUBMITTED, or RUNNING
int jobs_running; // number of active jobs, i.e. number of bins set to RUNNING
int jobs_submitted; // number of submitted jobs, i.e. number of bins set to SUBMITTED
char reading_dbs; // 1: still HMMs to read in a database; 0: finshed reading all HMMs, no db left
const char DEBUG_THREADS=0; // Debugging flag
HMM* q; // Create query HMM with maximum of par.maxres match states
HMM* q_tmp; // Create query HMM with maximum of par.maxres match states (needed for prefiltering)
HMM* t[MAXBINS]; // Each bin has a template HMM allocated that was read from the database file
Hit* hit[MAXBINS]; // Each bin has an object of type Hit allocated with a separate dynamic programming matrix (memory!!)
Hit hit_cur; // Current hit when going through hitlist
HitList hitlist; // list of hits with one Hit object for each pairwise comparison done
int* format; // format[bin] = 0 if in HHsearch format => add pcs; format[bin] = 1 if in HMMER format => no pcs
int read_from_db; // The value of this flag is returned from HMM::Read(); 0:end of file 1:ok 2:skip HMM
int N_searched; // Number of HMMs searched
Alignment Qali; // output A3M generated by merging A3M alignments for significant hits to the query alignment
Alignment Qali_allseqs; // output A3M alignment with no sequence filtered out (only active with -all option)
struct Thread_args // data to hand to WorkerLoop thread
{
int thread_id; // id of thread (for debugging)
void (*function)(int); // pointer to function (=job) to execute by WorkerLoop once old job is done
};
#ifdef PTHREAD
struct Thread_args thread_data[MAXTHREADS]; // store a threads thread_id and function to call (AlignByWorker, RealignByWorker)
pthread_t pthread[MAXTHREADS]; // info on thread's structures (needed by system)
pthread_attr_t joinable; // attribute set for describing threads
int rc; // return code for threading commands
// With this condition variable the main thread signals to the worker threads that it has submitted a new job
pthread_cond_t new_job = PTHREAD_COND_INITIALIZER;
// Mutex assures exclusive access to bin_status[], jobs_sumitted, jobs_running, and new_job by threads
pthread_mutex_t bin_status_mutex = PTHREAD_MUTEX_INITIALIZER;
// Mutex assures exclusive access to hitlist
pthread_mutex_t hitlist_mutex = PTHREAD_MUTEX_INITIALIZER;
// With this condition variable a worker thread signals to the main thread that it has finished a job
pthread_cond_t finished_job = PTHREAD_COND_INITIALIZER;
#endif
inline int PickBin(char status);
// Include hhworker.C and hhprefilter.C here, because it needs some of the above variables
#include "hhworker.C" // functions: AlignByWorker, RealignByWorker, WorkerLoop
#include "hhprefilter.C" // some prefilter functions
/////////////////////////////////////////////////////////////////////////////////////
// Help functions
/////////////////////////////////////////////////////////////////////////////////////
void help(char all=0)
{
// ----+----1----+----2----+----3----+----4----+----5----+----6----+----7----+---8-----+----9----+----0
printf("\n");
printf("HHblits %s:\nHMM-HMM-based lightning-fast iterative sequence search\n",VERSION_AND_DATE);
printf("HHblits is a sensitive, general-purpose, iterative sequence search tool that represents\n");
printf("both query and database sequences by HMMs. You can search HHblits databases starting\n");
printf("with a single query sequence, a multiple sequence alignment (MSA), or an HMM. HHblits\n");
printf("prints out a ranked list of database HMMs/MSAs and can also generate an MSA by merging\n");
printf("the significant database HMMs/MSAs onto the query MSA.\n");
printf("\n");
printf("%s",HHBLITS_REFERENCE);
printf("%s",COPYRIGHT);
printf("\n");
printf("Usage: %s -i query [options] \n",program_name);
printf(" -i input/query: single sequence or multiple sequence alignment (MSA)\n");
printf(" in a3m, a2m, or FASTA format, or HMM in hhm format\n");
if (all) {
printf("\n");
printf(" may be 'stdin' or 'stdout' throughout.\n");
}
printf("\n");
printf("Options: \n");
printf(" -d database name (e.g. uniprot20_29Feb2012) (default=%s) \n",db_base);
printf(" -n [1,8] number of iterations (default=%i) \n",num_rounds);
printf(" -e [0,1] E-value cutoff for inclusion in result alignment (def=%G) \n",par.e);
printf("\n");
printf("Input alignment format: \n");
printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n");
printf(" ' -' = Delete; '.' = gaps aligned to inserts (may be omitted) \n");
printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n");
printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n");
printf("\n");
printf("Output options: \n");
printf(" -o write results in standard format to file (default=)\n");
printf(" -oa3m write result MSA with significant matches in a3m format\n");
if (!all) {
printf(" Analogous for -opsi and -ohhm\n");
}
if (all) {
printf(" -opsi write result MSA of significant matches in PSI-BLAST format\n");
printf(" -ohhm write HHM file for result MSA of significant matches\n");
}
printf(" -oalis write MSAs in A3M format after each iteration\n");
if (all) {
printf(" -Ofas write pairwise alignments of significant matches in FASTA format\n");
printf(" Analogous for output in a3m and a2m format (e.g. -Oa3m)\n");
printf(" -qhhm write query input HHM file of last iteration (default=off) \n");
printf(" -seq max. number of query/template sequences displayed (default=%i) \n",par.nseqdis);
printf(" -aliw number of columns per line in alignment list (default=%i) \n",par.aliwidth);
printf(" -p [0,100] minimum probability in summary and alignment list (default=%G) \n",par.p);
printf(" -E [0,inf[ maximum E-value in summary and alignment list (default=%G) \n",par.E);
printf(" -Z maximum number of lines in summary hit list (default=%i) \n",par.Z);
printf(" -z minimum number of lines in summary hit list (default=%i) \n",par.z);
printf(" -B maximum number of alignments in alignment list (default=%i) \n",par.B);
printf(" -b minimum number of alignments in alignment list (default=%i) \n",par.b);
printf("\n");
printf("Prefilter options \n");
printf(" -noprefilt disable all filter steps \n");
printf(" -noaddfilter disable all filter steps (except for fast prefiltering) \n");
printf(" -nodbfilter disable additional filtering of prefiltered HMMs \n");
printf(" -noblockfilter search complete matrix in Viterbi \n");
printf(" -maxfilt max number of hits allowed to pass 2nd prefilter (default=%i) \n",par.maxnumdb);
printf("\n");
}
printf("Filter options applied to query MSA, database MSAs, and result MSA \n");
printf(" -all show all sequences in result MSA; do not filter result MSA \n");
printf(" -id [0,100] maximum pairwise sequence identity (def=%i)\n",par.max_seqid);
printf(" -diff [0,inf[ filter MSAs by selecting most diverse set of sequences, keeping \n");
printf(" at least this many seqs in each MSA block of length 50 (def=%i) \n",par.Ndiff);
printf(" -cov [0,100] minimum coverage with master sequence (%%) (def=%i) \n",par.coverage);
printf(" -qid [0,100] minimum sequence identity with master sequence (%%) (def=%i) \n",par.qid);
printf(" -qsc [0,100] minimum score per column with master sequence (default=%.1f) \n",par.qsc);
printf(" -neff [1,inf] target diversity of multiple sequence alignment (default=off) \n");
printf("\n");
printf("HMM-HMM alignment options: \n");
printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n");
printf(" -mact [0,1[ posterior probability threshold for MAC re-alignment (def=%.3f) \n",par.mact);
printf(" Parameter controls alignment greediness: 0:global >0.1:local \n");
printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n");
if (all) {
printf(" -realign_max realign max. hits (default=%i) \n",par.realign_max);
printf(" -alt show up to this many significant alternative alignments(def=%i) \n",par.altali);
printf(" -premerge merge hits to query MSA before aligning remaining hits (def=%i)\n",par.premerge);
printf(" -shift [-1,1] profile-profile score offset (def=%-.2f) \n",par.shift);
printf(" -ssm {0,..,4} 0: no ss scoring \n");
printf(" 1,2: ss scoring after or during alignment [default=%1i] \n",par.ssm);
printf(" 3,4: ss scoring after or during alignment, predicted vs. predicted\n");
printf(" -ssw [0,1] weight of ss score (def=%-.2f) \n",par.ssw);
printf("\n");
printf("Gap cost options: \n");
printf(" -gapb [0,inf[ Transition pseudocount admixture (def=%-.2f) \n",par.gapb);
printf(" -gapd [0,inf[ Transition pseudocount admixture for open gap (default=%-.2f) \n",par.gapd);
printf(" -gape [0,1.5] Transition pseudocount admixture for extend gap (def=%-.2f) \n",par.gape);
printf(" -gapf ]0,inf] factor to increase/reduce gap open penalty for deletes (def=%-.2f) \n",par.gapf);
printf(" -gapg ]0,inf] factor to increase/reduce gap open penalty for inserts (def=%-.2f) \n",par.gapg);
printf(" -gaph ]0,inf] factor to increase/reduce gap extend penalty for deletes(def=%-.2f)\n",par.gaph);
printf(" -gapi ]0,inf] factor to increase/reduce gap extend penalty for inserts(def=%-.2f)\n",par.gapi);
printf(" -egq [0,inf[ penalty (bits) for end gaps aligned to query residues (def=%-.2f) \n",par.egq);
printf(" -egt [0,inf[ penalty (bits) for end gaps aligned to template residues (def=%-.2f)\n",par.egt);
printf("\n");
printf("Pseudocount (pc) options: \n");
printf(" -pcm {0,..,2} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pcm);
printf(" 0: no pseudo counts: tau = 0 \n");
printf(" 1: constant tau = a \n");
printf(" 2: diversity-dependent: tau = a/(1 + ((Neff[i]-1)/b)^c) \n");
printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
printf(" -pca [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pca);
printf(" -pcb [1,inf[ Neff threshold value for -pcm 2 (def=%-.1f) \n",par.pcb);
printf(" -pcc [0,3] extinction exponent c for -pcm 2 (def=%-.1f) \n",par.pcc);
// printf(" -pcw [0,3] weight of pos-specificity for pcs (def=%-.1f) \n",par.pcw);
printf(" -pre_pca [0,1] PREFILTER pseudocount admixture (def=%-.1f) \n",par.pre_pca);
printf(" -pre_pcb [1,inf[ PREFILTER threshold for Neff (def=%-.1f) \n",par.pre_pcb);
printf("\n");
printf("Context-specific pseudo-counts: \n");
printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n");
printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n",par.clusterfile);
printf(" -cslib column state file for fast database prefiltering (default=%s)\n",par.cs_library);
printf("\n");
printf("Predict secondary structure\n");
printf(" -addss add 2ndary structure predicted with PSIPRED to result MSA \n");
printf(" -psipred directory with PSIPRED executables (default=%s) \n",par.psipred);
printf(" -psipred_data directory with PSIPRED data (default=%s) \n",par.psipred_data);
printf("\n");
}
printf("Other options: \n");
printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=%i)\n",v);
printf(" -neffmax ]1,20] skip further search iterations when diversity Neff of query MSA \n");
printf(" becomes larger than neffmax (default=%.1f)\n",neffmax);
#ifdef PTHREAD
printf(" -cpu number of CPUs to use (for shared memory SMPs) (default=%i) \n",cpu);
#endif
if (all) {
printf(" -scores write scores for all pairwise comparisions to file \n");
printf(" -atab write all alignments in tabular layout to file \n");
printf(" -maxres max number of HMM columns (def=%5i) \n",par.maxres);
printf(" -maxmem [1,inf[ max available memory in GB (def=%.1f) \n",par.maxmem);
}
#ifndef PTHREAD
printf("(The -cpu option is inactive since HHblits was not compiled with POSIX thread support)\n");
#endif
printf("\n");
if (!all) {
printf("An extended list of options can be obtained by calling 'hhblits -help'\n");
}
printf("\n");
printf("Example: %s -i query.fas -oa3m query.a3m -n 1 \n",program_name);
cout<=4) cout<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=argc || argv[i][0]=='-')
{help() ; cerr<='0' && argv[i][0]<='9') {par.Mgaps=atoi(argv[i]); par.M=2;}
else cerr<length=atoi(argv[++i]);
else if (!strcmp(argv[i],"-filtercut") && (ithresh=(double)atof(argv[++i]);
else if (!strcmp(argv[i],"-noprefilt") || !strcmp(argv[i],"-nofilter") )
{par.prefilter=false; already_seen_filter=false; block_filter=false; par.early_stopping_filter=false; early_stopping->thresh=0;}
else if (!strcmp(argv[i],"-noaddfilter"))
{already_seen_filter=false; block_filter=false; par.early_stopping_filter=false; early_stopping->thresh=0;}
else if (!strcmp(argv[i],"-nodbfilter")) {early_stopping->thresh=0;}
else if (!strcmp(argv[i],"-noblockfilter")) {block_filter=false;}
else if (!strcmp(argv[i],"-noearlystoppingfilter")) {par.early_stopping_filter=false;}
else if (!strcmp(argv[i],"-block_len") && (i0.35 && par.mact<0.351) {par.mact=0;} }
else if (!strncmp(argv[i],"-loc",4)) par.loc=1;
else if (!strncmp(argv[i],"-alt",4) && (i=4) cout<irep=1; hit[bin]->irep <= par.altali; hit[bin]->irep++)
{
// Break, if no previous_hit with irep is found
hit[bin]->Viterbi(q,t[bin]);
if (hit[bin]->irep>1 && hit[bin]->score <= SMIN) break;
hit[bin]->Backtrace(q,t[bin]);
hit[bin]->score_sort = hit[bin]->score_aass;
//printf("PerformViterbiByWorker: %-12.12s %-12.12s irep=%-2i score=%6.2f\n",hit[bin]->name,hit[bin]->fam,hit[bin]->irep,hit[bin]->score);
#ifdef PTHREAD
pthread_mutex_lock(&hitlist_mutex); // lock access to hitlist
#endif
stringstream ss_tmp;
ss_tmp << hit[bin]->name << "__" << hit[bin]->irep;
if (previous_hits->Contains((char*)ss_tmp.str().c_str()))
{
//printf("Previous hits contains %s!\n",(char*)ss_tmp.str().c_str());
hit_cur = previous_hits->Remove((char*)ss_tmp.str().c_str());
previous_hits->Add((char*)ss_tmp.str().c_str(), *(hit[bin]));
// Overwrite *hit[bin] with alignment, etc. of hit_cur
hit_cur.score = hit[bin]->score;
hit_cur.score_aass = hit[bin]->score_aass;
hit_cur.score_ss = hit[bin]->score_ss;
hit_cur.Pval = hit[bin]->Pval;
hit_cur.Pvalt = hit[bin]->Pvalt;
hit_cur.logPval = hit[bin]->logPval;
hit_cur.logPvalt = hit[bin]->logPvalt;
hit_cur.Eval = hit[bin]->Eval;
hit_cur.logEval = hit[bin]->logEval;
hit_cur.Probab = hit[bin]->Probab;
hitlist.Push(hit_cur); // insert hit at beginning of list (last repeats first!)
}
else
{
// don't save alignments which where not found in previous rounds
//printf("Don't save %s!\n",(char*)ss_tmp.str().c_str());
//hitlist.Push(*(hit[bin])); // insert hit at beginning of list (last repeats first!)
}
#ifdef PTHREAD
pthread_mutex_unlock(&hitlist_mutex); // unlock access to hitlist
#endif
if (hit[bin]->score <= SMIN) break; // break if score for first hit is already worse than SMIN
}
return;
}
/////////////////////////////////////////////////////////////////////////////////////
// This function is called if an HMM (.hhm) instead of an MSA is given as query.
// It reads the query MSA either from .a3m if it exists, or from the
// representative sequences in the HHM/HMM file
/////////////////////////////////////////////////////////////////////////////////////
void ReadQueryA3MFile()
{
// Open query a3m MSA
char qa3mfile[NAMELEN];
RemoveExtension(qa3mfile,par.infile);
strcat(qa3mfile,".a3m");
FILE* qa3mf=fopen(qa3mfile,"r");
if (!qa3mf)
{
// .a3m does not exist => extract query MSA from representative sequences in HHM
if (v>=1 && input_format==0) // HHM format
printf("Extracting representative sequences from %s to merge later with matched database sequences\n",par.infile);
if (v>=1 && input_format==1) // HMMER format
printf("Extracting consensus sequence from %s to merge later with matched database sequences\n",par.infile);
Qali.GetSeqsFromHMM(q);
Qali.Compress(par.infile);
if (Qali.L != q->L) {
cerr<<"Error in "<L<<" match states, while its representative sequences have "< 1 || *par.alnfile || *par.psifile || *par.hhmfile || *alis_basename)
// {
// if (input_format==0 && v>=1) // HHM format
// cerr<<"WARNING: No alignment file found! Using only representative sequences in HHM file as starting MSA.\n";
// else if (input_format==1 && v>=1)
// cerr<<"WARNING: No alignment file found! Using only consensus sequence from HMMER file as starting MSA.\n";
// }
}
else
{
// .a3m does exist => read it into Qali
if (v>=1) printf("Reading query MSA from %s to merge later with matched database sequences\n",qa3mfile);
Qali.Read(qa3mf,qa3mfile);
Qali.Compress(qa3mfile);
// Copy names from query HMM (don't use name of master sequence of MSA)
delete[] Qali.longname;
Qali.longname = new(char[strlen(q->longname)+1]);
strcpy(Qali.longname,q->longname);
strcpy(Qali.name,q->name);
strcpy(Qali.fam,q->fam);
if (Qali.L != q->L) {
printf("Error in %s: query hhm %s has %i match states whereas query a3m %s has %i!\n",par.argv[0],qa3mfile,Qali.L,par.infile,q->L);
exit(4);
}
fclose(qa3mf);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Perform Viterbi HMM-HMM search on all db HMM names in dbfiles
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void DoViterbiSearch(char *dbfiles[], int ndb, bool alignByWorker=true)
{
// Search databases
for (bin=0; binrealign_around_viterbi=false;}
double filter_cutoff = early_stopping->length * early_stopping->thresh;
early_stopping->sum = early_stopping->length;
early_stopping->counter = 0;
for (int a=0; alength; a++) {early_stopping->evals[a]=1.0;}
// For all the databases comming through prefilter
for (int idb=0; idbsum < filter_cutoff)
{
if (v>=4)
printf("Stop after DB-HHM %i from %i (filter_sum: %8.4f cutoff: %8.4f)\n",idb,ndb,early_stopping->sum,filter_cutoff);
printf("\n");
break;
}
// Open HMM database
FILE* dbf;
char filename[NAMELEN];
strcpy(filename,dbfiles[idb]);
dbf = ffindex_fopen_by_name(dbhhm_data, dbhhm_index, dbfiles[idb]);
if (dbf == NULL) {
RemoveExtension(filename, dbfiles[idb]);
strcat(filename,".a3m");
if(dba3m_index_file==NULL) {
cerr<index = N_searched; // give hit a unique index for HMM
hit[bin]->ftellpos = ftell(dbf); // record position in dbfile of next HMM to be read
// fprintf(stderr,"dbfile=%-40.40s index=%-5i ftellpos=%i\n",dbfiles[idb],N_searched,(int) hit[bin]->ftellpos);
char path[NAMELEN];
Pathname(path,dbfiles[idb]);
///////////////////////////////////////////////////
// Read next HMM from database file
if (!fgetline(line,LINELEN,dbf)) {continue;}
while (strscn(line)==NULL) fgetline(line,LINELEN,dbf); // skip lines that contain only white space
if (!strncmp(line,"HMMER3",6)) // read HMMER3 format
{
format[bin] = 1;
t[bin]->ReadHMMer3(dbf,filename);
par.hmmer_used = true;
}
else if (!strncmp(line,"HMMER",5)) // read HMMER format
{
format[bin] = 1;
t[bin]->ReadHMMer(dbf,filename);
par.hmmer_used = true;
}
else if (!strncmp(line,"HH",2)) // read HHM format
{
format[bin] = 0;
t[bin]->Read(dbf,path);
}
else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1
{
fseek(dbf,hit[bin]->ftellpos,SEEK_SET); // rewind to beginning of line
format[bin] = 0;
t[bin]->Read(dbf,path);
}
else if (line[0]=='#' || line[0]=='>') // read a3m alignment
{
Alignment tali;
tali.Read(dbf,filename,line);
tali.Compress(filename);
// qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);
tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db);
char wg=par.wg; par.wg=1; // use global weights
t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0';
tali.FrequenciesAndTransitions(t[bin]);
par.wg=wg; //reset global weights
format[bin] = 0;
}
else
{
cerr<=4) printf("Aligning with %s\n",t[bin]->name); /////////////////////v>=4
///////////////////////////////////////////////////
hit[bin]->dbfile = new(char[strlen(dbfiles[idb])+1]);
strcpy(hit[bin]->dbfile,dbfiles[idb]); // record db file name from which next HMM is read
++N_searched;
if (v1>=2 && !((idb+1)%20))
{
cout<<".";
if (!((idb+1)%1000)) printf(" %-4i HMMs searched\n",(idb+1));
cout.flush();
}
#ifdef PTHREAD
// Lock access to bin_status
if (threads>0) rc = pthread_mutex_lock(&bin_status_mutex);
#endif
// Send the job in bin to a thread
bin_status[bin] = SUBMITTED;
jobs_submitted++;
if (threads==0) // if no multi-threading mode, main thread executes job itself
{
if (alignByWorker)
AlignByWorker(bin);
else
PerformViterbiByWorker(bin);
bin_status[bin] = FREE;
jobs_submitted--;
}
#ifdef PTHREAD
// Restart threads waiting for a signal
rc = pthread_cond_signal(&new_job);
// Unlock access to bin_status
rc = pthread_mutex_unlock(&bin_status_mutex);
if (DEBUG_THREADS)
fprintf(stderr,"Main: put job into bin %i name=%-7.7s Jobs running: %i jobs_submitted:%i \n",bin,t[bin]->name,jobs_running,jobs_submitted);
#endif
}
#ifdef PTHREAD
if (threads>0)
{
// Lock mutex
rc = pthread_mutex_lock(&bin_status_mutex);
// Wait until job finishes and a bin becomes free
if (jobs_submitted+jobs_running>=bins)
{
if (DEBUG_THREADS) fprintf(stderr,"Master thread is waiting for jobs to be finished...\n");
#ifdef WINDOWS
rc = pthread_cond_wait(&finished_job, &bin_status_mutex);
#else
#ifdef HH_MAC
// If no submitted jobs are in the queue we have to wait for a new job ...
struct timespec ts;
struct timeval tv;
gettimeofday(&tv, NULL);
ts.tv_sec = tv.tv_sec + 1;
rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts);
#else
// If no submitted jobs are in the queue we have to wait for a new job ...
struct timespec ts;
clock_gettime(CLOCK_REALTIME,&ts);
ts.tv_sec += 1;
rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts);
#endif
#endif
}
// Unlock mutex
rc = pthread_mutex_unlock(&bin_status_mutex);
}
#endif
fclose(dbf);
}
// Finished searching all database HHMs
reading_dbs=0;
#ifdef PTHREAD
if (threads>0)
{
// No more HMMs in database => wait until all threads have finished
if (DEBUG_THREADS)
fprintf(stderr,"No more jobs read from database Jobs running:%i jobs_submitted:%i \n",jobs_running,jobs_submitted);
// Free all threads still waiting for jobs
rc = pthread_mutex_lock(&bin_status_mutex);
rc = pthread_cond_broadcast(&new_job);
rc = pthread_mutex_unlock(&bin_status_mutex); // Unlock mutex
// Wait for all jobs to finish => join all jobs to main
for (int j=0; j0 && v<=3) v=1; else v-=2;
if (print_elapsed) ElapsedTimeSinceLastCall("(preparing for search)");
hitlist.N_searched=db_size; //hand over number of HMMs scanned to hitlist (for E-value calculation)
//////////////////////////////////////////////////////////
// Start Viterbi search through db HMMs listed in dbfiles
DoViterbiSearch(dbfiles,ndb);
if (v1>=2) cout<<"\n";
v=v1;
// Sort list according to sortscore
if (v>=3) printf("Sorting hit list ...\n");
hitlist.SortList();
// Use NN prediction of lamda and mu
hitlist.CalculatePvalues(q);
// Calculate E-values as combination of P-value for Viterbi HMM-HMM comparison and prefilter E-value: E = Ndb P (Epre/Ndb)^alpha
if (par.prefilter)
hitlist.CalculateHHblitsEvalues(q);
}
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Variant of ViterbiSearch() function for rescoring previously found HMMs with Viterbi algorithm.
// Perform Viterbi search on each hit object in global hash previous_hits, but keep old alignment
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
void RescoreWithViterbiKeepAlignment(int db_size)
{
// Initialize and allocate space for dynamic programming
jobs_running = 0;
jobs_submitted = 0;
reading_dbs=1; // needs to be set to 1 before threads are created
for (bin=0; bin0 && v<=3) v=1; else v-=2;
char *dbfiles[db_size+1];
int ndb = 0;
par.block_shading->Reset();
while (!par.block_shading->End())
delete[] (par.block_shading->ReadNext());
par.block_shading->New(16381,NULL);
// Get dbfiles of previous hits
previous_hits->Reset();
while (!previous_hits->End())
{
hit_cur = previous_hits->ReadNext();
if (hit_cur.irep==1)
{
dbfiles[ndb]=new(char[strlen(hit_cur.dbfile)+1]);
strcpy(dbfiles[ndb],hit_cur.dbfile);
++ndb;
}
// Seach only around previous HMM-HMM alignment (not the prefilter alignment as in ViterbiSearch())
if (block_filter)
{
// Hash par.block_shading contains pointer to int array which is here called block.
// This array contains start and stop positions of alignment for shading
// Hash par.block_shading_counter contains currently next free position in par.block_shading int array
//printf("Viterbi hit %s q: %i-%i t: %i-%i\n",hit_cur.name, hit_cur.i1, hit_cur.i2, hit_cur.j1, hit_cur.j2);
int* block;
int counter;
if (par.block_shading->Contains(hit_cur.name))
{
block = par.block_shading->Show(hit_cur.name);
counter = par.block_shading_counter->Remove(hit_cur.name);
}
else
{
block = new(int[400]);
counter = 0;
}
block[counter++] = hit_cur.i1;
block[counter++] = hit_cur.i2;
block[counter++] = hit_cur.j1;
block[counter++] = hit_cur.j2;
par.block_shading_counter->Add(hit_cur.name,counter);
if (!par.block_shading->Contains(hit_cur.name))
par.block_shading->Add(hit_cur.name,block);
// printf("Add to block shading key: %s data:",hit_cur.name);
// for (int i = 0; i < counter; i++)
// printf(" %i,",block[i]);
// printf("\n");
}
}
hitlist.N_searched=db_size; //hand over number of HMMs scanned to hitlist (for E-value calculation)
//////////////////////////////////////////////////////////
// Start Viterbi search through db HMMs listed in dbfiles
DoViterbiSearch(dbfiles,ndb,false);
if (v1>=2) cout<<"\n";
v=v1;
for (int n=0;n=3) printf("Sorting hit list ...\n");
hitlist.SortList();
hitlist.CalculatePvalues(q); // Use NN prediction of lamda and mu
if (par.prefilter)
hitlist.CalculateHHblitsEvalues(q);
}
///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
// Realign hits with MAC algorithm
/////////////////////////////////////////////////////////////////////////////////////////////////////////////
void perform_realign(char *dbfiles[], int ndb)
{
q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
int nhits=0;
int N_aligned=0;
// Longest allowable length of database HMM (backtrace: 5 chars, fwd: 1 double, bwd: 1 double
long int Lmaxmem=((par.maxmem-0.5)*1024*1024*1024)/(2*sizeof(double)+8)/q->L/bins;
long int Lmax=0; // length of longest HMM to be realigned
par.block_shading->Reset();
while (!par.block_shading->End())
delete[] (par.block_shading->ReadNext());
par.block_shading->New(16381,NULL);
par.block_shading_counter->New(16381,0);
// phash_plist_realignhitpos->Show(dbfile) is pointer to list with template indices and their ftell positions.
// This list can be sorted by ftellpos to access one template after the other efficiently during realignment
Hash< List* >* phash_plist_realignhitpos;
phash_plist_realignhitpos = new Hash< List* > (30031,NULL);
// Some templates have several (suboptimal) alignments in hitlist. For realignment, we need to efficiently
// access all hit objects in hitlist belonging to one template (because we don't want to read templates twice)
// We therefore need for each template (identified by its index between 0 and N_searched-1) a list of elements
// in hitlist that store the alignments with the template of that index.
// This list is pointed to by array_plist_phits[index].
List** array_plist_phits;
array_plist_phits = new List*[N_searched];
for (int index=0; index= par.realign_max && nhits>=imax(par.B,par.Z)) break;
if (hit_cur.Eval > par.e)
{
if (nhits>=imax(par.B,par.Z)) continue;
if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) continue;
if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue;
}
if (hit_cur.L>Lmax) Lmax=hit_cur.L;
if (hit_cur.L>Lmaxmem) {nhits++; continue;}
// Seach only around viterbi hit
if (block_filter)
{
// Hash par.block_shading contains pointer to int array which is here called block.
// This array contains start and stop positions of alignment for shading
// Hash par.block_shading_counter contains currently next free position in par.block_shading int array
//printf("Viterbi hit %s q: %i-%i t: %i-%i\n",hit_cur.name, hit_cur.i1, hit_cur.i2, hit_cur.j1, hit_cur.j2);
int* block;
int counter;
if (par.block_shading->Contains(hit_cur.name))
{
block = par.block_shading->Show(hit_cur.name);
counter = par.block_shading_counter->Remove(hit_cur.name);
}
else
{
block = new(int[400]);
counter = 0;
}
block[counter++] = hit_cur.i1;
block[counter++] = hit_cur.i2;
block[counter++] = hit_cur.j1;
block[counter++] = hit_cur.j2;
par.block_shading_counter->Add(hit_cur.name,counter);
if (!par.block_shading->Contains(hit_cur.name))
par.block_shading->Add(hit_cur.name,block);
// printf("Add to block shading in realign key: %s data:",hit_cur.name);
// for (int i = 0; i < counter; i++)
// printf(" %i,",block[i]);
// printf("\n");
}
//fprintf(stderr,"hit.name=%-15.15s hit.index=%-5i hit.ftellpos=%-8i hit.dbfile=%s\n",hit_cur.name,hit_cur.index,(unsigned int)hit_cur.ftellpos,hit_cur.dbfile);
if (nhits>=par.premerge) // realign the first premerge hits consecutively to query profile
{
if (hit_cur.irep==1)
{
// For each template (therefore irep==1), store template index and position on disk in a list
Realign_hitpos realign_hitpos;
realign_hitpos.ftellpos = hit_cur.ftellpos; // stores position on disk of template for current hit
realign_hitpos.index = hit_cur.index; // stores index of template of current hit
if (!phash_plist_realignhitpos->Contains(hit_cur.dbfile))
{
List* newlist = new List;
phash_plist_realignhitpos->Add(hit_cur.dbfile,newlist);
}
// Add template index and ftellpos to list which belongs to key dbfile in hash
phash_plist_realignhitpos->Show(hit_cur.dbfile)->Push(realign_hitpos);
}
if (! array_plist_phits[hit_cur.index]) // pointer at index is still NULL
{
List* newlist = new List; // create new list of pointers to all aligments of a template
array_plist_phits[hit_cur.index] = newlist; // set array[index] to newlist
}
// Push(hitlist.ReadCurrentAddress()) : Add address of current hit in hitlist to list...
// array_plist_phits[hit_cur.index]-> : pointed to by hit_cur.index'th element of array_plist_phits
array_plist_phits[hit_cur.index]->Push(hitlist.ReadCurrentAddress());
}
nhits++;
}
if (Lmax>Lmaxmem)
{
Lmax=Lmaxmem;
if (v>=1)
{
cerr<<"WARNING: Realigning sequences only up to length "< option (currently "<forward_allocated) hit[bin]->DeleteForwardMatrix(q->L+2);
if (hit[bin]->backward_allocated) hit[bin]->DeleteBackwardMatrix(q->L+2);
// Allocate memory for matrices and set to 0
hit[bin]->AllocateForwardMatrix(q->L+2,Lmax+1);
hit[bin]->AllocateBackwardMatrix(q->L+2,Lmax+1);
bin_status[bin] = FREE;
}
// // If the above reallocation and reseting to 0 is time-critical, it could be avoided by
// // replacing the above block with this block: (JS)
// int Lmaxprev = 0;
// if (hit[bin]->forward_allocated)
// Lmaxprev = sizeof(hit[bin]->F_MM[0]) / sizeof(hit[bin]->F_MM[0][0]);
// for (bin=0; binforward_allocated) hit[bin]->AllocateForwardMatrix(q->L+2,Lmax+1);
// else if (LmaxprevDeleteForwardMatrix(q->L+2);
// hit[bin]->AllocateForwardMatrix(q->L+2,Lmax+1);
// }
// if (!hit[bin]->backward_allocated) hit[bin]->AllocateBackwardMatrix(q->L+2,Lmax+1);
// else if (LmaxprevDeleteBackwardMatrix(q->L+2);
// hit[bin]->AllocateBackwardMatrix(q->L+2,Lmax+1);
// }
// bin_status[bin] = FREE;
// }
// // This is just an idea. This code would need to be tested!!!
// // First, it would need to be tested whether reseting F_MM and B_MM to 0 during the allocation is needed at all.
if (print_elapsed) ElapsedTimeSinceLastCall("(reallocate/reset forward/backwad matrices)");
// if (print_elapsed) ElapsedTimeSinceLastCall("(prepare realign)");
if (v>=2)
printf("Realigning %i HMMs using HMM-HMM Maximum Accuracy algorithm\n",phash_plist_realignhitpos->Size());
v1=v;
if (v>0 && v<=3) v=1; else v-=2; // Supress verbose output during iterative realignment and realignment
//////////////////////////////////////////////////////////////////////////////////
// start premerge:
// Align the first par.premerge templates
if (par.premerge>0)
{
if (v>=2) printf("Merging %i best hits to query alignment ...\n",par.premerge);
bin=0;
nhits=0;
hitlist.Reset();
while (!hitlist.End() && nhits=imax(par.B,par.Z)) break;
if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) break;
if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue;
nhits++;
if (hit_cur.L>Lmaxmem) continue; // Don't align to long sequences due to memory limit
if (hit_cur.Eval > par.e) continue; // Don't align hits with an E-value below the inclusion threshold
// Open HMM database file dbfiles[idb]
FILE* dbf;
dbf = ffindex_fopen_by_name(dbhhm_data, dbhhm_index, hit_cur.dbfile);
if (dbf == NULL) {
char filename[NAMELEN];
RemoveExtension(filename, hit_cur.file);
strcat(filename,".a3m");
if(dba3m_index_file!=NULL) {
dbf = ffindex_fopen_by_name(dba3m_data, dba3m_index, filename);
} else {
cerr<index = hit_cur.index; // give hit a unique index for HMM
hit[bin]->ftellpos = hit_cur.ftellpos;
fseek(dbf,hit_cur.ftellpos,SEEK_SET);
hit[bin]->dbfile = new(char[strlen(hit_cur.dbfile)+1]);
strcpy(hit[bin]->dbfile,hit_cur.dbfile); // record db file name from which next HMM is read
hit[bin]->irep = 1; // Needed for min_overlap calculation in InitializeForAlignment in hhhit.C
char path[NAMELEN];
Pathname(path,hit_cur.dbfile);
///////////////////////////////////////////////////
// Read next HMM from database file
if (!fgetline(line,LINELEN,dbf)) {fprintf(stderr,"Error in %s: end of file %s reached prematurely!\n",par.argv[0],hit_cur.dbfile); exit(1);}
while (strscn(line)==NULL && fgetline(line,LINELEN,dbf)) {} // skip lines that contain only white space
if (!strncmp(line,"HMMER3",5)) // read HMMER3 format
{
format[bin] = 1;
read_from_db = t[bin]->ReadHMMer3(dbf,hit_cur.dbfile);
par.hmmer_used = true;
}
else if (!strncmp(line,"HMMER",5)) // read HMMER format
{
format[bin] = 1;
read_from_db = t[bin]->ReadHMMer(dbf,hit_cur.dbfile);
par.hmmer_used = true;
}
else if (!strncmp(line,"HH",2)) // read HHM format
{
format[bin] = 0;
read_from_db = t[bin]->Read(dbf,path);
}
else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1
{
format[bin] = 0;
fseek(dbf,hit_cur.ftellpos,SEEK_SET); // rewind to beginning of line
read_from_db = t[bin]->Read(dbf,path);
}
else if (line[0]=='#' || line[0]=='>') // read a3m alignment
{
Alignment tali;
tali.Read(dbf,hit_cur.dbfile,line);
tali.Compress(hit_cur.dbfile);
// qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);
tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db);
t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0';
tali.FrequenciesAndTransitions(t[bin]);
format[bin] = 0;
}
else {
cerr<=2) fprintf(stderr,"Realigning with %s ***** \n",t[bin]->name);
///////////////////////////////////////////////////
N_aligned++;
if (v1>=2 && !(N_aligned%10))
{
cout<<".";
if (!(N_aligned%500)) printf(" %-4i HMMs aligned\n",N_aligned);
cout.flush();
}
// Prepare MAC comparison(s)
PrepareTemplateHMM(q,t[bin],format[bin]);
t[bin]->Log2LinTransitionProbs(1.0);
// Realign only around previous Viterbi hit
hit[bin]->i1 = hit_cur.i1;
hit[bin]->i2 = hit_cur.i2;
hit[bin]->j1 = hit_cur.j1;
hit[bin]->j2 = hit_cur.j2;
hit[bin]->nsteps = hit_cur.nsteps;
hit[bin]->i = hit_cur.i;
hit[bin]->j = hit_cur.j;
hit[bin]->realign_around_viterbi=true;
// Align q to template in *hit[bin]
hit[bin]->Forward(q,t[bin]);
hit[bin]->Backward(q,t[bin]);
hit[bin]->MACAlignment(q,t[bin]);
hit[bin]->BacktraceMAC(q,t[bin]);
// Overwrite *hit[bin] with Viterbi scores, Probabilities etc. of hit_cur
hit[bin]->score = hit_cur.score;
hit[bin]->score_ss = hit_cur.score_ss;
hit[bin]->score_aass = hit_cur.score_aass;
hit[bin]->score_sort = hit_cur.score_sort;
hit[bin]->Pval = hit_cur.Pval;
hit[bin]->Pvalt = hit_cur.Pvalt;
hit[bin]->logPval = hit_cur.logPval;
hit[bin]->logPvalt = hit_cur.logPvalt;
hit[bin]->Eval = hit_cur.Eval;
hit[bin]->logEval = hit_cur.logEval;
hit[bin]->Probab = hit_cur.Probab;
hit[bin]->irep = hit_cur.irep;
// Replace original hit in hitlist with realigned hit
//hitlist.ReadCurrent().Delete();
hitlist.Delete().Delete(); // delete the list record and hit object
hitlist.Insert(*hit[bin]);
// merge only when hit length > MINCOLS_REALIGN (don't merge 1 column matches)
if (hit[bin]->matched_cols < MINCOLS_REALIGN) continue;
// Read a3m alignment of hit and merge with Qali according to Q-T-alignment in hit[bin]
char ta3mfile[NAMELEN];
//strcpy(ta3mfile,hit[bin]->file); // copy filename including path but without extension
RemoveExtension(ta3mfile,hit[bin]->dbfile);
strcat(ta3mfile,".a3m");
// Reading in next db MSA and merging it onto Qali
Alignment Tali;
FILE* ta3mf = ffindex_fopen_by_name(dba3m_data, dba3m_index, ta3mfile);
if (ta3mf == NULL) OpenFileError(ta3mfile);
Tali.Read(ta3mf,ta3mfile); // Read template alignment into Tali
fclose(ta3mf);
Tali.Compress(ta3mfile); // Filter database alignment
if (par.allseqs) // need to keep *all* sequences in Qali_allseqs? => merge before filtering
Qali_allseqs.MergeMasterSlave(*hit[bin],Tali,ta3mfile);
Tali.N_filtered = Tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db);
Qali.MergeMasterSlave(*hit[bin],Tali,ta3mfile);
// //?????????????????????????????
// // JS: Reading the db MSA twice for par.all seems pretty inefficient!!!!!!!!!!!
// FILE* ta3mf;
// ta3mf = ffindex_fopen(dba3m_data, dba3m_index, ta3mfile);
// if (ta3mf == NULL) OpenFileError(ta3mfile);
// Qali.MergeMasterSlave(*hit[bin],ta3mfile, ta3mf);
// fclose(ta3mf);
// if (par.allseqs)
// {
// ta3mf = ffindex_fopen(dba3m_data, dba3m_index, ta3mfile);
// Qali_allseqs.MergeMasterSlave(*hit[bin],ta3mfile, ta3mf, false); // filter db MSA = false
// fclose(ta3mf);
// }
// //?????????????????????????????
// Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
Qali.Compress("merged A3M file");
// Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
Qali.N_filtered = Qali.Filter(par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff);
// Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
Qali.FrequenciesAndTransitions(q);
stringstream ss_tmp;
ss_tmp << hit[bin]->name << "__" << hit[bin]->irep;
premerged_hits->Add((char*)ss_tmp.str().c_str());
if (par.notags) q->NeutralizeTags();
// Compute substitution matrix pseudocounts?
if (par.nocontxt) {
// Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a]
q->PreparePseudocounts();
// Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a]
q->AddAminoAcidPseudocounts();
} else {
// Add full context specific pseudocounts to query
q->AddContextSpecificPseudocounts();
}
q->CalculateAminoAcidBackground();
if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs);
// Transform transition freqs to lin space if not already done
q->AddTransitionPseudocounts();
q->Log2LinTransitionProbs(1.0); // transform transition freqs to lin space if not already done
}
}
if (print_elapsed) ElapsedTimeSinceLastCall("(premerge)");
// end premerge
//////////////////////////////////////////////////////////////////////////////////
#ifdef PTHREAD
// Start threads for realignment
for (int j=0; jContains(dbfiles[idb])) continue;
// phash_plist_realignhitpos->Show(dbfile) is pointer to list with template indices and their ftell positions.
// This list is now sorted by ftellpos in ascending order to access one template after the other efficiently
phash_plist_realignhitpos->Show(dbfiles[idb])->SortList();
// Open HMM database file dbfiles[idb]
FILE* dbf;
dbf = ffindex_fopen_by_name(dbhhm_data, dbhhm_index, dbfiles[idb]);
if (dbf == NULL) {
char filename[NAMELEN];
RemoveExtension(filename, dbfiles[idb]);
strcat(filename,".a3m");
if(dba3m_index_file!=NULL) {
dbf = ffindex_fopen_by_name(dba3m_data, dba3m_index, filename);
} else {
cerr<Show(dbfiles[idb])->Reset();
while (! phash_plist_realignhitpos->Show(dbfiles[idb])->End())
{
// Submit jobs until no bin is free anymore
while (! phash_plist_realignhitpos->Show(dbfiles[idb])->End() && jobs_submitted+jobs_runningShow(dbfiles[idb])->ReadNext();
hit[bin]->index = hitpos_curr.index; // give hit[bin] a unique index for HMM
fseek(dbf,hitpos_curr.ftellpos,SEEK_SET); // start to read at ftellpos for template
// Give hit[bin] the pointer to the list of pointers to hitlist elements of same template (for realignment)
hit[bin]->plist_phits = array_plist_phits[hitpos_curr.index];
// fprintf(stderr,"dbfile=%-40.40s index=%-5i ftellpos=%l\n",dbfiles[idb],hitpos_curr.index,hitpos_curr.ftellpos);
char path[NAMELEN];
Pathname(path,dbfiles[idb]);
///////////////////////////////////////////////////
// Read next HMM from database file
if (!fgetline(line,LINELEN,dbf)) {fprintf(stderr,"Error in %s: end of file %s reached prematurely!\n",par.argv[0],dbfiles[idb]); exit(1);}
while (strscn(line)==NULL && fgetline(line,LINELEN,dbf)) {} // skip lines that contain only white space
if (!strncmp(line,"HMMER3",5)) // read HMMER3 format
{
format[bin] = 1;
read_from_db = t[bin]->ReadHMMer3(dbf,dbfiles[idb]);
par.hmmer_used = true;
}
else if (!strncmp(line,"HMMER",5)) // read HMMER format
{
format[bin] = 1;
read_from_db = t[bin]->ReadHMMer(dbf,dbfiles[idb]);
par.hmmer_used = true;
}
else if (!strncmp(line,"HH",2)) // read HHM format
{
format[bin] = 0;
read_from_db = t[bin]->Read(dbf,path);
}
else if (!strncmp(line,"NAME",4)) // The following lines are for backward compatibility of HHM format version 1.2 with 1.1
{
format[bin] = 0;
fseek(dbf,hitpos_curr.ftellpos,SEEK_SET); // rewind to beginning of line
read_from_db = t[bin]->Read(dbf,path);
}
else if (line[0]=='#' || line[0]=='>') // read a3m alignment
{
Alignment tali;
tali.Read(dbf,dbfiles[idb],line);
tali.Compress(dbfiles[idb]);
// qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);
tali.N_filtered = tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db);
t[bin]->name[0]=t[bin]->longname[0]=t[bin]->fam[0]='\0';
tali.FrequenciesAndTransitions(t[bin]);
format[bin] = 0;
}
else {
cerr<=2) fprintf(stderr,"Realigning with %s\n",t[bin]->name);
///////////////////////////////////////////////////
hit[bin]->dbfile = new(char[strlen(dbfiles[idb])+1]);
strcpy(hit[bin]->dbfile,dbfiles[idb]); // record db file name from which next HMM is read
N_aligned++;
if (v1>=2 && !(N_aligned%10))
{
cout<<".";
if (!(N_aligned%500)) printf(" %-4i HMMs aligned\n",N_aligned);
cout.flush();
}
#ifdef PTHREAD
// Lock access to bin_status
if (threads>0) rc = pthread_mutex_lock(&bin_status_mutex);
#endif
// Send the job in bin to a thread
bin_status[bin] = SUBMITTED;
jobs_submitted++;
if (threads==0) // if no multi-threading mode, main thread executes job itself
{
RealignByWorker(bin);
bin_status[bin] = FREE;
jobs_submitted--;
break;
}
#ifdef PTHREAD
// Restart threads waiting for a signal
rc = pthread_cond_signal(&new_job);
// Unlock access to bin_status
rc = pthread_mutex_unlock(&bin_status_mutex);
if (DEBUG_THREADS)
fprintf(stderr,"Main: put job into bin %i name=%-7.7s Jobs running: %i jobs_submitted:%i \n",bin,t[bin]->name,jobs_running,jobs_submitted);
#endif
}
#ifdef PTHREAD
if (threads>0)
{
// Lock mutex
rc = pthread_mutex_lock(&bin_status_mutex);
// Wait until job finishes and a bin becomes free
if (jobs_submitted+jobs_running>=bins)
{
if (DEBUG_THREADS) fprintf(stderr,"Master thread is waiting for jobs to be finished...\n");
#ifdef WINDOWS
rc = pthread_cond_wait(&finished_job, &bin_status_mutex);
#else
#ifdef HH_MAC
// If no submitted jobs are in the queue we have to wait for a new job ...
struct timespec ts;
struct timeval tv;
gettimeofday(&tv, NULL);
ts.tv_sec = tv.tv_sec + 1;
rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts);
#else
// If no submitted jobs are in the queue we have to wait for a new job ...
struct timespec ts;
clock_gettime(CLOCK_REALTIME,&ts);
ts.tv_sec += 1;
rc = pthread_cond_timedwait(&finished_job, &bin_status_mutex,&ts);
#endif
#endif
}
// Unlock mutex
rc = pthread_mutex_unlock(&bin_status_mutex);
}
#endif
}
// End while(1)
///////////////////////////////////////////////////////////////////////////////////////
fclose(dbf);
}
reading_dbs=0;
#ifdef PTHREAD
if (threads>0)
{
// No more HMMs in database => wait until all threads have finished
if (DEBUG_THREADS)
fprintf(stderr,"No more jobs read from database Jobs running:%i jobs_submitted:%i \n",jobs_running,jobs_submitted);
// Free all threads still waiting for jobs
rc = pthread_mutex_lock(&bin_status_mutex);
rc = pthread_cond_broadcast(&new_job);
rc = pthread_mutex_unlock(&bin_status_mutex); // Unlock mutex
// Wait for all jobs to finish => join all jobs to main
for (int j=0; j=2) cout<<"\n";
v=v1;
// Delete all hitlist entries with too short alignments
nhits=0;
hitlist.Reset();
while (!hitlist.End())
{
hit_cur = hitlist.ReadNext();
//printf("Deleting alignment of %s with length %i? irep=%i nhits=%-2i par.B=%-3i par.Z=%-3i par.e=%.2g par.b=%-3i par.z=%-3i par.p=%.2g\n",hit_cur.name,hit_cur.matched_cols,hit_cur.irep,nhits,par.B,par.Z,par.e,par.b,par.z,par.p);
if (nhits > par.realign_max && nhits>=imax(par.B,par.Z)) break;
if (hit_cur.Eval > par.e)
{
if (nhits>=imax(par.B,par.Z)) continue;
if (nhits>=imax(par.b,par.z) && hit_cur.Probab < par.p) continue;
if (nhits>=imax(par.b,par.z) && hit_cur.Eval > par.E) continue;
}
if (hit_cur.matched_cols < MINCOLS_REALIGN)
{
if (v>=3) printf("Deleting alignment of %s with length %i\n",hit_cur.name,hit_cur.matched_cols);
hitlist.Delete().Delete(); // delete the list record and hit object
// // Make sure only realigned alignments get displayed! JS: Why? better unrealigned than none.
// if (last_round)
// if (par.B>par.Z) par.B--; else if (par.B==par.Z) {par.B--; par.Z--;} else par.Z--;
}
nhits++;
}
// Delete hash phash_plist_realignhitpos with lists
phash_plist_realignhitpos->Reset();
while (!phash_plist_realignhitpos->End())
delete(phash_plist_realignhitpos->ReadNext()); // delete list to which phash_plist_realignhitpos->ReadNext() points
delete(phash_plist_realignhitpos);
// Delete array_plist_phits with lists
for (int index=0; indexlength = 200;
early_stopping->thresh = 0.01;
strcpy(par.outfile,"");
strcpy(db_ext,"hhm");
N_searched=0;
previous_hits = new Hash(1631,hit_cur);
premerged_hits = new Hash(1631);
// Make command line input globally available
par.argv=argv;
par.argc=argc;
RemovePathAndExtension(program_name,argv[0]);
Pathname(program_path, argv[0]);
// Enable changing verbose mode before command line are processed
for (int i=1; i1 && !strcmp(argv[i],"-v0")) v=0;
else if (argc>1 && !strcmp(argv[i],"-v1")) v=1;
else if (argc>2 && !strcmp(argv[i],"-v")) v=atoi(argv[i+1]);
}
par.SetDefaultPaths(program_path);
// Process default otpions from .hhdefaults file
ReadDefaultsFile(argc_conf,argv_conf,program_path);
ProcessArguments(argc_conf,argv_conf);
// Process command line options (they override defaults from .hhdefaults file)
ProcessArguments(argc,argv);
// Check needed files
if (!*par.infile || !strcmp(par.infile,"")) // infile not given
{help(); cerr<=2 && v>=1) cerr<<"WARNING: using -global alignment for iterative searches is deprecated since non-homologous sequence segments can easily enter the MSA and corrupt it.\n";
if (num_rounds < 1) num_rounds=1;
else if (num_rounds > 8)
{
if (v>=1) cerr<<"WARNING: Number of iterations ("< Set to 8 iterations\n";
num_rounds=8;
}
// Premerging can be very time-consuming on large database a3ms, such as from pdb70.
// Hence it is only done when iteratively searching against uniprot20 or nr20 with their much smaller MSAs:
if (! (num_rounds > 1 || *par.alnfile || *par.psifile || *par.hhmfile || *alis_basename) ) par.premerge=0;
// No outfile given? Name it basename.hhm
if (!*par.outfile) // outfile not given? Name it basename.hhm
{
RemoveExtension(par.outfile,par.infile);
strcat(par.outfile,".hhr");
if (v>=2) cout<<"Search results will be written to "<2GB on 32bit system?):"<< endl;
exit(errno);
}
if (num_rounds > 1 ||*par.alnfile || *par.psifile || *par.hhmfile || *alis_basename)
{
cerr<;
par.block_shading_counter = new Hash;
par.block_shading->New(16381,NULL);
par.block_shading_counter->New(16381,0);
dbfiles_new = new char*[par.maxnumdb_no_prefilter+1];
dbfiles_old = new char*[par.maxnumdb+1];
early_stopping->evals = new double[early_stopping->length];
// Prepare index-based databases
char filename[NAMELEN];
dbhhm_data_file = fopen(dbhhm, "r");
if (!dbhhm_data_file) OpenFileError(dbhhm);
strcpy(filename, dbhhm);
strcat(filename, ".index");
int filesize;
filesize = CountLinesInFile(filename);
dbhhm_index_file = fopen(filename, "r");
if (!dbhhm_index_file) OpenFileError(filename);
dbhhm_index = ffindex_index_parse(dbhhm_index_file, filesize);
if (dbhhm_index==NULL) {
cerr<<"Error in "<MAXTHREADS)
{
threads=MAXTHREADS;
if (v>=1) fprintf(stderr,"WARNING: number of CPUs set to maximum value of %i\n",MAXTHREADS);
}
// Set OpenMP threads
#ifdef _OPENMP
cpu = imin(cpu,omp_get_max_threads());
omp_set_num_threads(cpu);
#endif
// Check option compatibilities
if (par.nseqdis>MAXSEQDIS-3-par.showcons) par.nseqdis=MAXSEQDIS-3-par.showcons; //3 reserved for secondary structure
if (par.aliwidth<20) par.aliwidth=20;
if (par.pca<0.001) par.pca=0.001; // to avoid log(0)
if (par.pre_pca<0.001) par.pre_pca=0.001; // to avoid log(0)
if (par.b>par.B) par.B=par.b;
if (par.z>par.Z) par.Z=par.z;
if (par.maxmem<1.0) {cerr<<"WARNING: setting -maxmem to its minimum allowed value of 1.0\n"; par.maxmem=1.0;}
// Set (global variable) substitution matrix and derived matrices
SetSubstitutionMatrix();
// Set secondary structure substitution matrix
if (par.ssm) SetSecStrucSubstitutionMatrix();
// Prepare context state pseudocounts lib
if (!par.nocontxt) {
fin = fopen(par.clusterfile, "r");
if (!fin) OpenFileError(par.clusterfile);
context_lib = new cs::ContextLibrary(fin);
fclose(fin);
cs::TransformToLog(*context_lib);
lib_pc = new cs::LibraryPseudocounts(*context_lib, par.csw, par.csb);
}
// Prepare column state lib (context size =1 )
fin = fopen(par.cs_library, "r");
if (!fin) OpenFileError(par.cs_library);
cs_lib = new cs::ContextLibrary(fin);
fclose(fin);
cs::TransformToLin(*cs_lib);
if (print_elapsed) ElapsedTimeSinceLastCall("(prepare CS pseudocounts)");
v1=v;
if (v>0 && v<=2) v=1; else v--;
// Read query input file (HHM, HMMER, or alignment format) without adding pseudocounts
Qali.N_in=0;
ReadQueryFile(par.infile,input_format,q,&Qali);
// If input file was not a sequence file (and hence a HHM or HMMER file) AND result MSA/HMM nee to be written,
// read in query sequences from a3m file or from representative seqs in HHM file or consensus seq in HMMER file
if (Qali.N_in==0)
if (num_rounds > 1 || *par.alnfile || *par.psifile || *par.hhmfile || *alis_basename || par.premerge>0)
ReadQueryA3MFile();
if (Qali.N_in - Qali.N_ss > 1) par.premerge=0;
par.M=1; // all database MSAs must be in A3M format
if (par.allseqs)
{
Qali_allseqs = Qali; // make a *deep* copy of Qali!
for(int k=0; k=3)
{
cout<<"Input file : "<NeutralizeTags();
// Prepare multi-threading - reserve memory for threads, intialize, etc.
if (threads==0) bins=1; else bins=iround(threads*1.2+0.5);
for (bin=0; binAllocateBacktraceMatrix(q->L +2,par.maxres); // ...with a separate dynamic programming matrix (memory!!)
}
format = new(int[bins]);
if (print_elapsed) ElapsedTimeSinceLastCall("(finished init)");
//////////////////////////////////////////////////////////////////////////////////
// Main loop overs search iterations
//////////////////////////////////////////////////////////////////////////////////
for (int round = 1; round <= num_rounds; round++) {
if (v>=2) printf("\nIteration %i\n",round);
// Settings for different rounds
if (par.premerge > 0 && round > 1 && previous_hits->Size() >= par.premerge)
{
if (v>3) printf("Set premerge to 0! (premerge: %i iteration: %i hits.Size: %i)\n",par.premerge,round,previous_hits->Size());
par.premerge = 0;
}
else
{
par.premerge -= previous_hits->Size();
}
// Save HMM without pseudocounts for prefilter query-profile
*q_tmp = *q;
// Write query HHM file? (not the final HMM, which will be written to par.hhmfile)
if (*query_hhmfile)
{
v1=v;
if (v>0 && v<=3) v=1; else v-=2;
// Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
q->CalculateAminoAcidBackground();
q->WriteToFile(query_hhmfile);
v=v1;
}
PrepareQueryHMM(input_format, q);
if (print_elapsed) ElapsedTimeSinceLastCall("(before prefiltering (pseudocounts))");
////////////////////////////////////////////
// Prefiltering
////////////////////////////////////////////
if (par.prefilter)
if (v>=2) printf("Prefiltering database\n");
prefilter_db(); // in hhprefilter.C
if (print_elapsed) ElapsedTimeSinceLastCall("(prefiltering)");
if (ndb_new == 0) {
printf("No HMMs pass prefilter => Stop searching!\n");
break;
}
// Search datbases
if (v>=2) {
printf("HMMs passed 2nd prefilter (gapped profile-profile alignment) : %6i\n", (ndb_new+ndb_old));
printf("HMMs passed 2nd prefilter and not found in previous iterations : %6i\n", ndb_new);
printf("Scoring %i HMMs using HMM-HMM Viterbi alignment\n", ndb_new);
}
// Main Viterbi HMM-HMM search
// Starts with empty hitlist (hits of previous iterations were deleted) and creates a hitlist with the hits of this iteration
ViterbiSearch(dbfiles_new,ndb_new,(ndb_new + ndb_old));
if (print_elapsed) ElapsedTimeSinceLastCall("(Viterbi search)");
// check for new hits or end with iteration
int new_hits = 0;
hitlist.Reset();
while (!hitlist.End())
{
hit_cur = hitlist.ReadNext();
if (hit_cur.Eval > 100.0*par.e) break; // E-value much too large
if (hit_cur.Eval > par.e) continue; // E-value too large
new_hits++;
}
if (new_hits == 0 || round == num_rounds)
{
last_round = true;
if (round < num_rounds && v>=2)
printf("No new hits found in iteration %i => Stop searching\n",round);
if (ndb_old > 0 && realign_old_hits)
{
if(v > 0) {
printf("Rescoring previously found HMMs with Viterbi algorithm\n");
}
ViterbiSearch(dbfiles_old,ndb_old,(ndb_new + ndb_old));
// Add dbfiles_old to dbfiles_new for realign
for (int a = 0; a < ndb_old; a++)
{
dbfiles_new[ndb_new]=new(char[strlen(dbfiles_old[a])+1]);
strcpy(dbfiles_new[ndb_new],dbfiles_old[a]);
ndb_new++;
}
}
else if (!realign_old_hits && previous_hits->Size() > 0)
{
if(v > 0) {
printf("Rescoring previously found HMMs with Viterbi algorithm\n");
}
RescoreWithViterbiKeepAlignment(ndb_new+previous_hits->Size());
if (print_elapsed) ElapsedTimeSinceLastCall("(Rescoring with Viterbi)");
}
}
// Realign hits with MAC algorithm
if (par.realign) perform_realign(dbfiles_new,ndb_new);
if (print_elapsed) ElapsedTimeSinceLastCall("(realigning with MAC)");
// Generate alignment for next iteration
if (round < num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *alis_basename)
{
v1=v;
if (v>0 && v<=3) v=1; else v-=2;
// If new hits found, merge hits to query alignment
if (new_hits != 0)
{
char ta3mfile[NAMELEN];
if (v>=1) printf("Merging hits to query profile\n");
// For each template below threshold
hitlist.Reset();
while (!hitlist.End())
{
hit_cur = hitlist.ReadNext();
if (hit_cur.Eval > 100.0*par.e) break; // E-value much too large
if (hit_cur.Eval > par.e) continue; // E-value too large
if (hit_cur.matched_cols < MINCOLS_REALIGN) continue; // leave out too short alignments
stringstream ss_tmp;
ss_tmp << hit_cur.name << "__" << hit_cur.irep;
if (previous_hits->Contains((char*)ss_tmp.str().c_str())) continue; // Already in alignment
// Add number of sequences in this cluster to total found
seqs_found += SequencesInCluster(hit_cur.name); // read number after second '|'
cluster_found++;
// Skip merging this hit if hit alignment was already merged during premerging
if (premerged_hits->Contains((char*)ss_tmp.str().c_str())) continue;
// Read a3m alignment of hit from .a3m file
RemoveExtension(ta3mfile,hit_cur.dbfile);
strcat(ta3mfile,".a3m");
// Reading in next db MSA and merging it onto Qali
FILE* ta3mf;
Alignment Tali;
ta3mf = ffindex_fopen_by_name(dba3m_data, dba3m_index, ta3mfile);
if (ta3mf == NULL) OpenFileError(ta3mfile);
Tali.Read(ta3mf,ta3mfile); // Read template alignment into Tali
fclose(ta3mf);
Tali.Compress(ta3mfile); // Filter database alignment
if (par.allseqs) // need to keep *all* sequences in Qali_allseqs? => merge before filtering
Qali_allseqs.MergeMasterSlave(hit_cur,Tali,ta3mfile);
Tali.N_filtered = Tali.Filter(par.max_seqid_db,par.coverage_db,par.qid_db,par.qsc_db,par.Ndiff_db);
Qali.MergeMasterSlave(hit_cur,Tali,ta3mfile);
// //?????????????????????????????
// // JS: Reading the db MSA twice for par.all seems pretty inefficient!!!!!!!!!!!
// FILE* ta3mf;
// ta3mf = ffindex_fopen(dba3m_data, dba3m_index, ta3mfile);
// if (ta3mf == NULL) OpenFileError(ta3mfile);
// Qali.MergeMasterSlave(hit_cur,ta3mfile, ta3mf);
// fclose(ta3mf);
// if (par.allseqs)
// {
// ta3mf = ffindex_fopen(dba3m_data, dba3m_index, ta3mfile);
// Qali_allseqs.MergeMasterSlave(hit_cur,ta3mfile, ta3mf, false); // filter db MSA = false
// fclose(ta3mf);
// }
// //?????????????????????????????
if (Qali.N_in>=MAXSEQ) break; // Maximum number of sequences reached
}
// Convert ASCII to int (0-20),throw out all insert states, record their number in I[k][i]
Qali.Compress("merged A3M file");
// Sort out the nseqdis most dissimilacd r sequences for display in the result alignments
Qali.FilterForDisplay(par.max_seqid,par.coverage,par.qid,par.qsc,par.nseqdis);
// Remove sequences with seq. identity larger than seqid percent (remove the shorter of two)
float const COV_ABS = 25; // min. number of aligned residues
int cov_tot = imax(imin((int)(COV_ABS / Qali.L * 100 + 0.5), 70), par.coverage);
if (v>2) printf("Filter new alignment with cov %3i%%\n", cov_tot);
Qali.N_filtered = Qali.Filter(par.max_seqid,cov_tot,par.qid,par.qsc,par.Ndiff);
if (print_elapsed) ElapsedTimeSinceLastCall("(merge hits to Qali)");
}
// Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a]
Qali.FrequenciesAndTransitions(q,NULL,true);
if (par.notags) q->NeutralizeTags();
// Calculate SSpred if we need to print out alis after each iteration or if last iteration
if (par.addss && (*alis_basename || round == num_rounds || new_hits == 0))
{
char ss_pred[par.maxres];
char ss_conf[par.maxres];
CalculateSS(q, ss_pred, ss_conf);
Qali.AddSSPrediction(ss_pred, ss_conf);
if (print_elapsed) ElapsedTimeSinceLastCall("(calculate SS_Pred)");
}
if (print_elapsed) ElapsedTimeSinceLastCall("(Calculate AA frequencies and transitions)");
if (*alis_basename)
{
stringstream ss_tmp;
ss_tmp << alis_basename << "_" << round << ".a3m";
if (par.allseqs)
Qali_allseqs.WriteToFile(ss_tmp.str().c_str(),"a3m");
else
Qali.WriteToFile(ss_tmp.str().c_str(),"a3m");
}
v=v1;
}
else if (round == num_rounds) // Update counts for log
{
hitlist.Reset();
while (!hitlist.End())
{
hit_cur = hitlist.ReadNext();
if (hit_cur.Eval > 100.0*par.e) break; // E-value much too large
if (hit_cur.Eval > par.e) continue; // E-value too large
stringstream ss_tmp;
ss_tmp << hit_cur.name << "__" << hit_cur.irep;
if (previous_hits->Contains((char*)ss_tmp.str().c_str())) continue; // Already in alignment
// Add number of sequences in this cluster to total found
seqs_found += SequencesInCluster(hit_cur.name); // read number after second '|'
cluster_found++;
}
}
if (v>=2)
printf("%i sequences belonging to %i database HMMs found with an E-value < %-12.4g\n",seqs_found,cluster_found, par.e);
if (v>=2 && (round < num_rounds || *par.alnfile || *par.psifile || *par.hhmfile || *alis_basename))
printf("Number of effective sequences of resulting query HMM: Neff = %4.2f\n", q->Neff_HMM);
if (q->Neff_HMM > neffmax && round < num_rounds) {
printf("Diversity is above threshold (%4.2f). Stop searching! (Change threshold using -neffmax .)\n", neffmax);
}
if (Qali.N_in>=MAXSEQ)
printf("Maximun number of sequences in query alignment reached (%i). Stop searching!\n", MAXSEQ);
if (new_hits == 0 || round == num_rounds || q->Neff_HMM > neffmax || Qali.N_in>=MAXSEQ)
break;
// Write good hits to previous_hits hash and clear hitlist
hitlist.Reset();
while (!hitlist.End())
{
hit_cur = hitlist.ReadNext();
char strtmp[NAMELEN+6];
sprintf(strtmp,"%s__%i%c",hit_cur.name,hit_cur.irep,'\0');
if (!already_seen_filter || hit_cur.Eval > par.e || previous_hits->Contains(strtmp))
hit_cur.Delete(); // Delete hit object (deep delete with Hit::Delete())
else
previous_hits->Add(strtmp,hit_cur);
// // Old version by Michael => Delete
// hit_cur = hitlist.ReadNext();
// stringstream ss_tmp;
// ss_tmp << hit_cur.name << "__" << hit_cur.irep;
// if (!already_seen_filter || hit_cur.Eval > par.e || previous_hits->Contains((char*)ss_tmp.str().c_str()))
// hit_cur.Delete(); // Delete hit object (deep delete with Hit::Delete())
// else
// previous_hits->Add((char*)ss_tmp.str().c_str(), hit_cur);
hitlist.Delete(); // Delete list record (flat delete)
}
if (print_elapsed) ElapsedTimeSinceLastCall("(end of this round)");
} // end main loop overs search iterations
//////////////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////
// Result section
//////////////////////////////////////////////////////////////////////////////////
// Warn, if HMMER files were used
if (par.hmmer_used && v>=1)
fprintf(stderr,"WARNING: Using HMMER files results in a drastically reduced sensitivity (>10%%).\nWe recommend to use HHMs build by hhmake.\n");
// Print for each HMM: n score -log2(Pval) L name (n=5:same name 4:same fam 3:same sf...)
if (*par.scorefile) {
if (v>=3) printf("Printing scores file ...\n");
hitlist.PrintScoreFile(q);
}
// Print FASTA or A2M alignments?
if (*par.pairwisealisfile) {
if (v>=2) cout<<"Printing alignments in "<<(par.outformat==1? "FASTA" : par.outformat==2?"A2M" :"A3M")<<" format to "<=3) printf("Printing hit list ...\n");
hitlist.PrintHitList(q_tmp,par.outfile);
// Write only hit list to screen?
if (v==2 && strcmp(par.outfile,"stdout")) WriteToScreen(par.outfile,109); // write only hit list to screen
// Print alignments of query sequences against hit sequences
hitlist.PrintAlignments(q_tmp,par.outfile);
// Write whole output file to screen? (max 10000 lines)
if (v>=3 && strcmp(par.outfile,"stdout")) WriteToScreen(par.outfile,10009);
// Generate result alignment or HMM file?
if (*par.alnfile || *par.psifile || *par.hhmfile)
{
// Write output PSI-BLAST-formatted alignment?
if (*par.psifile)
{
if (par.allseqs)
Qali_allseqs.WriteToFile(par.psifile,"psi");
else
Qali.WriteToFile(par.psifile,"psi");
}
// Write output HHM file?
if (*par.hhmfile)
{
// Add *no* amino acid pseudocounts to query. This is necessary to copy f[i][a] to p[i][a]
q->AddAminoAcidPseudocounts(0, 0.0, 0.0, 1.0);
q->CalculateAminoAcidBackground();
q->WriteToFile(par.hhmfile);
}
// Write output A3M alignment?
if (*par.alnfile)
{
if (par.allseqs)
Qali_allseqs.WriteToFile(par.alnfile,"a3m");
else
Qali.WriteToFile(par.alnfile,"a3m");
}
}
////////////////////////////////////////////////////
// Clean up
////////////////////////////////////////////////////
fclose(dbhhm_data_file);
fclose(dbhhm_index_file);
if (dba3m_index_file!=NULL) {
fclose(dba3m_data_file);
fclose(dba3m_index_file);
}
free(dbhhm_index);
free(dba3m_index);
// Delete memory for dynamic programming matrix
for (bin=0; binDeleteBacktraceMatrix(q->L+2);
if (hit[bin]->forward_allocated)
hit[bin]->DeleteForwardMatrix(q->L+2);
if (hit[bin]->backward_allocated)
hit[bin]->DeleteBackwardMatrix(q->L+2);
delete hit[bin];
delete t[bin];
}
delete q;
delete q_tmp;
if (format) delete[](format);
if (par.exclstr) delete[] par.exclstr;
delete[] early_stopping->evals;
delete early_stopping;
for (int n = 1; n < argc_conf; n++)
delete[] argv_conf[n];
if (par.dbfiles) delete[] par.dbfiles;
for (int idb=0; idbReset();
while (!par.block_shading->End())
delete[] (par.block_shading->ReadNext());
delete par.block_shading;
delete par.block_shading_counter;
if (par.prefilter)
{
free(X);
free(length);
free(first);
for (int n = 0; n < num_dbs; n++)
delete[](dbnames[n]);
delete[](dbnames);
}
// Comput substitution matrix pseudocounts
if (!par.nocontxt) {
delete context_lib;
delete lib_pc;
}
delete cs_lib;
// Delete content of hits in hitlist
hitlist.Reset();
while (!hitlist.End())
hitlist.Delete().Delete(); // Delete list record and hit object
previous_hits->Reset();
while (!previous_hits->End())
previous_hits->ReadNext().Delete(); // Delete hit object
delete previous_hits;
delete premerged_hits;
#ifdef PTHREAD
pthread_attr_destroy(&joinable);
pthread_mutex_destroy(&bin_status_mutex);
pthread_cond_destroy(&new_job);
pthread_cond_destroy(&finished_job);
#endif
if (print_elapsed) ElapsedTimeSinceLastCall("(sorting and formatting)");
// Print 'Done!'
FILE* outf=NULL;
if (!strcmp(par.outfile,"stdout")) printf("Done!\n");
else
{
if (*par.outfile)
{
outf=fopen(par.outfile,"a"); //open for append
fprintf(outf,"Done!\n");
fclose(outf);
}
if (v>=2) printf("Done\n");
}
exit(0);
} //end main
//////////////////////////////////////////////////////////////////////////////////////////////////////
// END OF MAIN
//////////////////////////////////////////////////////////////////////////////////////////////////////