// hhmake.C: build profile HMM from input alignment for HMM-HMM comparison // (C) Johannes Soeding and Michael Remmert 2012 // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // You should have received a copy of the GNU General Public License // along with this program. If not, see . // We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de // Reference: // Remmert M., Biegert A., Hauser A., and Soding J. // HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. // Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // clock #include // perror(), strerror(errno) #include // islower, isdigit etc #include #include #include #include #ifdef HH_SSE41 #include // SSSE3 #include // SSE4.1 #define HH_SSE3 #endif #ifdef HH_SSE3 #include // SSE3 #define HH_SSE2 #endif #ifdef HH_SSE2 #ifndef __SUNPRO_C #include // SSE2 #else #include #endif #endif using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include "cs.h" // context-specific pseudocounts #include "context_library.h" #include "library_pseudocounts-inl.h" #include "list.h" // list data structure #include "hash.h" // hash data structure #include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "hhdecl.C" // Constants, global variables, struct Parameters #include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen, #include "hhmatrices.C" // BLOSUM50, GONNET, HSDM #include "hhhmm.h" // class HMM #include "hhhit.h" // class Hit #include "hhalignment.h" // class Alignment #include "hhhmm.C" // class HMM #include "hhalignment.C" // class Alignment #include "hhfunc.C" // some functions common to hh programs ///////////////////////////////////////////////////////////////////////////////////// // Global variables ///////////////////////////////////////////////////////////////////////////////////// HMM* q = new HMM; //Create a HMM with maximum of par.maxres match states ///////////////////////////////////////////////////////////////////////////////////// // Help functions ///////////////////////////////////////////////////////////////////////////////////// void help(char all=0) { printf("\n"); printf("HHmake_batch %s\n",VERSION_AND_DATE); printf("Build a set of HMM from a set of input alignment in A2M, A3M, or FASTA format,\n"); printf("or convert between HMMER format (.hmm) and HHsearch format (.hhm). \n"); printf("%s",REFERENCE); printf("%s",COPYRIGHT); printf("\n"); printf("Usage: %s -i file [options] \n",program_name); printf(" -i list of query alignment (A2M, A3M, or FASTA), or query HMM\n"); printf("\n"); printf("Output options: \n"); printf(" -o list of HMM file to be written to (default=) \n"); printf(" -a HMM file to be appended to \n"); printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose\n"); printf(" -seq max. number of query/template sequences displayed (def=%i) \n",par.nseqdis); printf(" Beware of overflows! All these sequences are stored in memory.\n"); printf(" -cons make consensus sequence master sequence of query MSA \n"); printf(" -name use this name for HMM (default: use name of first sequence) \n"); printf("\n"); printf("Filter query multiple sequence alignment \n"); printf(" -id [0,100] maximum pairwise sequence identity (%%) (def=%i) \n",par.max_seqid); printf(" -diff [0,inf[ filter MSA by selecting most diverse set of sequences, keeping \n"); printf(" at least this many seqs in each MSA block of length 50 (def=%i) \n",par.Ndiff); printf(" -cov [0,100] minimum coverage with query (%%) (def=%i) \n",par.coverage); printf(" -qid [0,100] minimum sequence identity with query (%%) (def=%i) \n",par.qid); printf(" -qsc [0,100] minimum score per column with query (def=%.1f)\n",par.qsc); printf(" -neff [1,inf] target diversity of alignment (default=off)\n"); printf("\n"); printf("Input alignment format: \n"); printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n"); printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n"); printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n"); printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n"); printf("\n"); if (all) { printf("Pseudocount (pc) options: \n"); printf(" -pcm 0-2 position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pcm); printf(" 0: no pseudo counts: tau = 0 \n"); printf(" 1: constant tau = a \n"); printf(" 2: diversity-dependent: tau = a/(1 + ((Neff[i]-1)/b)^c) \n"); printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n"); printf(" 3: constant diversity pseudocounts \n"); printf(" -pca [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pca); printf(" -pcb [1,inf[ Neff threshold value for -pcm 2 (def=%-.1f) \n",par.pcb); printf(" -pcc [0,3] extinction exponent c for -pcm 2 (def=%-.1f) \n",par.pcc); // printf(" -pcw [0,3] weight of pos-specificity for pcs (def=%-.1f) \n",par.pcw); printf(" -pre_pca [0,1] PREFILTER pseudocount admixture (def=%-.1f) \n",par.pre_pca); printf(" -pre_pcb [1,inf[ PREFILTER threshold for Neff (def=%-.1f) \n",par.pre_pcb); printf("\n"); printf("Context-specific pseudo-counts: \n"); printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n"); printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n",par.clusterfile); printf(" -cslib column state file for fast database prefiltering (default=%s)\n",par.cs_library); printf("\n"); } printf("Example: %s -i test_list \n",program_name); printf("\n"); } ///////////////////////////////////////////////////////////////////////////////////// //// Processing input options from command line and .hhdefaults file ///////////////////////////////////////////////////////////////////////////////////// void ProcessArguments(int argc,char** argv) { // Read command line options for (int i=1; i<=argc-1; i++) { if (v>=4) cout<=argc || argv[i][0]=='-') {help(); cerr<=argc) {help(); cerr<=argc) {help(); cerr<name,argv[++i],NAMELEN-1); //copy longname to name... strmcpy(q->longname,argv[i],DESCLEN-1); //copy full name to longname } else if (!strcmp(argv[i],"-id") && (i='0' && argv[i][0]<='9') {par.Mgaps=atoi(argv[i]); par.M=2;} else cerr<=argc || argv[i][0]=='-') {help() ; cerr<=4) cout<0 weighs columns according to their intra-clomun similarity par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts! par.wg=0; // 0: use local sequence weights 1: use local ones // Make command line input globally available par.argv=argv; par.argc=argc; RemovePathAndExtension(program_name,argv[0]); // Enable changing verbose mode before defaults file and command line are processed for (int i=1; i1 && !strcmp(argv[i],"-v0")) v=0; else if (argc>1 && !strcmp(argv[i],"-v1")) v=1; else if (argc>2 && !strcmp(argv[i],"-v")) v=atoi(argv[i+1]); } par.SetDefaultPaths(program_path); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf,argv_conf); ProcessArguments(argc_conf,argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc,argv); std::vector inList; std::vector outList; std::string line; ifstream fp; fp.open(par.infile,ios::in); while (fp.good()) { getline(fp,line); if (line.size()) inList.push_back(line); } fp.close(); fp.open(par.outfile,ios::in); while (fp.good()) { getline(fp,line); if (line.size()) outList.push_back(line); } fp.close(); if (inList.size()!=outList.size()) { std::cerr<<"ERROR! "<(fin); fclose(fin); cs::TransformToLog(*context_lib); lib_pc = new cs::LibraryPseudocounts(*context_lib, par.csw, par.csb); } // Set substitution matrix; adjust to query aa distribution if par.pcm==3 SetSubstitutionMatrix(); for (size_t f=0;fMAXSEQDIS-3) par.nseqdis=MAXSEQDIS-3; //3 reserve for secondary structure // Get basename RemoveExtension(q->file,par.infile); //Get basename of infile (w/o extension): // Outfile not given? Name it basename.hhm if (!*par.outfile) { RemoveExtension(par.outfile,par.infile); strcat(par.outfile,".hhm"); } // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format=0; ReadQueryFile(par.infile,input_format,q); PrepareQueryHMM(input_format,q); // Write HMM to output file in HHsearch format q->WriteToFile(par.outfile); if (v>=3) WriteToScreen(par.outfile,1000); // (max 1000 lines) // Print 'Done!' FILE* outf=NULL; if (!strcmp(par.outfile,"stdout")) printf("Done!\n"); else { if (!*par.outfile) { outf=fopen(par.outfile,"a"); //open for append fprintf(outf,"Done!\n"); fclose(outf); } if (v>=2) printf("Done\n"); } q->longname[0]=0; } if (!par.nocontxt) { delete context_lib; delete lib_pc; } delete q; std::vector ().swap(inList); std::vector ().swap(outList); line.clear(); exit(0); } //end main