// hhmake.C: build profile HMM from input alignment for HMM-HMM comparison
// (C) Johannes Soeding and Michael Remmert 2012
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see .
// We are very grateful for bug reports! Please contact us at soeding@genzentrum.lmu.de
// Reference:
// Remmert M., Biegert A., Hauser A., and Soding J.
// HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
// Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
#define MAIN
#include // cin, cout, cerr
#include // ofstream, ifstream
#include // printf
#include // exit
#include // strcmp, strstr
#include // sqrt, pow
#include // INT_MIN
#include // FLT_MIN
#include // clock
#include // perror(), strerror(errno)
#include // islower, isdigit etc
#include
#include
#include
#include
#ifdef HH_SSE41
#include // SSSE3
#include // SSE4.1
#define HH_SSE3
#endif
#ifdef HH_SSE3
#include // SSE3
#define HH_SSE2
#endif
#ifdef HH_SSE2
#ifndef __SUNPRO_C
#include // SSE2
#else
#include
#endif
#endif
using std::cout;
using std::cerr;
using std::endl;
using std::ios;
using std::ifstream;
using std::ofstream;
#include "cs.h" // context-specific pseudocounts
#include "context_library.h"
#include "library_pseudocounts-inl.h"
#include "list.h" // list data structure
#include "hash.h" // hash data structure
#include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
#include "hhdecl.C" // Constants, global variables, struct Parameters
#include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen,
#include "hhmatrices.C" // BLOSUM50, GONNET, HSDM
#include "hhhmm.h" // class HMM
#include "hhhit.h" // class Hit
#include "hhalignment.h" // class Alignment
#include "hhhmm.C" // class HMM
#include "hhalignment.C" // class Alignment
#include "hhfunc.C" // some functions common to hh programs
/////////////////////////////////////////////////////////////////////////////////////
// Global variables
/////////////////////////////////////////////////////////////////////////////////////
HMM* q = new HMM; //Create a HMM with maximum of par.maxres match states
/////////////////////////////////////////////////////////////////////////////////////
// Help functions
/////////////////////////////////////////////////////////////////////////////////////
void help(char all=0)
{
printf("\n");
printf("HHmake_batch %s\n",VERSION_AND_DATE);
printf("Build a set of HMM from a set of input alignment in A2M, A3M, or FASTA format,\n");
printf("or convert between HMMER format (.hmm) and HHsearch format (.hhm). \n");
printf("%s",REFERENCE);
printf("%s",COPYRIGHT);
printf("\n");
printf("Usage: %s -i file [options] \n",program_name);
printf(" -i list of query alignment (A2M, A3M, or FASTA), or query HMM\n");
printf("\n");
printf("Output options: \n");
printf(" -o list of HMM file to be written to (default=) \n");
printf(" -a HMM file to be appended to \n");
printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose\n");
printf(" -seq max. number of query/template sequences displayed (def=%i) \n",par.nseqdis);
printf(" Beware of overflows! All these sequences are stored in memory.\n");
printf(" -cons make consensus sequence master sequence of query MSA \n");
printf(" -name use this name for HMM (default: use name of first sequence) \n");
printf("\n");
printf("Filter query multiple sequence alignment \n");
printf(" -id [0,100] maximum pairwise sequence identity (%%) (def=%i) \n",par.max_seqid);
printf(" -diff [0,inf[ filter MSA by selecting most diverse set of sequences, keeping \n");
printf(" at least this many seqs in each MSA block of length 50 (def=%i) \n",par.Ndiff);
printf(" -cov [0,100] minimum coverage with query (%%) (def=%i) \n",par.coverage);
printf(" -qid [0,100] minimum sequence identity with query (%%) (def=%i) \n",par.qid);
printf(" -qsc [0,100] minimum score per column with query (def=%.1f)\n",par.qsc);
printf(" -neff [1,inf] target diversity of alignment (default=off)\n");
printf("\n");
printf("Input alignment format: \n");
printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n");
printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n");
printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n");
printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n");
printf("\n");
if (all) {
printf("Pseudocount (pc) options: \n");
printf(" -pcm 0-2 position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pcm);
printf(" 0: no pseudo counts: tau = 0 \n");
printf(" 1: constant tau = a \n");
printf(" 2: diversity-dependent: tau = a/(1 + ((Neff[i]-1)/b)^c) \n");
printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
printf(" 3: constant diversity pseudocounts \n");
printf(" -pca [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pca);
printf(" -pcb [1,inf[ Neff threshold value for -pcm 2 (def=%-.1f) \n",par.pcb);
printf(" -pcc [0,3] extinction exponent c for -pcm 2 (def=%-.1f) \n",par.pcc);
// printf(" -pcw [0,3] weight of pos-specificity for pcs (def=%-.1f) \n",par.pcw);
printf(" -pre_pca [0,1] PREFILTER pseudocount admixture (def=%-.1f) \n",par.pre_pca);
printf(" -pre_pcb [1,inf[ PREFILTER threshold for Neff (def=%-.1f) \n",par.pre_pcb);
printf("\n");
printf("Context-specific pseudo-counts: \n");
printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n");
printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n",par.clusterfile);
printf(" -cslib column state file for fast database prefiltering (default=%s)\n",par.cs_library);
printf("\n");
}
printf("Example: %s -i test_list \n",program_name);
printf("\n");
}
/////////////////////////////////////////////////////////////////////////////////////
//// Processing input options from command line and .hhdefaults file
/////////////////////////////////////////////////////////////////////////////////////
void ProcessArguments(int argc,char** argv)
{
// Read command line options
for (int i=1; i<=argc-1; i++)
{
if (v>=4) cout<=argc || argv[i][0]=='-')
{help(); cerr<=argc)
{help(); cerr<=argc)
{help(); cerr<name,argv[++i],NAMELEN-1); //copy longname to name...
strmcpy(q->longname,argv[i],DESCLEN-1); //copy full name to longname
}
else if (!strcmp(argv[i],"-id") && (i='0' && argv[i][0]<='9') {par.Mgaps=atoi(argv[i]); par.M=2;}
else cerr<=argc || argv[i][0]=='-')
{help() ; cerr<=4) cout<0 weighs columns according to their intra-clomun similarity
par.gapb=0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts!
par.wg=0; // 0: use local sequence weights 1: use local ones
// Make command line input globally available
par.argv=argv;
par.argc=argc;
RemovePathAndExtension(program_name,argv[0]);
// Enable changing verbose mode before defaults file and command line are processed
for (int i=1; i1 && !strcmp(argv[i],"-v0")) v=0;
else if (argc>1 && !strcmp(argv[i],"-v1")) v=1;
else if (argc>2 && !strcmp(argv[i],"-v")) v=atoi(argv[i+1]);
}
par.SetDefaultPaths(program_path);
// Read .hhdefaults file?
if (par.readdefaultsfile)
{
// Process default otpions from .hhconfig file
ReadDefaultsFile(argc_conf,argv_conf);
ProcessArguments(argc_conf,argv_conf);
}
// Process command line options (they override defaults from .hhdefaults file)
ProcessArguments(argc,argv);
std::vector inList;
std::vector outList;
std::string line;
ifstream fp;
fp.open(par.infile,ios::in);
while (fp.good())
{
getline(fp,line);
if (line.size()) inList.push_back(line);
}
fp.close();
fp.open(par.outfile,ios::in);
while (fp.good())
{
getline(fp,line);
if (line.size()) outList.push_back(line);
}
fp.close();
if (inList.size()!=outList.size())
{
std::cerr<<"ERROR! "<(fin);
fclose(fin);
cs::TransformToLog(*context_lib);
lib_pc = new cs::LibraryPseudocounts(*context_lib, par.csw, par.csb);
}
// Set substitution matrix; adjust to query aa distribution if par.pcm==3
SetSubstitutionMatrix();
for (size_t f=0;fMAXSEQDIS-3) par.nseqdis=MAXSEQDIS-3; //3 reserve for secondary structure
// Get basename
RemoveExtension(q->file,par.infile); //Get basename of infile (w/o extension):
// Outfile not given? Name it basename.hhm
if (!*par.outfile)
{
RemoveExtension(par.outfile,par.infile);
strcat(par.outfile,".hhm");
}
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
char input_format=0;
ReadQueryFile(par.infile,input_format,q);
PrepareQueryHMM(input_format,q);
// Write HMM to output file in HHsearch format
q->WriteToFile(par.outfile);
if (v>=3) WriteToScreen(par.outfile,1000); // (max 1000 lines)
// Print 'Done!'
FILE* outf=NULL;
if (!strcmp(par.outfile,"stdout"))
printf("Done!\n");
else
{
if (!*par.outfile)
{
outf=fopen(par.outfile,"a"); //open for append
fprintf(outf,"Done!\n");
fclose(outf);
}
if (v>=2) printf("Done\n");
}
q->longname[0]=0;
}
if (!par.nocontxt)
{
delete context_lib;
delete lib_pc;
}
delete q;
std::vector ().swap(inList);
std::vector ().swap(outList);
line.clear();
exit(0);
} //end main