// hhmake.C: build profile HMM from input alignment for HMM-HMM comparison
// (C) Johannes Soeding and Michael Remmert 2012
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
// You should have received a copy of the GNU General Public License
// along with this program. If not, see .
// We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
// Reference:
// Remmert M., Biegert A., Hauser A., and Soding J.
// HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
// Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
#include // ofstream, ifstream
#include // printf
#include // exit
#include // strcmp, strstr
#include // sqrt, pow
#include // INT_MIN
#include // FLT_MIN
#include // clock
#include // perror(), strerror(errno)
#include // islower, isdigit etc
#include
#include "hhsuite_config.h"
#include "cs.h" // context-specific pseudocounts
#include "context_library.h"
#include "library_pseudocounts-inl.h"
#include "crf_pseudocounts-inl.h"
#include "list.h" // list data structure
#include "hash.h" // hash data structure
#include "util.h" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc.
#include "hhdecl.h" // Constants, global variables, struct Parameters
#include "hhutil.h" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID
#include "hhmatrices.h" // BLOSUM50, GONNET, HSDM
#include "hhhmm.h" // class HMM
#include "hhhit.h" // class Hit
#include "hhalignment.h" // class Alignment
#include "hhfunc.h" // some functions common to hh programs
// Help functions
void help(Parameters& par, char all = 0) {
printf("HHmake %i.%i.%i\n", HHSUITE_VERSION_MAJOR, HHSUITE_VERSION_MINOR, HHSUITE_VERSION_PATCH);
printf("Build an HMM from an input alignment in A2M, A3M, or FASTA format,\n");
printf("or convert between HMMER format (.hmm) and HHsearch format (.hhm).\n");
printf("%s", REFERENCE);
printf("%s", COPYRIGHT);
printf("\n");
printf("Usage: hhmake -i -o [options]\n");
printf(" -i query alignment (A2M, A3M, or FASTA), or query HMM \n");
if (all) {
printf("\n");
printf(" may be 'stdin' or 'stdout' throughout.\n");
}
printf("\n");
printf("Output options: \n");
printf(" -o HMM file to be written to (default=) \n");
printf(" -a HMM file to be appended to \n");
printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose\n");
printf(" -seq max. number of query/template sequences displayed (def=%i) \n", par.nseqdis);
printf(" Beware of overflows! All these sequences are stored in memory.\n");
printf(" -add_cons make consensus sequence master sequence of query MSA \n");
printf(" -name use this name for HMM (default: use name of first sequence) \n");
printf("\n");
printf("Filter query multiple sequence alignment \n");
printf(" -id [0,100] maximum pairwise sequence identity (%%) (def=%i) \n", par.max_seqid);
printf(" -diff [0,inf[ filter MSA by selecting most diverse set of sequences, keeping \n");
printf(" at least this many seqs in each MSA block of length 50 (def=%i) \n", par.Ndiff);
printf(" -cov [0,100] minimum coverage with query (%%) (def=%i) \n", par.coverage);
printf(" -qid [0,100] minimum sequence identity with query (%%) (def=%i) \n", par.qid);
printf(" -qsc [0,100] minimum score per column with query (def=%.1f)\n", par.qsc);
printf(" -neff [1,inf] target diversity of alignment (default=off)\n");
printf("\n");
printf("Input alignment format: \n");
printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n");
printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n");
printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n");
printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n");
printf("\n");
if (all) {
printf("Pseudocount (pc) options: \n");
printf(" Context specific hhm pseudocounts:\n");
printf(" -pc_hhm_contxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pc_hhm_context_engine.admix);
printf(" 0: no pseudo counts: tau = 0 \n");
printf(" 1: constant tau = a \n");
printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
printf(" 3: CSBlast admixture: tau = a(1+b)/(Neff[i]+b) \n");
printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
printf(" -pc_hhm_contxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pc_hhm_context_engine.pca);
printf(" -pc_hhm_contxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n",par.pc_hhm_context_engine.pcb);
printf(" -pc_hhm_contxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n\n",par.pc_hhm_context_engine.pcc);
printf(" Context independent hhm pseudocounts (used for templates; used for query if contxt file is not available):\n");
printf(" -pc_hhm_nocontxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n",par.pc_hhm_nocontext_mode);
printf(" 0: no pseudo counts: tau = 0 \n");
printf(" 1: constant tau = a \n");
printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n");
// printf(" 3: CSBlast admixture: tau = a(1+b)/(Neff[i]+b) \n");
printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n");
printf(" -pc_hhm_nocontxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n",par.pc_hhm_nocontext_a);
printf(" -pc_hhm_nocontxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n",par.pc_hhm_nocontext_b);
printf(" -pc_hhm_nocontxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n\n",par.pc_hhm_nocontext_c);
printf(" Context-specific pseudo-counts: \n");
printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n");
printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n\n", par.clusterfile.c_str());
printf("Other options:\n");
printf(" -maxseq max number of input rows (def=%5i)\n", par.maxseq);
printf(" -maxres max number of HMM columns (def=%5i)\n", par.maxres);
printf("\n");
}
printf("Example: hhmake -i test.a3m -o stdout \n");
printf("\n");
}
/////////////////////////////////////////////////////////////////////////////////////
//// Processing input options from command line
/////////////////////////////////////////////////////////////////////////////////////
void ProcessArguments(Parameters& par, std::string& name) {
const int argc = par.argc;
const char** argv = par.argv;
// Read command line options
for (int i = 1; i <= argc - 1; i++) {
HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl;
if (!strcmp(argv[i], "-i")) {
if (++i >= argc || argv[i][0] == '-') {
help(par);
HH_LOG(ERROR) << "No input file following -i" << std::endl;
exit(4);
}
else
strcpy(par.infile, argv[i]);
}
else if (!strcmp(argv[i], "-o")) {
par.append = 0;
if (++i >= argc) {
help(par);
HH_LOG(ERROR) << "No output file following -o" << std::endl;
exit(4);
}
else
strcpy(par.outfile, argv[i]);
}
else if (!strcmp(argv[i], "-a")) {
par.append = 1;
if (++i >= argc) {
help(par);
HH_LOG(ERROR) << "No output file following -a" << std::endl;
exit(4);
}
else
strcpy(par.outfile, argv[i]);
}
else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
help(par, 1);
exit(0);
}
else if (!strcmp(argv[i], "-v") && (i < argc - 1) && argv[i + 1][0] != '-') {
int v = atoi(argv[++i]);
par.v = Log::from_int(v);
Log::reporting_level() = par.v;
}
else if (!strcmp(argv[i], "-seq") && (i < argc - 1))
par.nseqdis = atoi(argv[++i]);
else if (!strncmp(argv[i], "-add_cons", 5))
par.cons = 1;
else if (!strncmp(argv[i], "-mark", 5))
par.mark = 1;
else if (!strcmp(argv[i], "-name") && (i < argc - 1)) {
name = argv[++i];
}
else if (!strcmp(argv[i], "-id") && (i < argc - 1))
par.max_seqid = atoi(argv[++i]);
else if (!strcmp(argv[i], "-qid") && (i < argc - 1))
par.qid = atoi(argv[++i]);
else if (!strcmp(argv[i], "-qsc") && (i < argc - 1))
par.qsc = atof(argv[++i]);
else if (!strcmp(argv[i], "-cov") && (i < argc - 1))
par.coverage = atoi(argv[++i]);
else if (!strcmp(argv[i], "-diff") && (i < argc - 1))
par.Ndiff = atoi(argv[++i]);
else if (!strcmp(argv[i], "-neff") && (i < argc - 1))
par.Neff = atof(argv[++i]);
else if (!strcmp(argv[i], "-Neff") && (i < argc - 1))
par.Neff = atof(argv[++i]);
else if (!strcmp(argv[i], "-M") && (i < argc - 1)) {
if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m"))
par.M = 1;
else if (!strcmp(argv[i], "first"))
par.M = 3;
else if (argv[i][0] >= '0' && argv[i][0] <= '9') {
par.Mgaps = atoi(argv[i]);
par.M = 2;
}
else
HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl;
}
else if (!strcmp(argv[i], "-Gonnet"))
par.matrix = 0;
else if (!strncmp(argv[i], "-BLOSUM", 7)
|| !strncmp(argv[i], "-Blosum", 7)) {
if (!strcmp(argv[i] + 7, "30"))
par.matrix = 30;
else if (!strcmp(argv[i] + 7, "40"))
par.matrix = 40;
else if (!strcmp(argv[i] + 7, "50"))
par.matrix = 50;
else if (!strcmp(argv[i] + 7, "65"))
par.matrix = 65;
else if (!strcmp(argv[i] + 7, "80"))
par.matrix = 80;
else
HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
}
else if (!strcmp(argv[i], "-wg"))
par.wg = 1;
else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1))
par.maxseq = atoi(argv[++i]);
else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) {
par.maxres = atoi(argv[++i]);
par.maxcol = 2 * par.maxres;
}
else if (!strcmp(argv[i], "-pcm") && (i < argc - 1))
par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]);
else if (!strcmp(argv[i], "-pca") && (i < argc - 1))
par.pc_hhm_context_engine.pca = atof(argv[++i]);
else if (!strcmp(argv[i], "-pcb") && (i < argc - 1))
par.pc_hhm_context_engine.pcb = atof(argv[++i]);
else if (!strcmp(argv[i], "-pcc") && (i < argc - 1))
par.pc_hhm_context_engine.pcc = atof(argv[++i]);
else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) {
par.gapb = atof(argv[++i]);
if (par.gapb <= 0.01)
par.gapb = 0.01;
}
else if (!strcmp(argv[i], "-gapd") && (i < argc - 1))
par.gapd = atof(argv[++i]);
else if (!strcmp(argv[i], "-gape") && (i < argc - 1))
par.gape = atof(argv[++i]);
else if (!strcmp(argv[i], "-gapf") && (i < argc - 1))
par.gapf = atof(argv[++i]);
else if (!strcmp(argv[i], "-gapg") && (i < argc - 1))
par.gapg = atof(argv[++i]);
else if (!strcmp(argv[i], "-gaph") && (i < argc - 1))
par.gaph = atof(argv[++i]);
else if (!strcmp(argv[i], "-gapi") && (i < argc - 1))
par.gapi = atof(argv[++i]);
else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1))
par.maxseq = atoi(argv[++i]);
else if (!strcmp(argv[i], "-maxres") && (i < argc - 1))
par.maxres = par.maxcol = atoi(argv[++i]);
else if (!strcmp(argv[i], "-nocontxt")) {
par.nocontxt = 1;
}
else if (!strcmp(argv[i], "-csb") && (i < argc - 1))
par.csb = atof(argv[++i]);
else if (!strcmp(argv[i], "-csw") && (i < argc - 1))
par.csw = atof(argv[++i]);
else if (!strcmp(argv[i], "-cs")) {
if (++i >= argc || argv[i][0] == '-') {
help(par);
HH_LOG(ERROR) << "No query file following -cs" << std::endl;
exit(4);
}
else
par.clusterfile = argv[i];
}
else {
HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl;
}
HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl;
} // end of for-loop for command line input
}
int main(int argc, const char **argv) {
Parameters par(argc, argv);
strcpy(par.infile, "");
strcpy(par.outfile, "");
strcpy(par.alnfile, "");
//Default parameter settings
par.showcons = 1; // write consensus sequence into hhm file
par.append = 0; // overwrite output file
par.nseqdis = 10; // maximum number of query or template sequences to be recoreded in HMM and diplayed in output alignments
par.mark = 0; // 1: only marked sequences (or first) get displayed; 0: most divergent ones get displayed
par.max_seqid = 90; // default for maximum sequence identity threshold
par.qid = 0; // default for maximum sequence identity threshold
par.qsc = -20.0f; // default for minimum score per column with query
par.coverage = 0; // default for minimum coverage threshold
par.Ndiff = 100; // pick Ndiff most different sequences from alignment
par.M = 1; // match state assignment is by A2M/A3M
par.Mgaps = 50; // above this percentage of gaps, columns are assigned to insert states
par.matrix = 0; // Subst.matrix 0: Gonnet, 1: HSDM, 2: BLOSUM50 3: BLOSUM62
par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) 0; // no amino acid and transition pseudocounts added
par.gapb = 0.0; // default values for transition pseudocounts; 0.0: add no transition pseudocounts!
par.wg = 0; // 0: use local sequence weights 1: use local ones
std::string name;
ProcessArguments(par, name);
// Check command line input and default values
if (!*par.infile) {
help(par);
HH_LOG(ERROR) << "Input file is missing" << std::endl;
exit(4);
}
if (par.nseqdis > MAXSEQDIS - 3) {
// 3 reserve for secondary structure
par.nseqdis = MAXSEQDIS - 3;
}
// Outfile not given? Name it basename.hhm
if (!*par.outfile) {
RemoveExtension(par.outfile, par.infile);
strcat(par.outfile, ".hhm");
}
cs::ContextLibrary* context_lib = NULL;
cs::Crf* crf = NULL;
cs::Pseudocounts* pc_hhm_context_engine = NULL;
cs::Admix* pc_hhm_context_mode = NULL;
cs::Pseudocounts* pc_prefilter_context_engine = NULL;
cs::Admix* pc_prefilter_context_mode = NULL;
// Prepare CS pseudocounts lib
if (!par.nocontxt) {
InitializePseudocountsEngine(par, context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
}
// substitution matrix flavours
float __attribute__((aligned(16))) P[20][20];
float __attribute__((aligned(16))) R[20][20];
float __attribute__((aligned(16))) Sim[20][20];
float __attribute__((aligned(16))) S[20][20];
float __attribute__((aligned(16))) pb[21];
float __attribute__((aligned(16))) qav[21];
// secondary structure matrices
float S73[NDSSP][NSSPRED][MAXCF];
float S33[NSSPRED][MAXCF][NSSPRED][MAXCF];
// Set substitution matrix; adjust to query aa distribution if par.pcm==3
SetSubstitutionMatrix(par.matrix, pb, P, R, S, Sim);
// Read input file (HMM, HHM, or alignment format), and add pseudocounts etc.
char input_format = 0;
Alignment Qali(par.maxseq, par.maxres);
// Write HMM to output file in HHsearch format
HMM q(par.nseqdis, par.maxres);
strmcpy(q.name, name.c_str(), NAMELEN - 1);
strmcpy(q.longname, name.c_str(), DESCLEN - 1);
//Get basename of infile (w/o extension)
RemoveExtension(q.file, par.infile);
ReadQueryFile(par, par.infile, input_format, par.wg, &q, &Qali, pb, S, Sim);
PrepareQueryHMM(par, input_format, &q, pc_hhm_context_engine, pc_hhm_context_mode, pb, R);
q.WriteToFile(par.outfile, par.append, par.max_seqid, par.coverage, par.qid, par.Ndiff, par.qsc, par.argc, par.argv, pb);
DeletePseudocountsEngine(context_lib, crf, pc_hhm_context_engine, pc_hhm_context_mode, pc_prefilter_context_engine, pc_prefilter_context_mode);
}