// hhalign.C: // Align a multiple alignment to an alignment or HMM // Print out aligned input sequences in a3m format // Error codes: 0: ok 1: file format error 2: file access error 3: memory error 4: internal numeric error 5: command line error /* (C) Johannes Soeding 2012 This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de Reference: Remmert M., Biegert A., Hauser A., and Soding J. HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment. Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011). */ #include "hhalign.h" #include "hhsuite_config.h" std::vector empty; HHalign::HHalign(Parameters& par) : HHblits(par, empty), tfiles(par.tfiles) { } HHalign::~HHalign() { } void HHalign::help(Parameters& par, char all) { printf("HHalign %i.%i.%i\n", HHSUITE_VERSION_MAJOR, HHSUITE_VERSION_MINOR, HHSUITE_VERSION_PATCH); printf("Align a query alignment/HMM to a template alignment/HMM by HMM-HMM alignment\n"); printf("If only one alignment/HMM is given it is compared to itself and the best\n"); printf("off-diagonal alignment plus all further non-overlapping alignments above \n"); printf("significance threshold are shown.\n"); printf("%s", REFERENCE); printf("%s", COPYRIGHT); printf("\n"); printf("Usage: hhalign -i query -t template [options] \n"); printf(" -i input/query: single sequence or multiple sequence alignment (MSA)\n"); printf(" in a3m, a2m, or FASTA format, or HMM in hhm format\n"); printf(" -t input/template: single sequence or multiple sequence alignment (MSA)\n"); printf(" in a3m, a2m, or FASTA format, or HMM in hhm format\n"); if (all) { printf("\n"); printf(" may be 'stdin' or 'stdout' throughout.\n"); } printf("\n"); printf("Input alignment format: \n"); printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n"); printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n"); printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n"); printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n"); printf(" -tags/-notags do NOT / do neutralize His-, C-myc-, FLAG-tags, and trypsin \n"); printf(" recognition sequence to background distribution (def=-notags) \n"); printf("\n"); printf("Output options: \n"); printf(" -o write results in standard format to file (default=)\n"); printf(" -oa3m write query alignment in a3m or PSI-BLAST format (-opsi) to file (default=none)\n"); printf(" -aa3m append query alignment in a3m (-aa3m) or PSI-BLAST format (-apsi )to file (default=none)\n"); printf(" -Ofas write pairwise alignments in FASTA xor A2M (-Oa2m) xor A3M (-Oa3m) format \n"); printf(" -add_cons generate consensus sequence as master sequence of query MSA (default=don't)\n"); printf(" -hide_cons don't show consensus sequence in alignments (default=show) \n"); printf(" -hide_pred don't show predicted 2ndary structure in alignments (default=show)\n"); printf(" -hide_dssp don't show DSSP 2ndary structure in alignments (default=show) \n"); printf(" -show_ssconf show confidences for predicted 2ndary structure in alignments\n"); if (all) { printf(" -seq max. number of query/template sequences displayed (default=%i) \n", par.nseqdis); printf(" -aliw number of columns per line in alignment list (default=%i) \n", par.aliwidth); printf(" -p [0,100] minimum probability in summary and alignment list (default=%G) \n", par.p); printf(" -E [0,inf[ maximum E-value in summary and alignment list (default=%G) \n", par.E); printf(" -Z maximum number of lines in summary hit list (default=%i) \n", par.Z); printf(" -z minimum number of lines in summary hit list (default=%i) \n", par.z); printf(" -B maximum number of alignments in alignment list (default=%i) \n", par.B); printf(" -b minimum number of alignments in alignment list (default=%i) \n", par.b); } printf("\n"); printf("Filter options applied to query MSA, template MSA, and result MSA \n"); printf(" -id [0,100] maximum pairwise sequence identity (def=%i)\n", par.max_seqid); printf(" -diff [0,inf[ filter MSAs by selecting most diverse set of sequences, keeping \n"); printf(" at least this many seqs in each MSA block of length 50 \n"); printf(" Zero and non-numerical values turn off the filtering. (def=%i) \n", par.Ndiff); printf(" -cov [0,100] minimum coverage with master sequence (%%) (def=%i) \n", par.coverage); printf(" -qid [0,100] minimum sequence identity with master sequence (%%) (def=%i) \n", par.qid); printf(" -qsc [0,100] minimum score per column with master sequence (default=%.1f) \n", par.qsc); printf(" -mark do not filter out sequences marked by \">@\"in their name line \n"); printf("\n"); printf("HMM-HMM alignment options: \n"); printf(" -norealign do NOT realign displayed hits with MAC algorithm (def=realign) \n"); printf(" -mact [0,1[ posterior prob threshold for MAC realignment controlling greedi- \n"); printf(" ness at alignment ends: 0:global >0.1:local (default=%.2f) \n", par.mact); printf(" -glob/-loc use global/local alignment mode for searching/ranking (def=local)\n"); if (all) { printf(" -realign realign displayed hits with max. accuracy (MAC) algorithm \n"); printf(" -excl exclude query positions from the alignment, e.g. '1-33,97-168' \n"); printf(" -template_excl exclude template positions from the alignment, e.g. '1-33,97-168' \n"); printf(" -ovlp banded alignment: forbid largest diagonals |i-j| of DP matrix (def=%i)\n", par.min_overlap); printf(" -alt show up to this many alternative alignments with raw score > smin(def=%i) \n", par.altali); printf(" -smin minimum raw score for alternative alignments (def=%.1f) \n", par.smin); printf(" -shift [-1,1] profile-profile score offset (def=%-.2f) \n", par.shift); printf(" -corr [0,1] weight of term for pair correlations (def=%.2f) \n", par.corr); printf(" -sc amino acid score (tja: template HMM at column j) (def=%i)\n", par.columnscore); printf(" 0 = log2 Sum(tja*qia/pa) (pa: aa background frequencies) \n"); printf(" 1 = log2 Sum(tja*qia/pqa) (pqa = 1/2*(pa+ta) ) \n"); printf(" 2 = log2 Sum(tja*qia/ta) (ta: av. aa freqs in template) \n"); printf(" 3 = log2 Sum(tja*qia/qa) (qa: av. aa freqs in query) \n"); printf(" 5 local amino acid composition correction \n"); printf(" -ssm {0,..,4} secondary structure scoring [default=%1i] \n", par.ssm); printf(" 0: = no ss scoring \n"); printf(" 1,2: = ss scoring after or during alignment \n"); printf(" 3,4: = ss scoring after or during alignment, predicted vs. predicted\n"); printf(" -ssw [0,1] weight of ss score (def=%-.2f) \n", par.ssw); printf(" -ssa [0,1] ss confusion matrix = (1-ssa)*I + ssa*psipred-confusion-matrix [def=%-.2f)\n", par.ssa); printf(" -wg use global sequence weighting for realignment! \n"); printf("\n"); printf("Gap cost options: \n"); printf(" -gapb [0,inf[ Transition pseudocount admixture (def=%-.2f) \n", par.gapb); printf(" -gapd [0,inf[ Transition pseudocount admixture for open gap (default=%-.2f) \n", par.gapd); printf(" -gapd [0,inf[ Transition pseudocount admixture for open gap (default=%-.2f) \n", par.gapd); printf(" -gape [0,1.5] Transition pseudocount admixture for extend gap (def=%-.2f) \n", par.gape); printf(" -gapf ]0,inf] factor to increase/reduce the gap open penalty for deletes (def=%-.2f) \n", par.gapf); printf(" -gapg ]0,inf] factor to increase/reduce the gap open penalty for inserts (def=%-.2f) \n", par.gapg); printf(" -gaph ]0,inf] factor to increase/reduce the gap extend penalty for deletes(def=%-.2f)\n", par.gaph); printf(" -gapi ]0,inf] factor to increase/reduce the gap extend penalty for inserts(def=%-.2f)\n", par.gapi); printf(" -egq [0,inf[ penalty (bits) for end gaps aligned to query residues (def=%-.2f) \n", par.egq); printf(" -egt [0,inf[ penalty (bits) for end gaps aligned to template residues (def=%-.2f) \n", par.egt); printf("\n"); printf("Pseudocount (pc) options: \n"); printf(" Context specific hhm pseudocounts:\n"); printf(" -pc_hhm_contxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_context_engine.admix); printf(" 0: no pseudo counts: tau = 0 \n"); printf(" 1: constant tau = a \n"); printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n"); printf(" 3: CSBlast admixture: tau = a(1+b)/(Neff[i]+b) \n"); printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n"); printf(" -pc_hhm_contxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_hhm_context_engine.pca); printf(" -pc_hhm_contxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_hhm_context_engine.pcb); printf(" -pc_hhm_contxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_hhm_context_engine.pcc); printf("\n"); printf(" Context independent hhm pseudocounts (used for templates; used for query if contxt file is not available):\n"); printf(" -pc_hhm_nocontxt_mode {0,..,3} position dependence of pc admixture 'tau' (pc mode, default=%-i) \n", par.pc_hhm_nocontext_mode); printf(" 0: no pseudo counts: tau = 0 \n"); printf(" 1: constant tau = a \n"); printf(" 2: diversity-dependent: tau = a/(1+((Neff[i]-1)/b)^c) \n"); printf(" (Neff[i]: number of effective seqs in local MSA around column i) \n"); printf(" -pc_hhm_nocontxt_a [0,1] overall pseudocount admixture (def=%-.1f) \n", par.pc_hhm_nocontext_a); printf(" -pc_hhm_nocontxt_b [1,inf[ Neff threshold value for mode 2 (def=%-.1f) \n", par.pc_hhm_nocontext_b); printf(" -pc_hhm_nocontxt_c [0,3] extinction exponent c for mode 2 (def=%-.1f) \n", par.pc_hhm_nocontext_c); printf("\n"); printf(" Context-specific pseudo-counts: \n"); printf(" -nocontxt use substitution-matrix instead of context-specific pseudocounts \n"); printf(" -contxt context file for computing context-specific pseudocounts (default=%s)\n", par.clusterfile.c_str()); printf(" -csw [0,inf] weight of central position in cs pseudocount mode (def=%.1f)\n", par.csw); printf(" -csb [0,1] weight decay parameter for positions in cs pc mode (def=%.1f)\n", par.csb); } printf("\n"); printf("Other options: \n"); printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose (def=%i)\n", par.v); if (all) { printf(" -atab write all alignments in tabular layout to file \n"); printf(" -maxseq max number of input rows (def=%5i)\n", par.maxseq); printf(" -maxres max number of HMM columns (def=%5i)\n", par.maxres); printf(" -maxmem [1,inf[ limit memory for realignment (in GB) (def=%.1f) \n", par.maxmem); } printf("\n"); if (!all) { printf("An extended list of options can be obtained by calling 'hhalign -h all'\n"); } printf("Example: hhalign -i T0187.a3m -t d1hz4a_.hhm -o result.hhr\n"); } ///////////////////////////////////////////////////////////////////////////////////// //// Processing input options from command line ///////////////////////////////////////////////////////////////////////////////////// void HHalign::ProcessAllArguments(Parameters& par) { par.tfiles.resize(0); strcpy(par.alnfile, ""); par.p = 0.0; // minimum threshold for inclusion in hit list and alignment listing par.E = 1e6; // maximum threshold for inclusion in hit list and alignment listing par.b = 1; // min number of alignments par.B = 100; // max number of alignments par.z = 1; // min number of lines in hit list par.Z = 100; // max number of lines in hit list par.append = 0; // append alignment to output file with -a option par.altali = 1; // find only ONE (possibly overlapping) subalignment par.outformat = 3; // default output format for alignment is a3m par.realign = 1; // default: realign par.num_rounds = 1; ProcessArguments(par); // Check option compatibilities if (par.nseqdis > MAXSEQDIS - 3 - par.showcons) par.nseqdis = MAXSEQDIS - 3 - par.showcons; //3 reserved for secondary structure if (par.aliwidth < 20) par.aliwidth = 20; if (par.pc_hhm_context_engine.pca < 0.001) par.pc_hhm_context_engine.pca = 0.001; // to avoid log(0) if (par.b > par.B) par.B = par.b; if (par.z > par.Z) par.Z = par.z; if (par.mact >= 1.0) par.mact = 0.999; else if (par.mact < 0) par.mact = 0.0; if (par.altali < 1) par.altali = 1; } void HHalign::ProcessArguments(Parameters& par) { const int argc = par.argc; const char** argv = par.argv; for (int i = 1; i < argc; i++) { HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl; if (!strcmp(argv[i], "-i")) { if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No query file following -i" << std::endl; exit(4); } else strcpy(par.infile, argv[i]); } else if (!strcmp(argv[i], "-t")) { if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No template file following -t" << std::endl; exit(4); } else { std::string tfile(argv[i]); par.tfiles.push_back(tfile); } } else if (!strcmp(argv[i], "-o")) { if (++i >= argc) { help(par); HH_LOG(ERROR) << "No filename following -o" << std::endl; exit(4); } else strcpy(par.outfile, argv[i]); } else if (!strcmp(argv[i], "-Ofas")) { par.outformat = 1; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -Ofas" << std::endl; exit(4); } else strcpy(par.pairwisealisfile, argv[i]); } else if (!strcmp(argv[i], "-Oa2m")) { par.outformat = 2; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -Oa2m" << std::endl; exit(4); } else strcpy(par.pairwisealisfile, argv[i]); } else if (!strcmp(argv[i], "-Oa3m")) { par.outformat = 3; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -Oa3m" << std::endl; exit(4); } else strcpy(par.pairwisealisfile, argv[i]); } else if (!strcmp(argv[i], "-oa3m")) { par.append = 0; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -oa3m" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-opsi")) { par.append = 0; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -opsi" << std::endl; exit(4); } else strcpy(par.psifile, argv[i]); } else if (!strcmp(argv[i], "-aa3m")) { par.append = 1; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -aa3m" << std::endl; exit(4); } else strcpy(par.alnfile, argv[i]); } else if (!strcmp(argv[i], "-apsi")) { par.append = 1; if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No output file following -apsi" << std::endl; exit(4); } else strcpy(par.psifile, argv[i]); } else if (!strcmp(argv[i], "-wg")) { par.wg = 1; } else if (!strcmp(argv[i], "-atab")) { if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No query file following -atab" << std::endl; exit(4); } else strmcpy(par.alitabfile, argv[i], NAMELEN - 1); } else if (!strcmp(argv[i], "-index")) { if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No index file following -index" << std::endl; exit(4); } else strcpy(par.indexfile, argv[i]); } else if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) { if (++i >= argc) { help(par); exit(0); } if (!strcmp(argv[i], "all")) { help(par, 1); exit(0); } else { help(par); exit(0); } } else if (!strcmp(argv[i], "-v") && (i < argc - 1) && argv[i + 1][0] != '-') { int v = atoi(argv[++i]); par.v = Log::from_int(v); Log::reporting_level() = par.v; } else if (!strcmp(argv[i], "-p") && (i < argc - 1)) par.p = atof(argv[++i]); else if (!strcmp(argv[i], "-e") && (i < argc - 1)) par.E = atof(argv[++i]); else if (!strcmp(argv[i], "-E") && (i < argc - 1)) par.E = atof(argv[++i]); else if (!strcmp(argv[i], "-b") && (i < argc - 1)) par.b = atoi(argv[++i]); else if (!strcmp(argv[i], "-B") && (i < argc - 1)) par.B = atoi(argv[++i]); else if (!strcmp(argv[i], "-z") && (i < argc - 1)) par.z = atoi(argv[++i]); else if (!strcmp(argv[i], "-Z") && (i < argc - 1)) par.Z = atoi(argv[++i]); else if (!strncmp(argv[i], "-add_cons", 7)) par.cons = 0; else if (!strncmp(argv[i], "-hide_cons", 7)) par.showcons = 0; else if (!strncmp(argv[i], "-hide_pred", 7)) par.showpred = 0; else if (!strncmp(argv[i], "-hide_dssp", 7)) par.showdssp = 0; else if (!strncmp(argv[i], "-show_ssconf", 7)) par.showconf = 1; else if (!strncmp(argv[i], "-mark", 7)) par.mark = 1; else if (!strcmp(argv[i], "-seq") && (i < argc - 1)) par.nseqdis = atoi(argv[++i]); else if (!strcmp(argv[i], "-aliw") && (i < argc - 1)) par.aliwidth = atoi(argv[++i]); else if (!strcmp(argv[i], "-id") && (i < argc - 1)) par.max_seqid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qid") && (i < argc - 1)) par.qid = atoi(argv[++i]); else if (!strcmp(argv[i], "-qsc") && (i < argc - 1)) par.qsc = atof(argv[++i]); else if (!strcmp(argv[i], "-cov") && (i < argc - 1)) par.coverage = atoi(argv[++i]); else if (!strcmp(argv[i], "-diff") && (i < argc - 1)) par.Ndiff = atoi(argv[++i]); //no help required else if (!strcmp(argv[i], "-Gonnet")) par.matrix = 0; //no help required else if (!strcmp(argv[i], "-Blosum50")) par.matrix = 50; //no help required else if (!strcmp(argv[i], "-Blosum62")) par.matrix = 62; else if (!strcmp(argv[i], "-pc_hhm_contxt_mode") && (i < argc - 1)) par.pc_hhm_context_engine.admix = (Pseudocounts::Admix) atoi(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_contxt_a") && (i < argc - 1)) par.pc_hhm_context_engine.pca = atof(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_contxt_b") && (i < argc - 1)) par.pc_hhm_context_engine.pcb = atof(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_contxt_c") && (i < argc - 1)) par.pc_hhm_context_engine.pcc = atof(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_nocontxt_mode") && (i < argc - 1)) par.pc_hhm_nocontext_mode = atoi(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_nocontxt_a") && (i < argc - 1)) par.pc_hhm_nocontext_a = atof(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_nocontxt_b") && (i < argc - 1)) par.pc_hhm_nocontext_b = atof(argv[++i]); else if (!strcmp(argv[i], "-pc_hhm_nocontxt_c") && (i < argc - 1)) par.pc_hhm_nocontext_c = atof(argv[++i]); else if (!strcmp(argv[i], "-gapb") && (i < argc - 1)) { par.gapb = atof(argv[++i]); if (par.gapb <= 0.01) par.gapb = 0.01; } else if (!strcmp(argv[i], "-gapd") && (i < argc - 1)) par.gapd = atof(argv[++i]); else if (!strcmp(argv[i], "-gape") && (i < argc - 1)) par.gape = atof(argv[++i]); else if (!strcmp(argv[i], "-gapf") && (i < argc - 1)) par.gapf = atof(argv[++i]); else if (!strcmp(argv[i], "-gapg") && (i < argc - 1)) par.gapg = atof(argv[++i]); else if (!strcmp(argv[i], "-gaph") && (i < argc - 1)) par.gaph = atof(argv[++i]); else if (!strcmp(argv[i], "-gapi") && (i < argc - 1)) par.gapi = atof(argv[++i]); else if (!strcmp(argv[i], "-egq") && (i < argc - 1)) par.egq = atof(argv[++i]); else if (!strcmp(argv[i], "-egt") && (i < argc - 1)) par.egt = atof(argv[++i]); else if (!strcmp(argv[i], "-ssm") && (i < argc - 1)) par.ssm = atoi(argv[++i]); else if (!strcmp(argv[i], "-ssw") && (i < argc - 1)) par.ssw = atof(argv[++i]); else if (!strcmp(argv[i], "-ssa") && (i < argc - 1)) par.ssa = atof(argv[++i]); else if (!strncmp(argv[i], "-glob", 5)) { par.loc = 0; if (par.mact > 0.35 && par.mact < 0.3502) { par.mact = 0; } } else if (!strncmp(argv[i], "-loc", 3)) par.loc = 1; else if (!strncmp(argv[i], "-alt", 4) && (i < argc - 1)) par.altali = atoi(argv[++i]); else if (!strncmp(argv[i], "-smin", 4) && (i < argc - 1)) par.smin = atof(argv[++i]); else if (!strcmp(argv[i], "-realign")) par.realign = 1; else if (!strcmp(argv[i], "-norealign")) par.realign = 0; else if (!strcmp(argv[i], "-M") && (i < argc - 1)) { //TODO: M a3m not defined in the help if (!strcmp(argv[++i], "a2m") || !strcmp(argv[i], "a3m")){ par.M = 1; par.M_template = 1; } else if (!strcmp(argv[i], "first")) { par.M = 3; par.M_template = 3; } else if (argv[i][0] >= '0' && argv[i][0] <= '9') { par.Mgaps = atoi(argv[i]); par.M = 2; par.M_template = 2; } else HH_LOG(WARNING) << "Ignoring unknown argument: -M " << argv[i] << std::endl; } else if (!strcmp(argv[i], "-shift") && (i < argc - 1)) par.shift = atof(argv[++i]); else if (!strcmp(argv[i], "-mact") && (i < argc - 1)) { par.mact = atof(argv[++i]); } //not in the help - but intended else if (!strcmp(argv[i], "-scwin") && (i < argc - 1)) { par.columnscore = 5; par.half_window_size_local_aa_bg_freqs = imax(1, atoi(argv[++i])); } else if (!strcmp(argv[i], "-sc") && (i < argc - 1)) par.columnscore = atoi(argv[++i]); else if (!strcmp(argv[i], "-maxseq") && (i < argc - 1)) par.maxseq = atoi(argv[++i]); else if (!strcmp(argv[i], "-maxres") && (i < argc - 1)) { par.maxres = atoi(argv[++i]); par.maxcol = 2 * par.maxres; } else if (!strcmp(argv[i], "-maxmem") && (i < argc - 1)) { par.maxmem = atof(argv[++i]); } else if (!strcmp(argv[i], "-corr") && (i < argc - 1)) par.corr = atof(argv[++i]); else if (!strcmp(argv[i], "-ovlp") && (i < argc - 1)) par.min_overlap = atoi(argv[++i]); else if (!strcmp(argv[i], "-tags")) par.notags = 0; else if (!strcmp(argv[i], "-notags")) par.notags = 1; else if (!strcmp(argv[i], "-nocontxt")) par.nocontxt = 1; else if (!strcmp(argv[i], "-csb") && (i < argc - 1)) par.csb = atof(argv[++i]); else if (!strcmp(argv[i], "-csw") && (i < argc - 1)) par.csw = atof(argv[++i]); else if (!strcmp(argv[i], "-contxt")) { if (++i >= argc || argv[i][0] == '-') { help(par); HH_LOG(ERROR) << "No query file following -contxt" << std::endl; exit(4); } else par.clusterfile = argv[i]; } else if (!strcmp(argv[i],"-excl")) { if (++i>=argc) { help(par); HH_LOG(ERROR) << "No expression following -excl" << std::endl; exit(4); } par.exclstr = new char[strlen(argv[i])+1]; strcpy(par.exclstr,argv[i]); } else if (!strcmp(argv[i],"-template_excl")) { if (++i>=argc) { help(par); HH_LOG(ERROR) << "No expression following -excl" << std::endl; exit(4); } par.template_exclstr = new char[strlen(argv[i])+1]; strcpy(par.template_exclstr,argv[i]); } else { HH_LOG(WARNING) << "Ignoring unknown option " << argv[i] << std::endl; } HH_LOG(DEBUG1) << i << " " << argv[i] << std::endl; } // end of for-loop for command line input } void HHalign::run(FILE* query_fh, char* query_path) { HH_LOG(DEBUG) << "Query file : " << query_path << "\n"; HH_LOG(DEBUG) << "Template files:"; for (size_t i = 0; i < tfiles.size(); ++i) { HH_LOG(DEBUG) << " " << tfiles[i]; } HH_LOG(DEBUG) << "\n"; int cluster_found = 0; int seqs_found = 0; Hit hit_cur; Hash* previous_hits = new Hash(1631, hit_cur); Qali = new Alignment(par.maxseq, par.maxres); Qali_allseqs = new Alignment(par.maxseq, par.maxres); q = new HMM(MAXSEQDIS, par.maxres); HMMSimd q_vec(par.maxres); q_tmp = new HMM(MAXSEQDIS, par.maxres); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. Qali->N_in = 0; char input_format = 0; ReadQueryFile(par, query_fh, input_format, par.wg, q, Qali, query_path, pb, S, Sim); PrepareQueryHMM(par, input_format, q, pc_hhm_context_engine, pc_hhm_context_mode, pb, R); q_vec.MapOneHMM(q); *q_tmp = *q; // Set query columns in His-tags etc to Null model distribution if (par.notags) q->NeutralizeTags(pb); std::vector new_entries; for (size_t i = 0; i < tfiles.size(); i++) { HHEntry* template_entry = new HHFileEntry(tfiles[i].c_str(), par.maxres); new_entries.push_back(template_entry); } int max_template_length = getMaxTemplateLength(new_entries); for(int i = 0; i < par.threads; i++) { viterbiMatrices[i]->AllocateBacktraceMatrix(q->L, max_template_length); } ViterbiRunner viterbirunner(viterbiMatrices, dbs, par.threads); std::vector hits_to_add = viterbirunner.alignment(par, &q_vec, new_entries, par.qsc_db, pb, S, Sim, R, par.ssm, S73, S33, S37); hitlist.N_searched = new_entries.size(); add_hits_to_hitlist(hits_to_add, hitlist); // Set new ss weight for realign par.ssw = par.ssw_realign; // Realign hits with MAC algorithm if (par.realign) { perform_realign(q_vec, input_format, new_entries, 1); } mergeHitsToQuery(previous_hits, seqs_found, cluster_found, 1); // Calculate pos-specific weights, AA frequencies and transitions -> f[i][a], tr[i][a] Qali->FrequenciesAndTransitions(q, par.wg, par.mark, par.cons, par.showcons, pb, Sim, NULL, true); if (par.notags) q->NeutralizeTags(pb); for(size_t i = 0; i < new_entries.size(); i++) { delete new_entries[i]; } new_entries.clear(); previous_hits->Reset(); while (!previous_hits->End()) previous_hits->ReadNext().Delete(); // Delete hit object delete previous_hits; }