// hhconsensus.C: read A3M/FASTA file and calculate consensus sequence // Compile with g++ hhconsensus.C -o hhconsensus -O3 -static -fno-strict-aliasing -L/home/soeding/programs/electric-fence-2.1.13/ ////#define WINDOWS #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf #include // min,max #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // islower, isdigit etc #include // clock_gettime etc. (in realtime library (-lrt compiler option)) #include // perror() #include #include //#include //#include "efence.h" //#include "efence.c" #ifdef HH_SSE41 #include // SSSE3 #include // SSE4.1 #define HH_SSE3 #endif #ifdef HH_SSE3 #include // SSE3 #define HH_SSE2 #endif #ifdef HH_SSE2 #ifndef __SUNPRO_C #include // SSE2 #else #include #endif #endif using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include "cs.h" // context-specific pseudocounts #include "context_library.h" #include "library_pseudocounts-inl.h" #include "list.h" // list data structure #include "hash.h" // hash data structure #include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "hhdecl.C" // Constants, global variables, struct Parameters #include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen, #include "hhmatrices.C" // BLOSUM50, GONNET, HSDM #include "hhhmm.h" // class HMM #include "hhhit.h" // class Hit #include "hhalignment.h" // class Alignment #include "hhhalfalignment.h" // class HalfAlignment #include "hhhmm.C" // class HMM #include "hhalignment.C" // class Alignment #include "hhhalfalignment.C" // class HalfAlignment #include "hhfunc.C" // some functions common to hh programs ///////////////////////////////////////////////////////////////////////////////////// // Global variables ///////////////////////////////////////////////////////////////////////////////////// HMM* q = new HMM; //Create a HMM with maximum of par.maxres match states ///////////////////////////////////////////////////////////////////////////////////// // Help functions ///////////////////////////////////////////////////////////////////////////////////// void help() { printf("\n"); printf("HHconsensus %s\n",VERSION_AND_DATE); printf("Calculate the consensus sequence for an A3M/FASTA input file. \n"); printf("%s",COPYRIGHT); printf("%s",REFERENCE); printf("\n"); printf("Usage: %s -i [options] \n",program_name); printf(" -i query alignment (A2M, A3M, or FASTA), or query HMM \n"); printf("\n"); printf("Output options: \n"); printf(" -s append consensus sequence in FASTA (default=) \n"); printf(" -o write alignment with consensus sequence in A3M \n"); printf(" -oa3m same \n"); printf(" -oa2m write alignment with consensus sequence in A2M \n"); printf(" -ofas write alignment with consensus sequence in FASTA \n"); printf(" -v verbose mode: 0:no screen output 1:only warings 2: verbose\n"); printf("\n"); printf("Filter input alignment (options can be combined): \n"); printf(" -id [0,100] maximum pairwise sequence identity (%%) (def=%i) \n",par.max_seqid); printf(" -diff [0,inf[ filter most diverse set of sequences, keeping at least this \n"); printf(" many sequences in each block of >50 columns (def=%i)\n",par.Ndiff); printf(" -cov [0,100] minimum coverage with query (%%) (def=%i) \n",par.coverage); printf(" -qid [0,100] minimum sequence identity with query (%%) (def=%i) \n",par.qid); printf(" -qsc [0,100] minimum score per column with query (def=%.1f)\n",par.qsc); printf("\n"); printf("Input alignment format: \n"); printf(" -M a2m use A2M/A3M (default): upper case = Match; lower case = Insert;\n"); printf(" '-' = Delete; '.' = gaps aligned to inserts (may be omitted) \n"); printf(" -M first use FASTA: columns with residue in 1st sequence are match states\n"); printf(" -M [0,100] use FASTA: columns with fewer than X%% gaps are match states \n"); printf("\n"); printf("Other options: \n"); printf(" -addss add predicted secondary structure information from PSIPRED \n"); printf("\n"); printf("Example: %s -i stdin -s stdout\n",program_name); printf("\n"); } ///////////////////////////////////////////////////////////////////////////////////// //// Processing input options from command line and .hhdefaults file ///////////////////////////////////////////////////////////////////////////////////// void ProcessArguments(int argc,char** argv) { // Read command line options for (int i=1; i<=argc-1; i++) { if (v>=4) cout<=argc || argv[i][0]=='-') {help(); cerr<=argc) {help(); cerr<=argc) {help(); cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<=argc || argv[i][0]=='-') {help() ; cerr<name,argv[++i],NAMELEN-1); //copy longname to name... strmcpy(q->longname,argv[i],DESCLEN-1); //copy full name to longname } else if (!strcmp(argv[i],"-id") && (i='0' && argv[i][0]<='9') {par.Mgaps=atoi(argv[i]); par.M=2;} else cerr<=argc || argv[i][0]=='-') {help() ; cerr<=4) cout<1 && !strcmp(argv[i],"-v0")) v=0; else if (argc>1 && !strcmp(argv[i],"-v1")) v=1; else if (argc>2 && !strcmp(argv[i],"-v")) v=atoi(argv[i+1]); } par.SetDefaultPaths(program_path); // Read .hhdefaults file? if (par.readdefaultsfile) { // Process default otpions from .hhconfig file ReadDefaultsFile(argc_conf,argv_conf); ProcessArguments(argc_conf,argv_conf); } // Process command line options (they override defaults from .hhdefaults file) ProcessArguments(argc,argv); // Check command line input and default values if (!*par.infile) {help(); cerr<file,par.infile); //Get basename of infile (w/o extension): // Outfile not given? Name it basename.hhm if (!*par.outfile && !*par.alnfile) { RemoveExtension(par.outfile,par.infile); strcat(par.outfile,".seq"); } // Prepare CS pseudocounts lib if (!par.nocontxt) { FILE* fin = fopen(par.clusterfile, "r"); context_lib = new cs::ContextLibrary(fin); fclose(fin); cs::TransformToLog(*context_lib); lib_pc = new cs::LibraryPseudocounts(*context_lib, par.csw, par.csb); } // Set substitution matrix; adjust to query aa distribution if par.pcm==3 SetSubstitutionMatrix(); // Read input file (HMM, HHM, or alignment format), and add pseudocounts etc. char input_format=0; ReadQueryFile(par.infile,input_format,q,qali); // Same code as in PrepareQueryHMM(par.infile,input_format,q,qali), except that we add SS prediction // Add Pseudocounts, if no HMMER input if (input_format == 0) { // Transform transition freqs to lin space if not already done q->AddTransitionPseudocounts(); // Comput substitution matrix pseudocounts if (par.nocontxt) { // Generate an amino acid frequency matrix from f[i][a] with full pseudocount admixture (tau=1) -> g[i][a] q->PreparePseudocounts(); // Add amino acid pseudocounts to query: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] q->AddAminoAcidPseudocounts(); } else { // Add full context specific pseudocounts to query q->AddContextSpecificPseudocounts(); } } else { q->AddAminoAcidPseudocounts(0); } q->CalculateAminoAcidBackground(); if (par.addss==1) CalculateSS(q); // !! if (par.columnscore == 5 && !q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); // Write consensus sequence to sequence file // Consensus sequence is calculated in hhalignment.C, Alignment::FrequenciesAndTransitions() if (*par.outfile) { FILE* outf=NULL; if (strcmp(par.outfile,"stdout")) { outf=fopen(par.outfile,"a"); if (!outf) OpenFileError(par.outfile); } else outf = stdout; // OLD //// ">name_consensus" -> ">name consensus" //strsubst(q->sname[q->nfirst],"_consensus"," consensus"); //fprintf(outf,">%s\n%s\n",q->sname[q->nfirst],q->seq[q->nfirst]+1); // NEW (long header needed for NR30cons database) fprintf(outf,">%s\n%s\n",q->longname,q->seq[q->nfirst]+1); fclose(outf); } // Print A3M/A2M/FASTA output alignment if (*par.alnfile) { HalfAlignment qa; int n = imin(q->n_display,par.nseqdis+(q->nss_dssp>=0)+(q->nss_pred>=0)+(q->nss_conf>=0)+(q->ncons>=0)); qa.Set(q->name,q->seq,q->sname,n,q->L,q->nss_dssp,q->nss_pred,q->nss_conf,q->nsa_dssp,q->ncons); if (par.outformat==1) qa.BuildFASTA(); else if (par.outformat==2) qa.BuildA2M(); else if (par.outformat==3) qa.BuildA3M(); if (qali->readCommentLine) qa.Print(par.alnfile,qali->longname); // print alignment to outfile else qa.Print(par.alnfile); // print alignment to outfile } delete qali; delete q; // Print 'Done!' printf("Done!\n"); exit(0); } //end main