/* hmmstat: display summary statistics for an HMM database. * * Example: * ./hmmstat Pfam * * SRE, Thu May 24 11:18:20 2007 * SVN $Id: hmmstat.c 4321 2012-11-27 16:15:47Z wheelert $ */ #include "p7_config.h" #include #include #include #include #include "easel.h" #include "esl_getopts.h" #include "esl_exponential.h" #include "hmmer.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "--eval2score", eslARG_NONE, FALSE, NULL, NULL, NULL, "-E", NULL, "compute score for E-value (E) for database of (Z) sequences", 0 }, { "--score2eval", eslARG_NONE, FALSE, NULL, NULL, NULL, "-S", NULL, "compute E-value for score (S) for database of (Z) sequences", 0 }, { "-Z", eslARG_INT, "1", NULL, "n>0", NULL, NULL, "--baseZ1,--baseZ", "database size (in seqs) for --eval2score or --score2eval", 0 }, { "--baseZ", eslARG_INT, "0", NULL, NULL, NULL, NULL, "--baseZ1,-Z", "database size (M bases) (DNA only, if search on both strands)", 0 }, { "--baseZ1", eslARG_INT, "0", NULL, NULL, NULL, NULL, "--baseZ,-Z", "database size (M bases) (DNA only, if search on single strand)", 0 }, { "-E", eslARG_REAL, "0.01", NULL, NULL, NULL, "--eval2score", NULL, "E-value threshold, for --eval2score", 0 }, { "-S", eslARG_REAL, "0.01", NULL, NULL, NULL, "--score2eval", NULL, "Score input for --score2eval", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] "; static char banner[] = "display summary statistics for a profile file"; static int output_header(FILE *ofp, const ESL_GETOPTS *go) { p7_banner(ofp, go->argv[0], banner); if (esl_opt_IsUsed(go, "--eval2score")) { if ( fprintf(ofp, "# show score required to reach E-value: %.2g\n", esl_opt_GetReal(go, "-E")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--score2eval")) { if ( fprintf(ofp, "# show E-value corresponding to score: %.2g\n", esl_opt_GetReal(go, "-S")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } if (esl_opt_IsUsed(go, "--eval2score") || esl_opt_IsUsed(go, "--score2eval")) { if (esl_opt_IsUsed(go, "--baseZ") ) { if ( fprintf(ofp, "# using base count (search both strands): %d Mb\n", esl_opt_GetInteger(go, "--baseZ")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { if ( fprintf(ofp, "# using base count (search single strand): %d Mb\n", esl_opt_GetInteger(go, "--baseZ1")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } else { if ( fprintf(ofp, "# using database sequence count: %d\n", esl_opt_GetInteger(go, "-Z")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); } } if (fprintf(ofp, "# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); return eslOK; } int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; int do_eval2score = 0; int do_score2eval = 0; long z_val; float e_val; float s_val; float nseq; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } output_header(stdout, go); if ( esl_opt_IsOn(go, "--eval2score") ) { do_eval2score = TRUE; e_val = esl_opt_GetReal(go, "-E"); } else if ( esl_opt_IsOn(go, "--score2eval") ) { do_score2eval = TRUE; s_val = esl_opt_GetReal(go, "-S"); } else if ( esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1") || esl_opt_IsUsed(go, "-Z") ) { puts("The flags -Z, --baseZ, and --baseZ1 are for use with --eval2score and --score2eval."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_IsUsed(go, "--baseZ") ) { z_val = 1000000 * 2 * (long)(esl_opt_GetInteger(go, "--baseZ")); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { z_val = 1000000 * (long)(esl_opt_GetInteger(go, "--baseZ1")); } else { z_val = esl_opt_GetInteger(go, "-Z"); } /* Initializations: open the HMM file */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); if (do_eval2score) printf (" %6s %6.2g", "sc for", e_val); if (do_score2eval) printf (" %6s %6.2f", "E-val for", s_val); printf("\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); if (do_eval2score) printf (" %13s", "-------------"); if (do_score2eval) printf (" %13s", "-------------"); printf("\n"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if ( esl_opt_IsOn(go, "--eval2score") || esl_opt_IsOn(go, "--score2eval") ) { if (esl_opt_IsUsed(go, "--baseZ") || esl_opt_IsUsed(go, "--baseZ1" ) ) { if ( hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --baseZ and --baseZ1 can't be used with non-nucleotide models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } else if ( hmm->abc->type != eslAMINO && hmm->abc->type != eslRNA && hmm->abc->type != eslDNA) { puts("The flags --eval2score and --score2eval can't be used with non-sequence models."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } } if (esl_opt_IsUsed(go, "--baseZ") ) { nseq = (float)z_val / (float)(hmm->max_length); } else if (esl_opt_IsUsed(go, "--baseZ1") ) { nseq = (float)z_val / (float)(hmm->max_length); } else { nseq = z_val; } if (bg == NULL) bg = p7_bg_Create(abc); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); if ( esl_opt_IsOn(go, "--eval2score") ) { float sc; sc = esl_exp_invsurv( e_val / nseq , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2f", sc); } else if ( esl_opt_IsOn(go, "--score2eval") ) { float e; e = nseq * esl_exp_surv( s_val , hmm->evparam[p7_FTAU], hmm->evparam[p7_FLAMBDA]); printf("%13.2g", e); } printf("\n"); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); esl_getopts_Destroy(go); exit(0); }