/* Clusters tabular data. * * SRE, Mon Nov 3 13:43:29 2008 [Janelia] * SVN $Id: esl-cluster.c 685 2011-05-23 14:27:52Z eddys $ */ #include "esl_config.h" #include #include #include #include #include "easel.h" #include "esl_fileparser.h" #include "esl_getopts.h" #include "esl_keyhash.h" #include "esl_stack.h" #include "esl_tree.h" #include "esl_vectorops.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "show brief help on version and usage", 0 }, { "-q", eslARG_INT, "8", NULL, "n>0", NULL,NULL, NULL, "field to read as query name, 1..n", 0 }, { "-t", eslARG_INT, "5", NULL, "n>0", NULL,NULL, NULL, "field to read as target name, 1..n", 0 }, { "-v", eslARG_INT, "1", NULL, "n>0", NULL,NULL, NULL, "field to read as distance value, 1..n", 0 }, { "-x", eslARG_REAL, "1e-4", NULL, "x>0", NULL,NULL, NULL, "clustering threshold", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] "; static char banner[] = "clusters tabular data file"; static void read_keyfile (ESL_GETOPTS *go, char *keyfile, ESL_KEYHASH *kh); static void read_tabfile (ESL_GETOPTS *go, char *tabfile, ESL_KEYHASH *kh, ESL_DMATRIX *D); static void output_clusters(ESL_GETOPTS *go, ESL_TREE *T, ESL_KEYHASH *kh); static void cmdline_failure(char *argv0, ESL_GETOPTS *go, char *format, ...) { va_list argp; va_start(argp, format); vfprintf(stderr, format, argp); va_end(argp); esl_usage(stdout, argv0, usage); puts("\n options:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* printf("\nTo see more help on available options, do %s -h\n\n", argv0); */ exit(1); } static void cmdline_help(char *argv0, ESL_GETOPTS *go) { esl_banner(stdout, argv0, banner); esl_usage (stdout, argv0, usage); puts("\n options:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); exit(0); } int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *keyfile = NULL; char *tabfile = NULL; ESL_KEYHASH *kh = esl_keyhash_Create(); int nkeys = 0; ESL_DMATRIX *D = NULL; ESL_TREE *T = NULL; go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], go, "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], go, "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help (argv[0], go); if (esl_opt_ArgNumber(go) != 2) cmdline_failure(argv[0], go, "Incorrect number of command line arguments.\n"); keyfile = esl_opt_GetArg(go, 1); tabfile = esl_opt_GetArg(go, 2); read_keyfile(go, keyfile, kh); nkeys = esl_keyhash_GetNumber(kh); D = esl_dmatrix_Create(nkeys, nkeys); read_tabfile(go, tabfile, kh, D); esl_tree_SingleLinkage(D, &T); //esl_tree_WriteNewick(stdout, T); output_clusters(go, T, kh); esl_tree_Destroy(T); esl_dmatrix_Destroy(D); esl_keyhash_Destroy(kh); esl_getopts_Destroy(go); return 0; } static void read_keyfile(ESL_GETOPTS *go, char *keyfile, ESL_KEYHASH *kh) { ESL_FILEPARSER *efp = NULL; int nline = 0; char *tok = NULL; int toklen; int status; if (esl_fileparser_Open(keyfile, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); while (esl_fileparser_NextLine(efp) == eslOK) { nline++; if (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) != eslOK) esl_fatal("No token found on line %d", nline); status = esl_keyhash_Store(kh, tok, toklen, NULL); if (status == eslEDUP) esl_fatal("Saw key %s twice: keys must be unique", tok); else if (status != eslOK) esl_fatal("unknown error in storing key %s\n", tok); } esl_fileparser_Close(efp); } static void read_tabfile(ESL_GETOPTS *go, char *tabfile, ESL_KEYHASH *kh, ESL_DMATRIX *D) { ESL_FILEPARSER *efp = NULL; int nline = 0; int vfield = esl_opt_GetInteger(go, "-v"); int qfield = esl_opt_GetInteger(go, "-q"); int tfield = esl_opt_GetInteger(go, "-t"); char *tok; int toklen; int ntok; double value; int qidx, tidx; if (esl_fileparser_Open(tabfile, NULL, &efp) != eslOK) esl_fatal("File open failed"); esl_fileparser_SetCommentChar(efp, '#'); esl_dmatrix_Set(D, eslINFINITY); while (esl_fileparser_NextLine(efp) == eslOK) { nline++; ntok = 0; qidx = tidx = -1; value = eslNaN; while (esl_fileparser_GetTokenOnLine(efp, &tok, &toklen) == eslOK) { ntok++; if (ntok == vfield) value = atof(tok); if (ntok == qfield && esl_keyhash_Lookup(kh, tok, toklen, &qidx) != eslOK) esl_fatal("failed to find query key %s", tok); if (ntok == tfield && esl_keyhash_Lookup(kh, tok, toklen, &tidx) != eslOK) esl_fatal("failed to find target key %s", tok); } if (qidx == -1) esl_fatal("Failed to find query name on line %d (looking for field %d)\n", nline, qfield); if (tidx == -1) esl_fatal("Failed to find target name on line %d (looking for field %d)\n", nline, tfield); if (isnan(value)) esl_fatal("Failed to find value on line %d (looking for field %d)\n", nline, vfield); D->mx[qidx][tidx] = value; if (D->mx[tidx][qidx] == eslINFINITY) D->mx[tidx][qidx] = value; } esl_fileparser_Close(efp); } void output_clusters(ESL_GETOPTS *go, ESL_TREE *T, ESL_KEYHASH *kh) { ESL_STACK *ns = esl_stack_ICreate(); ESL_STACK *s2 = esl_stack_ICreate(); double threshold = esl_opt_GetReal(go, "-x"); int v,v2; int nc = 0; esl_stack_IPush(ns, 0); while (esl_stack_IPop(ns, &v) == eslOK) { /* v may only be an internal node. */ if (T->ld[v] < threshold) { nc++; printf("Cluster %d: %g\n", nc, T->ld[v]); esl_stack_IPush(s2, T->right[v]); esl_stack_IPush(s2, T->left[v]); while (esl_stack_IPop(s2, &v2) == eslOK) { if (v2 <= 0) printf("= %s \t%d\t%g\n", esl_keyhash_Get(kh, -v2), nc, T->ld[v]); else { esl_stack_IPush(s2, T->right[v2]); esl_stack_IPush(s2, T->left[v2]); } } printf("\n\n"); continue; } if (T->right[v] > 0) esl_stack_IPush(ns, T->right[v]); else printf("Singleton:\n= %s\t%c\t%c\n\n", esl_keyhash_Get(kh, -T->right[v]), '-', '-'); if (T->left[v] > 0) esl_stack_IPush(ns, T->left[v]); else printf("Singleton:\n= %s\t%c\t%c\n\n", esl_keyhash_Get(kh, -T->left[v]), '-', '-'); } esl_stack_Destroy(ns); esl_stack_Destroy(s2); }