/* Defining domain number and coordinates in a significant hit by * posterior sampling and clustering. * * SRE, Wed Jan 9 07:26:34 2008 [Janelia] * SVN $Id: p7_spensemble.c 3474 2011-01-17 13:25:32Z eddys $ */ #include "p7_config.h" #include "easel.h" #include "esl_cluster.h" #include "esl_vectorops.h" #include "hmmer.h" /* Function: p7_spensemble_Create() * Synopsis: Allocates a * Incept: SRE, Wed Jan 9 10:00:14 2008 [Janelia] * * Purpose: Create a new with specified initial * allocation sizes: for the number of sampled * segment pairs, for the range over * which one of a domain's (i,j,k,m) sampled endpoints * falls, and for the number of significant * clusters (domains) that will be defined. * * The values of these initial allocations are only * relevant to tuning memory performance, because the * object is reallocated/grown as needed. You can make * guesses, and the better your guess, the fewer * reallocations will be needed; but everything will work * fine regardless of what these initial allocations are. * * A is designed to be reused for many * target sequences and/or models, to minimize alloc/free * calls. * * Args: init_n - initial allocation for # of sampled segment pairs * init_epc - initial allocation for maximum endpoint range * init_sigc - initial allocation for # of significant clusters, domains * * Returns: a pointer to the new . * * Throws: on allocation failure. */ P7_SPENSEMBLE * p7_spensemble_Create(int init_n, int init_epc, int init_sigc) { P7_SPENSEMBLE *sp = NULL; int status; ESL_ALLOC(sp, sizeof(P7_SPENSEMBLE)); sp->sp = NULL; sp->workspace = NULL; sp->assignment = NULL; sp->epc = NULL; sp->sigc = NULL; sp->nalloc = init_n; sp->epc_alloc = init_epc; sp->nsigc_alloc = init_sigc; ESL_ALLOC(sp->sp, sizeof(struct p7_spcoord_s) * sp->nalloc); ESL_ALLOC(sp->workspace, sizeof(int) * sp->nalloc * 2); /* workspace is 2n */ ESL_ALLOC(sp->assignment, sizeof(int) * sp->nalloc); ESL_ALLOC(sp->epc, sizeof(int) * sp->epc_alloc); ESL_ALLOC(sp->sigc, sizeof(struct p7_spcoord_s) * sp->nsigc_alloc); sp->nsamples = 0; sp->n = 0; sp->nc = 0; sp->nsigc = 0; return sp; ERROR: p7_spensemble_Destroy(sp); return NULL; } /* Function: p7_spensemble_Reuse() * Synopsis: Reinitializes a . * Incept: SRE, Wed Jan 9 10:26:36 2008 [Janelia] * * Purpose: Reinitialize so it can be used again to collect * and process a new segment pair ensemble, without * having to free and reallocate. * * Returns: on success. */ int p7_spensemble_Reuse(P7_SPENSEMBLE *sp) { sp->nsamples = 0; sp->n = 0; sp->nc = 0; sp->nsigc = 0; return eslOK; } /* Function: p7_spsensemble_Add() * Synopsis: Add a new segment pair to a growing ensemble. * Incept: SRE, Wed Jan 9 10:28:04 2008 [Janelia] * * Purpose: Adds a new segment pair to a growing ensemble . * The segment pair is defined by start/end positions * , on a target sequence (1..L), and start/end * positions , on a query model (1..M). * * You also provide the index of which sampled * traceback this segment pair came from; each traceback * contains one or more segment pairs. These * indices start at 0 and they must arrive sequentially: * that is, the caller must all the segment pairs * from traceback sample 0, then all the segment * pairs from traceback sample 1, and so on. * * The reason to enforce sequential addition has to do with * the internals of the ensemble clustering algorithm; * specifically with how it calculates the posterior * probability of a cluster in the ensemble. You can't * calculate the posterior probability of a cluster simply * by dividing the number of segment pairs in a cluster by * the total number of traces, because you can get * "probabilities" of greater than one: sometimes more than * one pair from the same trace get clustered together * (because one domain got split into two or more segment * pairs). Rather, what it does is to count the total * number of traces that have one or more segments in the * cluster, divided by the total number of traces. An * efficient way to implement this is, when counting * segments that belong to a cluster, only increment the * numerator if the segment has a different traceback index * than the last segment we counted in this cluster. (We'd * rather not have to keep track of a table of all the * traceback indices we've seen so far.) * * Args: sp - ensemble to add this segment pair to * sampleidx - index of traceback that this seg pair came from (0..nsamples-1) * i,j - start,end position on target sequence (1..L) * k,m - start,end position on query model (1..M) * * Returns: on success. * * Throws: if the is out of order. * if a reallocation fails. */ int p7_spensemble_Add(P7_SPENSEMBLE *sp, int sampleidx, int i, int j, int k, int m) { int status; if (sampleidx > sp->nsamples) ESL_EXCEPTION(eslEINVAL, "seg pair's is out of order"); else if (sampleidx == sp->nsamples) sp->nsamples++; if (sp->n >= sp->nalloc) { void *p; ESL_RALLOC(sp->sp, p, sizeof(struct p7_spcoord_s) * sp->nalloc * 2); ESL_RALLOC(sp->workspace, p, sizeof(int) * sp->nalloc * 4); /* remember, workspace is 2n */ ESL_RALLOC(sp->assignment, p, sizeof(int) * sp->nalloc * 2); sp->nalloc *= 2; } sp->sp[sp->n].idx = sampleidx; sp->sp[sp->n].i = i; sp->sp[sp->n].j = j; sp->sp[sp->n].k = k; sp->sp[sp->n].m = m; sp->n++; return eslOK; ERROR: return status; } /* struct p7_linkparam_s: * used just within this .c, as part of setting up the clustering problem in * the form that Easel's general SLC algorithm will take it. */ struct p7_linkparam_s { float min_overlap; /* 0.8 means >= 80% overlap of (smaller/larger) segment is required, both in seq and hmm */ int of_smaller; /* TRUE means overlap fraction is w.r.t. smaller segment; FALSE means w.r.t. larger segment */ int max_diagdiff; /* 4 means either start or endpoints of two segments must be within <= 4 diagonals of each other */ float min_posterior; /* 0.25 means a cluster must occur w/ >= 25% posterior probability in the sample to be "significant" */ float min_endpointp; /* 0.02 means choose widest endpoint with post. prob. of at least 2% */ }; /* link_spsamples(): * * Defines the rule used for single linkage clustering of sampled * domain coordinates. (API is dictated by Easel's general single * linkage clustering routine.) */ static int link_spsamples(const void *v1, const void *v2, const void *prm, int *ret_link) { struct p7_spcoord_s *h1 = (struct p7_spcoord_s *) v1; struct p7_spcoord_s *h2 = (struct p7_spcoord_s *) v2; struct p7_linkparam_s *param = (struct p7_linkparam_s *) prm; int nov, n; int d1, d2; /* seq overlap test */ nov = ESL_MIN(h1->j, h2->j) - ESL_MAX(h1->i, h2->i) + 1; /* overlap */ n = (param->of_smaller ? ESL_MIN(h1->j - h1->i + 1, h2->j - h2->i + 1) : /* min length of the two hits */ ESL_MAX(h1->j - h1->i + 1, h2->j - h2->i + 1)); /* max length of the two hits */ if ((float) nov / (float) n < param->min_overlap) { *ret_link = FALSE; return eslOK; } /* hmm overlap test */ nov = ESL_MIN(h1->m, h2->m) - ESL_MAX(h1->k, h2->k); n = (param->of_smaller ? ESL_MIN(h1->m - h1->k + 1, h2->m - h2->k + 1) : ESL_MAX(h1->m - h1->k + 1, h2->m - h2->k + 1)); if ((float) nov / (float) n < param->min_overlap) { *ret_link = FALSE; return eslOK; } /* nearby diagonal test */ d1 = (h1->i - h1->k); d2 = (h2->i - h2->k); if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; } d1 = (h1->j - h1->m); d2 = (h2->j - h2->m); if (abs(d1-d2) <= param->max_diagdiff) { *ret_link = TRUE; return eslOK; } *ret_link = FALSE; return eslOK; } /* cluster_orderer() * is the routine that gets passed to qsort() to sort * the significant clusters by order of occurrence on * the target sequence */ static int cluster_orderer(const void *v1, const void *v2) { struct p7_spcoord_s *h1 = (struct p7_spcoord_s *) v1; struct p7_spcoord_s *h2 = (struct p7_spcoord_s *) v2; if (h1->i < h2->i) return -1; else if (h1->i > h2->i) return 1; else return 0; } /* Function: p7_spensemble_Cluster() * Synopsis: Cluster a seg pair ensemble and define domains. * Incept: SRE, Wed Jan 9 11:04:07 2008 [Janelia] * * Purpose: Given a collected segment pair ensemble , cluster it; * identify significant clusters with high posterior probability; * and define consensus endpoints for each significant cluster. * * Clustering is single-linkage. The linkage rule is * controlled by the , , and * parameters. To be linked, two segments * must overlap by a fraction $\geq$ , * relative to either the smaller or larger of the two * segments ( or ), in both their * sequence and model coords, and either the start or end of both * segments must lie within $\leq$ diagonals * of each other. * * The threshold for cluster "significance" is controlled * by the parameter. Clusters with * posterior probability $\geq$ this threshold are called * significant. * * Consensus endpoint definition within a cluster is * controlled by the parameter. The widest * endpoint that has a posterior probability of $\geq * min_endpointp> is chosen; this is done independently for * each coordinate (i,j,k,m). * * A reasonable (and tested) parameterization is * , , , , . * * Args: sp - segment pair ensemble to cluster * min_overlap - linkage requires fractional overlap >= this, in both seq and hmm segments * of_smaller - overlap fraction denominators uses either the smaller (if TRUE) or larger (if FALSE) segment * max_diagdiff - linkage requires that start, end points of both seg pairs are <= this * min_posterior - clusters with posterior prob >= this are defined as significant * min_endpointp - widest endpoint with post prob >= this is defined as consensus endpoint coord * * Returns: the number of significant clusters in <*ret_nclusters>. * The caller can then obtain consensus endpoints for each cluster * by making a series of calls. * */ int p7_spensemble_Cluster(P7_SPENSEMBLE *sp, float min_overlap, int of_smaller, int max_diagdiff, float min_posterior, float min_endpointp, int *ret_nclusters) { struct p7_linkparam_s param; int status; int c; int h; int idx_of_last; int *ninc = NULL; int cwindow_width; int epc_threshold; int imin, jmin, kmin, mmin; int imax, jmax, kmax, mmax; int best_i, best_j, best_k, best_m; /* set up a single linkage clustering problem for Easel's general routine */ param.min_overlap = min_overlap; param.of_smaller = of_smaller; param.max_diagdiff = max_diagdiff; param.min_posterior = min_posterior; param.min_endpointp = min_endpointp; if ((status = esl_cluster_SingleLinkage(sp->sp, sp->n, sizeof(struct p7_spcoord_s), link_spsamples, (void *) ¶m, sp->workspace, sp->assignment, &(sp->nc))) != eslOK) goto ERROR; ESL_ALLOC(ninc, sizeof(int) * sp->nc); /* Look at each cluster in turn; most will be too small to worry about. */ for (c = 0; c < sp->nc; c++) { /* Calculate posterior probability of each cluster. * The extra wrinkle here is that this probability is w.r.t the number of sampled traces; * but the clusters might contain more than one seg pair from a given trace. * That's what the idx_of_last logic is doing, avoiding double-counting. */ idx_of_last = -1; for (ninc[c] = 0, h = 0; h < sp->n; h++) { if (sp->assignment[h] == c) { if (sp->sp[h].idx != idx_of_last) ninc[c]++; idx_of_last = sp->sp[h].idx; } } /* Reject low probability clusters: */ if ((float) ninc[c] / (float) sp->nsamples < min_posterior) continue; /* Find the maximum extent of all seg pairs in this cluster in i,j k,m */ for (imin = 0, h = 0; h < sp->n; h++) if (sp->assignment[h] == c) { if (imin == 0) { imin = imax = sp->sp[h].i; jmin = jmax = sp->sp[h].j; kmin = kmax = sp->sp[h].k; mmin = mmax = sp->sp[h].m; } else { imin = ESL_MIN(imin, sp->sp[h].i); imax = ESL_MAX(imax, sp->sp[h].i); jmin = ESL_MIN(jmin, sp->sp[h].j); jmax = ESL_MAX(jmax, sp->sp[h].j); kmin = ESL_MIN(kmin, sp->sp[h].k); kmax = ESL_MAX(kmax, sp->sp[h].k); mmin = ESL_MIN(mmin, sp->sp[h].m); mmax = ESL_MAX(mmax, sp->sp[h].m); } } /* Set up a window in which we can examine the end point distributions for i,j,k,m in turn, independently */ cwindow_width = ESL_MAX(ESL_MAX(imax-imin+1, jmax-jmin+1), ESL_MAX(kmax-kmin+1, mmax-mmin+1)); if (cwindow_width > sp->epc_alloc) { void *p; ESL_RALLOC(sp->epc, p, sizeof(int) * cwindow_width); sp->epc_alloc = cwindow_width; } epc_threshold = (int) ceilf((float) ninc[c] * min_endpointp); /* round up. freq of >= epc_threshold means we're >= min_p */ /* Identify the leftmost i that has enough endpoints. */ esl_vec_ISet(sp->epc, imax-imin+1, 0); for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].i-imin]++; for (best_i = imin; best_i <= imax; best_i++) if (sp->epc[best_i-imin] >= epc_threshold) break; if (best_i > imax) best_i = imin + esl_vec_IArgMax(sp->epc, imax-imin+1); /* Identify the leftmost k that has enough endpoints */ esl_vec_ISet(sp->epc, kmax-kmin+1, 0); for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].k-kmin]++; for (best_k = kmin; best_k <= kmax; best_k++) if (sp->epc[best_k-kmin] >= epc_threshold) break; if (best_k > kmax) best_k = kmin + esl_vec_IArgMax(sp->epc, kmax-kmin+1); /* Identify the rightmost j that has enough endpoints. */ esl_vec_ISet(sp->epc, jmax-jmin+1, 0); for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].j-jmin]++; for (best_j = jmax; best_j >= jmin; best_j--) if (sp->epc[best_j-jmin] >= epc_threshold) break; if (best_j < jmin) best_j = jmin + esl_vec_IArgMax(sp->epc, jmax-jmin+1); /* Identify the rightmost m that has enough endpoints. */ esl_vec_ISet(sp->epc, mmax-mmin+1, 0); for (h = 0; h < sp->n; h++) if (sp->assignment[h] == c) sp->epc[sp->sp[h].m-mmin]++; for (best_m = mmax; best_m >= mmin; best_m--) if (sp->epc[best_m-mmin] >= epc_threshold) break; if (best_m < mmin) best_m = mmin + esl_vec_IArgMax(sp->epc, mmax-mmin+1); /* If there's no well-defined domain (say, a long stretch of biased composition), the coords above might come out inconsistent; in this case, just reject the domain. */ if (best_i > best_j || best_k > best_m) continue; if (sp->nsigc >= sp->nsigc_alloc) { void *p; ESL_RALLOC(sp->sigc, p, sizeof(struct p7_spcoord_s) * sp->nsigc_alloc * 2); sp->nsigc_alloc *= 2; } sp->sigc[sp->nsigc].i = best_i; sp->sigc[sp->nsigc].j = best_j; sp->sigc[sp->nsigc].k = best_k; sp->sigc[sp->nsigc].m = best_m; sp->sigc[sp->nsigc].idx = c; sp->sigc[sp->nsigc].prob = (float) ninc[c] / (float) sp->nsamples; sp->nsigc++; } /* Now we need to make sure those domains are ordered by start point, * because later we're going to calculate overlaps by i_cur - j_prv */ qsort((void *) sp->sigc, sp->nsigc, sizeof(struct p7_spcoord_s), cluster_orderer); free(ninc); *ret_nclusters = sp->nsigc; return eslOK; ERROR: if (ninc != NULL) free(ninc); *ret_nclusters = 0; return status; } /* Function: p7_spensemble_GetClusterCoords() * Synopsis: Retrieve consensus coords of one significant segment pair cluster. * Incept: SRE, Wed Jan 9 11:39:27 2008 [Janelia] * * Purpose: Retrieve the consensus coords of significant segment pair cluster * from the ensemble , which has already been clustered with * . * * Returns: on success, and the consensus coords are in <*opt_i>, <*opt_j>, * <*opt_k>, and <*opt_m>; the (sampled) posterior probability of the * cluster is in <*opt_p>. All of these returned values are optional; * the caller can pass a for any value it's not interested in * retrieving. */ int p7_spensemble_GetClusterCoords(P7_SPENSEMBLE *sp, int which, int *opt_i, int *opt_j, int *opt_k, int *opt_m, float *opt_p) { if (opt_i != NULL) *opt_i = sp->sigc[which].i; if (opt_j != NULL) *opt_j = sp->sigc[which].j; if (opt_k != NULL) *opt_k = sp->sigc[which].k; if (opt_m != NULL) *opt_m = sp->sigc[which].m; if (opt_p != NULL) *opt_p = sp->sigc[which].prob; return eslOK; } /* Function: p7_spensemble_Destroy() * Synopsis: Deallocate a * Incept: SRE, Wed Jan 9 11:42:01 2008 [Janelia] * * Purpose: Destroys a . */ void p7_spensemble_Destroy(P7_SPENSEMBLE *sp) { if (sp == NULL) return; if (sp->sp != NULL) free(sp->sp); if (sp->workspace != NULL) free(sp->workspace); if (sp->assignment != NULL) free(sp->assignment); if (sp->epc != NULL) free(sp->epc); if (sp->sigc != NULL) free(sp->sigc); free(sp); } /***************************************************************** * Benchmark and example. *****************************************************************/ #ifdef p7SPENSEMBLE_EXAMPLE /* gcc -g -I. -L. -I ../easel -L ../easel -Dp7SPENSEMBLE_EXAMPLE -o example p7_spensemble.c -lhmmer -leasel -lm */ #include "p7_config.h" #include #include #include "easel.h" #include "esl_getopts.h" #include "esl_random.h" #include "esl_alphabet.h" #include "esl_dmatrix.h" #include "esl_sq.h" #include "esl_sqio.h" #include "esl_cluster.h" #include "esl_vectorops.h" #include "esl_stopwatch.h" #include "hmmer.h" static ESL_OPTIONS options[] = { /* name type default env range toggles reqs incomp help docgroup*/ { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 0 }, { "-s", eslARG_INT, "42", NULL, NULL, NULL, NULL, NULL, "set random number seed to ", 0 }, { "-N", eslARG_INT, "1000", NULL, NULL, NULL, NULL, NULL, "number of trace samples to take", 0 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, }; static char usage[] = "[-options] "; static char banner[] = "example, test, benchmark of defining domains by posterior sampling"; int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_STOPWATCH *w = esl_stopwatch_Create(); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); char *hmmfile = esl_opt_GetArg(go, 1); char *seqfile = esl_opt_GetArg(go, 2); int format = eslSQFILE_FASTA; ESL_ALPHABET *abc = NULL; P7_HMMFILE *hfp = NULL; P7_HMM *hmm = NULL; P7_BG *bg = NULL; P7_PROFILE *gm = NULL; P7_OPROFILE *om = NULL; P7_OMX *fwd = NULL; ESL_SQ *sq = NULL; ESL_SQFILE *sqfp = NULL; P7_TRACE *tr = NULL; P7_SPENSEMBLE *sp = NULL; int N = esl_opt_GetInteger(go, "-N"); int t,d,nc; int i,j,k,m; float sc; float prob; int status; /* Read in one HMM */ if (p7_hmmfile_OpenE(hmmfile, NULL, &hfp, NULL) != eslOK) p7_Fail("Failed to open HMM file %s", hmmfile); if (p7_hmmfile_Read(hfp, &abc, &hmm) != eslOK) p7_Fail("Failed to read HMM"); p7_hmmfile_Close(hfp); /* Read in one sequence */ sq = esl_sq_CreateDigital(abc); status = esl_sqfile_Open(seqfile, format, NULL, &sqfp); if (status == eslENOTFOUND) p7_Fail("No such file."); else if (status == eslEFORMAT) p7_Fail("Format unrecognized."); else if (status == eslEINVAL) p7_Fail("Can't autodetect stdin or .gz."); else if (status != eslOK) p7_Fail("Open failed, code %d.", status); if (esl_sqio_Read(sqfp, sq) != eslOK) p7_Fail("Failed to read sequence"); esl_sqfile_Close(sqfp); /* Configure a profile from the HMM */ bg = p7_bg_Create(abc); p7_bg_SetLength(bg, sq->n); gm = p7_profile_Create(hmm->M, abc); om = p7_oprofile_Create(hmm->M, abc); p7_ProfileConfig(hmm, bg, gm, sq->n, p7_LOCAL); p7_oprofile_Convert(gm, om); p7_oprofile_ReconfigLength(om, sq->n); /* Allocate DP matrix for Forward, and run a Forward calculation in it */ fwd = p7_omx_Create(gm->M, sq->n, sq->n); p7_Forward (sq->dsq, sq->n, om, fwd, &sc); /* Allocate a trace container, and an spensemble */ tr = p7_trace_Create(); sp = p7_spensemble_Create(1024, 64, 32); /* Start the stopwatch. Now we're in domain processing steps. */ esl_stopwatch_Start(w); /* Collect N traces, add their domain coords to the ensemble, and cluster */ for (t = 0; t < N; t++) { p7_StochasticTrace(r, sq->dsq, sq->n, om, fwd, tr); p7_trace_Index(tr); for (d = 0; d < tr->ndom; d++) p7_spensemble_Add(sp, t, tr->sqfrom[d], tr->sqto[d], tr->hmmfrom[d], tr->hmmto[d]); p7_trace_Reuse(tr); } p7_spensemble_Cluster(sp, 0.8, TRUE, 4, 0.25, 0.02, &nc); for (d = 0; d < nc; d++) { p7_spensemble_GetClusterCoords(sp, d, &i, &j, &k, &m, &prob); printf("domain %-4d : %6d %6d %6d %6d p=%.4f\n", d, i, j, k, m, prob); } /* Done. */ esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); p7_spensemble_Destroy(sp); p7_trace_Destroy(tr); esl_sq_Destroy(sq); p7_omx_Destroy(fwd); p7_profile_Destroy(gm); p7_oprofile_Destroy(om); p7_bg_Destroy(bg); p7_hmm_Destroy(hmm); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; } #endif /*p7SPENSEMBLE_EXAMPLE*/