#include "p7_config.h" #include #include "easel.h" #include "esl_alphabet.h" #include "esl_gumbel.h" #include "esl_sq.h" #include "hmmer.h" /* hit_sorter(): qsort's pawn, below */ static int FM_hit_sorter(const void *a, const void *b) { // return 2 * (((FM_DIAG*)a)->sortkey > ((FM_DIAG*)b)->sortkey) - 1; // same as the test below if ( ((FM_DIAG*)a)->sortkey > ((FM_DIAG*)b)->sortkey) return 1; else return -1; } /* Function: FM_mergeSeeds() * * Synopsis: Given collection of seeds, sort and merge overlapping ones * * Returns: on success. */ static int FM_mergeSeeds(FM_DIAGLIST *seeds, int N, int ssv_length) { int i; int j = 0; FM_DIAG next; int tmp; int next_is_complement; int curr_is_complement; int curr_n; int curr_k; int curr_len; int curr_end; int curr_diagval; FM_DIAG *diags = seeds->diags; if (seeds->count == 0) return eslOK; //sort, first by direction, then N (position on database sequence), then K (model position) qsort(diags, seeds->count, sizeof(FM_DIAG), FM_hit_sorter); next = diags[0]; curr_is_complement = (next.complementarity == p7_COMPLEMENT); curr_n = next.n; curr_k = next.k; curr_len = next.length; curr_end = curr_n + curr_len - 1; curr_diagval = next.n - next.k; for( i=1; icount; i++) { next = diags[i]; next_is_complement = (next.complementarity == p7_COMPLEMENT); if ( next_is_complement == curr_is_complement //same direction && ( next.n - next.k) == curr_diagval //overlapping diagonals will share the same value of (n - k) && next.n + next.length < curr_n + curr_len + ssv_length //overlapping, or close to it ) { //overlapping diags; extend, if appropriate tmp = next.n + next.length - 1; if (tmp > curr_end) { curr_end = tmp; curr_len = curr_end - curr_n + 1; } } else { //doesn't overlap current diagonal, so store current... diags[j].n = curr_n; diags[j].k = curr_k; diags[j].length = curr_end - curr_n + 1; diags[j].complementarity = curr_is_complement; diags[j].score = 0.0; // ... then start up a new one curr_n = next.n; curr_k = next.k; curr_len = next.length; curr_end = curr_n + curr_len - 1; curr_diagval = next.n - next.k; curr_is_complement = next_is_complement; j++; } } // store final entry diags[j].n = curr_n; diags[j].k = curr_k; diags[j].length = curr_end - curr_n + 1; diags[j].score = 0.0; diags[j].complementarity = curr_is_complement; seeds->count = j+1; return eslOK; } /* Function: FM_backtrackSeed() * * Synopsis: Find position(s) in the FM index for a diagonal that meets score threshold * * Details: Follows the BWT/FM-index until finding an entry of the implicit * suffix array that is found in the sampled SA. * * Args: fmf - FM index for finding matches to the input sequence * fm_cfg - FM-index meta data * i - Single position in the BWT * * Returns: on success. */ static uint32_t FM_backtrackSeed(const FM_DATA *fmf, const FM_CFG *fm_cfg, int i) { int j = i; int len = 0; int c; while ( j != fmf->term_loc && (j % fm_cfg->meta->freq_SA)) { //go until we hit a position in the full SA that was sampled during FM index construction c = fm_getChar( fm_cfg->meta->alph_type, j, fmf->BWT); j = fm_getOccCount (fmf, fm_cfg, j-1, c); j += abs(fmf->C[c]); len++; } return len + (j==fmf->term_loc ? 0 : fmf->SA[ j / fm_cfg->meta->freq_SA ]) ; // len is how many backward steps we had to take to find a sampled SA position } /* Function: FM_getPassingDiags() * * Synopsis: Find position(s) in the FM index for a seed that meets score threshold, keep list * * Details: This step determines the location of each instance of the seed * and creates a diagonal for that instance * * Args: fmf - FM index for finding matches to the input sequence * fmb - FM index for finding matches to the reverse of the input sequence * fm_cfg - FM-index meta data * k - Position of the diagonal in the model * M - Length of the model * depth - Length of the diagonal * fm_direction - which FM is in use for this diag * model_direction - forward or reverse path over the model * complementarity - top or bottom strand * interval - FM-index interval * seeds - RETURN: collection of threshold-passing windows * * Returns: on success. */ static int FM_getPassingDiags(const FM_DATA *fmf, const FM_CFG *fm_cfg, int k, int M, float sc, int depth, int fm_direction, int model_direction, int complementarity, FM_INTERVAL *interval, FM_DIAGLIST *seeds ) { int i; FM_DIAG *seed; //iterate over the forward interval, for each entry backtrack until hitting a sampled suffix array entry for (i = interval->lower; i<= interval->upper; i++) { seed = fm_newSeed(seeds); seed->k = k; seed->length = depth; if (complementarity == p7_NOCOMPLEMENT ) seed->n = fmf->N - FM_backtrackSeed(fmf, fm_cfg, i) - depth - 1; else seed->n = FM_backtrackSeed(fmf, fm_cfg, i) ; seed->complementarity = complementarity; /* seed->n corresponds to the start of the seed in terms of the * target sequence, in a forward direction. seed->k holds the * model position at the beginning of that seed. If model_direction * is fm_reverse, the ->n value is from the beginning of the revcomp, */ if (model_direction == fm_forward) seed->k -= (depth - 1) ; seed->sortkey = (int)( complementarity == p7_COMPLEMENT ? fmf->N + 1 : 0) // makes complement seeds cover a different score range than non-complements + ((int)(seed->n) - (int)(seed->k) ) // unique diagonal within the complement/non-complement score range + ((double)(seed->k)/(double)(M+1)) ; // fractional part, used to sort seeds sharing a diagonal } return eslOK; } /* Function: FM_Recurse() * * Synopsis: Recursively traverse/prune a string trie, testing all strings vs the model * * Details: This is the heart of the FM SSV method. Given a path P on the * trie, we keep track of a compact list of all not-yet-pruned * diagonals in the DP table of the string S corresponding to * P against the model. The preserved diagonals might be for * either a forward or backwards pass over the model and a pass * over either the top or bottom (reverse complemented) strand * of the target sequences. * * Args: depth - how long is the current path * Kp - alphabet size (including ambiguity) * fmf - FM index for finding matches to the input sequence * fmb - FM index for finding matches to the reverse of the input sequence * fm_cfg - FM-index meta data * ssvdata - compact data required for computing SSV scores * sc_threshFM - Score that a short diagonal must pass to warrant extension to a full diagonal * dp_pairs - Compact representation of the surviving diagonals in the DP table * first - The index of the first entry in dp_pairs for the current column of the DP table * last - The index of the last entry in dp_pairs for the current column of the DP table * interval_1 - FM-index interval - used for the standard backwards pass along the BWT (fmf) * interval_2 - FM-index interval - used for the forward pass along the BWT (fmb) * seeds - RETURN: collection of threshold-passing windows * seq - preallocated char* used to capture and print the string for the current path - for debugging only * * Returns: on success. */ static int FM_Recurse( int depth, int Kp, int fm_direction, const FM_DATA *fmf, const FM_DATA *fmb, const FM_CFG *fm_cfg, const P7_SCOREDATA *ssvdata, uint8_t *consensus, float sc_threshFM, FM_DP_PAIR *dp_pairs, int first, int last, FM_INTERVAL *interval_1, FM_INTERVAL *interval_2, FM_DIAGLIST *seeds , char *seq ) { float sc, next_score; int c, i, k; FM_INTERVAL interval_1_new, interval_2_new; uint8_t positive_run = 0; uint8_t consec_consensus = 0; for (c=0; c< fm_cfg->meta->alph_size; c++) {//acgt int dppos = last; seq[depth-1] = fm_cfg->meta->alph[c]; seq[depth] = '\0'; for (i=first; i<=last; i++) { // for each surviving diagonal from the previous round if (dp_pairs[i].model_direction == fm_forward) k = dp_pairs[i].pos + 1; else //fm_backward k = dp_pairs[i].pos - 1; if (dp_pairs[i].complementarity == p7_COMPLEMENT) { next_score = ssvdata->ssv_scores_f[k*Kp + fm_cfg->meta->compl_alph[c]]; } else { next_score = ssvdata->ssv_scores_f[k*Kp + c]; } sc = dp_pairs[i].score + next_score; positive_run = (next_score > 0 ? dp_pairs[i].consec_pos + 1 : 0); consec_consensus = (c == consensus[k] ? dp_pairs[i].consec_consensus+1 : 0); if ( sc >= sc_threshFM || (fm_cfg->consensus_match_req > 0 && consec_consensus == fm_cfg->consensus_match_req) ) { // this is a seed I want to extend interval_1_new.lower = interval_1->lower; interval_1_new.upper = interval_1->upper; if (fm_direction == fm_forward) { if ( interval_1_new.lower >= 0 && interval_1_new.lower <= interval_1_new.upper ) //no use extending a non-existent string fm_updateIntervalReverse( fmf, fm_cfg, c, &interval_1_new); if ( interval_1_new.lower >= 0 && interval_1_new.lower <= interval_1_new.upper ) { //no use passing a non-existent string FM_getPassingDiags(fmf, fm_cfg, k, ssvdata->M, sc, depth, fm_forward, dp_pairs[i].model_direction, dp_pairs[i].complementarity, &interval_1_new, seeds); } } else { // fm_direction == fm_reverse //searching for forward matches on the FM-index interval_2_new.lower = interval_2->lower; interval_2_new.upper = interval_2->upper; //searching for reverse matches on the FM-index if ( interval_1_new.lower >= 0 && interval_1_new.lower <= interval_1_new.upper ) //no use extending a non-existent string fm_updateIntervalForward( fmb, fm_cfg, c, &interval_1_new, &interval_2_new); if ( interval_2_new.lower >= 0 && interval_2_new.lower <= interval_2_new.upper ) { //no use passing a non-existent string FM_getPassingDiags(fmf, fm_cfg, k, ssvdata->M, sc, depth, fm_backward, dp_pairs[i].model_direction, dp_pairs[i].complementarity, &interval_2_new, seeds); } } } else if ( sc <= 0 //some other path in the string enumeration tree will do the job || depth == fm_cfg->max_depth //can't extend anymore, 'cause we've reached the pruning length || ( dp_pairs[i].model_direction == fm_forward && k == ssvdata->M) //can't extend anymore, 'cause we're at the end of the model, going forward || ( dp_pairs[i].model_direction == fm_backward && k == 1 ) //can't extend anymore, 'cause we're at the beginning of the model, going backwards || (depth == dp_pairs[i].score_peak_len + fm_cfg->drop_max_len) //too many consecutive positions with a negative total score contribution (sort of like Xdrop) || (depth > 4 && depth > consec_consensus && (float)sc/(float)depth < fm_cfg->score_density_req) //score density is too low (don't bother checking in the first couple slots || (depth >= 0.7*fm_cfg->max_depth && depth > consec_consensus && (float)sc/(float)depth < sc_threshFM/(float)(fm_cfg->max_depth)) // if we're most of the way across the sequence, and score density is too low, abort -- if the density on the other side is high enough, I'll find it on the reverse sweep || (dp_pairs[i].max_consec_pos < fm_cfg->consec_pos_req && //a seed is expected to have at least one run of positive-scoring matches at least length consec_pos_req; if it hasn't, (see Tue Nov 23 09:39:54 EST 2010) (fm_cfg->consec_pos_req - positive_run) == (fm_cfg->max_depth - depth + 1) // if we're close to the end of the sequence, abort -- if that end does have sufficiently long all-positive run, I'll find it on the reverse sweep ) || (dp_pairs[i].model_direction == fm_forward && ( (depth > (fm_cfg->max_depth - 10)) && sc + ssvdata->opt_ext_fwd[k][fm_cfg->max_depth-depth-1] < sc_threshFM) //can't hit threshold, even with best possible forward extension up to length ssv_req ) || (dp_pairs[i].model_direction == fm_backward && ( (depth > (fm_cfg->max_depth - 10)) && sc + ssvdata->opt_ext_rev[k-1][fm_cfg->max_depth-depth-1] < sc_threshFM ) //can't hit threshold, even with best possible extension up to length ssv_req ) ) { //do nothing - it's been pruned } else { // it's possible to extend this diagonal and reach the threshold score dppos++; dp_pairs[dppos].pos = k; dp_pairs[dppos].score = sc; dp_pairs[dppos].model_direction = dp_pairs[i].model_direction; dp_pairs[dppos].complementarity = dp_pairs[i].complementarity; if (sc > dp_pairs[i].max_score) { dp_pairs[dppos].max_score = sc; dp_pairs[dppos].score_peak_len = depth; } else { dp_pairs[dppos].max_score = dp_pairs[i].max_score; if (sc >= dp_pairs[i].max_score - fm_cfg->drop_lim) dp_pairs[dppos].score_peak_len = depth; // close enough to call it a new peak else dp_pairs[dppos].score_peak_len = dp_pairs[i].score_peak_len; } dp_pairs[dppos].consec_pos = positive_run; dp_pairs[dppos].max_consec_pos = ESL_MAX( positive_run, dp_pairs[i].max_consec_pos); dp_pairs[dppos].consec_consensus = consec_consensus; } } if ( dppos > last ){ // at least one diagonal that might reach threshold score, but hasn't yet, so extend interval_1_new.lower = interval_1->lower; interval_1_new.upper = interval_1->upper; if (fm_direction == fm_forward) { if ( interval_1_new.lower >= 0 && interval_1_new.lower <= interval_1_new.upper ) //no use extending a non-existent string fm_updateIntervalReverse( fmf, fm_cfg, c, &interval_1_new); if ( interval_1_new.lower < 0 || interval_1_new.lower > interval_1_new.upper ) { //that string doesn't exist in fwd index continue; } FM_Recurse(depth+1, Kp, fm_direction, fmf, fmb, fm_cfg, ssvdata, consensus, sc_threshFM, dp_pairs, last+1, dppos, &interval_1_new, NULL, seeds , seq ); } else { // fm_direction == fm_reverse interval_2_new.lower = interval_2->lower; interval_2_new.upper = interval_2->upper; if ( interval_1_new.lower >= 0 && interval_1_new.lower <= interval_1_new.upper ) //no use extending a non-existent string fm_updateIntervalForward( fmb, fm_cfg, c, &interval_1_new, &interval_2_new); if ( interval_1_new.lower < 0 || interval_1_new.lower > interval_1_new.upper ) { //that string doesn't exist in reverse index continue; } FM_Recurse(depth+1, Kp, fm_direction, fmf, fmb, fm_cfg, ssvdata, consensus, sc_threshFM, dp_pairs, last+1, dppos, &interval_1_new, &interval_2_new, seeds , seq ); } } } return eslOK; } /* Function: FM_getSeeds() * * Synopsis: Find short diagonal seeds with score above a modest threshold. * * Details: Given FM configuration , model scoring data , * both forward and backward FM indexes (, ), and * a score threshold , find all seeds in the FMs * that meet the threshold, and place them in the container * . * * This involves building diagonals in both forward and reverse * orientation relative to the model, because the pruning method * includes a score density calculation - sometimes that density * is only found on one end of the hit. This function merely * kickstarts the task of traversing over a trie of all strings * up to some fixed length looking for threshold-passing * diagonals - FM_Recurse() does the hard work. * * Args: fmf - FM index for finding matches to the input sequence * fmb - FM index for finding matches to the reverse of the input sequence * fm_cfg - FM-index meta data * ssvdata - compact data required for computing SSV scores * Kp - Alphabet size (including ambiguity chars) * sc_threshFM - Score that a short diagonal must pass to warrant extension to a full diagonal * strands - p7_STRAND_TOPONLY | p7_STRAND_BOTTOMONLY | p7_STRAND_BOTH * seeds - RETURN: collection of threshold-passing windows * * Returns: on success. */ static int FM_getSeeds ( const FM_DATA *fmf, const FM_DATA *fmb, const FM_CFG *fm_cfg, const P7_SCOREDATA *ssvdata, uint8_t *consensus, int Kp, float sc_threshFM, int strands, FM_DIAGLIST *seeds ) { FM_INTERVAL interval_f1, interval_f2, interval_bk; int i, k; int status; float sc; char *seq; FM_DP_PAIR *dp_pairs_fwd; FM_DP_PAIR *dp_pairs_rev; ESL_ALLOC(dp_pairs_fwd, ssvdata->M * fm_cfg->max_depth * sizeof(FM_DP_PAIR)); // guaranteed to be enough to hold all diagonals ESL_ALLOC(dp_pairs_rev, ssvdata->M * fm_cfg->max_depth * sizeof(FM_DP_PAIR)); ESL_ALLOC(seq, 50*sizeof(char)); for (i=0; imeta->alph_size; i++) { int fwd_cnt=0; int rev_cnt=0; interval_f1.lower = interval_f2.lower = interval_bk.lower = fmf->C[i]; interval_f1.upper = interval_f2.upper = interval_bk.upper = abs(fmf->C[i+1])-1; if (interval_f1.lower<0 ) //none of that character found continue; // This is here for debugging purposes only. Feel free to comment out. seq[0] = fm_cfg->meta->alph[i]; seq[1] = '\0'; // Fill in a DP column for the character c, (compressed so that only positive-scoring entries are kept) // There will be 4 DP columns for each character, (1) fwd-std, (2) fwd-complement, (3) rev-std, (4) rev-complement for (k = 1; k <= ssvdata->M; k++) // there's no need to bother keeping an entry starting at the last position (gm->M) { if (strands != p7_STRAND_BOTTOMONLY) { sc = ssvdata->ssv_scores_f[k*Kp + i]; if (sc>0) { // we'll extend any positive-scoring diagonal /* fwd on model, fwd on FM (really, reverse on FM, but the FM is on a reversed string, so its fwd*/ if (k < ssvdata->M-3) { // don't bother starting a forward diagonal so close to the end of the model //Forward pass on the FM-index dp_pairs_fwd[fwd_cnt].pos = k; dp_pairs_fwd[fwd_cnt].score = sc; dp_pairs_fwd[fwd_cnt].max_score = sc; dp_pairs_fwd[fwd_cnt].score_peak_len = 1; dp_pairs_fwd[fwd_cnt].consec_pos = 1; dp_pairs_fwd[fwd_cnt].max_consec_pos = 1; dp_pairs_fwd[fwd_cnt].consec_consensus = (i==consensus[k] ? 1 : 0); dp_pairs_fwd[fwd_cnt].complementarity = p7_NOCOMPLEMENT; dp_pairs_fwd[fwd_cnt].model_direction = fm_forward; fwd_cnt++; } /* rev on model, rev on FM (the FM is on the unreversed string)*/ if (k > 4) { // don't bother starting a reverse diagonal so close to the start of the model dp_pairs_rev[rev_cnt].pos = k; dp_pairs_rev[rev_cnt].score = sc; dp_pairs_rev[rev_cnt].max_score = sc; dp_pairs_rev[rev_cnt].score_peak_len = 1; dp_pairs_rev[rev_cnt].consec_pos = 1; dp_pairs_rev[rev_cnt].max_consec_pos = 1; dp_pairs_rev[rev_cnt].consec_consensus = (i==consensus[k] ? 1: 0); dp_pairs_rev[rev_cnt].complementarity = p7_NOCOMPLEMENT; dp_pairs_rev[rev_cnt].model_direction = fm_backward; rev_cnt++; } } } // Now do the reverse complement if (strands != p7_STRAND_TOPONLY) { sc = ssvdata->ssv_scores_f[k*Kp + fm_cfg->meta->compl_alph[i]]; if (sc>0) { // we'll extend any positive-scoring diagonal /* rev on model, fwd on FM (really, reverse on FM, but the FM is on a reversed string, so its fwd*/ if (k > 4) { // don't bother starting a reverse diagonal so close to the start of the model dp_pairs_fwd[fwd_cnt].pos = k; dp_pairs_fwd[fwd_cnt].score = sc; dp_pairs_fwd[fwd_cnt].max_score = sc; dp_pairs_fwd[fwd_cnt].score_peak_len = 1; dp_pairs_fwd[fwd_cnt].consec_pos = 1; dp_pairs_fwd[fwd_cnt].max_consec_pos = 1; dp_pairs_fwd[fwd_cnt].consec_consensus = (i==consensus[k] ? 1: 0); dp_pairs_fwd[fwd_cnt].complementarity = p7_COMPLEMENT; dp_pairs_fwd[fwd_cnt].model_direction = fm_backward; fwd_cnt++; } /* fwd on model, rev on FM (the FM is on the unreversed string - complemented)*/ if (k < ssvdata->M-3) { // don't bother starting a forward diagonal so close to the end of the model dp_pairs_rev[rev_cnt].pos = k; dp_pairs_rev[rev_cnt].score = sc; dp_pairs_rev[rev_cnt].max_score = sc; dp_pairs_rev[rev_cnt].score_peak_len = 1; dp_pairs_rev[rev_cnt].consec_pos = 1; dp_pairs_rev[rev_cnt].max_consec_pos = 1; dp_pairs_rev[rev_cnt].consec_consensus = (i==consensus[k] ? 1: 0); dp_pairs_rev[rev_cnt].complementarity = p7_COMPLEMENT; dp_pairs_rev[rev_cnt].model_direction = fm_forward; rev_cnt++; } } } } FM_Recurse ( 2, Kp, fm_forward, fmf, fmb, fm_cfg, ssvdata, consensus, sc_threshFM, dp_pairs_fwd, 0, fwd_cnt-1, &interval_f1, NULL, seeds , seq ); FM_Recurse ( 2, Kp, fm_backward, fmf, fmb, fm_cfg, ssvdata, consensus, sc_threshFM, dp_pairs_rev, 0, rev_cnt-1, &interval_bk, &interval_f2, seeds , seq ); } //merge duplicates FM_mergeSeeds(seeds, fmf->N, fm_cfg->ssv_length); free (dp_pairs_fwd); free (dp_pairs_rev); if (seq) free(seq); return eslOK; ERROR: return eslEMEM; } /* Function: FM_window_from_diag() * * Synopsis: Create a hit window, with sequence-based coordinates, from a diagonal * holding FM-based coordinates * * Details: The submitted diagonal is in FM-based coordinates. Since a single * FM index might be the concatenation of many sequences in the * original, this needs to be converted to coordinates in the * original sequence space (get sequence ID and positions). A diag * might span multiple input strings, so it is broken up as * necessary (usually, only one of these will pan out as a legit * diagonal, but we'll let the next stage sort that out). * * Args: diag - The FM-based diagonal * fm - Data for the FM-index. * meta - FM metadata from the config * windowlist - RETURN: collection of SSV-passing windows, with meta data required for downstream stages. * * Returns: on success. */ static int FM_window_from_diag (FM_DIAG *diag, const FM_DATA *fm, const FM_METADATA *meta, P7_HMM_WINDOWLIST *windowlist) { // if diag->complementarity == p7_NOCOMPLEMENT, these positions are in context of FM->T // otherwise, they're in context of revcomp(FM->T). int status; uint32_t seg_id; uint64_t seg_pos; status = fm_getOriginalPosition (fm, meta, 0, diag->length, diag->complementarity, diag->n, &seg_id, &seg_pos); p7_hmmwindow_new(windowlist, seg_id, seg_pos, diag->n, diag->k+diag->length-1, diag->length, diag->score, diag->complementarity, meta->seq_data[seg_id].length); return eslOK; } /* Function: FM_extendSeed() * Synopsis: Extend seed in both diagonal directions, capturing the score * * Details: Given a diagonal seed found using FM-index traversal (typically * around length 16, with a modest score, but not necessarily enough * to pass the SSV threshold), establish a window around that seed, * and extend it to maximize score (with the constraint of not going * through a long negative scoring stretch). Capture the score of * this extended diagonal. * * Args: diag - The initial seed * fm - Data for the FM-index (only need the forward FM from the * calling function). * ssvdata - Compact data required for computing MSV (SSV) scores * cfg - FM-index meta data * tmp_sq - Sequence object that this function uses for calculations. * Must be pre-allocated. * * Returns: on success. */ static int FM_extendSeed(FM_DIAG *diag, const FM_DATA *fm, const P7_SCOREDATA *ssvdata, FM_CFG *cfg, ESL_SQ *tmp_sq) { uint64_t k,n; int32_t model_start, model_end; int64_t target_start, target_end; int64_t hit_start, max_hit_start, max_hit_end; float sc; float max_sc = 0.0; int c; // this will allow a diagonal to be extended at least 10 bases in each direction, an up to as much as required to allow a diag of length cfg->ssv_length int extend = ESL_MAX(10, cfg->ssv_length - diag->length); //this determines the start and end of the window that we think it's possible we'll extend to the window to (which determines the sequence we'll extract) model_start = ESL_MAX ( 1, diag->k - extend + 1) ; model_end = ESL_MIN( ssvdata->M, diag->k + diag->length + extend - 1 ); target_start = diag->n - (diag->k - model_start); target_end = diag->n + (model_end - diag->k); if (target_start < 0) { model_start -= target_start; target_start = 0; } if (target_end > fm->N-2) { model_end -= target_end - (fm->N-2); target_end = fm->N-2; } fm_convertRange2DSQ(fm, cfg->meta, target_start, target_end-target_start+1, diag->complementarity, tmp_sq, FALSE ); //This finds the highest-scoring sub-diag in the just-determined potential diagonal range. k = model_start; n = 1; sc = 0.0; hit_start = n; for ( ; k <= model_end; k++, n++) { c = tmp_sq->dsq[n]; sc += ssvdata->ssv_scores_f[k*tmp_sq->abc->Kp + c]; if (sc < 0) { sc = 0; hit_start = n+1; } else if (sc > max_sc) { max_sc = sc; max_hit_start = hit_start; max_hit_end = n; } } diag->n = target_start + max_hit_start - 1; diag->k = model_start + max_hit_start - 1; diag->length = max_hit_end - max_hit_start + 1; diag->score = max_sc; return eslOK; } /* Function: p7_SSVFM_longlarget() * Synopsis: Finds windows with SSV scores above given threshold, using FM-index * * Details: Uses FM-index to find high-scoring diagonals (seeds), then extends those * seeds to maximal scoring diagonals (no gaps). Windows meeting the SSV * scoring threshold (usually score s.t. p=0.02) are captured, and passed * on to the Viterbi and Forward stages of the pipeline. * * Args: om - optimized profile * nu - configuration: expected number of hits (use 2.0 as a default) * bg - the background model, required for translating a P-value threshold into a score threshold * F1 - p-value below which a window is captured as being above threshold * fmf - data for forward traversal of the FM-index * fmb - data for backward traversal of the FM-index * fm_cfg - FM-index meta data * ssvdata - compact data required for computing SSV scores * strands - p7_STRAND_TOPONLY | p7_STRAND_BOTTOMONLY | p7_STRAND_BOTH * windowlist - RETURN: collection of SSV-passing windows, with meta data required for downstream stages. * * Returns: on success. * * Throws: if trouble allocating memory for seeds */ int p7_SSVFM_longlarget( P7_OPROFILE *om, float nu, P7_BG *bg, double F1, const FM_DATA *fmf, const FM_DATA *fmb, FM_CFG *fm_cfg, const P7_SCOREDATA *ssvdata, int strands, P7_HMM_WINDOWLIST *windowlist) { float sc_thresh, sc_threshFM; float invP; //, invP_FM; float nullsc; int i; float tloop = logf((float) om->max_length / (float) (om->max_length+3)); float tloop_total = tloop * om->max_length; float tmove = logf( 3.0f / (float) (om->max_length+3)); float tbmk = logf( 2.0f / ((float) om->M * (float) (om->M+1))); float tec = logf(1.0f / nu); FM_DIAG *diag; ESL_SQ *tmp_sq; uint8_t *consensus; FM_DIAGLIST seeds; int status; status = fm_initSeeds(&seeds); if (status != eslOK) ESL_EXCEPTION(eslEMEM, "Error allocating memory for seed list\n"); /* convert the consensus to a collection of ints, so I can test for runs of identity to the consensus */ ESL_ALLOC(consensus, (om->M+1)*sizeof(uint8_t) ); for (i=1; i<=om->M; i++) consensus[i] = om->abc->inmap[(int)(om->consensus[i])]; /* Set false target length. This is a conservative estimate of the length of window that'll * soon be passed on to later phases of the pipeline; used to recover some bits of the score * that we would miss if we left length parameters set to the full target length */ p7_oprofile_ReconfigMSVLength(om, om->max_length); p7_bg_SetLength(bg, om->max_length); p7_bg_NullOne (bg, NULL, om->max_length, &nullsc); tmp_sq = esl_sq_CreateDigital(om->abc); /* * Computing the score required to let P meet the F1 prob threshold * In original code, converting from an SSV score S (the score getting * to state C) to a probability goes like this: * S = XMX(L,p7G_C) * usc = S + tmove + tloop_total * P = f ( (usc - nullsc) / eslCONST_LOG2 , mu, lambda) * and XMX(C) was the diagonal score + tmove + tbmk + tec * and we're computing the threshold score S, so reverse it: * (usc - nullsc) / eslCONST_LOG2 = inv_f( P, mu, lambda) * usc = nullsc + eslCONST_LOG2 * inv_f( P, mu, lambda) * S = usc - tmove - tloop_total - tmove - tbmk - tec * * * Here, I compute threshold with length model based on max_length. Usually, the * length of a window returned by this scan will be 2*max_length-1 or longer. Doesn't * really matter - in any case, both the bg and om models will change with roughly * 1 bit for each doubling of the length model, so they offset. */ invP = esl_gumbel_invsurv(F1, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); sc_thresh = (invP * eslCONST_LOG2) + nullsc - (tmove + tloop_total + tmove + tbmk + tec); // invP_FM = esl_gumbel_invsurv(0.5, om->evparam[p7_MMU], om->evparam[p7_MLAMBDA]); // sc_threshFM = ESL_MAX(fm_cfg->scthreshFM, (invP_FM * eslCONST_LOG2) + nullsc - (tmove + tloop_total + tmove + tbmk + tec) ) ; sc_threshFM = fm_cfg->scthreshFM * fm_cfg->sc_thresh_ratio; //get diagonals that score above sc_threshFM status = FM_getSeeds(fmf, fmb, fm_cfg, ssvdata, consensus, om->abc->Kp, sc_threshFM, strands, &seeds ); if (status != eslOK) ESL_EXCEPTION(eslEMEM, "Error allocating memory for seed computation\n"); //now extend those diagonals to find ones scoring above sc_thresh for(i=0; iscore >= sc_thresh) FM_window_from_diag(diag, fmf, fm_cfg->meta, windowlist ); } esl_sq_Destroy(tmp_sq); free(seeds.diags); free(consensus); return eslEOF; ERROR: ESL_EXCEPTION(eslEMEM, "Error allocating memory for SSVFM longtarget\n"); } /*------------------ end, FM_MSV() ------------------------*/ /***************************************************************** * HMMER - Biological sequence analysis with profile HMMs * Version 3.1b2; February 2015 * Copyright (C) 2015 Howard Hughes Medical Institute. * Other copyrights also apply. See the COPYRIGHT file for a full list. * * HMMER is distributed under the terms of the GNU General Public License * (GPLv3). See the LICENSE file for details. *****************************************************************/