/* Merge alignments into a single alignment based on their reference (RF) annotation. * * Special care is taken to be consistent with missing data '~' * annotation in two different contexts. * * First, from Infernal 1.1rc2 and later cmalign and cmsearch -A * output alignments: '~' can occur in GC RF and SS_cons * annotation. The '~' after a given nongap/nonmissing RF position x * (and before nongap/nonmissing RF position x+1) must all occur 5' of * all inserted positions (gap: '.') that occur between x and x+1. * * Second, from HMMER3, hmmbuild reads and writes '~' in sequences to * mark the ends of fragments. Any fragment must begin and end with a * contiguous string of 1 or more '~'. '~' characters are not allowed * within sequences anywhere except within these leading and trailing * contiguous stretches. * * esl-alimerge will accept as input and produce as output alignments * that are consistent with both of these conventions. * * EPN, Fri Nov 20 16:28:59 2009 */ #include "esl_config.h" #include #include #include #include "easel.h" #include "esl_alphabet.h" #include "esl_distance.h" #include "esl_fileparser.h" #include "esl_getopts.h" #include "esl_msa.h" #include "esl_msafile.h" #include "esl_msafile2.h" #include "esl_sqio.h" #include "esl_stopwatch.h" #include "esl_vectorops.h" static char banner[] = "merge alignments based on their reference (RF) annotation"; static char usage1[] = "[options] "; static char usage2[] = "[options] --list 1 ali files to merge>\n\ \n\ Input alignments must be in Stockholm or Pfam format.\n\ Ouput format choices\n\ --------------------\n\ stockholm [default]\n\ pfam\n\ a2m\n\ psiblast\n\ afa"; static void read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile); static int update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis); static int validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int clen, int nmsa, int alen_merged, int *maxgap, int *maxmis, char *errbuf); static int add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged); static int inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str); static int inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str); static int validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len); static int determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf); static void write_pfam_msa_top(FILE *fp, ESL_MSA *msa); static void write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int maxwidth); static int64_t maxwidth(char **s, int n); static int rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar); static ESL_OPTIONS options[] = { /* name type default env range togs reqs incomp help docgroup */ { "--list", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "command-line argument is a file that lists ali files to merge", 99 }, { "-h", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "help; show brief info on version and usage", 1 }, { "-o", eslARG_OUTFILE, NULL, NULL, NULL, NULL,NULL, NULL, "output the final alignment to file , not stdout", 1 }, { "-v", eslARG_NONE, FALSE, NULL, NULL, NULL,"-o", NULL, "print info on merge to stdout; requires -o", 1 }, { "--small", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "use minimal RAM (RAM usage will be independent of aln sizes)", 1 }, { "--rfonly", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "remove all columns that are gaps in GC RF annotation", 1 }, { "--informat", eslARG_STRING, FALSE, NULL, NULL, NULL,NULL, NULL, "NOT YET DISPLAYED", 99 }, { "--outformat",eslARG_STRING, FALSE, NULL, NULL, NULL,NULL, NULL, "specify that output aln be format (see choices above)", 1 }, { "--rna", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--amino,--rna", "alignments to merge are RNA alignments", 1 }, { "--dna", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--amino,--rna", "alignments to merge are DNA alignments", 1 }, { "--amino", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL,"--dna,--rna", "alignments to merge are protein alignments", 1 }, { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL,NULL, NULL, "arrest after start: for debugging under gdb", 99 }, { 0,0,0,0,0,0,0,0,0,0 }, }; int main(int argc, char **argv) { int status; /* easel return code */ ESL_GETOPTS *go = NULL; /* application configuration */ ESL_ALPHABET *abc = NULL; /* biological alphabet */ char *msafile1 = NULL; /* msa file 1 (stays NULL if --list) */ char *msafile2 = NULL; /* msa file 2 (stays NULL if --list) */ char *listfile = NULL; /* list file name (stays NULL unless --list) */ int infmt = eslMSAFILE_UNKNOWN; /* format code for input alifiles */ int outfmt = eslMSAFILE_UNKNOWN; /* format code for output ali */ ESLX_MSAFILE *afp = NULL; /* open alignment file, normal interface */ ESL_MSAFILE2 *afp2 = NULL; /* open alignment file, small-mem interface */ FILE *ofp; /* output file (default is stdout) */ char **alifile_list = NULL; /* list of alignment files to merge */ int nalifile; /* size of alifile_list */ int do_stall; /* used to stall when debugging */ int fi; /* counter over alignment files */ int ai, ai2; /* counters over alignments */ int nali_tot; /* number of alignments in all files */ int *nali_per_file = NULL; /* [0..nalifile-1] number of alignments per file */ int nseq_tot; /* number of sequences in all alignments */ int nseq_cur; /* number of sequences in current alignment */ int64_t alen_cur; /* length of current alignment */ int64_t *alenA = NULL; /* [0..nali_tot-1] alignment length of input msas (even after * potentially removingeinserts (--rfonly)) */ ESL_MSA **msaA = NULL; /* [0..nali_tot-1] all msas read from all files */ int *maxgap = NULL; /* [0..cpos..cm->clen+1] max number of gap columns * before each consensus position in all alignments */ int *maxmis = NULL; /* [0..cpos..cm->clen+1] max number of missing data columns * before each consensus position in all alignments */ int nalloc = 0; /* current size of msaA */ int chunksize = 10; /* size to increase nalloc by when realloc'ing */ void *tmp; /* for ESL_RALLOC() */ int clen; /* consensus length (non-gap #=GC RF length) of all alignments */ int cur_clen; /* consensus length (non-gap #=GC RF length) of current alignments */ int apos; /* alignment position */ ESL_MSA *mmsa = NULL; /* the merged alignment created by merging all alignments in msaA */ int alen_mmsa; /* number of columns in merged MSA */ char errbuf[eslERRBUFSIZE]; /* buffer for error messages */ char *tmpstr; /* used if -v, for printing file names */ int **usemeA = NULL; /* [0..nali_tot-1][0..alen] used only if --rfonly enabled, for removing gap RF columns */ ESL_STOPWATCH *w = NULL; /* for timing the merge, only used if -o enabled */ int do_small; /* TRUE if --small, operate in special small memory mode, aln seq data is not stored */ int do_rfonly; /* TRUE if --rfonly, output only non-gap RF columns (remove all insert columns) */ int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */ int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */ int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */ /* output formatting, only relevant if -v */ char *namedashes = NULL; /* string of dashes, an underline */ int ni; /* counter */ int namewidth; /* max width of file name */ int tmp_len; /* current width of merged aln */ /* variables only used in small mode (--small) */ int ngs_cur; /* number of GS lines in current alignment (only used if do_small) */ int gs_exists = FALSE; /* set to TRUE if do_small and any input aln has >= 1 GS line */ int maxname, maxgf, maxgc, maxgr; /* max length of seqname, GF tag, GC tag, GR tag in all input alignments */ int maxname_cur, maxgf_cur, maxgc_cur, maxgr_cur; /* max length of seqname, GF tag, GC tag, GR tag in current input alignment */ int margin; /* total margin length for output msa */ /*********************************************** * Parse command line ***********************************************/ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage1); esl_usage(stdout, argv[0], usage2); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") ) { esl_banner(stdout, argv[0], banner); esl_usage (stdout, argv[0], usage1); esl_usage (stdout, argv[0], usage2); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 1, 2, 80); exit(0); } if(((! esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 2)) || (( esl_opt_GetBoolean(go, "--list")) && (esl_opt_ArgNumber(go) != 1))) { printf("Incorrect number of command line arguments.\n"); esl_usage(stdout, argv[0], usage1); esl_usage(stdout, argv[0], usage2); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if(esl_opt_GetBoolean(go, "--list")) { listfile = esl_opt_GetArg(go, 1); } else { msafile1 = esl_opt_GetArg(go, 1); msafile2 = esl_opt_GetArg(go, 2); } do_small = (esl_opt_IsOn(go, "--small")) ? TRUE : FALSE; do_rfonly = (esl_opt_IsOn(go, "--rfonly")) ? TRUE : FALSE; /* open output file */ if (esl_opt_GetString(go, "-o") != NULL) { if ((ofp = fopen(esl_opt_GetString(go, "-o"), "w")) == NULL) esl_fatal("Failed to open -o output file %s\n", esl_opt_GetString(go, "-o")); } else ofp = stdout; if (esl_opt_IsOn(go, "--informat")) { infmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--informat")); if (infmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat", esl_opt_GetString(go, "--informat")); if (do_small && infmt != eslMSAFILE_PFAM) esl_fatal("small memory mode requires Pfam formatted alignments"); } if (esl_opt_IsOn(go, "--outformat")) { outfmt = eslx_msafile_EncodeFormat(esl_opt_GetString(go, "--outformat")); if (outfmt == eslMSAFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --outformat", esl_opt_GetString(go, "--outformat")); if (do_small && outfmt != eslMSAFILE_PFAM) esl_fatal("we can only output Pfam formatted alignments in small memory mode"); } else outfmt = eslMSAFILE_STOCKHOLM; if (do_small) { infmt = eslMSAFILE_PFAM; /* this must be true, else we can't do small memory mode */ outfmt = eslMSAFILE_PFAM; } do_stall = esl_opt_GetBoolean(go, "--stall"); /* a stall point for attaching gdb */ while (do_stall); /* determine file names to merge */ if(listfile != NULL) { /* read list file */ read_list_file(listfile, &alifile_list, &nalifile); if(nalifile == 0) esl_fatal("Failed to read a single alignment file name from %s\n", listfile); } else { /* we're merging two alignment files from command-line */ nalifile = 2; ESL_ALLOC(alifile_list, sizeof(char *) * nalifile); if((status = esl_strdup(msafile1, -1, &(alifile_list[0]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile1, status); if((status = esl_strdup(msafile2, -1, &(alifile_list[1]))) != eslOK) esl_fatal("Error storing alignment file name %s, error status: %d\n", msafile2, status); } /* create and start stopwatch */ if(ofp != stdout) { w = esl_stopwatch_Create(); esl_stopwatch_Start(w); } if(esl_opt_GetBoolean(go, "-v")) { /* determine the longest file name in alifile_list */ namewidth = 9; /* length of 'file name' */ for(fi = 0; fi < nalifile; fi++) { if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error."); namewidth = ESL_MAX(namewidth, strlen(tmpstr)); free(tmpstr); } ESL_ALLOC(namedashes, sizeof(char) * (namewidth+1)); namedashes[namewidth] = '\0'; for(ni = 0; ni < namewidth; ni++) namedashes[ni] = '-'; fprintf(stdout, "# Reading %d alignment files...\n", nalifile); fprintf(stdout, "#\n"); fprintf(stdout, "# %7s %-*s %7s %9s %9s %13s %8s\n", "", namewidth,"", "", "", "", "", "ncols"); fprintf(stdout, "# %7s %-*s %7s %9s %9s %13s %8s\n", "file #", namewidth,"file name", "ali #", "#seq/ali", "ncols/ali", "# seq total", "required"); fprintf(stdout, "# %7s %*s %7s %9s %9s %13s %8s\n", "-------", namewidth, namedashes, "-------", "---------", "---------", "-------------", "--------"); } /* Allocate and initialize */ nalloc = chunksize; ESL_ALLOC(msaA, sizeof(ESL_MSA *) * nalloc); ESL_ALLOC(alenA, sizeof(int64_t) * nalloc); if(do_rfonly) ESL_ALLOC(usemeA, sizeof(int *) * nalloc); /* abc handling is weird. We only use alphabet to define gap characters in this miniapp. * msa's are actually read in text mode. Thus eslRNA suffices for anything. */ if (esl_opt_GetBoolean(go, "--amino")) abc = esl_alphabet_Create(eslAMINO); else if (esl_opt_GetBoolean(go, "--dna")) abc = esl_alphabet_Create(eslDNA); else if (esl_opt_GetBoolean(go, "--rna")) abc = esl_alphabet_Create(eslRNA); else if (do_small) abc = esl_alphabet_Create(eslRNA); /* alphabet is only used to define gap characters, so (in this miniapp) we're okay specifying RNA for any alignment (even non-RNA ones) */ else abc = esl_alphabet_Create(eslRNA); /* ditto */ /**************************************************************** * Read alignments one at a time, storing them all, separately * ****************************************************************/ ai = 0; nseq_tot = 0; maxname = maxgf = maxgc = maxgr = 0; ESL_ALLOC(nali_per_file, sizeof(int) * nalifile); esl_vec_ISet(nali_per_file, nalifile, 0); for (fi = 0; fi < nalifile; fi++) { if (do_small) { status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); if (status == eslENOTFOUND) esl_fatal("Alignment file %s doesn't exist or is not readable\n", alifile_list[fi]); else if (status != eslOK) esl_fatal("Alignment file %s open failed with error %d\n", alifile_list[fi], status); } else { status = eslx_msafile_Open(NULL, alifile_list[fi], NULL, infmt, NULL, &afp); if (status != eslOK) eslx_msafile_OpenFailure(afp, status); } /* while loop: while we have an alignment in current alignment file, (statement looks weird b/c we use a different function if --small) */ while((status = (do_small) ? esl_msafile2_ReadInfoPfam(afp2, NULL, NULL, -1, NULL,NULL, &(msaA[ai]), &nseq_cur, &alen_cur, &ngs_cur, &maxname_cur, &maxgf_cur, &maxgc_cur, &maxgr_cur, NULL, NULL, NULL, NULL, NULL) : eslx_msafile_Read (afp, &(msaA[ai]))) == eslOK) { if(msaA[ai]->rf == NULL) esl_fatal("Error, all alignments must have #=GC RF annotation; alignment %d of file %d does not (%s)\n", nali_per_file[fi], (fi+1), alifile_list[fi]); msaA[ai]->abc = abc; /* msa's are read in text mode, so this is (currently) only used to define gap characters, it doesn't even have to be the correct alphabet. if --small, this is set as RNA regardless of input */ if (do_small) { maxname = ESL_MAX(maxname, maxname_cur); maxgf = ESL_MAX(maxgf, maxgf_cur); maxgc = ESL_MAX(maxgc, maxgc_cur); maxgr = ESL_MAX(maxgr, maxgr_cur); msaA[ai]->alen = alen_cur; if(ngs_cur > 0) gs_exists = TRUE; } else { nseq_cur = msaA[ai]->nseq; } alenA[ai] = msaA[ai]->alen; /* impt if --small and --rfonly, to remember total width of aln to expect in second pass */ nali_per_file[fi]++; nseq_tot += nseq_cur; /* reallocate per-alignment data, if nec */ if((ai+1) == nalloc) { nalloc += chunksize; ESL_RALLOC(msaA, tmp, sizeof(ESL_MSA *) * nalloc); ESL_RALLOC(alenA, tmp, sizeof(int64_t) * nalloc); for(ai2 = ai+1; ai2 < nalloc; ai2++) { msaA[ai2] = NULL; } if(do_rfonly) { ESL_RALLOC(usemeA, tmp, sizeof(int *) * nalloc); for(ai2 = ai+1; ai2 < nalloc; ai2++) { usemeA[ai2] = NULL; } } } /* either store consensus (non-gap RF) length (if first aln), or verify it is what we expect */ cur_clen = 0; for(apos = 0; apos < (int) msaA[ai]->alen; apos++) { if(rfchar_is_nongap_nonmissing(msaA[ai]->abc, msaA[ai]->rf[apos])) cur_clen++; } if(ai == 0) { /* first alignment, store clen, allocate maxgap, maxmis */ clen = cur_clen; ESL_ALLOC(maxgap, sizeof(int) * (clen+1)); esl_vec_ISet(maxgap, (clen+1), 0); ESL_ALLOC(maxmis, sizeof(int) * (clen+1)); esl_vec_ISet(maxmis, (clen+1), 0); /* these will all stay 0 unless we see '~' in the alignments */ } else if(cur_clen != clen) { esl_fatal("Error, all alignments must have identical non-gap #=GC RF lengths; expected (RF length of first ali read): %d,\nalignment %d of file %d length is %d (%s))\n", clen, nali_per_file[fi], (fi+1), cur_clen, alifile_list[fi]); } if(do_rfonly) { /* Remove all columns that are gaps/missing data in the RF annotation, we keep an array of usemes, * one per aln, in case of --small, so we know useme upon second pass of alignment files */ ESL_ALLOC(usemeA[ai], sizeof(int) * (msaA[ai]->alen)); for(apos = 0; apos < msaA[ai]->alen; apos++) { usemeA[ai][apos] = (rfchar_is_nongap_nonmissing(abc, msaA[ai]->rf[apos])) ? TRUE : FALSE; } if((status = esl_msa_ColumnSubset(msaA[ai], errbuf, usemeA[ai])) != eslOK) { esl_fatal("status code: %d removing gap RF columns for msa %d from file %s:\n%s", status, (ai+1), alifile_list[fi], errbuf); } } else { /* --rfonly not enabled, determine max number inserts between each position */ /* update_maxgap_and_maxmis checks to make sure the msa follows our rule for missing data and gaps: * for all positions x..y b/t any two adjacent nongap RF positions (x-1 and y+1 are nongap RF positions) * all missing data columns '~' must come before all gap positions ('.', '-', or '_') */ if((status = update_maxgap_and_maxmis(msaA[ai], errbuf, clen, msaA[ai]->alen, maxgap, maxmis)) != eslOK) esl_fatal(errbuf); } if(esl_opt_GetBoolean(go, "-v")) { if((status = esl_FileTail(alifile_list[fi], FALSE, &tmpstr)) != eslOK) esl_fatal("Memory allocation error."); tmp_len = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1); fprintf(stdout, " %7d %-*s %7d %9d %9" PRId64 " %13d %8d\n", (fi+1), namewidth, tmpstr, (ai+1), nseq_cur, msaA[ai]->alen, nseq_tot, tmp_len); free(tmpstr); } ai++; } /* end of while eslx_msafile_Read() loop */ if (do_small) { if (status == eslEFORMAT) esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf); else if (status == eslEINVAL) esl_fatal("Alignment file %s, parse error:\n%s\n", alifile_list[fi], afp2->errbuf); else if (status != eslEOF) esl_fatal("Alignment file %s, read failed with error code %d\n", alifile_list[fi], status); } else if (status != eslEOF) eslx_msafile_ReadFailure(afp, status); if(nali_per_file[fi] == 0) esl_fatal("Failed to read any alignments from file %s\n", alifile_list[fi]); if (do_small) esl_msafile2_Close(afp2); else eslx_msafile_Close(afp); } /* end of for (fi=0; fi < nalifile; fi++) */ nali_tot = ai; /********************************************* * Merge all alignments into the merged MSA * *********************************************/ /* We allocate space for all sequences, but leave sequences as NULL (nseq = -1). * If (do_small) we didn't store the sequences on the first pass through the * alignment files, and we'll never allocate space for the sequences in mmsa, * we'll just output them as we reread them on another pass through the * individual alignments. If we read >= 1 GS line in any of the input alignments, * we need to do an additional pass through the files, outputting only GS * data. Then, in a final (3rd) pass we'll output aligned data. * * if (!do_small), we have the sequences in memory, we'll copy these * to the merged alignment, freeing them in the orignal msaA alignments * as we go so we never need to allocate the full mmsa while we still have * the individual msas (in msaA[]) in memory. */ mmsa = esl_msa_Create(nseq_tot, -1); alen_mmsa = clen + esl_vec_ISum(maxgap, clen+1) + esl_vec_ISum(maxmis, clen+1); /* Determine what annotation from the input alignments * we will include in the merged MSA. * See comments in header of validate_and_copy_msa_annotation() * for rules on what we include. */ if((status = validate_and_copy_msa_annotation(go, outfmt, mmsa, msaA, nali_tot, clen, alen_mmsa, maxgap, maxmis, errbuf)) != eslOK) esl_fatal("Error while checking and copying individual MSA annotation to merged MSA:%s\n", errbuf); if (do_small) { /* Small memory mode, do up to 2 more passes through the input alignments: * pass 2 will output only GS data at top of output alignment file (only performed if >= 1 GS line read in input files * pass 3 will output all aligned data at to output alignment file (always performed) */ /* output header, comments, and #=GF data */ write_pfam_msa_top(ofp, mmsa); if(ofp != stdout) { if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n"); } fprintf(stdout, "# Outputting merged alignment to file %s ... ", esl_opt_GetString(go, "-o")); fflush(stdout); } /* if there was any GS annotation in any of the individual alignments, * do second pass through alignment files, outputting GS annotation as we go. */ if(gs_exists) { ai = 0; for(fi = 0; fi < nalifile; fi++) { /* we're in small memory mode... */ status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */ if (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]); else if (status == eslEFORMAT) esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]); else if (status != eslOK) esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status); for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) { status = esl_msafile2_RegurgitatePfam(afp2, ofp, maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag (irrelevant here) */ FALSE, /* regurgitate stockholm header ? */ FALSE, /* regurgitate // trailer ? */ FALSE, /* regurgitate blank lines */ FALSE, /* regurgitate comments */ FALSE, /* regurgitate GF ? */ TRUE, /* regurgitate GS ? */ FALSE, /* regurgitate GC ? */ FALSE, /* regurgitate GR ? */ FALSE, /* regurgitate aseq ? */ NULL, /* regurgitate all seqs, not a subset */ NULL, /* regurgitate all seqs, not a subset */ NULL, /* we're not keeping a subset of columns */ NULL, /* we're not adding all gap columns */ alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */ '.', /* gap char, irrelevant */ NULL, /* don't return num seqs read */ NULL); /* don't return num seqs regurgitated */ if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]); if(status != eslOK) esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf); ai++; fflush(ofp); } esl_msafile2_Close(afp2); } fprintf(ofp, "\n"); /* a single blank line to separate GS annotation from aligned data */ } /* do another (either second or third) pass through alignment files, outputting aligned sequence data (and GR) as we go */ ai = 0; for(fi = 0; fi < nalifile; fi++) { status = esl_msafile2_Open(alifile_list[fi], NULL, &afp2); /* this should work b/c it did on the first pass */ if (status == eslENOTFOUND) esl_fatal("Second pass, alignment file %s doesn't exist or is not readable\n", alifile_list[fi]); else if (status == eslEFORMAT) esl_fatal("Second pass, couldn't determine format of alignment %s\n", alifile_list[fi]); else if (status != eslOK) esl_fatal("Second pass, alignment file %s open failed with error %d\n", alifile_list[fi], status); for(ai2 = 0; ai2 < nali_per_file[fi]; ai2++) { /* determine how many all gap columns to insert after each alignment position * of the child msa when copying it to the merged msa */ if(! do_rfonly) { if((status = determine_gap_columns_to_add(msaA[ai], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK) esl_fatal("error determining number of all gap columns to add to alignment %d of file %s", ai2, alifile_list[fi]); } status = esl_msafile2_RegurgitatePfam(afp2, ofp, maxname, maxgf, maxgc, maxgr, /* max width of a seq name, gf tag, gc tag, gr tag */ FALSE, /* regurgitate stockholm header ? */ FALSE, /* regurgitate // trailer ? */ FALSE, /* regurgitate blank lines */ FALSE, /* regurgitate comments */ FALSE, /* regurgitate GF ? */ FALSE, /* regurgitate GS ? */ FALSE, /* regurgitate GC ? */ TRUE, /* regurgitate GR ? */ TRUE, /* regurgitate aseq ? */ NULL, /* regurgitate all seqs, not a subset */ NULL, /* regurgitate all seqs, not a subset */ (do_rfonly) ? usemeA[ai] : NULL, (do_rfonly) ? NULL : neitherA, alenA[ai], /* alignment length, as we read it in first pass (inserts may have been removed since then) */ '.', NULL, /* don't return num seqs read */ NULL); /* don't return num seqs regurgitated */ if(status == eslEOF) esl_fatal("Second pass, error out of alignments too soon, when trying to read aln %d of file %s", ai2, alifile_list[fi]); if(status != eslOK) esl_fatal("Second pass, error reading alignment %d of file %s: %s", ai2, alifile_list[fi], afp2->errbuf); free(ngapA); free(nmisA); free(neitherA); esl_msa_Destroy(msaA[ai]); msaA[ai] = NULL; ai++; fflush(ofp); } esl_msafile2_Close(afp2); } /* finally, write GC annotation and '//' line */ margin = maxname + 1; if (maxgc > 0 && maxgc+6 > margin) margin = maxgc+6; if (maxgr > 0 && maxname+maxgr+7 > margin) margin = maxname+maxgr+7; write_pfam_msa_gc(ofp, mmsa, margin); } /* end of if(do_small) */ else { /* ! do_small: for each input alignment in msaA[], add all aligned data to mmsa, then free it */ if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "#\n# Merging alignments ... "); fflush(stdout); } for(ai = 0; ai < nali_tot; ai++) { if((status = add_msa(mmsa, errbuf, msaA[ai], maxgap, maxmis, clen, alen_mmsa)) != eslOK) esl_fatal("Error, merging alignment %d of %d:\n%s.", (ai+1), nali_tot, errbuf); esl_msa_Destroy(msaA[ai]); /* note: the aligned sequences will have already been freed in add_msa() */ msaA[ai] = NULL; } if(esl_opt_GetBoolean(go, "-v")) { fprintf(stdout, "done.\n#\n"); fflush(stdout); } mmsa->alen = alen_mmsa; /* it was -1, b/c we filled in each seq as we marched through each msaA[] alignment */ if(ofp != stdout) { fprintf(stdout, "# Saving alignment to file %s ... ", esl_opt_GetString(go, "-o")); } status = eslx_msafile_Write(ofp, mmsa, outfmt); if(status != eslOK) esl_fatal("Error, during alignment output; status code: %d\n", status); } if(ofp != stdout) { fflush(stdout); fclose(ofp); fprintf(stdout, "done\n#\n"); fflush(stdout); esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); } /* clean up and exit */ if(alifile_list != NULL) { for(fi = 0; fi < nalifile; fi++) { if(alifile_list[fi] != NULL) free(alifile_list[fi]); } free(alifile_list); } if(usemeA != NULL) { for(ai = 0; ai < nali_tot; ai++) { free(usemeA[ai]); } free(usemeA); } if(nali_per_file != NULL) free(nali_per_file); if(alenA != NULL) free(alenA); if(namedashes != NULL) free(namedashes); if(msaA != NULL) free(msaA); if(maxgap != NULL) free(maxgap); if(maxmis != NULL) free(maxmis); if(mmsa != NULL) esl_msa_Destroy(mmsa); if(abc != NULL) esl_alphabet_Destroy(abc); if(w != NULL) esl_stopwatch_Destroy(w); esl_getopts_Destroy(go); return 0; ERROR: esl_fatal("Out of memory. Reformat to Pfam with esl-reformat and try esl-alimerge --small."); return eslEMEM; /*NEVERREACHED*/ } /* Function: read_list_file * Date: EPN, Fri Nov 20 16:41:32 2009 * * Read a file listing alignment files to merge. * Store file names in *ret_alifile_list and return it, * return number of files in ret_nalifile and return it. * Each white-space delimited token is considered a * different alignment name. * * Dies if we encounter an error. * * Returns: void. */ void read_list_file(char *listfile, char ***ret_alifile_list, int *ret_nalifile) { int status; ESL_FILEPARSER *efp; char *tok; int nalloc = 10; int chunksize = 10; char **alifile_list = NULL; int n = 0; void *tmp; ESL_ALLOC(alifile_list, sizeof(char *) * nalloc); status = esl_fileparser_Open(listfile, NULL, &efp); if (status == eslENOTFOUND) esl_fatal("List file %s does not exist or is not readable\n", listfile); else if(status == eslEMEM) esl_fatal("Ran out of memory when opening list file %s\n", listfile); else if(status != eslOK) esl_fatal("Error opening list file %s\n", listfile); esl_fileparser_SetCommentChar(efp, '#'); while((status = esl_fileparser_GetToken(efp, &tok, NULL)) != eslEOF) { if(n == nalloc) { nalloc += chunksize; ESL_RALLOC(alifile_list, tmp, sizeof(char *) * nalloc); } if((status = esl_strdup(tok, -1, &(alifile_list[n++]))) != eslOK) { esl_fatal("Error storing alignment file name while reading list file %s, error status: %d\n", listfile, status); } } esl_fileparser_Close(efp); *ret_alifile_list = alifile_list; *ret_nalifile = n; return; ERROR: esl_fatal("Out of memory."); return; /*NOTREACHED*/ } /* Function: update_maxgap_and_maxmis * Date: EPN, Sun Nov 22 09:40:48 2009 * (stolen from Infernal's cmalign.c) * * Update maxgap[] and maxmis[], arrays that keeps track of the max * number of gap columns and missing data columns ('~' gap #=GC RF) * columns before each cpos (consensus (non-gap #=GC RF) column) * * We require that all missing columns between cpos x and x+1 occur before * all gap columns between cpos x and x+1. * * Consensus columns are index [0..cpos..clen]. * * max{gap,mis}[0] is number of gap/missing columns before 1st cpos. * max{gap,mis}[clen-1] is number of gap/missing columns before final cpos. * max{gap,mis}[clen] is number of gap/missing columns after final cpos. * * Returns: eslOK on success. * eslEINVAL if msa->rf is NULL, nongap RF length is not clen of * for any two cpos x and x+1, a gap column precedes a * missing data column. * */ int update_maxgap_and_maxmis(ESL_MSA *msa, char *errbuf, int clen, int64_t alen, int *maxgap, int *maxmis) { int apos; int cpos = 0; int ngap = 0; int nmis = 0; for(apos = 0; apos < alen; apos++) { if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) { ngap++; } else if (esl_abc_CIsMissing(msa->abc, msa->rf[apos])) { nmis++; if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap); } else { maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap); maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis); cpos++; ngap = 0; nmis = 0; } } /* update final value, max{ins,el}[clen+1], the number of inserts * after the final consensus position */ maxgap[cpos] = ESL_MAX(maxgap[cpos], ngap); maxmis[cpos] = ESL_MAX(maxmis[cpos], nmis); if(cpos != clen) ESL_FAIL(eslEINVAL, errbuf, "second pass expected clen (%d) not equal to actual clen (%d).\n", clen, cpos); return eslOK; } /* Function: validate_and_copy_msa_annotation * Date: EPN, Tue Nov 24 05:35:50 2009 * * Decide what individual MSA annotation from * the input alignments in msaA[], if any, will be * included in the merged alignment (mmsa) and * add that info to it. * * Rules for what to include in mmsa: * * We include name,desc,acc,author annotation in merged alignment * if it is identical in all msaA[] input alignments. * * We include comments and per-file (GF) annotation if they * are present and identical in all input msaA[] alignments. * * We include per-column (GC) annotation if it is present and * identical *with-respect-to* #=GC RF annotation AND all * the annotation in gap #=GC RF columns in all msaA[] are gaps * ('.'). This also pertains to the following parsed per-column * annotation: ss_cons, sa_cons, pp_cons, and rf. With rf, * de-gapped rf annotation must be identical in all input * alignments period, if it is not we'll die with an error message. * * Per-sequence information and per-residue information is always * included in merged alignment. This is done by add_msa() function. * * Returns: eslOK on success. * eslEINCONCEIVABLE if input/msa is corrupt in some way (example: ngf>0 but gf_tag[0] == NULL) * eslEMEM on memory error * if !eslOK, errbuf is filled before return */ int validate_and_copy_msa_annotation(const ESL_GETOPTS *go, int outfmt, ESL_MSA *mmsa, ESL_MSA **msaA, int nmsa, int clen, int alen_merged, int *maxgap, int *maxmis, char *errbuf) { int status; int j; /* counter over alignment annotations */ int j2; /* counter over alignment annotations */ int ai; /* counter over alignments */ char *dealigned = NULL; /* a temporary, dealigned string */ char *dealigned2 = NULL; /* another temporary, dealigned string */ char *gapped_out = NULL; /* a temporary string with gaps added to fit into merged aln */ int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */ int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */ int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */ int do_add; int found_tag; int be_verbose = FALSE; /* we only print info about annotation if -v AND we'll actually * output it (as stockholm or pfam) * (we actually don't even need to be in this function if we're * not output in stockholm or pfam...) */ if((esl_opt_GetBoolean(go, "-v")) && (outfmt == eslMSAFILE_STOCKHOLM || outfmt == eslMSAFILE_PFAM)) { be_verbose = TRUE; } if(be_verbose) fprintf(stdout, "#\n"); if(nmsa == 0) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "in validate_and_copy_msa_annotation(): zero child alignments."); /* First, determine how many all gap columns to insert after each alignment position * of the first child msa, so we can (possibly) gap out GC,SS_cons,SA_cons,PP_cons annotation * to appropriate length when adding it to the merged MSA. */ if((status = determine_gap_columns_to_add(msaA[0], maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK) return status; /* Note: esl_strcmp() can handle NULL strings (they are not identical to non-NULL strings) */ /*********************************************************************/ /* Check if name annotation is identical in all alignments */ do_add = TRUE; /* until proven otherwise */ if(msaA[0]->name != NULL) { for(ai = 1; ai < nmsa; ai++) { if(esl_strcmp(msaA[0]->name, msaA[ai]->name) != 0) { do_add = FALSE; break; } } if(do_add) { if(be_verbose) fprintf(stdout, "# Identical name annotation from all alignments transferred to merged alignment.\n"); if((status = esl_strdup(msaA[0]->name, -1, &(mmsa->name))) != eslOK) goto ERROR; } else if(be_verbose) fprintf(stdout, "# Name annotation is not identical in all alignments; not included in merged alignment.\n"); } else if(be_verbose) fprintf(stdout, "# Name annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check if description annotation is identical in all alignments */ do_add = TRUE; /* until proven otherwise */ if(msaA[0]->desc != NULL) { for(ai = 1; ai < nmsa; ai++) { if(esl_strcmp(msaA[0]->desc, msaA[ai]->desc) != 0) { do_add = FALSE; break; } } if(do_add) { if(be_verbose) fprintf(stdout, "# Identical description annotation from all alignments transferred to merged alignment.\n"); if((status = esl_strdup(msaA[0]->desc, -1, &(mmsa->desc))) != eslOK) goto ERROR; } else if(be_verbose) fprintf(stdout, "# Description annotation is not identical in all alignments; not included in merged alignment.\n"); } else if(be_verbose) fprintf(stdout, "# Description annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check if accession annotation is identical in all alignments */ do_add = TRUE; /* until proven otherwise */ if(msaA[0]->acc != NULL) { for(ai = 1; ai < nmsa; ai++) { if(esl_strcmp(msaA[0]->acc, msaA[ai]->acc) != 0) { do_add = FALSE; break; } } if(do_add) { if(be_verbose) fprintf(stdout, "# Identical accession annotation from all alignments transferred to merged alignment.\n"); if((status = esl_strdup(msaA[0]->acc, -1, &(mmsa->acc))) != eslOK) goto ERROR; } else if(be_verbose) fprintf(stdout, "# Accession annotation is not identical in all alignments; not included in merged alignment.\n"); } else if(be_verbose) fprintf(stdout, "# Accession annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check if author annotation is identical in all alignments */ do_add = TRUE; /* until proven otherwise */ if(msaA[0]->au != NULL) { for(ai = 1; ai < nmsa; ai++) { if(esl_strcmp(msaA[0]->au, msaA[ai]->au) != 0) { do_add = FALSE; break; } } if(do_add) { if(be_verbose) fprintf(stdout, "# Identical author annotation from all alignments transferred to merged alignment.\n"); if((status = esl_strdup(msaA[0]->au, -1, &(mmsa->au))) != eslOK) goto ERROR; } else if(be_verbose) fprintf(stdout, "# Author annotation is not identical in all alignments; not included in merged alignment.\n"); } else if(be_verbose) fprintf(stdout, "# Author annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check per-file (GF) annotation, must be present and identical in all msaA[] alignments to be included */ if(msaA[0]->ngf > 0) { for(j = 0; j < msaA[0]->ngf; j++) { do_add = TRUE; /* until proven otherwise */ /* verify that what we think is true is true */ if(msaA[0]->gf_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf); if(msaA[0]->gf[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GF annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngf); for(ai = 1; ai < nmsa; ai++) { found_tag = FALSE; for(j2 = 0; j2 < msaA[ai]->ngf; j2++) { if(esl_strcmp(msaA[0]->gf_tag[j], msaA[ai]->gf_tag[j2]) == 0) { found_tag = TRUE; if(esl_strcmp(msaA[0]->gf[j], msaA[ai]->gf_tag[j2]) != 0) { do_add = FALSE; } break; /* if we found a match, do_add remains TRUE */ } } if(found_tag && do_add) { if(be_verbose) fprintf(stdout, "# Identical GF tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gf_tag[j]); if((status = esl_msa_AddGF(mmsa, msaA[0]->gf_tag[j], -1, msaA[0]->gf[j], -1)) != eslOK) goto ERROR; } else { if(be_verbose) fprintf(stdout, "# GF tag %s annotation from first alignment absent from >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gf_tag[j]); } } } } else if(be_verbose) fprintf(stdout, "# Unparsed GF annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check comments, all must be identically ordered and identical in all msaA[] aligments to include them */ if(msaA[0]->ncomment > 0) { do_add = TRUE; /* until proven otherwise */ /* make sure all alignments have same number of comments */ for(ai = 1; ai < nmsa; ai++) { if(msaA[ai]->ncomment != msaA[0]->ncomment) { do_add = FALSE; break; } } if(do_add) { /* make sure all alignments have identical comments */ for(j = 0; j < msaA[0]->ncomment; j++) { /* verify that what we think is true is true */ if(msaA[0]->comment[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, comment %d of msaA[0] is NULL, but msaA[0]->ncomment is %d.\n", j, msaA[0]->ncomment); for(ai = 1; ai < nmsa; ai++) { if(esl_strcmp(msaA[0]->comment[j], msaA[ai]->comment[j]) != 0) { /* comment doesn't match */ do_add = FALSE; break; } } } } if(do_add) { for(j = 0; j < msaA[0]->ncomment; j++) { if((status = esl_msa_AddComment(mmsa, msaA[0]->comment[j], -1))!= eslOK) goto ERROR; } if(be_verbose) fprintf(stdout, "# All alignments have identical comments in the same order. These were transferred to merged alignment.\n"); } else { if(be_verbose) fprintf(stdout, "# Comments are not identical in all alignments; not included in merged alignment.\n"); } } else if(be_verbose) fprintf(stdout, "# No comments in (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check unparsed per-column (GC) annotation, it must include all gaps in gap RF columns and * be identical once gap RF columns are removed in all msaA[] alignments to be included. */ if(msaA[0]->ngc > 0) { for(j = 0; j < msaA[0]->ngc; j++) { do_add = TRUE; /* until proven otherwise */ /* verify that what we think is true is true */ if(msaA[0]->gc_tag[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC tag %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc); if(msaA[0]->gc[j] == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpectedly, GC annotation %d of msaA[0] is NULL, but msaA[0]->ngf is %d.\n", j, msaA[0]->ngc); /* ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->gc[j], msaA[0]->alen)) { /* returns TRUE if gc[j] has 0 non-gap characters in gap columns of RF annotation */ /* dealign gc line */ if((status = esl_strdup(msaA[0]->gc[j], msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[0]", msaA[0]->gc_tag[j]); for(ai = 1; ai < nmsa; ai++) { found_tag = FALSE; for(j2 = 0; j2 < msaA[ai]->ngc; j2++) { if(esl_strcmp(msaA[0]->gc_tag[j], msaA[ai]->gc_tag[j2]) == 0) { found_tag = TRUE; /* ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->gc[j2], msaA[ai]->alen)) { /* returns TRUE if gc[j2] has 0 non-gap characters in gap columns of RF annotation */ /* dealign */ if((status = esl_strdup(msaA[ai]->gc[j2], msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning GC tag %s of msaA[%d]", msaA[ai]->gc_tag[j2], ai); /* check identity */ if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; } free(dealigned2); dealigned2 = NULL; break; /* if we matched, do_add remains TRUE */ } } } } /* end of (for(ai = 1...)) */ if(dealigned != NULL) { free(dealigned); dealigned = NULL; } if(found_tag && do_add) { /* gap out the the GC annotation to fit in merged alignment */ if((status = inflate_string_with_gaps_and_missing(msaA[0]->gc[j], msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create GC tag %s annotation for merged alignment.", msaA[0]->gc_tag[j]); if((status = esl_msa_AppendGC(mmsa, msaA[0]->gc_tag[j], gapped_out)) != eslOK) goto ERROR; free(gapped_out); gapped_out = NULL; if(be_verbose) fprintf(stdout, "# Identical GC tag %s annotation from all alignments transferred to merged alignment.\n", msaA[0]->gc_tag[j]); } else { if(be_verbose) fprintf(stdout, "# GC tag %s annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n", msaA[0]->gc_tag[j]); } } } /* end of for(j = 0 j < msaA[0]->ngc... */ } /* end of if(msaA[0]->ngc > 0) */ else if(be_verbose) fprintf(stdout, "# Unparsed GC annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check ss_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */ if(msaA[0]->ss_cons != NULL) { do_add = TRUE; /* until proven otherwise */ /* ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->ss_cons, msaA[0]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign ss_cons */ if((status = esl_strdup(msaA[0]->ss_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[0]"); for(ai = 1; ai < nmsa; ai++) { if(msaA[ai]->ss_cons == NULL) { do_add = FALSE; break; } /* ss_cons != NULL, ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->ss_cons, msaA[ai]->alen)) { /* returns TRUE if ss_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign */ if((status = esl_strdup(msaA[ai]->ss_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning ss_cons of msaA[%d]", ai); /* check identity */ if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; } free(dealigned2); dealigned2 = NULL; break; /* if we matched, do_add remains TRUE */ } } /* end of (for(ai = 1...)) */ if(dealigned != NULL) { free(dealigned); dealigned = NULL; } if(do_add) { /* gap out the the ss_cons to fit in merged alignment, * as a special case, we use '~' in SS_cons as we add missing data and gaps * like we do in RF (for all other annotation we only add gaps) */ if((status = inflate_string_with_gaps_and_missing(msaA[0]->ss_cons, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SS_cons annotation for merged alignment."); if(mmsa->ss_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SS_cons to merged alignment, it is already non-NULL."); if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->ss_cons))) != eslOK) goto ERROR; free(gapped_out); gapped_out = NULL; if(be_verbose) fprintf(stdout, "# Identical SS_cons annotation from all alignments transferred to merged alignment.\n"); } else { if(be_verbose) fprintf(stdout, "# SS_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n"); } } } /* end of if(msaA[0]->ss_cons != NULL) */ else if(be_verbose) fprintf(stdout, "# SS_cons annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check sa_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */ if(msaA[0]->sa_cons != NULL) { do_add = TRUE; /* until proven otherwise */ /* ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->sa_cons, msaA[0]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign sa_cons */ if((status = esl_strdup(msaA[0]->sa_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[0]"); for(ai = 1; ai < nmsa; ai++) { if(msaA[ai]->sa_cons == NULL) { do_add = FALSE; break; } /* sa_cons != NULL, ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->sa_cons, msaA[ai]->alen)) { /* returns TRUE if sa_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign */ if((status = esl_strdup(msaA[ai]->sa_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning sa_cons of msaA[%d]", ai); /* check identity */ if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; } free(dealigned2); dealigned2 = NULL; break; /* if we matched, do_add remains TRUE */ } } /* end of (for(ai = 1...)) */ if(dealigned != NULL) { free(dealigned); dealigned = NULL; } if(do_add) { /* gap out the the sa_cons to fit in merged alignment */ if((status = inflate_string_with_gaps_and_missing(msaA[0]->sa_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create SA_cons annotation for merged alignment."); if(mmsa->sa_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding SA_cons to merged alignment, it is already non-NULL."); if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->sa_cons))) != eslOK) goto ERROR; free(gapped_out); gapped_out = NULL; if(be_verbose) fprintf(stdout, "# Identical SA_cons annotation from all alignments transferred to merged alignment.\n"); } else { if(be_verbose) fprintf(stdout, "# SA_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n"); } } } /* end of if(msaA[0]->sa_cons != NULL) */ else if(be_verbose) fprintf(stdout, "# SA_cons annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Check pp_cons: it must include all gaps in gap RF columns and be identical once gap RF columns are removed in all * msaA[] alignments to be included. (Same requirements as unparsed GC annotation, so code block below is analogous to one above). */ if(msaA[0]->pp_cons != NULL) { do_add = TRUE; /* until proven otherwise */ /* ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[0]->abc, msaA[0]->rf, msaA[0]->pp_cons, msaA[0]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign pp_cons */ if((status = esl_strdup(msaA[0]->pp_cons, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned, msaA[0]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[0]"); for(ai = 1; ai < nmsa; ai++) { if(msaA[ai]->pp_cons == NULL) { do_add = FALSE; break; } /* pp_cons != NULL, ensure it does not have non-gaps in gap RF columns */ if(validate_no_nongaps_in_rf_gaps(msaA[ai]->abc, msaA[ai]->rf, msaA[ai]->pp_cons, msaA[ai]->alen)) { /* returns TRUE if pp_cons has 0 non-gap characters in gap columns of RF annotation */ /* dealign */ if((status = esl_strdup(msaA[ai]->pp_cons, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned2, msaA[ai]->rf, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning pp_cons of msaA[%d]", ai); /* check identity */ if(esl_strcmp(dealigned, dealigned2) != 0) { do_add = FALSE; } free(dealigned2); dealigned2 = NULL; break; /* if we matched, do_add remains TRUE */ } } /* end of (for(ai = 1...)) */ if(dealigned != NULL) { free(dealigned); dealigned = NULL; } if(do_add) { /* gap out the the pp_cons to fit in merged alignment */ if((status = inflate_string_with_gaps_and_missing(msaA[0]->pp_cons, msaA[0]->alen, alen_merged, neitherA, '.', NULL, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create PP_cons annotation for merged alignment."); if(mmsa->pp_cons != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding PP_cons to merged alignment, it is already non-NULL."); if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->pp_cons))) != eslOK) goto ERROR; free(gapped_out); gapped_out = NULL; if(be_verbose) fprintf(stdout, "# Identical PP_cons annotation from all alignments transferred to merged alignment.\n"); } else { if(be_verbose) fprintf(stdout, "# PP_cons annotation from first alignment absent from or different in >= 1 other alignments; not included in merged alignment.\n"); } } } /* end of if(msaA[0]->pp_cons != NULL) */ else if(be_verbose) fprintf(stdout, "# PP_cons annotation absent from (at least) first alignment; not included in merged alignment.\n"); /*********************************************************************/ /* Finally, validate that RF annotation is identical in all alignments after removing gaps. */ if(msaA[0]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation."); /* dealign rf */ if((status = esl_strdup(msaA[0]->rf, msaA[0]->alen, &(dealigned))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned, dealigned, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[0]"); for(ai = 1; ai < nmsa; ai++) { if(msaA[ai]->rf == NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have #= GC RF annotation."); /* dealign */ if((status = esl_strdup(msaA[ai]->rf, msaA[ai]->alen, &(dealigned2))) != eslOK) goto ERROR; if((status = esl_strdealign(dealigned2, dealigned2, "-_.~", NULL)) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "unexpected error dealigning RF of msaA[%d]", ai); /* check identity */ if(esl_strcmp(dealigned, dealigned2) != 0) { printf("%s\n%s\n", dealigned, dealigned2); ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "All alignments must have identical #=GC RF annotation, once gaps (\".\",\"-\",\"_\") are removed.\nAlignment %d de-gapped RF annotation differs from that of alignment 1.\n%s\n%s", ai+1, dealigned, dealigned2); } if(dealigned2 != NULL) { free(dealigned2); dealigned2 = NULL; } } if(dealigned != NULL) { free(dealigned); dealigned = NULL; } /* gap out the the RF to fit in merged alignment */ if((status = inflate_string_with_gaps_and_missing(msaA[0]->rf, msaA[0]->alen, alen_merged, ngapA, '.', nmisA, '~', &(gapped_out))) != eslOK) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding gaps to create RF annotation for merged alignment."); if(mmsa->rf != NULL) ESL_XFAIL(eslEINCONCEIVABLE, errbuf, "Error adding RF to merged alignment, it is already non-NULL."); if((status = esl_strdup(gapped_out, alen_merged, &(mmsa->rf))) != eslOK) goto ERROR; free(gapped_out); gapped_out = NULL; if(be_verbose) fprintf(stdout, "# Identical RF annotation from all alignments transferred to merged alignment.\n"); if(dealigned != NULL) free(dealigned); if(dealigned2 != NULL) free(dealigned2); if(gapped_out != NULL) free(gapped_out); if(ngapA != NULL) free(ngapA); if(nmisA != NULL) free(nmisA); if(neitherA != NULL) free(neitherA); return eslOK; ERROR: if(dealigned != NULL) free(dealigned); if(dealigned2 != NULL) free(dealigned2); if(gapped_out != NULL) free(gapped_out); if(ngapA != NULL) free(ngapA); if(nmisA != NULL) free(nmisA); if(neitherA != NULL) free(neitherA); return status; } /* Function: add_msa * Date: EPN, Mon Nov 23 05:54:37 2009 * * Add a "child" MSA we read from a file to the merged * MSA - the merged alignment that we'll eventually * output. We free each string in the child as soon * as we've added it to the merged, to save memory. * * We add all sequence data (aseq), and per sequence * annotation, including sqname, sqdesc, sqacc, pp, ss, * sa, as well as non-parsed GS and GR annotation. * * [0..clen] is an array specifying the * number of inserted columns necessary between * each consensus position. * * max{gap,mis}[0] is number of gaps/missing columns before 1st cpos. * max{gap,mis}[clen-1] is number of gaps/missing columns before final cpos. * max{gap,mis}[clen] is number of gaps/missing columns after final cpos. * * is the number of columns in the merged * alignment. This is the non-gap RF length plus the * sum of the maxgap vector. * * Returns: eslOK on success. * eslEMEM on memory allocation failure. */ int add_msa(ESL_MSA *mmsa, char *errbuf, ESL_MSA *msa_to_add, int *maxgap, int *maxmis, int clen, int alen_merged) { int status; int i; /* counter over sequences in msa_to_add */ int j; /* counter over alignment annotations */ int mi; /* counter over sequences in mmsa */ void *tmp; /* for reallocations */ char *tmpstr; /* used for copying GR annotation */ int nseq_existing; /* number of sequences already added to mmsa, by previous calls of this function */ int *ngapA = NULL; /* [0..alen] number of insert gap columns to add after each alignment column when merging */ int *nmisA = NULL; /* [0..alen] number of missing data ('~') gap columns to add after each alignment column when merging */ int *neitherA = NULL; /* [0..apos..alen] = ngapA[apos] + nmisA[apos] */ nseq_existing = mmsa->nseq; /* determine how many all gap columns to insert after each alignment position * of the child msa when copying it to the merged msa */ if((status = determine_gap_columns_to_add(msa_to_add, maxgap, maxmis, clen, &(ngapA), &(nmisA), &(neitherA), errbuf)) != eslOK) return status; nseq_existing = mmsa->nseq; /* Append msa_to_add's sequence data and per-sequence annotation to mmsa after adding necessary gaps */ /* sequence names and aligned sequence data (digitized) */ for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { esl_strdup(msa_to_add->sqname[i], -1, &(mmsa->sqname[mi])); status = inflate_seq_with_gaps(msa_to_add->abc, msa_to_add->aseq[i], msa_to_add->alen, alen_merged, neitherA, '.', &(mmsa->aseq[mi])); if (status == eslEMEM) ESL_XFAIL(status, errbuf, "Out of memory adding sequence %s", mmsa->sqname[mi]); else if(status != eslOK) ESL_XFAIL(status, errbuf, "Found internal missing data symbols in seq: %s", mmsa->sqname[mi]); free(msa_to_add->aseq[i]); /* free immediately */ msa_to_add->aseq[i] = NULL; } /* parsed annotation that is optional */ /* sqacc */ if(msa_to_add->sqacc != NULL) { if(mmsa->sqacc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */ ESL_ALLOC(mmsa->sqacc, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqacc[mi] = NULL; } } else { /* reallocate; to add space for new seqs */ ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); } for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->sqacc[i] != NULL) { if((status = esl_strdup(msa_to_add->sqacc[i], -1, &(mmsa->sqacc[mi]))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence accession number %d.\n", i+1); free(msa_to_add->sqacc[i]); /* free immediately */ msa_to_add->sqacc[i] = NULL; } else { mmsa->sqacc[mi] = NULL; } } } /* end of if(msa_to_add->sqacc != NULL */ else if(mmsa->sqacc != NULL) { /* msa_to_add->sqacc == NULL, but mmsa->sqacc != NULL, reallocate mmsa->sqacc, and set new ones to NULL */ ESL_RALLOC(mmsa->sqacc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) { mmsa->sqacc[mi] = NULL; } } /* sqdesc */ if(msa_to_add->sqdesc != NULL) { if(mmsa->sqdesc == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */ ESL_ALLOC(mmsa->sqdesc, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = 0; mi < nseq_existing; mi++) { mmsa->sqdesc[mi] = NULL; } } else { /* reallocate; to add space for new seqs */ ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); } for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->sqdesc[i] != NULL) { if((status = esl_strdup(msa_to_add->sqdesc[i], -1, &(mmsa->sqdesc[mi]))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence description number %d.\n", i+1); free(msa_to_add->sqdesc[i]); /* free immediately */ msa_to_add->sqdesc[i] = NULL; } else { mmsa->sqdesc[mi] = NULL; } } } /* end of if(msa_to_add->sqdesc != NULL */ else if(mmsa->sqdesc != NULL) { /* msa_to_add->sqdesc == NULL, but mmsa->sqdesc != NULL, reallocate mmsa->sqdesc, and set new ones to NULL */ ESL_RALLOC(mmsa->sqdesc, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) { mmsa->sqdesc[mi] = NULL; } } /* per-seq posterior probabilities */ if(msa_to_add->pp != NULL) { if(mmsa->pp == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */ ESL_ALLOC(mmsa->pp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = 0; mi < nseq_existing; mi++) { mmsa->pp[mi] = NULL; } } else { /* reallocate; to add space for new seqs */ ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); } for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->pp[i] != NULL) { if((status = inflate_string_with_gaps_and_missing(msa_to_add->pp[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->pp[mi]))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d posterior probabilities.\n", i+1); free(msa_to_add->pp[i]); /* free immediately */ msa_to_add->pp[i] = NULL; } else { mmsa->pp[mi] = NULL; } } } /* end of if(msa_to_add->pp != NULL */ else if(mmsa->pp != NULL) { /* msa_to_add->pp == NULL, but mmsa->pp != NULL, reallocate mmsa->pp, and set new ones to NULL */ ESL_RALLOC(mmsa->pp, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) { mmsa->pp[mi] = NULL; } } /* per-seq secondary structure */ if(msa_to_add->ss != NULL) { if(mmsa->ss == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */ ESL_ALLOC(mmsa->ss, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = 0; mi < nseq_existing; mi++) { mmsa->ss[mi] = NULL; } } else { /* reallocate; to add space for new seqs */ ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); } for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->ss[i] != NULL) { if((status = inflate_string_with_gaps_and_missing(msa_to_add->ss[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->ss[mi]))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d secondary structure.\n", i+1); free(msa_to_add->ss[i]); /* free immediately */ msa_to_add->ss[i] = NULL; } else { mmsa->ss[mi] = NULL; } } } /* end of if(msa_to_add->ss != NULL */ else if(mmsa->ss != NULL) { /* msa_to_add->ss == NULL, but mmsa->ss != NULL, reallocate mmsa->ss, and set new ones to NULL */ ESL_RALLOC(mmsa->ss, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) { mmsa->ss[mi] = NULL; } } /* per-seq surface accessibility */ if(msa_to_add->sa != NULL) { if(mmsa->sa == NULL) { /* allocate for all sequences, even ones added in previous calls to add_msa() */ ESL_ALLOC(mmsa->sa, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = 0; mi < nseq_existing; mi++) { mmsa->sa[mi] = NULL; } } else { /* reallocate; to add space for new seqs */ ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); } for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->sa[i] != NULL) { if((status = inflate_string_with_gaps_and_missing(msa_to_add->sa[i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(mmsa->sa[mi]))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d surface accessibility.\n", i+1); free(msa_to_add->sa[i]); /* free immediately */ msa_to_add->sa[i] = NULL; } else { mmsa->sa[mi] = NULL; } } } /* end of if(msa_to_add->sa != NULL */ else if(mmsa->sa != NULL) { /* msa_to_add->sa == NULL, but mmsa->sa != NULL, reallocate mmsa->sa, and set new ones to NULL */ ESL_RALLOC(mmsa->sa, tmp, sizeof(char *) * (nseq_existing + msa_to_add->nseq)); for(mi = nseq_existing; mi < nseq_existing + msa_to_add->nseq; mi++) { mmsa->sa[mi] = NULL; } } /* Unparsed per-sequence (GS) annotation */ if(msa_to_add->ngs > 0) { for(j = 0; j < msa_to_add->ngs; j++) { for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->gs[j][i] != NULL) if((status =esl_msa_AddGS(mmsa, msa_to_add->gs_tag[j], -1, mi, msa_to_add->gs[j][i], -1)) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GS annotation.\n", i+1); } free(msa_to_add->gs[j][i]); /* free immediately */ msa_to_add->gs[j][i] = NULL; } } /* caller will free the rest of gs via esl_msa_Destroy() */ /* unparsed per-residue (GR) annotation */ if(msa_to_add->gr != NULL) { for(j = 0; j < msa_to_add->ngr; j++) { for(i = 0, mi = nseq_existing; i < msa_to_add->nseq; i++, mi++) { if(msa_to_add->gr[j][i] != NULL) { if((status = inflate_string_with_gaps_and_missing(msa_to_add->gr[j][i], msa_to_add->alen, alen_merged, neitherA, '.', NULL, '~', &(tmpstr))) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1); if((status = esl_msa_AppendGR(mmsa, msa_to_add->gr_tag[j], mi, tmpstr)) != eslOK) ESL_XFAIL(status, errbuf, "Memory allocation error when copying sequence number %d GR annotation.\n", i+1); free(tmpstr); free(msa_to_add->gr[j][i]); /* free immediately */ msa_to_add->gr[j][i] = NULL; } } } } /* caller will free the rest of gr in esl_msa_Destroy() */ /* msa_to_add should be destroyed by caller */ /* update nseq in mmsa */ mmsa->nseq += msa_to_add->nseq; if(ngapA != NULL) free(ngapA); if(nmisA != NULL) free(nmisA); if(neitherA != NULL) free(neitherA); return eslOK; ERROR: if(ngapA != NULL) free(ngapA); return status; } /* inflate_string_with_gaps_and_missing * * Given a string, create a new one that is a copy of it, * but with missing data and gaps added before each position (apos) * as specified by n{gap,mis}A[0..apos..len]. and * specify the gap character and missing data character. Either * ngapA or nmisA can be either be NULL, if so they're treated as if * they're all zeroes (no gaps/missing data will be added anywhere). * * n{gap,mis}A[0] - number of gaps/missing to add before first posn * n{gap,mis}A[apos] - number of gaps/missing to add before posn apos * n{gap,mis}A[src_len] - number of gaps/missing to add after final posn * * By convention, missing data symbols are always put before gap symbols * when more than 1 of each are to be placed after the some nongap RF position. * * ret_str is allocated here. * * Returns eslOK on success. * eslEMEM on memory error. */ int inflate_string_with_gaps_and_missing(char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, int *nmisA, char mischar, char **ret_dst_str) { int status; int src_apos = 0; int dst_apos = 0; int i; char *dst_str; ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1)); dst_str[dst_len] = '\0'; /* add gaps before first position */ if(nmisA) for(i = 0; i < nmisA[0]; i++) dst_str[dst_apos++] = mischar; if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = gapchar; /* add gaps after every position */ for(src_apos = 0; src_apos < src_len; src_apos++) { dst_str[dst_apos++] = src_str[src_apos]; if(nmisA) for(i = 0; i < nmisA[(src_apos+1)]; i++) dst_str[dst_apos++] = mischar; if(ngapA) for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = gapchar; } *ret_dst_str = dst_str; return eslOK; ERROR: return eslEMEM; } /* inflate_seq_with_gaps() * * Given an aligned sequence, create a new one that is a copy of it, * but with gaps added before each position (apos) as specified by * ngapA[0..apos..len]. specifies the gap character. * * This function follows the HMMER3 convention of using '~' to mark * fragment sequences. If a sequence is a fragment, the first and final * string of gaps (contiguous gaps) will be marked as '~'. That is, * the 1st column will necessarily be a '~' and the last will be a '~'. * This function takes care to add '~' as gap characters in fragments, * where appropriate, to follow HMMER3 convention. * * This function is importantly different from inflate_string_with_gaps_and_missing() * in that it does not allow internal '~' missing data characters to * be added (which are dictated by nmisA[] in inflate_string_with_gaps_and_missing()) * Those '~' are possible in Infernal output alignments only in * #=GC RF and #=GC SS_cons alignment. * * It only adds '~' as necessary to the beginning and ends of fragments. * * ngapA[0] - number of gaps/missing to add before first posn * ngapA[apos] - number of gaps/missing to add before posn apos * ngapA[src_len] - number of gaps/missing to add after final posn * * ret_str is allocated here. * * Returns eslOK on success. * eslEMEM on memory error. * eslEINVAL if sequence (src_str) has an internal missing * data symbol that violates HMMER3 convention. */ int inflate_seq_with_gaps(const ESL_ALPHABET *abc, char *src_str, int64_t src_len, int64_t dst_len, int *ngapA, char gapchar, char **ret_dst_str) { int status; int src_apos = 0; int dst_apos = 0; int src_flpos = 0; /* position of rightmost '~' in contiguous string that begins at position 0 */ int src_frpos = 0; /* position of leftmost '~' in contiguous string that ends at position src_len-1 */ int i; char *dst_str; char char2add; int i_am_fragment = FALSE; ESL_ALLOC(dst_str, sizeof(char) * (dst_len+1)); dst_str[dst_len] = '\0'; /* determine if sequence is a fragment, and determine src_flpos and src_frpos */ src_flpos = 0; while(src_flpos < src_len && esl_abc_CIsMissing(abc, src_str[src_flpos])) { src_flpos++; } src_flpos--; /* we overshot by 1 */ /* src_flpos is now rightmost '~' in stretch of '~' that begin at position 0, so * it's -1 if sequence is not a fragment (there are no leading '~' in src_str) */ src_frpos = src_len-1; while(src_frpos > -1 && esl_abc_CIsMissing(abc, src_str[src_frpos])) { src_frpos--; } src_frpos++; /* we overshot by 1 */ /* src_frpos is now leftmost '~' in stretch of '~' that end at position src_len-1, so * it's src_len if sequence is not a fragment (there are no trailing '~' in src_str) */ i_am_fragment = (src_flpos != -1 || src_frpos != src_len) ? TRUE : FALSE; /* Now verify that we don't have any internal '~' characters */ for(src_apos = src_flpos+1; src_apos <= src_frpos-1; src_apos++) { if(esl_abc_CIsMissing(abc, src_str[src_apos])) return eslEINVAL; } /* add gaps before first position */ src_apos = 0; if(i_am_fragment) { char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar; } else { char2add = gapchar; } if(ngapA) for(i = 0; i < ngapA[0]; i++) dst_str[dst_apos++] = char2add; /* add gaps or missing character after every position */ for(src_apos = 0; src_apos < src_len; src_apos++) { dst_str[dst_apos++] = src_str[src_apos]; if(i_am_fragment) { char2add = ((src_flpos >= src_apos) || (src_frpos <= src_apos)) ? esl_abc_CGetMissing(abc) : gapchar; } else { char2add = gapchar; } for(i = 0; i < ngapA[(src_apos+1)]; i++) dst_str[dst_apos++] = char2add; } *ret_dst_str = dst_str; return eslOK; ERROR: return eslEMEM; } /* validate_no_nongaps_in_rf_gaps * * Given an RF string with gaps defined as by alphabet * and another string of same length. Make sure none of the * positions that are gaps in the RF string are non-gaps in the * other string. Return TRUE if none are. Return FALSE if at * least one is. * * Returns TRUE if 0 characters in in same position * as a gap in are non-gaps. FALSE otherwise. */ int validate_no_nongaps_in_rf_gaps(const ESL_ALPHABET *abc, char *rf_str, char *other_str, int64_t len) { int64_t i; for(i = 0; i < len; i++) { if((! rfchar_is_nongap_nonmissing(abc, rf_str[i])) && (rfchar_is_nongap_nonmissing(abc, other_str[i]))) return FALSE; } return TRUE; } /* determine_gap_columns_to_add * (stolen and slightly modified from Infernal 1.1rc1's cmalign.c) * * Given and , two arrays of the number of gap RF * positions and '~' RF (missing data) after each non-gap RF * (consensus) position in the eventual final merged alignment, * calculate how many inserts and missing data inserts * we need to add at each position of to expand it out to the * appropriate size of the eventual merged alignment. * * max{gap,mis}[0] is number of gaps/missing before 1st cpos in merged aln * max{gap,mis}[cpos] is number of gaps/missing after cpos in merged aln * for cpos = 1..clen * clen is the number of non-gap RF positions in msa (and in eventual merged msa). * * We allocate fill and return ret_ngapA[0..msa->alen], ret_nmisA[0..msa->alen], * and ret_neitherA[0..msa->alen] here. * * ret_n{gap,mis}gapA[0] is number of gaps/missing to add before 1st position of msa * ret_n{gap,mis}gapA[apos] is number of gaps/missing to add after alignment position apos * for apos = 1..msa->alen * * ret_neitherA[apos] = ngapA[apos] + nmisA[apos] * * This is similar to the esl_msa.c helper function of the same name, * but that function does not bother with missing data '~'. * * Returns eslOK on success. * eslEMEM on memory alloaction error * eslEINVAL if gaps occur before missing data b/t any two nongap RF posns * eslERANGE if a value exceeds what we expected (based on earlier * checks before this function was entered). * if !eslOK, errbuf if filled. */ int determine_gap_columns_to_add(ESL_MSA *msa, int *maxgap, int *maxmis, int clen, int **ret_ngapA, int **ret_nmisA, int **ret_neitherA, char *errbuf) { int status; int apos; int prv_cpos = 0; /* alignment position corresponding to consensus position cpos-1 */ int cpos = 0; int ngap = 0; int nmis = 0; int *ngapA = NULL; int *nmisA = NULL; int *neitherA = NULL; /* contract check */ if(maxmis[0] != 0) ESL_FAIL(eslEINVAL, errbuf, "missing characters exist prior to first cpos, this shouldn't happen.\n"); ESL_ALLOC(ngapA, sizeof(int) * (msa->alen+1)); ESL_ALLOC(nmisA, sizeof(int) * (msa->alen+1)); ESL_ALLOC(neitherA, sizeof(int) * (msa->alen+1)); esl_vec_ISet(ngapA, (msa->alen+1), 0); esl_vec_ISet(nmisA, (msa->alen+1), 0); esl_vec_ISet(neitherA, (msa->alen+1), 0); for(apos = 0; apos < msa->alen; apos++) { if(esl_abc_CIsMissing(msa->abc, msa->rf[apos])) { nmis++; if(ngap > 0) ESL_FAIL(eslEINVAL, errbuf, "after nongap RF pos %d, %d gap columns precede a missing data column (none should)", cpos, ngap); } else if(esl_abc_CIsGap(msa->abc, msa->rf[apos])) { ngap++; } else { /* a consensus position */ /* a few sanity checks */ if(ngap > maxgap[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d inserts before cpos %d greater than max expected (%d).\n", ngap, cpos, maxgap[cpos]); if(nmis > maxmis[cpos]) ESL_FAIL(eslEINCONCEIVABLE, errbuf, "%d EL inserts before cpos %d greater than max expected (%d).\n", nmis, cpos, maxmis[cpos]); if (cpos == 0) { /* gaps and/or missing data before first consensus position: flush right */ if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */ ngapA[prv_cpos] = maxgap[cpos] - ngap; /* flush right gaps (no missing) */ } else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) { nmisA[prv_cpos] = maxmis[cpos] - nmis; /* flush right missing (no gaps) */ } else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) { /* missing data (ELs) is always 5' of gaps */ nmisA[prv_cpos] = maxmis[cpos] - nmis; /* flush right */ ngapA[prv_cpos+nmis] = maxgap[cpos] - ngap; /* flush right, after missing */ } } else { /* gaps and/missing data after interior consensus position (i.e. not before 1st cpos or after last) * Determine where to place inserts and/or missing data. * Handle each of 4 possibilities separately, note that if * maxgap[cpos] == 0 then ngap == 0, and if maxmis[cpos] == 0 then nmis == 0 (see sanity check above). */ if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */ ngapA[prv_cpos + 1 + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */ } else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) { nmisA[prv_cpos + 1 + (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */ } else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) { /* missing data is always 5' of gaps */ nmisA[prv_cpos + 1 + (nmis/2)] = maxmis[cpos] - nmis; /* internal cpos: split */ ngapA[prv_cpos + 1 + nmis + (ngap/2)] = maxgap[cpos] - ngap; /* internal cpos: split */ } /* final case is if (maxgap[cpos] == 0 && maxmis[cpos] == 0) * in this case we do nothing. */ } cpos++; prv_cpos = apos; ngap = 0; nmis = 0; } } /* deal with gaps and missing data after final consensus position */ /* first, validate that clen is what it should be */ if(cpos != clen) { if(ngapA != NULL) free(ngapA); if(nmisA != NULL) free(nmisA); if(neitherA != NULL) free(neitherA); ESL_FAIL(eslEINCONCEIVABLE, errbuf, "consensus length (%d) is not the expected length (%d).", cpos, clen); } if(maxgap[cpos] > 0 && maxmis[cpos] == 0) { /* most common case */ ngapA[prv_cpos + 1 + ngap] = maxgap[cpos] - ngap; /* flush left gaps (no missing) */ } else if(maxgap[cpos] == 0 && maxmis[cpos] > 0) { nmisA[prv_cpos + 1 + nmis] = maxmis[cpos] - nmis; /* flush left missing (no gaps) */ } else if(maxgap[cpos] > 0 && maxmis[cpos] > 0) { /* missing data (ELs) is always 5' of gaps */ nmisA[prv_cpos + 1 + nmis] = maxmis[cpos] - nmis; /* flush left */ ngapA[prv_cpos + 1 + nmis + ngap] = maxgap[cpos] - ngap; /* flush left, after missing */ } /* determine neitherA[], the number of gaps due to either inserts or missing data after each apos */ for(apos = 0; apos <= msa->alen; apos++) { neitherA[apos] = ngapA[apos] + nmisA[apos]; } *ret_ngapA = ngapA; *ret_nmisA = nmisA; *ret_neitherA = neitherA; return eslOK; ERROR: if(ngapA != NULL) free(ngapA); if(nmisA != NULL) free(nmisA); if(neitherA != NULL) free(neitherA); ESL_FAIL(status, errbuf, "Memory allocation error."); return status; /*NEVERREACHED*/ } /* write_pfam_msa_top * * Highly specialized function for printing out the 'top' of a * Stockholm alignment file, i.e. the Stockholm header line, comments * and GF annotation to an msa file. This function is necessary when * printing the merged alignment file in small memory mode (when * --small enabled). is an alignment with no sequence * information (no aseq, ax, GS, or GR data). */ void write_pfam_msa_top(FILE *fp, ESL_MSA *msa) { int i, maxgf; /* rest of code in this function was stolen verbatim from actually_write_stockholm in esl_msa.c */ /* Magic Stockholm header */ fprintf(fp, "# STOCKHOLM 1.0\n"); /* Free text comments */ for (i = 0; i < msa->ncomment; i++) fprintf(fp, "# %s\n", msa->comment[i]); if (msa->ncomment > 0) fprintf(fp, "\n"); maxgf = maxwidth(msa->gf_tag, msa->ngf); if (maxgf < 2) maxgf = 2; /* GF section: per-file annotation */ if (msa->name != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "ID", msa->name); if (msa->acc != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AC", msa->acc); if (msa->desc != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "DE", msa->desc); if (msa->au != NULL) fprintf(fp, "#=GF %-*s %s\n", maxgf, "AU", msa->au); /* Thresholds are hacky. Pfam has two. Rfam has one. */ if (msa->cutset[eslMSA_GA1] && msa->cutset[eslMSA_GA2]) fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "GA", msa->cutoff[eslMSA_GA1], msa->cutoff[eslMSA_GA2]); else if (msa->cutset[eslMSA_GA1]) fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "GA", msa->cutoff[eslMSA_GA1]); if (msa->cutset[eslMSA_NC1] && msa->cutset[eslMSA_NC2]) fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "NC", msa->cutoff[eslMSA_NC1], msa->cutoff[eslMSA_NC2]); else if (msa->cutset[eslMSA_NC1]) fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "NC", msa->cutoff[eslMSA_NC1]); if (msa->cutset[eslMSA_TC1] && msa->cutset[eslMSA_TC2]) fprintf(fp, "#=GF %-*s %.1f %.1f\n", maxgf, "TC", msa->cutoff[eslMSA_TC1], msa->cutoff[eslMSA_TC2]); else if (msa->cutset[eslMSA_TC1]) fprintf(fp, "#=GF %-*s %.1f\n", maxgf, "TC", msa->cutoff[eslMSA_TC1]); for (i = 0; i < msa->ngf; i++) fprintf(fp, "#=GF %-*s %s\n", maxgf, msa->gf_tag[i], msa->gf[i]); fprintf(fp, "\n"); return; } /* write_pfam_msa_gc * * Highly specialized function for printing out the GC annotation to a * Pfam Stockholm alignment file (1 line/seq). This function is * necessary when printing the merged alignment file in small memory * mode (when --small enabled). is an alignment with no sequence * information (no aseq, ax, GS, nor GR data). */ void write_pfam_msa_gc(FILE *fp, ESL_MSA *msa, int margin) { int i; if (msa->ss_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SS_cons", msa->ss_cons); } if (msa->sa_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "SA_cons", msa->sa_cons); } if (msa->pp_cons != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "PP_cons", msa->pp_cons); } if (msa->rf != NULL) { fprintf(fp, "#=GC %-*s %s\n", margin-6, "RF", msa->rf); } for (i = 0; i < msa->ngc; i++) { fprintf(fp, "#=GC %-*s %s\n", margin-6, msa->gc_tag[i], msa->gc[i]); } fprintf(fp, "//\n"); } /* maxwidth() * Return the length of the longest string in * an array of strings. */ static int64_t maxwidth(char **s, int n) { int64_t max,len; int i; max = 0; for (i = 0; i < n; i++) if (s[i] != NULL) { len = strlen(s[i]); if (len > max) max = len; } return max; } /* rfchar_is_nongap_nonmissing() * Return FALSE if a character from RF annotation is * a gap or a missing character, else return TRUE. */ static int rfchar_is_nongap_nonmissing(const ESL_ALPHABET *abc, char rfchar) { if(esl_abc_CIsGap (abc, rfchar)) return FALSE; if(esl_abc_CIsMissing(abc, rfchar)) return FALSE; return TRUE; }