/*
Copyright 2009-2012 Andreas Biegert, Christof Angermueller
This file is part of the CS-BLAST package.
The CS-BLAST package is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
The CS-BLAST package is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
#include "cs.h"
#include "alignment-inl.h"
#include "application.h"
#include "blosum_matrix.h"
#include "context_library.h"
#include "crf_pseudocounts-inl.h"
#include "crf-inl.h"
#include "count_profile-inl.h"
#include "getopt_pp.h"
#include "library_pseudocounts-inl.h"
#include "pssm.h"
#include "sequence-inl.h"
#include "a3m_compress.h"
#include "ffindexdatabase.h"
#include "ext/fmemopen.h"
#include "context_data.lib.h"
//include "context_data.crf.h"
#include "cs219.lib.h"
#ifdef OPENMP
#include
#endif
#include
using namespace GetOpt;
using std::string;
using std::vector;
namespace cs {
struct CSTranslateAppOptions {
static const int kAssignMatchColsByQuery = -1;
CSTranslateAppOptions() { Init(); }
virtual ~CSTranslateAppOptions() { }
// Set csbuild default parameters
void Init() {
informat = "auto";
outformat = "seq";
modelfile = "internal";
alphabetfile = "internal";
pc_admix = 0.90;
pc_ali = 12.0;
match_assign = kAssignMatchColsByQuery;
weight_center = 1.6;
weight_decay = 0.85;
weight_as = 1000.0;
ffindex = false;
verbose = true;
}
// Validates the parameter settings and throws exception if needed.
void Validate() {
if (infile.empty()) throw Exception("No input file provided!");
if (alphabetfile.empty()) throw Exception("No abstract states provided!");
}
// The input alignment file with training data.
string infile;
// The output file.
string outfile;
// The file to which the output should be appended.
string appendfile;
// Input file with context profile library or HMM for generating pseudocounts
string modelfile;
// Input file with profile library to be used as abstract state alphabet
string alphabetfile;
// Input file format
string informat;
// Output file format (abstract state sequence or abstract state profile)
string outformat;
// Overall pseudocount admixture
double pc_admix;
// Constant in pseudocount calculation for alignments
double pc_ali;
// Match column assignment for FASTA alignments
int match_assign;
// Weight of central column in multinomial emission
double weight_center;
// Exponential decay of window weights
double weight_decay;
// Weight in emission calculation of abstract states
double weight_as;
// verbose output
bool verbose;
// ffindex
bool ffindex;
}; // CSTranslateAppOptions
char GetMatchSymbol(double pp) {
char rv = '=';
if (pp > 0.8) rv = '|';
else if (pp > 0.6) rv = '+';
else if (pp > 0.4) rv = '.';
else if (pp > 0.2) rv = '-';
return rv;
}
inline int GetConfidence(double pp) {
return static_cast(floor((pp - DBL_EPSILON) * 10));
}
template
class CSTranslateApp : public Application {
public:
// Runs the csbuild application.
virtual int Run() {
SetupEmissions();
SetupPseudocountEngine();
SetupAbstractStateEngine();
if (!opts_.ffindex) {
FILE *fin;
if (strcmp(opts_.infile.c_str(), "stdin") == 0)
fin = stdin;
else
fin = fopen(opts_.infile.c_str(), "r");
if (!fin)
throw Exception("Unable to read input file '%s'!", opts_.infile.c_str());
string header;
CountProfile profile; // input profile we want to translate
ReadProfile(fin, header, profile);
size_t profile_counts_lenght = profile.counts.length();
// Prepare abstract sequence in AS219 format
Sequence as_seq(profile_counts_lenght);
as_seq.set_header(header);
// Translate count profile into abstract state count profile (Neff is one)
if (opts_.verbose)
fputs("Translating count profile to abstract state alphabet AS219 ...\n", out_);
CountProfile as_profile(profile_counts_lenght); // output profile
Translate(profile, as_profile);
BuildSequence(as_profile, profile_counts_lenght, as_seq);
// Build pseudo-alignment for output
const size_t nseqs = 5;
const size_t header_width = 4;
const size_t width = 100;
vector ali(nseqs, "");
vector labels(nseqs, "");
labels[0] = "Pos";
labels[1] = "Cons";
labels[3] = "AS219";
labels[4] = "Conf";
BlosumMatrix sm;
ali[1] = ConservationSequence(profile, sm);
for (size_t i = 0; i < profile.counts.length(); ++i) {
ali[0].append(strprintf("%d", (i + 1) % 10));
ali[2].push_back(GetMatchSymbol(as_profile.counts[i][as_seq[i]]));
ali[3].push_back(as_seq.chr(i));
ali[4].append(strprintf("%d", GetConfidence(as_profile.counts[i][as_seq[i]])));
}
// Print pseudo alignment in blocks if verbose
if (opts_.verbose) {
fputc('\n', out_); // blank line before alignment
while (!ali.front().empty()) {
for (size_t k = 0; k < nseqs; ++k) {
string label = labels[k];
label += string(header_width - label.length() + 1, ' ');
fputs(label.c_str(), out_);
fputc(' ', out_); // separator between header and sequence
size_t len = MIN(width, ali[k].length());
fputs(ali[k].substr(0, len).c_str(), out_);
fputc('\n', out_);
ali[k].erase(0, len);
}
fputc('\n', out_); // blank line after each block
}
}
// Write abstract-state sequence or profile to outfile
if (opts_.outformat == "seq") {
if (!opts_.outfile.empty()) {
WriteStateSequence(as_seq, opts_.outfile, false);
}
if (!opts_.appendfile.empty()) {
WriteStateSequence(as_seq, opts_.appendfile, true);
}
} else {
if (!opts_.outfile.empty()) {
WriteStateProfile(as_profile, opts_.outfile, false);
}
if (!opts_.appendfile.empty()) {
WriteStateProfile(as_profile, opts_.appendfile, true);
}
}
}
else {
const bool isCa3m = this->opts_.informat == "ca3m";
std::string input_data_file = opts_.infile + ".ffdata";
std::string input_index_file = opts_.infile + ".ffindex";
FFindexDatabase *header_db = NULL;
FFindexDatabase *sequence_db = NULL;
if (isCa3m) {
// infile has to be the ffindex basepath with no suffices
input_data_file = this->opts_.infile + "_ca3m.ffdata";
input_index_file = this->opts_.infile + "_ca3m.ffindex";
std::string input_header_data_file = this->opts_.infile + "_header.ffdata";
std::string input_header_index_file = this->opts_.infile + "_header.ffindex";
header_db = new FFindexDatabase(const_cast(input_header_data_file.c_str()),
const_cast(input_header_index_file.c_str()), false);
std::string input_sequence_data_file = this->opts_.infile + "_sequence.ffdata";
std::string input_sequence_index_file = this->opts_.infile + "_sequence.ffindex";
sequence_db = new FFindexDatabase(const_cast(input_sequence_data_file.c_str()),
const_cast(input_sequence_index_file.c_str()), false);
}
FFindexDatabase input(const_cast(input_data_file.c_str()),
const_cast(input_index_file.c_str()), isCa3m);
input.ensureLinearAccess();
//prepare output ffindex cs219 database
std::string output_data_file = opts_.outfile + ".ffdata";
std::string output_index_file = opts_.outfile + ".ffindex";
FILE *output_data_fh = fopen(output_data_file.c_str(), "w");
FILE *output_index_fh = fopen(output_index_file.c_str(), "w");
if (output_data_fh == NULL) {
LOG(ERROR) << "Could not open ffindex output data file! (" << output_data_file << ")!" << std::endl;
exit(1);
}
if (output_index_fh == NULL) {
LOG(ERROR) << "Could not open ffindex output index file! (" << output_index_file << ")!" << std::endl;
exit(1);
}
size_t output_offset = 0;
size_t input_range_start = 0;
size_t input_range_end = input.db_index->n_entries;
// Foreach entry
#pragma omp parallel for shared(input, sequence_db, header_db)
for (size_t entry_index = input_range_start; entry_index < input_range_end; entry_index++) {
ffindex_entry_t *entry = ffindex_get_entry_by_index(input.db_index, entry_index);
if (entry == NULL) {
LOG(WARNING) << "Could not open entry " << entry_index << " from input ffindex!" << std::endl;
continue;
}
if (opts_.verbose) {
fprintf(out_, "Processing entry: %s\n", entry->name);
}
std::ostringstream a3m_buffer;
std::string a3m_string;
FILE *inf;
if (isCa3m) {
char *data = ffindex_get_data_by_entry(input.db_data, entry);
compressed_a3m::extract_a3m(data, entry->length,
sequence_db->db_index, sequence_db->db_data,
header_db->db_index, header_db->db_data,
&a3m_buffer);
a3m_string = a3m_buffer.str();
inf = fmemopen(static_cast(const_cast(a3m_string.c_str())), a3m_string.length(), "r");
} else {
inf = ffindex_fopen_by_entry(input.db_data, entry);
}
if (inf == NULL) {
LOG(WARNING) << "Could not open input entry (" << entry->name << ")!" << std::endl;
continue;
}
string header;
CountProfile profile; // input profile we want to translate
try {
ReadProfile(inf, header, profile);
} catch (const Exception &e) {
fprintf(out_, "Could not read entry: %s, Message: %s\n", entry->name, e.what());
continue;
}
size_t profile_counts_length = profile.counts.length();
CountProfile as_profile(profile_counts_length); // output profile
Translate(profile, as_profile);
// Prepare abstract sequence in AS219 format
Sequence as_seq(profile_counts_length);
as_seq.set_header(header);
BuildSequence(as_profile, profile_counts_length, as_seq);
std::stringstream out_buffer;
if (opts_.outformat == "seq") {
WriteStateSequence(as_seq, out_buffer);
} else {
WriteStateProfile(as_profile, out_buffer);
}
std::string out_string = out_buffer.str();
#pragma omp critical
{
ffindex_insert_memory(output_data_fh, output_index_fh, &output_offset,
const_cast(out_string.c_str()),
out_string.size(), entry->name);
}
// FIXME: we are leaking inf, but if we fclose we get weird crashes
//fclose(inf);
}
fclose(output_index_fh);
fclose(output_data_fh);
ffsort_index(output_index_file.c_str());
if (isCa3m) {
delete sequence_db;
delete header_db;
}
}
return 0;
};
// Parses command line options.
virtual void ParseOptions(GetOpt_pp &ops) {
ops >> Option('i', "infile", opts_.infile, opts_.infile);
ops >> Option('o', "outfile", opts_.outfile, opts_.outfile);
ops >> Option('a', "appendfile", opts_.appendfile, opts_.appendfile);
ops >> Option('I', "informat", opts_.informat, opts_.informat);
ops >> Option('O', "outformat", opts_.outformat, opts_.outformat);
ops >> Option('M', "match-assign", opts_.match_assign, opts_.match_assign);
ops >> Option('x', "pc-admix", opts_.pc_admix, opts_.pc_admix);
ops >> Option('c', "pc-ali", opts_.pc_ali, opts_.pc_ali);
ops >> Option('A', "alphabet", opts_.alphabetfile, opts_.alphabetfile);
ops >> Option('D', "context-data", opts_.modelfile, opts_.modelfile);
ops >> Option('w', "weight", opts_.weight_as, opts_.weight_as);
ops >> OptionPresent('f', "ffindex", opts_.ffindex);
ops >> Option('v', "verbose", opts_.verbose, opts_.verbose);
opts_.Validate();
if (opts_.outfile.compare("stdout") == 0)
opts_.verbose = false;
if (opts_.outfile.empty() && opts_.appendfile.empty())
opts_.outfile = GetBasename(opts_.infile, false) + ".as";
if (opts_.informat == "auto")
opts_.informat = GetFileExt(opts_.infile);
};
// Prints options summary to stream.
virtual void PrintOptions() const {
fprintf(out_, " %-30s %s\n", "-i, --infile ",
"Input file with alignment or sequence");
fprintf(out_, " %-30s %s\n", "-o, --outfile ",
"Output file for generated abstract state sequence (def: .as)");
fprintf(out_, " %-30s %s\n", "-a, --append ", "Append generated abstract state sequence to this file");
fprintf(out_, " %-30s %s (def=%s)\n", "-I, --informat prf|seq|fas|...",
"Input format: prf, seq, fas, a2m, a3m or ca3m", opts_.informat.c_str());
fprintf(out_, " %-30s %s (def=%s)\n", "-O, --outformat seq|prf", "Outformat: abstract state sequence or profile",
opts_.outformat.c_str());
fprintf(out_, " %-30s %s\n", "-M, --match-assign [0:100]",
"Make all FASTA columns with less than X% gaps match columns");
fprintf(out_, " %-30s %s\n", "", "(def: make columns with residue in first sequence match columns)");
fprintf(out_, " %-30s %s (def=internal)\n", "-A, --alphabet ",
"Abstract state alphabet consisting of exactly 219 states");
fprintf(out_, " %-30s %s (def=internal)\n", "-D, --context-data ",
"Add context-specific pseudocounts using given context-data");
fprintf(out_, " %-30s %s (def=%-.2f)\n", "-x, --pc-admix [0,1]",
"Pseudocount admix for context-specific pseudocounts", opts_.pc_admix);
fprintf(out_, " %-30s %s (def=%-.1f)\n", "-c, --pc-ali [0,inf[",
"Constant in pseudocount calculation for alignments", opts_.pc_ali);
fprintf(out_, " %-30s %s (def=%-.2f)\n", "-w, --weight [0,inf[",
"Weight of abstract state column in emission calculation", opts_.weight_as);
fprintf(out_, " %-30s %s (def=off)\n", "-f, --ffindex",
"Read from -i , write to -o (do not include _ca3m suffix for ca3m informat); enables openmp if possible");
};
// Prints short application description.
virtual void PrintBanner() const {
fputs("Translate a sequence/alignment into an abstract state alphabet.\n", out_);
};
// Prints usage banner to stream.
virtual void PrintUsage() const {
fputs("Usage: cstranslate -i [options]\n", out_);
};
// Setup pseudocount engine
void SetupPseudocountEngine() {
if (opts_.modelfile.empty())
return;
std::string engine;
FILE *fin;
if (opts_.modelfile == "internal") {
// fin = fmemopen((void*)context_data_crf, context_data_crf_len, "r");
// engine = "crf";
fin = fmemopen((void*)context_data_lib, context_data_lib_len, "r");
engine = "lib";
} else {
fin = fopen(opts_.modelfile.c_str(), "r");
engine = GetFileExt(opts_.modelfile);
}
if (!fin) {
throw Exception("Unable to read file '" + opts_.modelfile + "'!\n");
}
if (engine == "lib") {
// Older generative approach by Biegert & Soeding PNAS 2009
if (opts_.verbose) {
fprintf(out_, "Reading context library for pseudocounts from %s ...\n", GetBasename(opts_.modelfile).c_str());
}
pc_lib_.reset(new ContextLibrary(fin));
TransformToLog(*pc_lib_);
pc_.reset(new LibraryPseudocounts(*pc_lib_, opts_.weight_center,opts_.weight_decay));
} else if (engine == "crf") {
// Newer discriminative approach by Angermueller & Soeding Bioinformatics 2012
if (opts_.verbose) {
fprintf(out_, "Reading CRF for pseudocounts from %s ...\n", GetBasename(opts_.modelfile).c_str());
}
pc_crf_.reset(new Crf(fin));
pc_.reset(new CrfPseudocounts(*pc_crf_));
} else {
throw Exception("Invalid pseudocount engine: " + engine);
}
fclose(fin);
};
// Setup abstract state engine
void SetupAbstractStateEngine() {
FILE *fin;
if (opts_.alphabetfile == "internal") {
fin = fmemopen((void*)cs219_lib, cs219_lib_len, "r");
} else {
fin = fopen(opts_.alphabetfile.c_str(), "r");
}
if (!fin) {
throw Exception("Unable to read file '" + opts_.alphabetfile + "'!\n");
}
if (opts_.verbose) {
fprintf(out_, "Reading abstract state alphabet from %s ...\n", GetBasename(opts_.alphabetfile).c_str());
}
as_lib_.reset(new ContextLibrary(fin));
if (as_lib_->size() != AS219::kSize) {
fprintf(out_, "Abstract state alphabet should have %zu states but actually has %zu states!\n", AS219::kSize, as_lib_->size());
exit(1);
}
if (static_cast(as_lib_->wlen()) != 1) {
fprintf(out_, "Abstract state alphabet should have a window length of %d but actually has %zu!\n", 1, as_lib_->wlen());
exit(1);
}
TransformToLog(*as_lib_);
fclose(fin);
};
void SetupEmissions() {
emission_.reset(new Emission(1, this->opts_.weight_as, 1.0));
}
// Writes abstract state sequence to outfile
void WriteStateSequence(const Sequence &seq, string outfile, bool append = false) const {
FILE *fout;
if (outfile.compare("stdout") == 0)
fout = stdout;
else
fout = fopen(outfile.c_str(), append ? "a" : "w");
if (!fout) throw Exception("Can't %s to file '%s'!", append ? "append" : "write", outfile.c_str());
for (size_t i = 0; i < seq.length(); ++i) {
fputc((char) seq[i], fout);
}
fclose(fout);
if (opts_.verbose)
fprintf(out_, "%s abstract state sequence to %s\n", append ? "Appended" : "Wrote", outfile.c_str());
};
void WriteStateSequence(const Sequence &seq, std::stringstream &ss) {
for (size_t i = 0; i < seq.length(); ++i) {
ss.put((char) seq[i]);
}
};
// Writes abstract state profile to outfile
void WriteStateProfile(const CountProfile &prof, string outfile, bool append = false) const {
FILE *fout;
if (outfile.compare("stdout") == 0)
fout = stdout;
else
fout = fopen(outfile.c_str(), append ? "a" : "w");
if (!fout) throw Exception("Can't %s to output file '%s'!", append ? "append" : "write", outfile.c_str());
prof.Write(fout);
fclose(fout);
if (opts_.verbose)
fprintf(out_, "%s abstract state count profile to %s\n", append ? "Appended" : "Wrote", outfile.c_str());
};
void WriteStateProfile(const CountProfile &prof, std::stringstream &ss) {
prof.Write(ss);
};
void ReadProfile(FILE *fin, string &header, CountProfile &profile) {
if (opts_.informat == "prf") { // read count profile from infile
profile = CountProfile(fin);;
if (profile.name.empty()) header = GetBasename(opts_.infile, false);
else header = profile.name;
if (pc_) {
if (opts_.verbose)
fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix);
CSBlastAdmix admix(opts_.pc_admix, opts_.pc_ali);
profile.counts = pc_->AddTo(profile, admix);
Normalize(profile.counts, profile.neff);
}
} else if (opts_.informat == "seq") { // build profile from sequence
Sequence seq(fin);
header = seq.header();
profile = CountProfile(seq);
if (pc_) {
if (opts_.verbose)
fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix);
ConstantAdmix admix(opts_.pc_admix);
profile.counts = pc_->AddTo(seq, admix);
}
} else { // build profile from alignment
AlignmentFormat f = AlignmentFormatFromString(opts_.informat);
Alignment ali(fin, f);
header = ali.name();
if (f == FASTA_ALIGNMENT) {
if (opts_.match_assign == CSTranslateAppOptions::kAssignMatchColsByQuery)
ali.AssignMatchColumnsBySequence(0);
else
ali.AssignMatchColumnsByGapRule(opts_.match_assign);
}
profile = CountProfile(ali);
if (pc_) {
if (opts_.verbose)
fprintf(out_, "Adding cs-pseudocounts (admix=%.2f) ...\n", opts_.pc_admix);
CSBlastAdmix admix(opts_.pc_admix, opts_.pc_ali);
profile.counts = pc_->AddTo(profile, admix);
Normalize(profile.counts, profile.neff);
}
}
fclose(fin); // close input file
};
void Translate(CountProfile &profile, CountProfile &as_profile) {
for (size_t i = 0; i < as_profile.length(); ++i) {
CalculatePosteriorProbs(*as_lib_, *emission_, profile, i, as_profile.counts[i]);
}
as_profile.name = GetBasename(opts_.infile, false);
as_profile.name = as_profile.name.substr(0, as_profile.name.length() - 1);
};
void BuildSequence(CountProfile &as_profile, size_t profile_counts_length, Sequence &as_seq) {
// We also build an abstract state sequence, either just for pretty printing or
// even because this is the actual output that the user wants
for (size_t i = 0; i < profile_counts_length; ++i) {
// Find state with maximal posterior prob and assign it to as_seq[i]
size_t k_max = 0;
double p_max = as_profile.counts[i][0];
for (size_t k = 1; k < AS219::kSize; ++k) {
if (as_profile.counts[i][k] > p_max) {
k_max = k;
p_max = as_profile.counts[i][k];
}
}
as_seq[i] = k_max;
}
};
// Parameter wrapper
CSTranslateAppOptions opts_;
// Profile library with abstract states
scoped_ptr > as_lib_;
// Profile library for context pseudocounts
scoped_ptr > pc_lib_;
// CRF for CRF context pseudocounts
scoped_ptr > pc_crf_;
// Pseudocount engine
scoped_ptr > pc_;
// Emissions
scoped_ptr > emission_;
}; // class CSTranslateApp
} // namespace cs