/* Copyright 2009-2012 Andreas Biegert, Christof Angermueller This file is part of the CS-BLAST package. The CS-BLAST package is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. The CS-BLAST package is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ #ifndef CS_COUNT_PROFILE_INL_H_ #define CS_COUNT_PROFILE_INL_H_ #include "count_profile.h" #include "alignment-inl.h" #include "profile-inl.h" #include "sequence-inl.h" namespace cs { template CountProfile::CountProfile(const Alignment& ali, bool pos_weights, bool neff_sum_pairs) : counts(ali.nmatch(), 0.0f), neff(ali.nmatch()) { // Add counts and neff from alignment to count profile if (pos_weights) { // use position-specific sequence weights Matrix w; Vector neff_tmp(PositionSpecificWeightsAndDiversity(ali, w)); assert(neff.size()); for (size_t i = 0; i < counts.length(); ++i) { neff[i] = neff_tmp[i]; for (size_t k = 0; k < ali.nseqs(); ++k) if (ali[i][k] < Abc::kAny) counts[i][ali[i][k]] += w[i][k]; } } else { // use faster global sequence weights Vector wg; double neff_glob = GlobalWeightsAndDiversity(ali, wg, neff_sum_pairs); for (size_t i = 0; i < counts.length(); ++i) { neff[i] = neff_glob; for (size_t k = 0; k < ali.nseqs(); ++k) if (ali[i][k] < Abc::kAny) counts[i][ali[i][k]] += wg[k]; } } // Normalize counts to effective number of sequences Normalize(counts, neff); } template void CountProfile::Read(FILE* fin) { // Parse and check header information if (!StreamStartsWith(fin, "CountProfile")) throw Exception("Stream does not start with class id 'CountProfile'!"); char buffer[KB]; cs::fgetline(buffer, KB, fin); if (strstr(buffer, "NAME")) { name = ReadString(buffer, "NAME", "Unable to parse count profile 'NAME'!"); cs::fgetline(buffer, KB, fin); } size_t len = ReadInt(buffer, "LENG", "Unable to parse count profile 'LENG'!"); cs::fgetline(buffer, KB, fin); size_t nalph = ReadInt(buffer, "ALPH", "Unable to parse count profile 'ALPH'!"); if (nalph != Abc::kSize) throw Exception("Alphabet size of serialized count profile should be %d " "but is actually %d!", Abc::kSize, nalph); // If everything went fine we can resize our data memmbers counts.Resize(len); neff.Resize(len); // Read counts and effective number of sequences for each column size_t i = 0; const char* ptr = buffer; cs::fgetline(buffer, KB, fin); // skip alphabet description line while (cs::fgetline(buffer, KB, fin) && buffer[0] != '/' && buffer[1] != '/') { ptr = buffer; i = strtoi(ptr) - 1; assert(i < len); // TODO: include ANY char in seialization for (size_t a = 0; a < Abc::kSize; ++a) // TODO: save counts as log base e instead of log base 2 counts[i][a] = pow(2, static_cast(-strastoi(ptr)) / kScale); counts[i][Abc::kAny] = 0.0f; neff[i] = static_cast(strtoi(ptr)) / kScale; } Normalize(counts, neff); // normalize probs to counts if (i != len - 1) throw Exception("Count profile should have %i columns but actually has %i!", len, i+1); } template void CountProfile::Write(FILE* fout) const { // Print header section fputs("CountProfile\n", fout); if (!name.empty()) fprintf(fout, "NAME\t%s\n", name.c_str()); fprintf(fout, "LENG\t%zu\n", counts.length()); fprintf(fout, "ALPH\t%zu\n", Abc::kSize); // Print alphabet description line fputs("COUNTS", fout); for (size_t a = 0; a < Abc::kSize; ++a) fprintf(fout, "\t%c", Abc::kIntToChar[a]); fputs("\tNEFF\n", fout); // Print counts matrix and neff vector as negative logs scaled by 'kScale' for (size_t i = 0; i < counts.length(); ++i) { fprintf(fout, "%zu", i+1); // TODO: include ANY char in seialization for (size_t a = 0; a < Abc::kSize; ++a) { // TODO: save counts as log base e instead of log base 2 if (counts[i][a] == 0.0) fputs("\t*", fout); else fprintf(fout, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale)); } fprintf(fout, "\t%d\n", iround(neff[i] * kScale)); } fputs("//\n", fout); } template void CountProfile::Write(std::stringstream& ss) const { // Print header section char buffer [4000]; sprintf(buffer, "CountProfile\n"); ss << buffer; if (!name.empty()) { sprintf(buffer, "NAME\t%s\n", name.c_str()); ss << buffer; } sprintf(buffer, "LENG\t%zu\n", counts.length()); ss << buffer; sprintf(buffer, "ALPH\t%zu\n", Abc::kSize); ss << buffer; // Print alphabet description line sprintf(buffer, "COUNTS"); ss << buffer; for (size_t a = 0; a < Abc::kSize; ++a) { sprintf(buffer, "\t%c", Abc::kIntToChar[a]); ss << buffer; } sprintf(buffer, "\tNEFF\n"); ss << buffer; // Print counts matrix and neff vector as negative logs scaled by 'kScale' for (size_t i = 0; i < counts.length(); ++i) { sprintf(buffer, "%zu", i+1); ss << buffer; // TODO: include ANY char in seialization for (size_t a = 0; a < Abc::kSize; ++a) { // TODO: save counts as log base e instead of log base 2 if (counts[i][a] == 0.0) { sprintf(buffer, "\t*"); ss << buffer; } else { sprintf(buffer, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale)); ss << buffer; } } sprintf(buffer, "\t%d\n", iround(neff[i] * kScale)); ss << buffer; } sprintf(buffer, "//\n"); ss << buffer; } // Returns the average Neff in given count profile. template inline double Neff(const CountProfile& cp) { double neff_sum = 0.0; for (size_t i = 0; i < cp.neff.size(); ++i) neff_sum += cp.neff[i]; return neff_sum / cp.neff.size(); } // Builds and returns a consensus string of the given count profile by // calculating at each position the alphabet character that deviates most strongly // from its background probability. template std::string ConsensusSequence(const CountProfile& cp, const SubstitutionMatrix& sm) { Profile prof(cp.counts); Normalize(prof, 1.0); std::string cons(prof.length(), ' '); for (size_t i = 0; i < prof.length(); ++i) { double maxw = 0.0; size_t maxa = 0; for (size_t a = 0; a < Abc::kSize; ++a) { if (prof[i][a] - sm.p(a) > maxw) { maxw = prof[i][a] - sm.p(a); maxa = a; } } cons[i] = Abc::kIntToChar[maxa]; } return cons; } // Builds and returns a conservation string for given count profile that // indicates conservation of residues by uppercase, lowercase, and '~' template std::string ConservationSequence(const CountProfile& cp, const SubstitutionMatrix& sm) { static const char kMixedColChar = '~'; // Precompute similarity matrix for amino acid pairs Matrix sim(Abc::kSize, Abc::kSize); for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) sim[a][b] = sm.q(a,b) * sm.q(a,b) / sm.q(a,a) / sm.q(b,b); // Normalize count profile Profile prof(cp.counts); Normalize(prof, 1.0); std::string cons(prof.length(), ' '); // Now compute conservation string for (size_t i = 0; i < prof.length(); ++i) { double maxw = 0.0; size_t maxa = 0; for (size_t a = 0; a < Abc::kSize; ++a) { if (prof[i][a] - sm.p(a) > maxw) { maxw = prof[i][a] - sm.p(a); maxa = a; } } maxw = 0.0; for (size_t b = 0; b < Abc::kSize; ++b) maxw += prof[i][b] * sim[maxa][b] * sim[maxa][b]; if (maxw > 0.6) cons[i] = toupper(Abc::kIntToChar[maxa]); else if (maxw > 0.4) cons[i] = tolower(Abc::kIntToChar[maxa]); else cons[i] = kMixedColChar; } return cons; } // Prints counts and neff in human-readable format for debugging. template std::ostream& operator<< (std::ostream& out, const CountProfile& cp) { out << "CountProfile" << std::endl; out << "name:\t" << cp.name << std::endl; for (size_t a = 0; a < Abc::kSizeAny; ++a) out << "\t" << Abc::kIntToChar[a]; out << "\tNeff" << std::endl; for (size_t i = 0; i < cp.counts.length(); ++i) { out << i+1; for (size_t a = 0; a < Abc::kSizeAny; ++a) out << strprintf("\t%6.4f", cp.counts[i][a]); out << strprintf("\t%-5.2f\n", cp.neff[i]); } return out; } } // namespace cs #endif // CS_COUNT_PROFILE_INL_H_