// Copyright 2009, Andreas Biegert #ifndef CS_COUNT_PROFILE_INL_H_ #define CS_COUNT_PROFILE_INL_H_ #include "count_profile.h" #include "alignment-inl.h" #include "profile-inl.h" #include "sequence-inl.h" namespace cs { template CountProfile::CountProfile(const Alignment& ali, bool pos_weights, bool neff_sum_pairs) : counts(ali.nmatch(), 0.0f), neff(ali.nmatch()) { // Add counts and neff from alignment to count profile if (pos_weights) { // use position-specific sequence weights Matrix w; Vector neff_tmp(PositionSpecificWeightsAndDiversity(ali, w)); assert(neff.size()); for (size_t i = 0; i < counts.length(); ++i) { neff[i] = neff_tmp[i]; for (size_t k = 0; k < ali.nseqs(); ++k) if (ali[i][k] < Abc::kAny) counts[i][ali[i][k]] += w[i][k]; } } else { // use faster global sequence weights Vector wg; double neff_glob = GlobalWeightsAndDiversity(ali, wg, neff_sum_pairs); for (size_t i = 0; i < counts.length(); ++i) { neff[i] = neff_glob; for (size_t k = 0; k < ali.nseqs(); ++k) if (ali[i][k] < Abc::kAny) counts[i][ali[i][k]] += wg[k]; } } // Normalize counts to effective number of sequences Normalize(counts, neff); } template void CountProfile::Read(FILE* fin) { // Parse and check header information if (!StreamStartsWith(fin, "CountProfile")) throw Exception("Stream does not start with class id 'CountProfile'!"); char buffer[KB]; fgetline(buffer, KB, fin); if (strstr(buffer, "NAME")) { name = ReadString(buffer, "NAME", "Unable to parse count profile 'NAME'!"); fgetline(buffer, KB, fin); } size_t len = ReadInt(buffer, "LENG", "Unable to parse count profile 'LENG'!"); fgetline(buffer, KB, fin); size_t nalph = ReadInt(buffer, "ALPH", "Unable to parse count profile 'ALPH'!"); if (nalph != Abc::kSize) throw Exception("Alphabet size of serialized count profile should be %d " "but is actually %d!", Abc::kSize, nalph); // If everything went fine we can resize our data memmbers counts.Resize(len); neff.Resize(len); // Read counts and effective number of sequences for each column size_t i = 0; const char* ptr = buffer; fgetline(buffer, KB, fin); // skip alphabet description line while (fgetline(buffer, KB, fin) && buffer[0] != '/' && buffer[1] != '/') { ptr = buffer; i = strtoi(ptr) - 1; assert(i < len); // TODO: include ANY char in seialization for (size_t a = 0; a < Abc::kSize; ++a) // TODO: save counts as log base e instead of log base 2 counts[i][a] = pow(2, static_cast(-strastoi(ptr)) / kScale); counts[i][Abc::kAny] = 0.0f; neff[i] = static_cast(strtoi(ptr)) / kScale; } Normalize(counts, neff); // normalize probs to counts if (i != len - 1) throw Exception("Count profile should have %i columns but actually has %i!", len, i+1); } template void CountProfile::Write(FILE* fout) const { // Print header section fputs("CountProfile\n", fout); if (!name.empty()) fprintf(fout, "NAME\t%s\n", name.c_str()); fprintf(fout, "LENG\t%zu\n", counts.length()); fprintf(fout, "ALPH\t%zu\n", Abc::kSize); // Print alphabet description line fputs("COUNTS", fout); for (size_t a = 0; a < Abc::kSize; ++a) fprintf(fout, "\t%c", Abc::kIntToChar[a]); fputs("\tNEFF\n", fout); // Print counts matrix and neff vector as negative logs scaled by 'kScale' for (size_t i = 0; i < counts.length(); ++i) { fprintf(fout, "%zu", i+1); // TODO: include ANY char in seialization for (size_t a = 0; a < Abc::kSize; ++a) { // TODO: save counts as log base e instead of log base 2 if (counts[i][a] == 0.0) fputs("\t*", fout); else fprintf(fout, "\t%d", -iround(log2(counts[i][a] / neff[i]) * kScale)); } fprintf(fout, "\t%d\n", iround(neff[i] * kScale)); } fputs("//\n", fout); } // Returns the average Neff in given count profile. template inline double Neff(const CountProfile& cp) { double neff_sum = 0.0; for (size_t i = 0; i < cp.neff.size(); ++i) neff_sum += cp.neff[i]; return neff_sum / cp.neff.size(); } // Builds and returns a consensus string of the given count profile by // calculating at each position the alphabet character that deviates most strongly // from its background probability. template std::string ConsensusSequence(const CountProfile& cp, const SubstitutionMatrix& sm) { Profile prof(cp.counts); Normalize(prof, 1.0); std::string cons(prof.length(), ' '); for (size_t i = 0; i < prof.length(); ++i) { double maxw = 0.0; size_t maxa = 0; for (size_t a = 0; a < Abc::kSize; ++a) { if (prof[i][a] - sm.p(a) > maxw) { maxw = prof[i][a] - sm.p(a); maxa = a; } } cons[i] = Abc::kIntToChar[maxa]; } return cons; } // Builds and returns a conservation string for given count profile that // indicates conservation of residues by uppercase, lowercase, and '~' template std::string ConservationSequence(const CountProfile& cp, const SubstitutionMatrix& sm) { static const char kMixedColChar = '~'; // Precompute similarity matrix for amino acid pairs Matrix sim(Abc::kSize, Abc::kSize); for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) sim[a][b] = sm.q(a,b) * sm.q(a,b) / sm.q(a,a) / sm.q(b,b); // Normalize count profile Profile prof(cp.counts); Normalize(prof, 1.0); std::string cons(prof.length(), ' '); // Now compute conservation string for (size_t i = 0; i < prof.length(); ++i) { double maxw = 0.0; size_t maxa = 0; for (size_t a = 0; a < Abc::kSize; ++a) { if (prof[i][a] - sm.p(a) > maxw) { maxw = prof[i][a] - sm.p(a); maxa = a; } } maxw = 0.0; for (size_t b = 0; b < Abc::kSize; ++b) maxw += prof[i][b] * sim[maxa][b] * sim[maxa][b]; if (maxw > 0.6) cons[i] = toupper(Abc::kIntToChar[maxa]); else if (maxw > 0.4) cons[i] = tolower(Abc::kIntToChar[maxa]); else cons[i] = kMixedColChar; } return cons; } // Prints counts and neff in human-readable format for debugging. template std::ostream& operator<< (std::ostream& out, const CountProfile& cp) { out << "CountProfile" << std::endl; out << "name:\t" << cp.name << std::endl; for (size_t a = 0; a < Abc::kSizeAny; ++a) out << "\t" << Abc::kIntToChar[a]; out << "\tNeff" << std::endl; for (size_t i = 0; i < cp.counts.length(); ++i) { out << i+1; for (size_t a = 0; a < Abc::kSizeAny; ++a) out << strprintf("\t%6.4f", cp.counts[i][a]); out << strprintf("\t%-5.2f\n", cp.neff[i]); } return out; } } // namespace cs #endif // CS_COUNT_PROFILE_INL_H_