// Copyright 2009, Andreas Biegert #ifndef CS_SUBSTITUTION_MATRIX_INL_H_ #define CS_SUBSTITUTION_MATRIX_INL_H_ #include "substitution_matrix.h" namespace cs { template SubstitutionMatrix::SubstitutionMatrix(double l) : q_(Abc::kSizeAny, Abc::kSizeAny, 0.0), s_(Abc::kSizeAny, Abc::kSizeAny, 0.0), rx_(Abc::kSizeAny, Abc::kSizeAny, 0.0), ry_(Abc::kSizeAny, Abc::kSizeAny, 0.0), px_(Abc::kSizeAny, 0.0), py_(Abc::kSizeAny, 0.0), lambda_(l) {} template SubstitutionMatrix::~SubstitutionMatrix() {} template void SubstitutionMatrix::InitFromTargetFreqs() { // Check transition probability matrix, renormalize P double sumab = 0.0; for (size_t a = 0; a < Abc::kSize; a++) for (size_t b = 0; b < Abc::kSize; ++b) sumab += q_[a][b]; for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) q_[a][b] /= sumab; // Calculate background frequencies for (size_t a = 0; a < Abc::kSize; ++a) { px_[a] = 0.0; for (size_t b = 0; b < Abc::kSize; ++b) px_[a] += q_[a][b]; } Normalize(&px_[0], Abc::kSize); for (size_t b = 0; b < Abc::kSize; ++b) { py_[b] = 0.0; for (size_t a = 0; a < Abc::kSize; ++a) py_[b] += q_[a][b]; } Normalize(&py_[0], Abc::kSize); // Precompute conditional probability matrix Px for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) rx_[b][a] = q_[a][b] / px_[a]; // Px(b|a) for (size_t b = 0; b < Abc::kSize; ++b) rx_[b][Abc::kAny] = py_[b]; // Px(b|X) = Py(b) for (size_t a = 0; a < Abc::kSize; ++a) rx_[Abc::kAny][a] = 1.0; // Px(X|a) = 1.0 // Precompute conditional probability matrix Py for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) ry_[a][b] = q_[a][b] / py_[b]; // Py(a|b) for (size_t a = 0; a < Abc::kSize; ++a) ry_[a][Abc::kAny] = px_[a]; // Py(a|X) = Px(a) for (size_t b = 0; b < Abc::kSize; ++b) ry_[Abc::kAny][b] = 1.0; // Py(X|b) = 1.0 // Calculate scoring matrix as S[a][b] = (1 / lambda) * log(P(a,b) / (P(a)*P(b))) for (size_t a = 0; a < Abc::kSize; ++a) for (size_t b = 0; b < Abc::kSize; ++b) s_[a][b] = (1.0 / lambda_) * log(q_[a][b] / (px_[a] * py_[b])); LOG(DEBUG1) << *this; } // template // void SubstitutionMatrix::InitFromScoresAndBackgroundFreqs() { // // Calculate target frequencies // for (size_t a = 0; a < Abc::kSize; ++a) // for (size_t b = 0; b < Abc::kSize; ++b) // p_[a][b] = pow(2.0, s_[a][b]) * f_[a] * f_[b]; // // Check transition probability matrix, renormalize P // double sumab = 0.0f; // for (size_t a = 0; a < Abc::kSize; a++) // for (size_t b = 0; b < Abc::kSize; ++b) sumab += p_[a][b]; // for (size_t a = 0; a < Abc::kSize; ++a) // for (size_t b = 0; b < Abc::kSize; ++b) p_[a][b] /= sumab; // // Precompute matrix R for amino acid pseudocounts: // for (size_t a = 0; a < Abc::kSize; ++a) // for (size_t b = 0; b < Abc::kSize; ++b) // r_[a][b] = p_[a][b] / f_[b]; // R[a][b] = P(a|b) // LOG(DEBUG1) << *this; // } template void SubstitutionMatrix::Print(std::ostream& out) const { out << "Background frequencies on query side (in %):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) out << strprintf("%-.1f\t", 100.0f * px_[a]); out << "\nBackground frequencies on database side (in %):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) out << strprintf("%-.1f\t", 100.0f * py_[a]); out << "\nSubstitution matrix log( q(a,b) / p(a)*p(b) ) (scaled by 1/lambda):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) { for (size_t b = 0; b < Abc::kSize; ++b) out << strprintf("%+5.1f\t", s_[a][b]); out << std::endl; } out << "Target frequency matrix q(a,b) (in %):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) { for (size_t b = 0; b < Abc::kSize; ++b) out << strprintf("%-.2f\t", 100.0f * q_[a][b]); out << std::endl; } out << "Matrix of conditional probs on query side P(a|b) = P(a,b)/p(b) (in %):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) { for (size_t b = 0; b < Abc::kSize; ++b) out << strprintf("%-.1f\t", 100.0f * rx_[a][b]); out << std::endl; } out << "Matrix of conditional probs on database side P(a|b) = P(a,b)/p(b) (in %):\n"; for (size_t a = 0; a < Abc::kSize; ++a) out << Abc::kIntToChar[a] << "\t"; out << std::endl; for (size_t a = 0; a < Abc::kSize; ++a) { for (size_t b = 0; b < Abc::kSize; ++b) out << strprintf("%-.1f\t", 100.0f * ry_[a][b]); out << std::endl; } } template double Neff(const SubstitutionMatrix& sm) { double neff = 0.0; for (size_t b = 0; b < Abc::kSize; ++b) { double tmp = 0.0; for (size_t a = 0; a < Abc::kSize; ++a) if (sm.r(a,b) > FLT_MIN) tmp -= sm.r(a,b) * log2(sm.r(a,b)); neff += sm.p(b) * pow(2.0, tmp); } return neff; } } // namespace cs #endif // CS_SUBSTITUTION_MATRIX_INL_H_