// Substitution matrices and their background frequencies #include "hhmatrices.h" void SetBlosumMatrix(const char matrix, const float BlosumXX[], float* pb, float P[20][20]) { int a,b,n=0; HH_LOG(DEBUG) << "Using the BLOSUM " << matrix << " matrix" << std::endl; for (a=0; a<20; ++a) for (pb[a]=0.0f, b=0; b<=a; ++b,++n) P[a][b] = BlosumXX[n]; for (a=0; a<19; a++) for (b=a+1; b<20; ++b) P[a][b] = P[b][a]; } ///////////////////////////////////////////////////////////////////////////////////// // Set (global variable) substitution matrix with derived matrices and background frequencies ///////////////////////////////////////////////////////////////////////////////////// void SetSubstitutionMatrix(const char matrix, float* pb, float P[20][20], float R[20][20], float S[20][20], float Sim[20][20]) { int a,b; switch (matrix) { default: case 0: //Gonnet matrix HH_LOG(DEBUG) << "Using the Gonnet matrix" << std::endl; for (a=0; a<20; ++a) for (pb[a]=0.0f, b=0; b<20; ++b) P[a][b] = 0.000001f*Gonnet[a*20+b]; break; case 30: //BLOSUM30 SetBlosumMatrix(matrix, Blosum30, pb, P); break; case 40: //BLOSUM40 SetBlosumMatrix(matrix, Blosum40, pb, P); break; case 50: //BLOSUM50 SetBlosumMatrix(matrix, Blosum50, pb, P); break; case 62: //BLOSUM62 SetBlosumMatrix(matrix, Blosum62, pb, P); break; case 65: //BLOSUM65 SetBlosumMatrix(matrix, Blosum65, pb, P); break; case 80: //BLOSUM80 SetBlosumMatrix(matrix, Blosum80, pb, P); break; } // Check transition probability matrix, renormalize P and calculate pb[a] float sumab=0.0f; for (a=0; a<20; a++) for (b=0; b<20; ++b) sumab+=P[a][b]; for (a=0; a<20; a++) for (b=0; b<20; ++b) P[a][b]/=sumab; for (a=0; a<20; a++) for (pb[a]=0.0f, b=0; b<20; ++b) pb[a]+=P[a][b]; //Compute similarity matrix for amino acid pairs (for calculating consensus sequence) for (a=0; a<20; ++a) for (b=0; b<20; ++b) Sim[a][b] = P[a][b]*P[a][b]/P[a][a]/P[b][b]; //Precompute matrix R for amino acid pseudocounts: for (a=0; a<20; ++a) for (b=0; b<20; ++b) R[a][b] = P[a][b]/pb[b]; //R[a][b]=P(a|b) //Precompute matrix R for amino acid pseudocounts: for (a=0; a<20; ++a) for (b=0; b<20; ++b) S[a][b] = log2(R[a][b]/pb[a]); // S[a][b] = log2(P(a,b)/P(a)/P(b)) // Evaluate sequence identity underlying substitution matrix if (Log::reporting_level() >= DEBUG) { float id=0.0f; float entropy=0.0f; float entropy_pb=0.0f; float mut_info=0.0f; for (a=0; a<20; ++a) id+=P[a][a]; for (a=0; a<20; ++a) entropy_pb-=pb[a]*log2(pb[a]); for (a=0; a<20; ++a) for (b=0; b<20; ++b) { entropy-=P[a][b]*log2(R[a][b]); mut_info += P[a][b]*S[a][b]; } HH_LOG(DEBUG) << "sequence identity = " << 100*id << " %; entropy per column = " << entropy << " bits (out of " << entropy_pb << "); mutual information = " << mut_info << " bits" << std::endl; } //Debugging: probability matrix and dissimilarity matrix if (Log::reporting_level() >= DEBUG1) { HH_LOG(DEBUG) << "Check matrix: before renormalization sum P(a,b)= "<