// hhhmm.C #ifndef MAIN #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // clock #include // islower, isdigit etc #include "cs.h" // context-specific pseudocounts #include "context_library.h" #include "library_pseudocounts-inl.h" #include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "list.C" // list data structure #include "hash.C" // hash data structure #include "hhdecl.C" // Constants, global variables, struct Parameters #include "hhutil.C" // MatchChr, InsertChr, aa2i, i2aa, log2, fast_log2, ScopID, WriteToScreen, #include "hhmatrices.C" // BLOSUM50, GONNET, HSDM #include "hhhmm.h" // class Hit #include "hhhit.h" // class Hit #include "hhalignment.h" // class Alignment #include "hhhalfalignment.h" // class HalfAlignment #include "hhfullalignment.h" // class FullAlignment #include "hhhitlist.h" // class Hit #endif ///////////////////////////////////////////////////////////////////////////////////// //// Class HMM ///////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// // Object constructor ///////////////////////////////////////////////////////////////////////////////////// HMM::HMM(int maxseqdis, int maxres) { sname = new char*[maxseqdis]; // names of stored sequences seq = new char*[maxseqdis]; // residues of stored sequences (first at pos 1!) Neff_M = new float[maxres]; // Neff_M[i] = diversity of subalignment of seqs that have residue in col i Neff_I = new float[maxres]; // Neff_I[i] = diversity of subalignment of seqs that have insert in col i Neff_D = new float[maxres]; // Neff_D[i] = diversity of subalignment of seqs that have delete in col i longname = new char[DESCLEN]; // Full name of first sequence of original alignment (NAME field) ss_dssp = new char[maxres]; // secondary structure determined by dssp 0:- 1:H 2:E 3:C 4:S 5:T 6:G 7:B sa_dssp = new char[maxres]; // solvent accessibility state determined by dssp 0:- 1:A (absolutely buried) 2:B 3:C 4:D 5:E (exposed) ss_pred = new char[maxres]; // predicted secondary structure 0:- 1:H 2:E 3:C ss_conf = new char[maxres]; // confidence value of prediction 0:- 1:0 ... 10:9 l = new int[maxres]; // l[i] = pos. of j'th match state in aligment f = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITHOUT pseudocounts g = new float*[maxres]; // f[i][a] = prob of finding amino acid a in column i WITH pseudocounts p = new float*[maxres]; // p[i][a] = prob of finding amino acid a in column i WITH OPTIMUM pseudocounts tr = new float*[maxres]; // log2 of transition probabilities M2M M2I M2D I2M I2I D2M D2D // tr_lin = new float*[maxres]; // linear transition probabilities M2M M2I M2D I2M I2I D2M D2D for (int i=0; i n_display) { for (int k=n_display; k n_display) { for (int k=n_display; k n_display) { for (int k=n_display; k=4) cout<<"Reading in HMM "< k=n int h; // index for character in input line int l=1; // index of character in sequence seq[k] int i=1; // index of match states in ss_dssp[i] and ss_pred[i] sequence int n_seq=0; // number of sequences to be displayed EXCLUDING ss sequences cur_seq[0]='-'; // overwrite '\0' character at beginning to be able to do strcpy(*,cur_seq) k=-1; while (fgetline(line,LINELEN-1,dbf) && line[0]!='#') { if (v>=4) cout<<"Read from file:"<') //line contains sequence name { if (k>=MAXSEQDIS-1) //maximum number of allowable sequences exceeded { if (v>=2) cerr<<"WARNING in "<ss_dssp",8)) nss_dssp=k; else if (!strncmp(line,">sa_dssp",8)) nsa_dssp=k; else if (!strncmp(line,">ss_pred",8)) nss_pred=k; else if (!strncmp(line,">ss_conf",8)) nss_conf=k; else if (!strncmp(line,">Cons-",6) || !strncmp(line,">Consensus",10)) ncons=k; else { if (nfirst==-1) nfirst=k; // if (n_seq>=par.nseqdis) // {while (fgetline(line,LINELEN-1,dbf) && line[0]!='#'); k--; break;} n_seq++; } //If this is not the first sequence then store residues of previous sequence if (k>0) { seq[k-1]=new(char[strlen(cur_seq)+1]); if (!seq[k-1]) MemoryError("array of sequences to display"); strcpy(seq[k-1],cur_seq); } // store sequence name strcut(line+1); //find next white-space character and overwrite it with end-of-string character sname[k] = new (char[strlen(line+1)+1]); //+1 for terminating '\0' if (!sname[k]) MemoryError("array of names for sequences to display"); strcpy(sname[k],line+1); //store sequence name in **name l=1; i=1; } else //line contains sequence residues { if (k==-1) { cerr<'\0' && l=0 && line[h]!='.') { char c=ss2ss(line[h]); cur_seq[l]=c; if (c!='.' && !(c>='a' && c<='z')) ss_dssp[i++]=ss2i(c); l++; } else if (v && ss2i(line[h])==-2) cerr<'\0' && l=0) { char c=line[h]; cur_seq[l]=c; if (c!='.' && !(c>='a' && c<='z')) sa_dssp[i++]=sa2i(c); l++; } else if (v && sa2i(line[h])==-2) cerr<'\0' && l=0 && ss2i(line[h])<=3 && line[h]!='.') { char c=ss2ss(line[h]); cur_seq[l]=c; if (c!='.' && !(c>='a' && c<='z')) ss_pred[i++]=ss2i(c); l++; } else if (v && ss2i(line[h])==-2) cerr<'\0' && l='0' && line[h]<='9')) { cur_seq[l]=line[h]; ss_conf[l]=cf2i(line[h]); l++; } else if (v && cf2i(line[h])==-2) cerr<'\0' && l=0 && line[h]!='.') // ignore '.' and white-space characters ' ', \t and \n (aa2i()==-1) {cur_seq[l]=line[h]; l++;} else if (aa2i(line[h])==-2 && v) cerr<=0) { seq[k]=new(char[strlen(cur_seq)+1]); if (!seq[k]) MemoryError("array of sequences to display"); strcpy(seq[k],cur_seq); } n_seqs=k+1; // DEBUG if (v>=4) { printf("nss_dssp=%i nsa_dssp=%i nss_pred=%i nss_conf=%i nfirst=%i\n",nss_dssp,nsa_dssp,nss_pred,nss_conf,nfirst); for (k=0; k"< 1.1 to generate hhm files.\n"); else if (!strcmp("AVER",str4)) {} // AVER line scrapped else if (!strcmp("NULL",str4)) { ptr=line+4; for (a=0; a<20 && ptr; ++a) //s2[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids pb[s2a[a]] = (float) fpow2(float(-strinta(ptr))/HMMSCALE); if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("\nNULL "); for (a=0; a<20; ++a) printf("%5.1f ",100.*pb[s2a[a]]); printf("\n"); } } ///////////////////////////////////////////////////////////////////////////////////// // Read transition probabilities from start state else if (!strcmp("HMM",str3)) { fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels fgetline(line,LINELEN-1,dbf); // Skip line with transition labels ptr=line; for (a=0; a<=D2D && ptr; ++a) tr[0][a] = float(-strinta(ptr))/HMMSCALE; //store transition probabilites as log2 values // strinta returns next integer in string and puts ptr to first char // after the integer. Returns -99999 if '*' is found. // ptr is set to 0 if no integer is found after ptr. Neff_M[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition Neff_I[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition Neff_D[0] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition if (!ptr) return Warning(dbf,line,name); ///////////////////////////////////////////////////////////////////////////////////// // Read columns of HMM int next_i=0; // index of next column while (fgetline(line,LINELEN-2,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') { if (strscn(line)==NULL) continue; // skip lines that contain only white space // Read in AA probabilities ptr=line+1; int prev_i = next_i; next_i = strint(ptr); ++i; if (v && next_i!=prev_i+1) if (++warn<=5) { cerr<L) { cerr<par.maxres-2) { fgetline(line,LINELEN-1,dbf); // Skip line continue; } for (a=0; a<20 && ptr; ++a) // f[i][s2a[a]] = (float)pow(2.,float(-strinta(ptr))/HMMSCALE); f[i][s2a[a]] = fpow2(float(-strinta(ptr))/HMMSCALE); // speed-up ~5 s for 10000 SCOP domains //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids l[i]=strint(ptr); if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("%s",line); printf("%6i ",i); for (a=0; a<20; ++a) printf("%5.1f ",100*f[i][s2a[a]]); printf("%5i",l[i]); printf("\n"); } // Read transition probabilities fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels if (line[0]!=' ' && line[0]!='\t') return Warning(dbf,line,name); ptr=line; for (a=0; a<=D2D && ptr; ++a) tr[i][a] = float(-strinta(ptr))/HMMSCALE; //store transition prob's as log2-values Neff_M[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with M->? transition if (Neff_M[i] == 0) { Neff_M[i] = 1; } Neff_I[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with I->? transition Neff_D[i] = float(strinta(ptr))/HMMSCALE; // Read eff. number of sequences with D->? transition if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf(" "); for (a=0; a<=D2D; ++a) printf("%5.1f ",100*fpow2(tr[i][a])); printf("%5.1f %5.1f %5.1f \n",Neff_M[i],Neff_I[i],Neff_D[i]); } } if (line[0]=='/' && line[1]=='/') break; } else if (v) cerr< stop reading in // Set coefficients of EVD (= 0.0 if not calibrated for these parameters) // lamda = lamda_hash.Show(par.Key()); // mu = mu_hash.Show(par.Key()); if (lamda && v>=3) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu); if (v && i!=L) cerr<par.maxres-2) {i=par.maxres-2; cerr<=2) cout<<"Read in HMM "<=4) cout<<"Reading in HMM "<=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SADSS records with more than %i residues.\n",name,par.maxres); else strcat(seq[nsa_dssp],ptr); } } else if (!strncmp("SSPRD",line,5)) { if (nss_pred<0) { nss_pred=k++; seq[nss_pred] = new(char[par.maxres+2]); sname[nss_pred] = new(char[15]); strcpy(seq[nss_pred]," "); strcpy(sname[nss_pred],"ss_pred"); } ptr=strscn(line+5); if (ptr) { strcut(ptr); if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SSPRD records with more than %i residues.\n",name,par.maxres); else strcat(seq[nss_pred],ptr); } } else if (!strncmp("SSCON",line,5)) { if (nss_conf<0) { nss_conf=k++; seq[nss_conf] = new(char[par.maxres+2]); sname[nss_conf] = new(char[15]); strcpy(seq[nss_conf]," "); strcpy(sname[nss_conf],"ss_conf"); } ptr=strscn(line+5); if (ptr) { strcut(ptr); if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SSPRD records with more than %i residues.\n",name,par.maxres); else strcat(seq[nss_conf],ptr); } } else if (!strncmp("SSCIT",line,5)) continue; else if (!strcmp("XT ",str4)) continue; else if (!strcmp("NULT",str4)) continue; else if (!strcmp("NULE",str4)) { ptr=line+4; for (a=0; a<20 && ptr; ++a) //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids pb[s2a[a]] = (float) 0.05 * fpow2(float(strinta(ptr,-99999))/HMMSCALE); if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("\nNULL "); for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]); printf("\n"); } } else if (!strcmp("EVD ",str4)) { char* ptr=line+4; ptr = strscn(ptr); sscanf(ptr,"%f",&lamda); ptr = strscn(ptr); sscanf(ptr,"%f",&mu); if (lamda<0) { if (v>=2 && ignore_hmmer_cal==0) cerr<=4) { printf(" "); for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); printf("\n"); } // Prepare to store DSSP states (if there are none, delete afterwards) nss_dssp=k++; seq[nss_dssp] = new(char[par.maxres+2]); sname[nss_dssp] = new(char[15]); strcpy(sname[nss_dssp],"ss_dssp"); ///////////////////////////////////////////////////////////////////////////////////// // Read columns of HMM int next_i=0; // index of next column while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') { if (strscn(line)==NULL) continue; // skip lines that contain only white space // Read in AA probabilities ptr=line; int prev_i = next_i; next_i = strint(ptr); ++i; if (v && next_i!=prev_i+1) if (++warn<5) { cerr<L) { cerr<L && v) cerr<=par.maxres-2) { fgetline(line,LINELEN-1,dbf); // Skip two lines fgetline(line,LINELEN-1,dbf); continue; } for (a=0; a<20 && ptr; ++a) f[i][s2a[a]] = (float) pb[s2a[a]]*fpow2(float(strinta(ptr,-99999))/HMMSCALE); //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("%6i ",i); for (a=0; a<20; ++a) printf("%6.3g ",100*f[i][s2a[a]]); printf("\n"); } // Read insert emission line fgetline(line,LINELEN-1,dbf); ptr = strscn(line); if (!ptr) return Warning(dbf,line,name); annotchr[i]=uprchr(*ptr); if (*ptr!='-' && *ptr!=' ' && *ptr!='X' && *ptr!='x') annot=1; // Read annotation character and seven transition probabilities fgetline(line,LINELEN-1,dbf); ptr = strscn(line); switch (*ptr) { case 'H': ss_dssp[i]=1; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'E': ss_dssp[i]=2; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'C': ss_dssp[i]=3; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'S': ss_dssp[i]=4; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'T': ss_dssp[i]=5; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'G': ss_dssp[i]=6; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'B': ss_dssp[i]=7; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'I': dssp=1; case '~': ss_dssp[i]=3; seq[nss_dssp][i]=*ptr; break; case '-': // no SS available from any template case '.': // no clear consensus SS structure case 'X': // no clear consensus SS structure ss_dssp[i]=0; seq[nss_dssp][i]='-'; break; default: ss_dssp[i]=0; seq[nss_dssp][i]=*ptr; break; } ptr+=2; for (a=0; a<=D2D && ptr; ++a) tr[i][a] = float(strinta(ptr,-99999))/HMMSCALE; //store transition prob's as log2-values if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf(" "); for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); printf("\n"); } } if (line[0]=='/' && line[1]=='/') break; } } //while(getline) if (L==0) return 0; //End of db file -> stop reading in // Set coefficients of EVD (= 0.0 if not calibrated for these parameters) // lamda = lamda_hash.Show(par.Key()); // mu = mu_hash.Show(par.Key()); if (lamda && v>=2) printf("HMM %s is already calibrated: lamda=%-5.3f, mu=%-5.2f\n",name,lamda,mu); if (v && i!=L) cerr<=par.maxres-2) {i=par.maxres-2; cerr<0) strcat(longname," "); strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC if (strlen(name)>0) strcat(longname," "); strncat(longname,desc,DESCLEN-strlen(longname)-1); longname[DESCLEN-1]='\0'; ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4) RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file' // Secondary structure if (!dssp) { // remove dssp sequence delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed nss_dssp=-1; k--; } else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; } if (nss_pred>=0) { for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]); if (nss_conf>=0) for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]); else for (i=1; i<=L; ++i) ss_conf[i] = 5; } // Copy query (first sequence) and consensus residues? if (par.showcons) { sname[k]=new(char[10]); strcpy(sname[k],"Consensus"); sname[k+1]=new(char[strlen(longname)+1]); strcpy(sname[k+1],longname); seq[k]=new(char[L+2]); seq[k][0]=' '; seq[k][L+1]='\0'; seq[k+1]=new(char[L+2]); seq[k+1][0]=' '; seq[k+1][L+1]='\0'; for (i=1; i<=L; ++i) { float pmax=0.0; int amax=0; for (a=0; apmax) {amax=a; pmax=f[i][a];} if (pmax>0.6) seq[k][i]=i2aa(amax); else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax)); else seq[k][i]='x'; seq[k+1][i]=i2aa(amax); } ncons=k++; // nfirst is set later! } else { sname[k]=new(char[strlen(longname)+1]); strcpy(sname[k],longname); seq[k]=new(char[L+2]); seq[k][0]=' '; seq[k][L+1]='\0'; } if (annot) // read in some annotation characters? { annotchr[0]=' '; annotchr[L+1]='\0'; strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters } else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence) { for (i=1; i<=L; ++i) { float pmax=0.0; int amax=0; for (a=0; apmax) {amax=a; pmax=f[i][a];} seq[k][i]=i2aa(amax); } } // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]); nfirst=k++; n_display=k; n_seqs=k; // Calculate overall Neff_HMM Neff_HMM=0; for (i=1; i<=L; ++i) { float S=0.0; for (a=0; a<20; ++a) if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]); Neff_HMM+=(float) fpow2(S); } Neff_HMM/=L; for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts! Neff_M[L+1]=1.0f; Neff_I[L+1]=Neff_D[L+1]=0.0f; if (v>=2) cout<<"Read in HMM "<=4) cout<<"Reading in HMM "<=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SADSS records with more than %i residues.\n",name,par.maxres); else strcat(seq[nsa_dssp],ptr); } } else if (!strncmp("SSPRD",line,5)) { if (nss_pred<0) { nss_pred=k++; seq[nss_pred] = new(char[par.maxres+2]); sname[nss_pred] = new(char[15]); strcpy(seq[nss_pred]," "); strcpy(sname[nss_pred],"ss_pred"); } ptr=strscn(line+5); if (ptr) { strcut(ptr); if (strlen(seq[nss_pred])+strlen(ptr)>=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SSPRD records with more than %i residues.\n",name,par.maxres); else strcat(seq[nss_pred],ptr); } } else if (!strncmp("SSCON",line,5)) { if (nss_conf<0) { nss_conf=k++; seq[nss_conf] = new(char[par.maxres+2]); sname[nss_conf] = new(char[15]); strcpy(seq[nss_conf]," "); strcpy(sname[nss_conf],"ss_conf"); } ptr=strscn(line+5); if (ptr) { strcut(ptr); if (strlen(seq[nss_conf])+strlen(ptr)>=(unsigned)(par.maxres)) printf("WARNING: HMM %s has SSPRD records with more than %i residues.\n",name,par.maxres); else strcat(seq[nss_conf],ptr); } } else if (!strncmp("SSCIT",line,5)) continue; else if (!strcmp("XT ",str4)) continue; ////////////////////////////////////////////////////////////////////////////////////////////////////// else if (!strncmp("STATS LOCAL",line,11)) continue; else if (!strcmp("EFFN",str4)) { ptr=line+4; float effn = strflt(ptr); // Calculate Neff_HMM by using f(x) = ax^0.1 + bx^0.5 + cx + d (fitted with scop25 dataset) Neff_HMM = -1.403534 * pow(effn, 0.1) + 4.428118 * pow(effn, 0.5) - 0.2885410 * effn - 1.108568; } ///////////////////////////////////////////////////////////////////////////////////// // Read transition probabilities from start state else if (!strncmp("HMM",line,3)) { fgetline(line,LINELEN-1,dbf); // Skip line with amino acid labels fgetline(line,LINELEN-1,dbf); // Skip line with transition labels ptr=strscn(line); if (!strncmp("COMPO",ptr,5)) { ptr=ptr+5; for (a=0; a<20 && ptr; ++a) //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids pb[s2a[a]] = (float) exp(-1.0*strflta(ptr,99999)); if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("\nNULL "); for (a=0; a<20; ++a) printf("%6.3g ",100.*pb[s2a[a]]); printf("\n"); } fgetline(line,LINELEN-1,dbf); // Read next line } fgetline(line,LINELEN-1,dbf); // Skip line with 0-states insert probabilities ptr = strscn(line); for (a=0; a<=D2D && ptr; ++a) tr[0][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition probabilites as log2 values // strinta returns next integer in string and puts ptr to first char // after the integer. Returns -99999 if '*' is found. // ptr is set to 0 if no integer is found after ptr. if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf(" "); for (a=0; a<=D2D && ptr; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); printf("\n"); } // Prepare to store DSSP states (if there are none, delete afterwards) nss_dssp=k++; seq[nss_dssp] = new(char[par.maxres+2]); sname[nss_dssp] = new(char[15]); strcpy(sname[nss_dssp],"ss_dssp"); ///////////////////////////////////////////////////////////////////////////////////// // Read columns of HMM int next_i=0; // index of next column while (fgetline(line,LINELEN-1,dbf) && !(line[0]=='/' && line[1]=='/') && line[0]!='#') { if (strscn(line)==NULL) continue; // skip lines that contain only white space // Read in AA probabilities ptr=line; int prev_i = next_i; next_i = strint(ptr); ++i; if (v && next_i!=prev_i+1) if (++warn<5) { cerr<L) { cerr<L && v) cerr<=par.maxres-2) { fgetline(line,LINELEN-1,dbf); // Skip two lines fgetline(line,LINELEN-1,dbf); continue; } for (a=0; a<20 && ptr; ++a) f[i][s2a[a]] = (float) exp(-1.0*strflta(ptr,99999)); //s2a[a]: transform amino acids Sorted by alphabet -> internal numbers for amino acids if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf("%6i ",i); for (a=0; a<20; ++a) printf("%6.3g ",100*f[i][s2a[a]]); printf("\n"); } // Ignore MAP annotation ptr = strscn(ptr); //find next word ptr = strscn_ws(ptr); // ignore word // Read RF and CS annotation ptr = strscn(ptr); if (!ptr) return Warning(dbf,line,name); annotchr[i]=uprchr(*ptr); if (*ptr!='-' && *ptr!=' ' && *ptr!='X' && *ptr!='x') annot=1; ptr = strscn(ptr); switch (*ptr) { case 'H': ss_dssp[i]=1; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'E': ss_dssp[i]=2; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'C': ss_dssp[i]=3; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'S': ss_dssp[i]=4; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'T': ss_dssp[i]=5; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'G': ss_dssp[i]=6; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'B': ss_dssp[i]=7; seq[nss_dssp][i]=*ptr; dssp=1; break; case 'I': dssp=1; case '~': ss_dssp[i]=3; seq[nss_dssp][i]=*ptr; break; case '-': // no SS available from any template case '.': // no clear consensus SS structure case 'X': // no clear consensus SS structure ss_dssp[i]=0; seq[nss_dssp][i]='-'; break; default: ss_dssp[i]=0; seq[nss_dssp][i]=*ptr; break; } // Read insert emission line fgetline(line,LINELEN-1,dbf); // Read seven transition probabilities fgetline(line,LINELEN-1,dbf); ptr=line; for (a=0; a<=D2D && ptr; ++a) tr[i][a] = log2((float) exp(-1.0*strflta(ptr,99999))); //store transition prob's as log2-values if (!ptr) return Warning(dbf,line,name); if (v>=4) { printf(" "); for (a=0; a<=D2D; ++a) printf("%6.3g ",100*fpow2(tr[i][a])); printf("\n"); } } if (line[0]=='/' && line[1]=='/') break; } } //while(getline) if (L==0) return 0; //End of db file -> stop reading in if (v && i!=L) cerr<=par.maxres-2) {i=par.maxres-2; cerr<0) strcat(longname," "); strncat(longname,name,DESCLEN-strlen(longname)-1); // longname = ACC NAME DESC if (strlen(name)>0) strcat(longname," "); strncat(longname,desc,DESCLEN-strlen(longname)-1); longname[DESCLEN-1]='\0'; ScopID(cl,fold,sfam,fam);// get scop classification from basename (e.g. a.1.2.3.4) RemoveExtension(file,filestr); // copy name of dbfile without extension into 'file' // Secondary structure if (!dssp) { // remove dssp sequence delete[] seq[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed delete[] sname[nss_dssp]; // memory that had been allocated in case ss_dssp was given needs to be freed nss_dssp=-1; k--; } else { seq[nss_dssp][0]='-'; seq[nss_dssp][L+1]='\0'; } if (nss_pred>=0) { for (i=1; i<=L; ++i) ss_pred[i] = ss2i(seq[nss_pred][i]); if (nss_conf>=0) for (i=1; i<=L; ++i) ss_conf[i] = cf2i(seq[nss_conf][i]); else for (i=1; i<=L; ++i) ss_conf[i] = 5; } // Copy query (first sequence) and consensus residues? if (par.showcons) { sname[k]=new(char[10]); strcpy(sname[k],"Consensus"); sname[k+1]=new(char[strlen(longname)+1]); strcpy(sname[k+1],longname); seq[k]=new(char[L+2]); seq[k][0]=' '; seq[k][L+1]='\0'; seq[k+1]=new(char[L+2]); seq[k+1][0]=' '; seq[k+1][L+1]='\0'; for (i=1; i<=L; ++i) { float pmax=0.0; int amax=0; for (a=0; apmax) {amax=a; pmax=f[i][a];} if (pmax>0.6) seq[k][i]=i2aa(amax); else if (pmax>0.4) seq[k][i]=lwrchr(i2aa(amax)); else seq[k][i]='x'; seq[k+1][i]=i2aa(amax); } ncons=k++; // nfirst is set later! } else { sname[k]=new(char[strlen(longname)+1]); strcpy(sname[k],longname); seq[k]=new(char[L+2]); seq[k][0]=' '; seq[k][L+1]='\0'; } if (annot) // read in some annotation characters? { annotchr[0]=' '; annotchr[L+1]='\0'; strcpy(seq[k],annotchr); // overwrite the consensus sequence with the annotation characters } else if (!par.showcons) // we have not yet calculated the consensus, but we need it now as query (first sequence) { for (i=1; i<=L; ++i) { float pmax=0.0; int amax=0; for (a=0; apmax) {amax=a; pmax=f[i][a];} seq[k][i]=i2aa(amax); } } // printf("%i query name=%s seq=%s\n",n,sname[n],seq[n]); nfirst=k++; n_display=k; n_seqs=k; // If no effektive number of sequences is given, calculate Neff_HMM by given profile if (Neff_HMM == 0) { for (i=1; i<=L; ++i) { float S=0.0; for (a=0; a<20; ++a) if (f[i][a]>1E-10) S-=f[i][a]*fast_log2(f[i][a]); Neff_HMM+=(float) fpow2(S); } Neff_HMM/=L; } for (i=0; i<=L; ++i) Neff_M[i] = Neff_I[i] = Neff_D[i] = 10.0; // to add only little additional pseudocounts! Neff_M[L+1]=1.0f; Neff_I[L+1]=Neff_D[L+1]=0.0f; if (v>=2) cout<<"Read in HMM "< pI2I=0 gape=1 -> pI2I=0.75 gape=inf -> pI2I=1. pI2I=1.0*gape/(gape-1+1.0/0.75); pI2M=1-pI2I; // gape=0 -> pD2D=0 gape=1 -> pD2D=0.75 gape=inf -> pD2D=1. pD2D=1.0*gape/(gape-1+1.0/0.75); pD2M=1-pD2D; for (i=0; i<=L; ++i) //for all columns in HMM { // Transitions from M state p0 = (Neff_M[i]-1)*fpow2(tr[i][M2M]) + gapb*pM2M; p1 = (Neff_M[i]-1)*fpow2(tr[i][M2D]) + gapb*pM2D; p2 = (Neff_M[i]-1)*fpow2(tr[i][M2I]) + gapb*pM2I; if (i==0) p1=p2=0; //from M(0) no transition to D(1) and I(0) possible if (i==L) p1=p2=0; //from M(L) no transition to D(L+1) and I(L+1) possible sum = p0+p1+p2+FLT_MIN; tr[i][M2M] = fast_log2(p0/sum); tr[i][M2D] = fast_log2(p1/sum)*gapf; tr[i][M2I] = fast_log2(p2/sum)*gapg; // Transitions from I state p0 = Neff_I[i]*fpow2(tr[i][I2M]) + gapb*pI2M; p1 = Neff_I[i]*fpow2(tr[i][I2I]) + gapb*pI2I; sum = p0+p1+FLT_MIN; tr[i][I2M] = fast_log2(p0/sum); tr[i][I2I] = fast_log2(p1/sum)*gapi; // Transitions from D state p0 = Neff_D[i]*fpow2(tr[i][D2M]) + gapb*pD2M; p1 = Neff_D[i]*fpow2(tr[i][D2D]) + gapb*pD2D; if (i==L) p1=0; //from D(L) no transition to D(L+1) possible sum = p0+p1+FLT_MIN; tr[i][D2M] = fast_log2(p0/sum); tr[i][D2D] = fast_log2(p1/sum)*gaph; } if (v>=4) { printf("\nPseudocount transition probabilities:\n"); printf("pM2M=%4.1f%%, pM2I=%4.1f%%, pM2D=%4.1f%%, ",100*pM2M,100*pM2I,100*pM2D); printf("pI2M=%4.1f%%, pI2I=%4.1f%%, ",100*pI2M,100*pI2I); printf("pD2M=%4.1f%%, pD2D=%4.1f%% ",100*pD2M,100*pD2D); printf("tau = %4.1f%%\n\n",100.*gapb/(Neff_HMM-1+gapb)); printf("Listing transition probabilities WITH pseudocounts:\n"); printf(" i dssp pred sacc M->M M->I M->D I->M I->I D->M D->D\n"); for (i=1; i<=L; ++i) //for all columns in HMM { printf("%4i %1c %1c %1c %6.3f %6.3f %6.3f ",i,i2ss(ss_dssp[i]),i2ss(ss_pred[i]),i2sa(sa_dssp[i]),fpow2(tr[i][M2M]),fpow2(tr[i][M2I]),fpow2(tr[i][M2D])); printf("%6.3f %6.3f ",fpow2(tr[i][I2M]),fpow2(tr[i][I2I])); printf("%6.3f %6.3f ",fpow2(tr[i][D2M]),fpow2(tr[i][D2D])); printf("%1i %2i %1i\n",ss_pred[i],ss_conf[i],ss_dssp[i]); } printf("\n"); printf("nss_dssp=%i nss_pred=%i\n",nss_dssp,nss_pred); } return; } ///////////////////////////////////////////////////////////////////////////////////// // Generate an amino acid frequency matrix g[][] with full pseudocount admixture (tau=1) ///////////////////////////////////////////////////////////////////////////////////// void HMM::PreparePseudocounts() { for (int i=0; i<=L+1; ++i) for (int a=0; a<20; ++a) g[i][a] = ScalarProd20(R[a],f[i]); } ///////////////////////////////////////////////////////////////////////////////////// // Generate an AA frequency matrix g[][] with full context specific pseudocount admixture (tau=1) ///////////////////////////////////////////////////////////////////////////////////// void HMM::AddContextSpecificPseudocounts(char pcm, float pca, float pcb, float pcc) { int i; //position in HMM int a; //amino acid (0..19) if (has_pseudocounts) { pcm = 0; } cs::CountProfile ali_profile(L); fillCountProfile(&ali_profile); cs::Admix *admix = NULL; switch (pcm) { case 1: admix = new cs::ConstantAdmix(pca); break; case 2: case 4: admix = new cs::HHsearchAdmix(pca, pcb, pcc); break; case 3: // TODO break; case 5: // use CS-BLAST Admix for prefiltering admix = new cs::CSBlastAdmix(pca, pcb); break; } if (admix == NULL) { // write cs-pseudocounts in HMM.p for (i=1; i<=L; ++i) for (a=0; a<20; ++a) p[i][a] = f[i][a]; } else { cs::Profile profile(lib_pc->AddTo(ali_profile, *admix)); delete admix; // write cs-pseudocounts in HMM.p for (i=1; i<=L; ++i) for (a=0; a<20; ++a) p[i][a] = profile[i-1][a]; } // DEBUGGING output if (v>=3) { float sum; cout<<"Context specific pseudocounts added!\n\n"; switch (pcm) { case 0: cout<<"No pseudocounts added (-pcm 0)\n"; return; case 1: cout<<"Adding constant AA pseudocount admixture of "<=4) { cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; for (i=1; i<=L; ++i) { printf("%3i: ",i); sum=0; for (a=0; a<20; ++a) { sum+=f[i][a]; printf("%4.1f ",100*f[i][a]); } printf(" sum=%5.3f Neff_M=%4.2f\n",sum,Neff_M[i]); } cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; for (i=1; i<=L; ++i) { printf("%3i: ",i); sum=0; for (a=0; a<20; ++a) { sum+=p[i][a]; printf("%4.1f ",100*p[i][a]); } printf(" sum=%5.3f\n",sum); } } } } ///////////////////////////////////////////////////////////////////////////////////// // Fill CountProfile with actual aa-frequencies ///////////////////////////////////////////////////////////////////////////////////// void HMM::fillCountProfile(cs::CountProfile *csProfile) { for (int i=0; ineff[i] = Neff_M[i+1]; for (int a=0; a<20; ++a) csProfile->counts[i][a] = f[i+1][a] * Neff_M[i+1]; } } ///////////////////////////////////////////////////////////////////////////////////// // Calculate amino acid background frequencies in HMM ///////////////////////////////////////////////////////////////////////////////////// void HMM::CalculateAminoAcidBackground() { int a, i; // initialize vector of average aa freqs with pseudocounts for (a=0; a<20; ++a) pav[a]=pb[a]*100.0f/Neff_HMM; // calculate averages for (i=1; i<=L; ++i) for (a=0; a<20; ++a) pav[a] += p[i][a]; // Normalize vector of average aa frequencies pav[a] NormalizeTo1(pav,NAA); for (a=0; a<20; ++a) p[0][a] = p[L+1][a] = pav[a]; } ///////////////////////////////////////////////////////////////////////////////////// // Add amino acid pseudocounts to HMM and calculate average protein aa probabilities pav[a] // Pseudocounts: p[i][a] = (1-tau)*f[i][a] + tau*g[i][a] ///////////////////////////////////////////////////////////////////////////////////// void HMM::AddAminoAcidPseudocounts(char pcm, float pca, float pcb, float pcc) { int i; //position in HMM int a; //amino acid (0..19) float sum; float tau; //tau = pseudocount admixture if (has_pseudocounts) { pcm = 0; } // Calculate amino acid frequencies p[i][a] = (1-tau(i))*f[i][a] + tau(i)*g[i][a] switch (pcm) { case 0: //no pseudocounts whatsoever: tau=0 for (i=1; i<=L; ++i) for (a=0; a<20; ++a) p[i][a]=f[i][a]; break; case 1: //constant pseudocounts (for optimization): tau = pca tau = pca; for (i=1; i<=L; ++i) for (a=0; a<20; ++a) p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a]; break; case 2: // diversity-dependent (i.e, Neff_M[i]-dependent) pseudocounts if (par.pcc==1.0f) for (i=1; i<=L; ++i) { tau = fmin(1.0, pca/(1. + Neff_M[i]/pcb ) ); for (a=0; a<20; ++a) p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a]; } else for (i=1; i<=L; ++i) { tau = fmin(1.0, pca/(1. + pow((Neff_M[i])/pcb,pcc))); for (a=0; a<20; ++a) p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a]; } break; case 3: // constant-diversity pseudocounts // Is this still used? => scrap? (JS) for (i=1; i<=L; ++i) { float x = Neff_M[i]/pcb; pca = 0.793 + 0.048*(pcb-10.0); tau = fmax(0.0, pca*(1-x + pcc*x*(1-x)) ); for (a=0; a<20; ++a) p[i][a] = (1.-tau)*f[i][a] + tau * g[i][a]; } if (v>=2) { printf("Divergence before / after addition of amino acid pseudocounts: %5.2f / %5.2f\n",Neff_HMM, CalcNeff()); } break; } //end switch (pcm) //turn on pseudocount switch to indicate that HMM contains pseudocounts if (pcm!=0) has_pseudocounts=true; // DEBUGGING output if (v>=3) { switch (pcm) { case 0: cout<<"No pseudocounts added (-pcm 0)\n"; return; case 1: cout<<"Adding constant AA pseudocount admixture of "<=4) { cout<<"\nAmino acid frequencies WITHOUT pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; for (i=1; i<=L; ++i) { printf("%3i: ",i); sum=0; for (a=0; a<20; ++a) { sum+=f[i][a]; printf("%4.1f ",100*f[i][a]); } printf(" sum=%5.3f\n",sum); } cout<<"\nAmino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; for (i=1; i<=L; ++i) { printf("%3i: ",i); sum=0; for (a=0; a<20; ++a) { sum+=p[i][a]; printf("%4.1f ",100*p[i][a]); } printf(" sum=%5.3f\n",sum); } } } return; } ///////////////////////////////////////////////////////////////////////////////////// // Divide aa probabilties by square root of locally averaged background frequencies // !!!!! ATTENTION!!!!!!! after this p is not the same as after adding pseudocounts !!! ///////////////////////////////////////////////////////////////////////////////////// void HMM::DivideBySqrtOfLocalBackgroundFreqs(int D) // 2*D+1 is window size { if (divided_by_local_bg_freqs) {cerr<<"WARNING: already divided probs by local aa frequencies!\n"; return;} divided_by_local_bg_freqs=1; int i; // query and template match state indices int a; // amino acid index float fac=1.0/(2*D+1); // 1 / window size float** pnul = new float*[L+1]; // null model probabilities for (i=0; i<=L; ++i) pnul[i] = new(float[NAA]); for (a=0; a average over entire length L if (L <= 2*D+1) { for (i=1; i<=L; ++i) for (a=0; a average over window size 2*D+1 else { // Calculate local amino acid background frequencies in leftmost window (1,..,2*D+1) for (i=1; i<=2*D+1; ++i) for (a=0; a=3) { cout<<"\nLocal amino acid background frequencies\n"; cout<<" A R N D C Q E G H I L K M F P S T W Y V sum\n"; for (i=1; i<=L; ++i) { printf("%-5i ",i); float sum = 0.0; for (a=0; a<20; ++a) { printf("%4.1f ",100*fac*pnul[i][a]); sum += fac*pnul[i][a]; } printf("%4.1f\n",100*sum); } } for (i=0; i<=L; ++i) delete[] pnul[i]; delete[] pnul; return; } ///////////////////////////////////////////////////////////////////////////////////// // Factor Null model into HMM t // !!!!! ATTENTION!!!!!!! after this t->p is not the same as after adding pseudocounts !!! ///////////////////////////////////////////////////////////////////////////////////// void HMM::IncludeNullModelInHMM(HMM* q, HMM* t, int columnscore ) { int i,j; //query and template match state indices int a; //amino acid index // Multiply template frequencies with amino acid weights = 1/background_freq(a) (for all but SOP scores) switch (columnscore) { default: case 0: // Null model with background prob. from database for (j=0; j<=t->L+1; ++j) for (a=0; a<20; ++a) t->p[j][a] /= pb[a]; break; case 1: // Null model with background prob. equal average from query and template float pnul[NAA]; // null model probabilities used in comparison (only set in template/db HMMs) for (a=0; a<20; ++a) pnul[a] = 0.5*(q->pav[a]+t->pav[a]); for (j=0; j<=t->L+1; ++j) for (a=0; a<20; ++a) t->p[j][a] /= pnul[a]; break; case 2: // Null model with background prob. from template protein for (j=0; j<=t->L+1; ++j) for (a=0; a<20; ++a) t->p[j][a] /= t->pav[a]; break; case 3: // Null model with background prob. from query protein for (j=0; j<=t->L+1; ++j) for (a=0; a<20; ++a) t->p[j][a] /= q->pav[a]; break; case 5: // Null model with local background prob. from template and query protein // if (!q->divided_by_local_bg_freqs) q->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); if (!q->divided_by_local_bg_freqs) InternalError("No local amino acid bias correction on query HMM!"); if (!t->divided_by_local_bg_freqs) t->DivideBySqrtOfLocalBackgroundFreqs(par.half_window_size_local_aa_bg_freqs); break; case 10: // Separated column scoring for Stochastic Backtracing (STILL USED??) for (i=0; i<=q->L+1; ++i) { float sum = 0.0; for (a=0; a<20; ++a) sum += pb[a]*q->p[i][a]; sum = 1.0/sqrt(sum); for (a=0; a<20; ++a) q->p[i][a]*=sum; } for (j=0; j<=t->L+1; j++) { float sum = 0.0; for (a=0; a<20; ++a) sum += pb[a]*t->p[j][a]; sum = 1.0/sqrt(sum); for (a=0; a<20; ++a) t->p[j][a]*=sum; } break; case 11: // log co-emission probability (no null model) for (a=0; a<20; ++a) pnul[a]=0.05; break; } if (v>=4) { cout<<"\nAverage amino acid frequencies\n"; cout<<" A R N D C Q E G H I L K M F P S T W Y V\n"; cout<<"Q: "; for (a=0; a<20; ++a) printf("%4.1f ",100*q->pav[a]); cout<<"\nT: "; for (a=0; a<20; ++a) printf("%4.1f ",100*t->pav[a]); cout<<"\npb: "; for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]); } // cout<<"\nWeighted amino acid frequencies WITH pseudocounts:\n A R N D C Q E G H I L K M F P S T W Y V\n"; // for (i=1; i<=L; ++i) // { // printf("%3i: ",i); // float sum=0; // for (a=0; a<20; ++a) // { // sum+=t->p[i][a]; // printf("%4.1f ",100*t->t->p[i][a]); // } // printf(" sum=%5.3f\n",sum); // } return; } ///////////////////////////////////////////////////////////////////////////////////// // Write HMM to output file ///////////////////////////////////////////////////////////////////////////////////// void HMM::WriteToFile(char* outfile) { const int SEQLEN=100; // number of residues per line for sequences to be displayed int i,a; if (trans_lin==1) InternalError("tried to write HMM file with transition pseudocounts in linear representation"); if (divided_by_local_bg_freqs) InternalError("tried to write HMM file with amino acid probabilities divided by sqrt of local background frequencies\n"); FILE *outf=NULL; if (strcmp(outfile,"stdout")) { if (par.append) outf=fopen(outfile,"a"); else outf=fopen(outfile,"w"); if (!outf) OpenFileError(outfile); } else outf = stdout; if (v>=2) cout<<"Writing HMM to "< ",(int)strlen(par.argv[i])); fprintf(outf,"\n"); // print out date stamp time_t* tp=new(time_t); *tp=time(NULL); fprintf(outf,"DATE %s",ctime(tp)); delete tp; // Print out some statistics of alignment fprintf(outf,"LENG %i match states, %i columns in multiple alignment\n",L,l[L]); fprintf(outf,"FILT %i out of %i sequences passed filter (-id %i -cov %i -qid %i -qsc %.2f -diff %i)\n",N_filtered,N_in,par.max_seqid,par.coverage,par.qid,par.qsc,par.Ndiff); fprintf(outf,"NEFF %-4.1f\n",Neff_HMM); if (has_pseudocounts) { fprintf(outf,"PCT true\n"); } // Print selected sequences from alignment (including secondary structure and confidence values, if known) fprintf(outf,"SEQ\n"); for (int n=0; n%s\n",sname[n]); //first sequence character starts at 1; 0 not used. for(unsigned int j=0; jM\tM->I\tM->D\tI->M\tI->I\tD->M\tD->D\tNeff\tNeff_I\tNeff_D\n"); // print out transition probabilities from begin state (virtual match state) fprintf(outf," "); for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[0][a]*HMMSCALE)); fout(outf,iround(Neff_M[0]*HMMSCALE)); fout(outf,iround(Neff_I[0]*HMMSCALE)); fout(outf,iround(Neff_D[0]*HMMSCALE)); fprintf(outf,"\n"); // Start loop for printing HMM columns int h=1; for (i=1; i<=L; ++i) { while(islower(seq[nfirst][h]) && seq[nfirst][h]) h++; fprintf(outf,"%1c %-4i ",seq[nfirst][h++],i); // Print emission probabilities for match state for (a=0; a<20; ++a) fout(outf,-iround(fast_log2(p[i][s2a[a]])*HMMSCALE )); fprintf(outf,"%-i",l[i]); fprintf(outf,"\n"); // Print transition probabilities fprintf(outf," "); for (a=0; a<=D2D; ++a) fout(outf,-iround(tr[i][a]*HMMSCALE)); fout(outf,iround(Neff_M[i]*HMMSCALE)); fout(outf,iround(Neff_I[i]*HMMSCALE)); fout(outf,iround(Neff_D[i]*HMMSCALE)); fprintf(outf,"\n\n"); } // end for(i)-loop for printing HMM columns fprintf(outf,"//\n"); if (strcmp(outfile,"stdout")) fclose(outf); } ///////////////////////////////////////////////////////////////////////////////////// // Write HMM to output file ///////////////////////////////////////////////////////////////////////////////////// void HMM::InsertCalibration(char* infile) { char* line = new(char[LINELEN]); // input line char** lines = new(char*[3*L+100000]); int nline=0; int l; char done=0; // inserted new 'EVD mu sigma' line? // Read from infile all lines and insert the EVD line with lamda and mu coefficients ifstream inf; inf.open(infile, ios::in); if (!inf) OpenFileError(infile); if (v>=2) cout<<"Recording calibration coefficients in "< remove, i.e., read next line while (!done && !strncmp(line,"EVD ",3) && !(line[0]=='/' && line[1]=='/') && nline<2*par.maxres) inf.getline(line,LINELEN); if ((line[0]=='/' && line[1]=='/') || nline>=2*par.maxres) FormatError(infile,"Expecting hhm format"); // Found the SEQ line? -> insert calibration before this line if (!done && (!strncmp("SEQ",line,3) || !strncmp("HMM",line,3)) && (isspace(line[3]) || line[3]=='\0')) { done=1; lines[nline]=new(char[128]); if (!lines[nline]) MemoryError("space to read in HHM file for calibration"); sprintf(lines[nline],"EVD %-7.4f %-7.4f",lamda,mu); nline++; } lines[nline]=new(char[strlen(line)+1]); if (!lines[nline]) MemoryError("space to read in HHM file for calibration"); strcpy (lines[nline],line); nline++; } inf.close(); // Write to infile all lines FILE* infout=fopen(infile,"w"); if (!infout) { cerr<=2) printf("Neutralized His-tag between positions %i and %i\n",imax(i0-8,1),i-1); } // Neutralize C-myc tag if ( (pt=strstr(qseq,"EQKLISEEDL")) ) { if (v>=2) printf("Neutralized C-myc-tag at position %i\n",int(pt-qseq)+1); for (i=pt-qseq+1; i<=pt-qseq+10; ++i) for (a=0; a=2) printf("Neutralized FLAG-tag at position %i\n",int(pt-qseq)+1); for (i=pt-qseq+1; i<=pt-qseq+8; ++i) for (a=0; a1E-10) Neff-=p[i][a]*fast_log2(p[i][a]); return fpow2(Neff/L); } ///////////////////////////////////////////////////////////////////////////////////// // Add secondary structure prediction to alignment (overwrite existing prediction) ///////////////////////////////////////////////////////////////////////////////////// void HMM::AddSSPrediction(char seq_pred[], char seq_conf[]) { unsigned int i; if ((int)strlen(seq_pred)!=L+1) { cerr<<"WARNING: Could not add secondary struture prediction - unequal length!\n"; return; } if (nss_pred >= 0 && nss_conf >= 0) { strcpy(seq[nss_pred],seq_pred); for (i=0; i=0; --k) { seq[k+2] = seq[k]; sname[k+2] = sname[k]; } if (nss_dssp >= 0) { nss_dssp += 2; } if (nsa_dssp >= 0) { nsa_dssp += 2; } if (ncons >= 0) { ncons += 2; } if (nfirst >= 0) { nfirst += 2; } nss_pred=0; seq[nss_pred]=new(char[L+2]); strcpy(seq[nss_pred],seq_pred); for (i=0; i