// hhalignment.C ///////////////////////////////////////////////////////////////////////////////////// //// Class Alignment ///////////////////////////////////////////////////////////////////////////////////// // hhalignment.C #ifndef MAIN #define MAIN #include // cin, cout, cerr #include // ofstream, ifstream #include // printf using std::cout; using std::cerr; using std::endl; using std::ios; using std::ifstream; using std::ofstream; #include // exit #include // strcmp, strstr #include // sqrt, pow #include // INT_MIN #include // FLT_MIN #include // clock #include // islower, isdigit etc #include "util.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "list.h" // list data structure #include "hash.h" // hash data structure #include "hhdecl.C" #include "hhutil.C" // imax, fmax, iround, iceil, ifloor, strint, strscn, strcut, substr, uprstr, uprchr, Basename etc. #include "hhhmm.h" #endif ///////////////////////////////////////////////////////////////////////////////////// // Class Alignment ///////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////// // Object constructor ///////////////////////////////////////////////////////////////////////////////////// Alignment::Alignment(int maxseq, int maxres) { longname = new(char[DESCLEN]); sname = new(char*[maxseq+2]); seq = new(char*[maxseq+2]); l = new(int[maxres]); X = new(char*[maxseq+2]); I = new(short unsigned int*[maxseq+2]); keep = new(char[maxseq+2]); display = new(char[maxseq+2]); wg = new(float[maxseq+2]); nseqs = new(int[maxres+2]); N_in=L=0; nres=NULL; // number of residues per sequence k first=NULL; // first residue in sequence k last=NULL; // last residue in sequence k ksort=NULL; // sequence indices sorted by descending nres[k] name[0]='\0'; // no name defined yet longname[0]='\0'; // no name defined yet fam[0]='\0'; // no name defined yet file[0]='\0'; // no name defined yet readCommentLine = '0'; } ///////////////////////////////////////////////////////////////////////////////////// // Object destructor ///////////////////////////////////////////////////////////////////////////////////// Alignment::~Alignment() { delete[] longname; for(int k=0; k') //line contains sequence name { if (k>=MAXSEQ-1) { if (v>=1 && k>=MAXSEQ) cerr<=0) //if this is at least the second name line { if (strlen(cur_seq)<=1) // 1, because the sequence in cur_seq starts at position 1 => no residues = length 1 { cerr<no_name_123 { cur_name = no_name; sprintf(cur_name,"no_name_%i",k); } else if (cur_name[0]=='@') cur_name = strscn(cur_name+1); //skip @-character in name l=1; //position in current sequence (first=1) cur_seq[l]='\0'; // avoids taking wrong sequence in case the input alignment is corrupted (two header lines with no sequence line between) // display[k]= 0: do not show in Q-T alignments 1: show if not filtered out later 2: show in any case (do not filter out) // keep[k] = 0: do not include in profile 1: include if not filtered out later 2: include in any case (do not filter out) if (!strncmp(line,">ss_dssp",8)) { if (kss_dssp<0) {display[k]=2; n_display++; keep[k]=0; kss_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">sa_dssp",8)) { if (ksa_dssp<0) {display[k]=2; n_display++; keep[k]=0; ksa_dssp=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_pred",8)) { if (kss_pred<0) {display[k]=2; n_display++; keep[k]=0; kss_pred=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_conf",8)) { if (kss_conf<0) {display[k]=2; n_display++; keep[k]=0; kss_conf=k; N_ss++;} else {skip_sequence=1; k--; continue;} } else if (!strncmp(line,">ss_",4) || !strncmp(line,">sa_",4)) { display[k]=2; n_display++; keep[k]=0; N_ss++; } else if (!strncmp(line,">aa_",4)) { // ignore sequences beginning with ">aa_" skip_sequence=1; k--; continue; } //store first real seq else if (kfirst<0) { char word[NAMELEN]; strwrd(word,line,NAMELEN); // Copies first word in ptr to str if (strstr(word,"_consensus")) {display[k]=2; keep[k]=0; n_display++; kfirst=k;} else {display[k]=keep[k]=2; n_display++; kfirst=k;} } //store all sequences else if (par.mark==0) {display[k]=keep[k]=1; n_display++;} //store sequences up to nseqdis else if (line[1]=='@'&& n_display-N_ss=4) printf("Reading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",cur_name,k,n_display,display[k],keep[k]); sname[k] = new(char[strlen(cur_name)+1]); if (!sname[k]) {MemoryError("array for sequence names");} strcpy(sname[k],cur_name); } // end if(line contains sequence name) else if (line[0]=='#') // Commentary line? { // #PF01367.9 5_3_exonuc: 5'-3' exonuclease, C-terminal SAM fold; PDB 1taq, 1bgx (T:271-174), 1taq (271-174) if (name[0]) continue; // if already name defined: skip commentary line char *ptr1; ptr1=strscn_(line+1); // set ptr1 to first non-whitespace character after '#' -> AC number strncpy(longname,ptr1,DESCLEN-1); // copy whole commentary line after '# ' into longname longname[DESCLEN-1]='\0'; strtr(longname,""," "); strcut_(ptr1); // cut after AC number and set ptr2 to first non-whitespace character after AC number // char* ptr2=strcut_(ptr1); // **instead** of previous line // strcpy(fam,ptr1); // copy AC number to fam // if (!strncmp(fam,"PF",2)) strcut_(fam,'.'); // if PFAM identifier contains '.' cut it off // strcut_(ptr2); // cut after first word ... strmcpy(name,ptr1,NAMELEN-1); // ... and copy first word into name readCommentLine = '1'; } //line contains sequence residues or SS information and does not belong to a >aa_ sequence else if (!skip_sequence) { if (v>=4) cout<'\0' && l=0) // ignore white-space characters ' ', \t and \n (aa2i()==-1) {cur_seq[l]=line[h]; l++;} else if (aa2i(line[h])==-2 && v) cerr<'\0' && l=0 && ss2i(line[h])<=7) {cur_seq[l]=ss2ss(line[h]); l++;} else if (v) cerr<'\0' && l=0) cur_seq[l++]=line[h]; else if (v) cerr<'\0' && l=0 && ss2i(line[h])<=3) {cur_seq[l]=ss2ss(line[h]); l++;} else if (v) cerr<'\0' && l='0' && line[h]<='9')) {cur_seq[l]=line[h]; l++;} else if (v) cerr<sa_pred etc { while (h'\0' && l='0' && line[h]<='9') || (line[h]>='A' && line[h]<='B')) {cur_seq[l]=line[h]; l++;} else if (v) cerr<=par.maxcol-1) { cerr<=0) //if at least one sequence was read in { seq[k]=new(char[strlen(cur_seq)+2]); if (!seq[k]) MemoryError("array for input sequences"); X[k]=new(char[strlen(cur_seq)+2]); if (!X[k]) MemoryError("array for input sequences"); I[k]=new(short unsigned int[strlen(cur_seq)+2]); if (!I[k]) MemoryError("array for input sequences"); strcpy(seq[k],cur_seq); } else {cerr< extract from first sequence { char* ptr; // strtr(sname[kfirst],"~"," "); // 'transpose': replaces the tilde with a blanc everywhere in sname[kfirst] strncpy(longname,sname[kfirst],DESCLEN-1); // longname is name of first sequence longname[DESCLEN-1]='\0'; strncpy(name,sname[kfirst],NAMELEN-1); // Shortname is first word of longname... name[NAMELEN-1]='\0'; ptr = strcut(name); // ...until first white-space character if (ptr && islower(ptr[0]) && ptr[1]=='.' && isdigit(ptr[2])) //Scop family code present as second word? { lwrstr(name); // Transform upper case to lower case strcut(ptr); // Non-white-space characters until next white-space character.. strcpy(fam,ptr); // ...are the SCOP familiy code } else if (name[0]=='P' && name[1]=='F' && isdigit(name[2]) && isdigit(name[3]) ) //Pfam code { strcpy(fam,name); // set family name = Pfam code } } // Checking for warning messages if (v==0) return; if (v>=2) cout<<"Read "<=3) cout<<"Query sequence for alignment has number "< WARNING static short unsigned int h[MAXSEQ]; //points to next character in seq[k] to be written // for (k=0;k%s\n%s\n",k,sname[k],seq[k]); // DEBUG // Initialize for (k=0;k=3) { if (par.M==1) cout<<"Using match state assignment by capital letters (a2m/a3m format)\n"; else if (par.M==2) cout<<"Using percentage-rule match state assignment\n"; else if (par.M==3) cout<<"Using residues of first sequence as match states\n"; } // Warn, if there are gaps in a single sequence if (v>=1 && N_in-N_ss==1 && par.M!= 2 && strchr(seq[kfirst]+1,'-')!=NULL) fprintf(stderr, "WARNING: File %s has a single sequence containing gaps, which will be ignored.\nIf you want to treat the gaps as match states, use the '-M 100' option.\n",infile); // Too few match states? if (par.M==1) { int match_states = strcount(seq[kfirst],'A','Z') + strcount(seq[kfirst]+1,'-','-'); if (match_states < 6) { if (N_in-N_ss<=1) { par.M=3; // if only single sequence in input file, use par.M=3 (match states by first seq) fprintf(stderr, "WARNING: single sequence in file %s contains only %i match_states! Switching to option -M first\n seq=%s\n",infile,match_states,seq[kfirst]); } else if (v>=1) fprintf(stderr, "WARNING: Master sequence in file %s contains only %i match_states!\nseq=%s\n",infile,match_states,seq[kfirst]); } } // Create matrices X and I with amino acids represented by integer numbers switch(par.M) { ///////////////////////////////////////////////////////////////////////////////////// // a2m/a3m format: match states capital case, inserts lower case, delete states '-', inserted gaps '.' // The confidence values for ss prediction are interpreted as follows: 0-9:match states(!) '-' :match state '.':insert case 1: default: // Warn if alignment is ment to be -M first or -M <%> instead of A2M/A3M if (v>=2 && strchr(seq[kfirst],'-') ) // Seed/query sequence contains a gap ... { unsigned int len = strlen(seq[kfirst])-1; for (k=1; k=N_in) // ... but alignment contains no lower case residue fprintf(stderr,"WARNING: input alignment %s looks like aligned FASTA instead of A2M/A3M format. Consider using '-M first' or '-M 50'\n",infile); } // Remove '.' characters from seq[k] for(k=0; kss_dssp, >ss_pred, >ss_conf, >aa_... sequences { while((c=seq[k][l++])) // assign residue to c at same time { if (c>='a' && c<='z') I[k][i-1]++;//insert state = lower case character else if (c!='.') //match state = upper case character { X[k][i]=aa2i(c); I[k][i]=0; ++i; } } } else if (k==kss_dssp || k==kss_pred) // does alignment contain sequence of secondary structure states? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=ss2i(c); //match state = upper case character } else if (k==ksa_dssp) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.' && !(c>='a' && c<='z')) X[k][i++]=sa2i(c); //match state = upper case character } else if (k==kss_conf) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.') X[k][i++]=cf2i(c); //match state = 0-9 or '-' } else if (k==kfirst) // does alignment contain sequence of prediction confidence values? { while((c=seq[k][l++])) // assign residue to c at same time if (c!='.') { X[k][i]=aa2i(c); I[k][i]=0; ++i; } } else continue; i--; if (L!=i && L!=par.maxres-2 && !unequal_lengths) unequal_lengths=k; //sequences have different lengths L=imin(L,i); } if (unequal_lengths) break; //Replace GAP with ENDGAP for all end gaps for (k=0; k=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; } for (i=1; i<=L; ++i) this->l[i]=i; //assign column indices to match states if (L<=0) { cerr<<"\nError in "< option"<=2) { fprintf(stderr,"WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",L); break; } if (v>=2) cout<<"Alignment in "<=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; } // Add up percentage of gaps for (l=1; l<=L; l++) { float res=0; float gap=0; for (k=0; k< N_in; ++k) if (keep[k]) { if (X[k][l]=4) cout<<"percent gaps["<=par.maxres-2) { if (v>=1) fprintf(stderr,"WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); break; } ++i; this->l[i]=l; for (k=0; k=2) cout<<"Alignment in "<=3) printf("Length of first seq = %i\n",L); // Check for sequences with unequal lengths for (k=1; k match state for (l=1; l<=L; l++) if (isalpha(seq[kfirst][l])) match_state[l]=1; else match_state[l]=0; // Throw out insert states and keep only match states for (k=0; k=par.maxres-2) { if (v>=1) fprintf(stderr,"WARNING: Number of match columns too large. Only first %i match columns will be kept!\n",i); break; } ++i; this->l[i]=l; for (k=0; k=2) cout<<"Alignment in "<=1 && X[k][i]==GAP; i--) X[k][i]=ENDGAP; } delete[] match_state; break; } } //end switch() ///////////////////////////////////////////////////////////////////////////////////// // Error if (unequal_lengths) { strcut(sname[unequal_lengths]); cerr<=2 && !strncmp(infile,"merged A3M",10)) { fprintf(stderr,"Merged MSA:\n"); for (k=0; k<=unequal_lengths; ++k) fprintf(stderr,"%3i\n>%s\n%s\n",k,sname[k],seq[k]); } exit(1); } if (L == 0) { cerr<=2 && !par.cons) { for (i=1; i<=L; ++i) if (X[kfirst][i]==GAP) { printf("NOTE: Use the '-cons' option to calculate a consensus sequence as first sequence of the alignment.\n"); break; } } // DEBUG if (v>=4) for (k=0; k"<9) cout<<"X"; else cout<n_seqs; ++qk) { if (qk==q->ncons) {continue;} // Create space for residues and paste new sequence in seq[k]=new(char[strlen(q->seq[qk])+1]); if (!seq[k]) MemoryError("array for input sequences"); strcpy(seq[k],q->seq[qk]); X[k]=new(char[strlen(q->seq[qk])+1]); if (!X[k]) MemoryError("array for input sequences"); I[k]=new(short unsigned int[strlen(q->seq[qk])+1]); if (!I[k]) MemoryError("array for input sequences"); if (qk==q->nss_dssp) { display[k]=2; n_display++; keep[k]=0; kss_dssp=k; N_ss++; } else if (qk==q->nsa_dssp) { display[k]=2; n_display++; keep[k]=0; ksa_dssp=k; N_ss++; } else if (qk==q->nss_pred) { display[k]=2; n_display++; keep[k]=0; kss_pred=k; N_ss++; } else if (qk==q->nss_conf) { display[k]=2; n_display++; keep[k]=0; kss_conf=k; N_ss++; } else { if (kfirst<0) { display[k]=keep[k]=2; n_display++; kfirst=k; } else { display[k]=keep[k]=1; n_display++; } } // store sequence name if (v>=3) printf("\nReading seq %-16.16s k=%3i n_displ=%3i display[k]=%i keep[k]=%i\n",q->sname[qk],k,n_display,display[k],keep[k]); sname[k] = new(char[strlen(q->sname[qk])+1]); if (!sname[k]) {MemoryError("array for sequence names");} strcpy(sname[k],q->sname[qk]); ++k; } N_in = k; strcpy(longname,q->longname); strmcpy(name,q->name,NAMELEN-1); strmcpy(fam,q->fam,NAMELEN-1); return; } ///////////////////////////////////////////////////////////////////////////////////// // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage=0) display[kss_dssp]=0; if (ksa_dssp>=0) display[ksa_dssp]=0; if (kss_pred>=0) display[kss_pred]=0; if (kss_conf>=0) display[kss_conf]=0; for (seqid=imin(10,max_seqid); n_displayN) { for (int k=0; k=0) {display[kss_dssp]=1; n_display++;} if (ksa_dssp>=0) {display[ksa_dssp]=1; n_display++;} if (kss_pred>=0) {display[kss_pred]=1; n_display++;} if (kss_conf>=0) {display[kss_conf]=1; n_display++;} delete[] dummy; return n_display; } ///////////////////////////////////////////////////////////////////////////////////// // Remove sequences with seq. identity larger than seqid percent (remove the shorter of two) or coverage0, remove sequences with minimum-sequence-identity filter of between seqid1 // and seqid2 (%), where the minimum seqid threshold is determined such that, // in all column blocks of at least WMIN=25 residues, at least Ndiff sequences are left. // This ensures that in multi-domain proteins sequences covering one domain are not // removed completely because sequences covering other domains are more diverse. // // Allways the shorter of two compared sequences is removed (=> sort sequences by length first). // Please note: sequence identity of sequence x with y when filtering x is calculated as // number of residues in sequence x that are identical to an aligned residue in y / number of residues in x // Example: two sequences x and y are 100% identical in their overlapping region but one overlaps by 10% of its // length on the left and the other by 20% on the right. Then x has 10% seq.id with y and y has 20% seq.id. with x. ///////////////////////////////////////////////////////////////////////////////////// int Alignment::Filter2(char keep[], int coverage, int qid, float qsc, int seqid1, int seqid2, int Ndiff) { // In the beginnning, keep[k] is 1 for all regular amino acid sequences and 0 for all others (ss_conf, ss_pred,...) // In the end, keep[k] will be 1 for all regular representative sequences kept in the alignment, 0 for all others char* in=new(char[N_in+1]); // in[k]=1: seq k has been accepted; in[k]=0: seq k has not yet been accepted at current seqid char* inkk=new(char[N_in+1]); // inkk[k]=1 iff in[ksort[k]]=1 else 0; int* Nmax=new(int[L+2]); // position-dependent maximum-sequence-identity threshold for filtering? (variable used in former version was idmax) int* idmaxwin=new(int[L+2]); // minimum value of idmax[i-WFIL,i+WFIL] int* seqid_prev=new(int[N_in+1]); // maximum-sequence-identity threshold used in previous round of filtering (with lower seqid) int* N=new(int[L+2]); // N[i] number of already accepted sequences at position i const int WFIL=25; // see previous line int diffNmax=Ndiff; // current maximum difference of Nmax[i] and Ndiff int diffNmax_prev=0; // previous maximum difference of Nmax[i] and Ndiff int seqid; // current maximum value for the position-dependent maximum-sequence-identity thresholds in idmax[] int seqid_step=0; // previous increment of seqid float diff_min_frac; // minimum fraction of differing positions between sequence j and k needed to accept sequence k float qdiff_max_frac=0.9999-0.01*qid; // maximum allowable number of residues different from query sequence int diff; // number of differing positions between sequences j and k (counted so far) int diff_suff; // number of differing positions between sequences j and k that would be sufficient int qdiff_max; // maximum number of residues required to be different from query int cov_kj; // upper limit of number of positions where both sequence k and j have a residue int first_kj; // first non-gap position in sequence j AND k int last_kj; // last non-gap position in sequence j AND k int kk, jj; // indices for sequence from 1 to N_in int k, j; // kk=ksort[k], jj=ksort[j] int i; // counts residues int n; // number of sequences accepted so far // Initialize in[k] for (n=k=0; k=1; i--) if (X[k][i]=N_in) {seqid1=seqid2; Ndiff=N_in; diffNmax=Ndiff;} // Check coverage and sim-to-query criteria for each sequence k for (k=0; k reject once and for all float qsc_sum=0.0; // Check if score-per-column with query is at least qsc if (qsc>-10) { float qsc_min = qsc*nres[k]; // minimum total score of seq k with query int gapq=0, gapk=0; // number of consecutive gaps in query or k'th sequence at position i for (int i=first[k]; i<=last[k]; ++i) { if (X[k][i]<20) { gapk=0; if (X[kfirst][i]<20) { gapq=0; qsc_sum += S[(int)X[kfirst][i]][(int)X[k][i]]; } else if (gapq++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN; } else if (X[kfirst][i]<20) { gapq=0; if (gapk++) qsc_sum-=PLTY_GAPEXTD; else qsc_sum-=PLTY_GAPOPEN; } } // printf("k=%3i qsc=%6.2f\n",k,qsc_sum); if (qsc_sum reject once and for all } //Check if sequence similarity with query at least qid? if (qdiff_max_frac<0.999) { qdiff_max=int(qdiff_max_frac*nres[k]+0.9999); // printf("k=%-4i nres=%-4i qdiff_max=%-4i first=%-4i last=%-4i",k,nres[k],qdiff_max,first[k],last[k]); diff=0; for (int i=first[k]; i<=last[k]; ++i) // enough different residues to reject based on minimum qid with query? => break if (X[k][i]<20 && X[k][i]!=X[kfirst][i] && ++diff>=qdiff_max) break; // printf(" diff=%4i\n",diff); if (diff>=qdiff_max) {keep[k]=0; continue;} // too different from query? => reject once and for all } // printf(" qsc=%6.2f qid=%6.2f \n",qsc_sum/nres[k],100.0*(1.0-(float)(diff)/nres[k])); } if (seqid1>seqid2) { for (n=k=0; k0) n++; return n; } // Successively increment idmax[i] at positons where N[i]max) max=N[j]; if (Nmax[i] k) for (kk=0; kk=100) {in[k]=inkk[kk]=1; n++; continue;} float seqidk=seqid1; for (i=first[k]; i<=last[k]; ++i) if (idmaxwin[i]>seqidk) seqidk=idmaxwin[i]; if (seqid==seqid_prev[k]) continue; // sequence has already been rejected at this seqid threshold => reject this time seqid_prev[k]=seqid; diff_min_frac =0.9999-0.01*seqidk; // min fraction of differing positions between sequence j and k needed to accept sequence k // Loop over already accepted sequences for (jj=0; jjnres[k] anyway because of sorting diff=0; for (int i=first_kj; i<=last_kj; ++i) { // enough different residues to accept? => break if (X[k][i]>=NAA || X[j][i]>=NAA) cov_kj--; else if (X[k][i]!=X[j][i] && ++diff>=diff_suff) break; // accept (k,j) } // // DEBUG // printf("%20.20s with %20.20s: diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj); // printf("%s\n%s\n\n",seq[k],seq[j]); if (diff=kk) // did loop reach end? => accept k. Otherwise reject k (the shorter of the two) { in[k]=inkk[kk]=1; n++; for (i=first[k]; i<=last[k]; ++i) N[i]++; // update number of sequences at position i // printf("%i %20.20s accepted\n",k,sname[k]); } // else // { // printf("%20.20s rejected: too similar with seq %20.20s diff=%i diff_min_frac*cov_kj=%f diff_suff=%i nres=%i cov_kj=%i\n",sname[k],sname[j],diff,diff_min_frac*cov_kj,diff_suff,nres[k],cov_kj); // printf("%s\n%s\n\n",seq[k],seq[j]); // } } // End Loop over all candidate sequences kk // // DEBUG // printf("\n"); // printf("seqid_prev[k]= \n"); // for (k=0; k=2) { printf("%i out of %i sequences passed filter (",n,N_in-N_ss); if (par.coverage) printf("%i%% min coverage, ",coverage); if (qid) printf("%i%% min sequence identity to query, ",qid); if (qsc>-10) printf("%.2f bits min score per column to query, ",qsc); if (Ndiff0) printf("up to %i%% position-dependent max pairwise sequence identity)\n",seqid); else printf("%i%% max pairwise sequence identity)\n",seqid1); } for (k=0; k=1; i--) if (X[k][i]p[i][a]/pb[a]); logodds[i][ANY]=-0.5; // half a bit penalty for X // printf(" A R N D C Q E G H I L K M F P S T W Y V\n"); // printf("%6i ",i); // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100 * qcore->f[i][a]); // printf("\n"); // printf(" "); // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100 * qcore->g[i][a]); // printf("\n"); // printf(" "); // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100 * qcore->p[i][a]); // printf("\n"); // printf(" "); // for (a=0; a<20; ++a) fprintf(stdout,"%5.1f ",100*pb[a]); // printf("\n"); // printf(" "); // for (a=0; a<20; ++a) fprintf(stdout,"%5.2f ",fast_log2(qcore->p[i][a]/pb[a])); // printf("\n"); } // Main loop: test all sequences k for (k=kfirst+1; ktr[i][D2M]; else score+=qcore->tr[i][M2M]; gap=0; } else if (X[k][i]==GAP) // current state is Delete (ignore ENDGAPs) { if (gap) score+=qcore->tr[i][D2D]; else score+=qcore->tr[i][M2D]; gap=1; } if (I[k][i]) score+=qcore->tr[i][M2I]+(I[k][i]-1) * qcore->tr[i][I2I]+qcore->tr[i][I2M]; // if (k==2) printf("i=%3i %c:%c score_M=%6.2f score=%6.2f score_sum=%6.2f \n",i,i2aa(X[kfirst][i]),i2aa(X[k][i]),score_M,score-score_prev,score); // score_prev=score; } printf("k=%3i score=%6.2f\n",k,score); if (score=2) printf("Diversity of unfiltered alignment %.2f is below target diversity %.2f.\n",y0,par.Neff); return; } y1=filter_by_qsc(x1,keep_orig); while (fabs(par.Neff-y)>TOLY && x1-x0>TOLX) { x = x0 + (par.Neff-y0)*(x1-x0)/(y1-y0); // linear interpolation between (x0,y0) and (x1,y1) y = filter_by_qsc(x,keep_orig); if (v>=2) printf(" %3i x0=%6.3f -> %6.3f x=%6.3f -> %6.3f x1=%6.3f -> %6.3f \n",++i,x0,y0,x,y,x1,y1); if (y>par.Neff) {x0=x; y0=y;} else {x1=x; y1=y;} } v=v1; // Write filtered alignment WITH insert states (lower case) to alignment file if (v>=2) printf("Found Neff=%6.3f at filter threshold qsc=%6.3f\n",y,x); return; } float Alignment::filter_by_qsc(float qsc, char* keep_orig) { HMM q; for (int k=0; kNeff_HMM); return q.Neff_HMM; } ///////////////////////////////////////////////////////////////////////////////////// // Calculate AA frequencies q->p[i][a] and transition probabilities q->tr[i][a] from alignment ///////////////////////////////////////////////////////////////////////////////////// void Alignment::FrequenciesAndTransitions(HMM* q, char* in, bool time) { int k; // index of sequence int i; // position in alignment int a; // amino acid (0..19) int ni[NAA+3]; // number of times amino acid a occurs at position i int naa; // number of different amino acids //if (time) { ElapsedTimeSinceLastCall("begin freq and trans"); } //Delete name and seq matrices of old HMM q if (!q->dont_delete_seqs) // don't delete sname and seq if flat copy to hit object has been made { for (k=0; kn_seqs; k++) delete [] q->sname[k]; for (k=0; kn_seqs; k++) delete [] q->seq[k]; } else // Delete all not shown sequences (lost otherwise) { if (q->n_seqs > q->n_display) { for (k=q->n_display; kn_seqs; k++) delete [] q->sname[k]; for (k=q->n_display; kn_seqs; k++) delete [] q->seq[k]; } } if (v>=3) cout<<"Calculating position-dependent weights on subalignments\n"; if (in==NULL) in=keep; //why not in declaration? if (N_filtered>1) { for (k=0; kNeff_HMM=1.0f; for (i=0; i<=L+1; ++i) // for all positions i in alignment { q->Neff_M[i]=1.0f; q->Neff_I[i]=q->Neff_D[i]=0.0f; for (a=0; a<20; ++a) q->f[i][a]=0.0; if (X[kfirst][i] < ANY) q->f[i][(unsigned int) X[kfirst][i] ] = 1.0; else for (a=0; a<20; ++a) q->f[i][a]=pb[a]; q->tr[i][M2M]=0; q->tr[i][M2I]=-100000.0; q->tr[i][M2D]=-100000.0; q->tr[i][I2M]=-100000.0; q->tr[i][I2I]=-100000.0; q->tr[i][D2M]=-100000.0; q->tr[i][D2D]=-100000.0; } q->tr[0][I2M]=0; q->tr[L][I2M]=0; q->tr[0][D2M]=0; q->Neff_M[0]=q->Neff_I[0]=q->Neff_D[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state } if (v>=3) { printf("\nMatches:\n"); printf("col Neff nseqs\n"); for (i=1; i<=imin(L,100); ++i) printf("%3i %5.2f %3i\n",i,q->Neff_M[i],nseqs[i]); printf("\nInserts:\n"); printf("col Neff nseqs\n"); for (i=1; i<=imin(L,100); ++i) printf("%3i %5.2f %3i\n",i,q->Neff_I[i],nseqs[i]); printf("\nDeletes:\n"); printf("col Neff nseqs\n"); for (i=1; i<=imin(L,100); ++i) printf("%3i %5.2f %3i\n",i,q->Neff_D[i],nseqs[i]); } // Copy column information into HMM q q->L=L; q->N_in=N_in; q->N_filtered=N_filtered; for (i=1; i<=L; ++i) q->l[i]=l[i]; // Set names in HMM q if (strlen(q->name)==0) strcpy(q->name,name); if (strlen(q->longname)==0) strcpy(q->longname,longname); if (strlen(q->fam)==0) strcpy(q->fam,fam); ScopID(q->cl,q->fold,q->sfam,q->fam); // derive superfamily, fold and class code from family name strcpy(q->file,file); // Store basename of alignment file name in q->file // Copy sequences to be displayed into HMM q->nss_dssp=q->nsa_dssp=q->nss_pred=q->nss_conf=q->nfirst=-1; int n=0; if (kss_dssp>=0) q->nss_dssp=n++; // copy dssp sequence? if (ksa_dssp>=0) q->nsa_dssp=n++; // copy dssp sequence? if (kss_pred>=0) q->nss_pred=n++; // copy psipred sequence? if (kss_conf>=0) q->nss_conf=n++; // copy confidence value sequence? //if (time) { ElapsedTimeSinceLastCall("Copy to HMM"); } // Calculate consensus sequence? if (par.showcons || par.cons) { float maxw; int maxa; if (par.showcons) { // Reserve space for consensus/conservation sequence as Q-T alignment mark-up q->ncons=n++; q->sname[q->ncons]=new(char[10]); if (!q->sname[q->ncons]) {MemoryError("array of names for displayed sequences");} strcpy(q->sname[q->ncons],"Consensus"); q->seq[q->ncons]=new(char[L+2]); if (!q->seq[q->ncons]) {MemoryError("array of names for displayed sequences");} } if (par.cons) { // Reserve space for consensus sequence as first sequence in alignment q->nfirst=n++; kfirst=-1; q->sname[q->nfirst]=new(char[strlen(name)+11]); if (!q->sname[q->nfirst]) {MemoryError("array of names for displayed sequences");} strcpy(q->sname[q->nfirst],name); strcat(q->sname[q->nfirst],"_consensus"); q->seq[q->nfirst]=new(char[L+2]); if (!q->seq[q->nfirst]) {MemoryError("array of names for displayed sequences");} } // Calculate consensus amino acids using similarity matrix for (i=1; i<=L; ++i) { maxw=0.0; maxa=0; for (a=0; a<20; ++a) if (q->f[i][a]-pb[a]>maxw) {maxw = q->f[i][a]-pb[a]; maxa = a;} if (par.showcons) { maxw =0.0; for (int b=0; b<20; b++) maxw += q->f[i][b]*Sim[maxa][b]*Sim[maxa][b]; maxw *= q->Neff_M[i]/(q->Neff_HMM+1); // columns with many gaps don't get consensus symbol if (maxw>0.6) q->seq[q->ncons][i] = uprchr(i2aa(maxa)); else if (maxw>0.4) q->seq[q->ncons][i] = lwrchr(i2aa(maxa)); else q->seq[q->ncons][i] = 'x'; } if (par.cons) q->seq[q->nfirst][i] = uprchr(i2aa(maxa)); } if (par.showcons) { q->seq[q->ncons][0]='-'; q->seq[q->ncons][L+1]='\0'; } if (par.cons) { q->seq[q->nfirst][0]='-'; q->seq[q->nfirst][L+1]='\0'; } } //if (time) { ElapsedTimeSinceLastCall("Calc consensus sequence"); } // Copy sequences to be displayed from alignment to HMM for (k=0; k=MAXSEQDIS) { if (par.mark) cerr<<"WARNING: maximum number "<nss_dssp; // copy dssp sequence to nss_dssp else if (k==ksa_dssp) nn=q->nsa_dssp; else if (k==kss_pred) nn=q->nss_pred; else if (k==kss_conf) nn=q->nss_conf; else if (k==kfirst) nn=q->nfirst=n++; else nn=n++; // strcut(sname[k]," "); // delete rest of name line beginning with two spaces " " // Why this?? Problem for pdb seqs without chain q->sname[nn]=new(char[strlen(sname[k])+1]); if (!q->sname[nn]) {MemoryError("array of names for displayed sequences");} strcpy(q->sname[nn],sname[k]); q->seq[nn]=new(char[strlen(seq[k])+1]); if (!q->seq[nn]) {MemoryError("array of names for displayed sequences");} strcpy(q->seq[nn],seq[k]); } } q->n_display=n; // how many sequences to be displayed in alignments? q->n_seqs=n; // Copy secondary structure information into HMM if (kss_dssp>=0) for (i=1; i<=L; ++i) q->ss_dssp[i]=X[kss_dssp][i]; if (ksa_dssp>=0) for (i=1; i<=L; ++i) q->sa_dssp[i]=X[ksa_dssp][i]; if (kss_pred>=0) { for (i=1; i<=L; ++i) q->ss_pred[i]=X[kss_pred][i]; if (kss_conf>=0) for (i=1; i<=L; ++i) q->ss_conf[i]=X[kss_conf][i]; else for (i=1; i<=L; ++i) q->ss_conf[i]=5; } q->lamda=0.0; q->mu=0.0; q->trans_lin=0; // transition probs in log space q->has_pseudocounts=false; q->dont_delete_seqs=false; q->divided_by_local_bg_freqs=false; //if (time) { ElapsedTimeSinceLastCall("Copy sequences and SS"); } // Debug: print occurence of amino acids for each position i if (v>=2) printf("Effective number of sequences exp(entropy) = %-4.1f\n",q->Neff_HMM); //PRINT if (v>=3) { cout<<"\nMatr: "; for (a=0; a<20; ++a) printf("%4.1f ",100*pb[a]); cout<<"\nAmino acid frequencies without pseudocounts:\n"; cout<<" A R N D C Q E G H I L K M F P S T W Y V\n"; for (i=1; i<=L; ++i) { printf("%3i: ",i); for (a=0; a<20; ++a) printf("%4.0f ",100 * q->f[i][a]); cout<M M->I M->D I->M I->I D->M D->D Neff_M Neff_I Neff_D\n"); for (i=0; i<=L; ++i) { printf("%4i %6.3f %6.3f %6.3f ",i,pow(2.0,q->tr[i][M2M]),pow(2.0,q->tr[i][M2I]),pow(2.0,q->tr[i][M2D])); printf("%6.3f %6.3f ",pow(2.0,q->tr[i][I2M]),pow(2.0,q->tr[i][I2I])); printf("%6.3f %6.3f ",pow(2.0,q->tr[i][D2M]),pow(2.0,q->tr[i][D2D])); printf("%6.3f %6.3f %6.3f\n",q->Neff_M[i],q->Neff_I[i],q->Neff_D[i]); } } return; } ///////////////////////////////////////////////////////////////////////////////////// // Calculate freqs q->f[i][a] and transitions q->tr[i][a] (a=MM,MI,MD) with pos-specific subalignments // Pos-specific weights are calculated like in "GetPositionSpecificWeights()" ///////////////////////////////////////////////////////////////////////////////////// void Alignment::Amino_acid_frequencies_and_transitions_from_M_state(HMM* q, char* in) { // Calculate position-dependent weights wi[k] for each i. // For calculation of weights in column i use sub-alignment // over sequences which have a *residue* in column i (no gap, no end gap) // and over columns where none of these sequences has an end gap. // This is done by updating the arrays n[j][a] at each step i-1->i while letting i run from 1 to L. // n[j][a] = number of occurences of amino acid a at column j of the subalignment, // => only columns with n[j][ENDGAP]=0 are contained in the subalignment! // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0) // then the old values wi[k], Neff[i-1], and ncol are used for the new position i. // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22 int k; // index of sequence int i,j; // position in alignment int a; // amino acid (0..19) int naa; // number of different amino acids int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i float Neff[par.maxres]; // diversity of subalignment i int nseqi=0; // number of sequences in subalignment i int ncol=0; // number of columns j that contribute to Neff[i] char change; // has the set of sequences in subalignment changed? 0:no 1:yes float fj[NAA+3]; // to calculate entropy float sum; // Global weights? if (par.wg==1) for (k=0; kNeff_HMM=0.0f; Neff[0]=0.0; // if the first column has no residues (i.e. change==0), Neff[i]=Neff[i-1]=Neff[0] n = new(int*[L+2]); for (j=1; j<=L; ++j) n[j]=new(int[NAA+3]); for (j=1; j<=L; ++j) for (a=0; a=ANY && X[k][i]=ANY) { // ... if sequence k WAS included in i-1 and has to be thrown out for column i change=1; nseqi--; for (int j=1; j<=L; ++j) n[j][ (int)X[k][j]]--; } } //end for (k) nseqs[i]=nseqi; // If subalignment changed: update weights wi[k] and Neff[i] if (change) { // Initialize weights and numbers of residues for subalignment i ncol=0; for (k=0; kMAXENDGAPFRAC*nseqi) continue; naa=0; for (a=0; a<20; ++a) if(n[j][a]) naa++; if (naa==0) continue; ncol++; for (k=0; kMAXENDGAPFRAC*nseqi) continue; for (a=0; a<20; ++a) fj[a]=0; for (k=0; k1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]); } if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0; } else //no update was necessary; copy values for i-1 { Neff[i]=Neff[i-1]; } } // Calculate amino acid frequencies q->f[i][a] from weights wi[k] for (a=0; a<20; ++a) q->f[i][a]=0; for (k=0; kf[i][ (int)X[k][i] ]+=wi[k]; NormalizeTo1(q->f[i],NAA,pb); // Calculate transition probabilities from M state q->tr[i][M2M]=q->tr[i][M2D]=q->tr[i][M2I]=0.0; for (k=0; ktr[i][M2I]+=wi[k]; else if (X[k][i+1]<=ANY) //next state is M q->tr[i][M2M]+=wi[k]; else if (X[k][i+1]==GAP) //next state is D q->tr[i][M2D]+=wi[k]; } } // end for(k) // Normalize and take log sum = q->tr[i][M2M]+q->tr[i][M2I]+q->tr[i][M2D]+FLT_MIN; q->tr[i][M2M]=log2(q->tr[i][M2M]/sum); q->tr[i][M2I]=log2(q->tr[i][M2I]/sum); q->tr[i][M2D]=log2(q->tr[i][M2D]/sum); // for (k=0; ktr[0][M2M]=0; q->tr[0][M2I]=-100000; q->tr[0][M2D]=-100000; q->tr[L][M2M]=0; q->tr[L][M2I]=-100000; q->tr[L][M2D]=-100000; q->Neff_M[0]=99.999; // Neff_av[0] is used for calculation of transition pseudocounts for the start state // Set emission probabilities of zero'th (begin) state and L+1st (end) state to background probabilities for (a=0; a<20; ++a) q->f[0][a]=q->f[L+1][a]=pb[a]; // Assign Neff_M[i] and calculate average over alignment, Neff_M[0] if (par.wg==1) { for (i=1; i<=L; ++i) { float sum=0.0f; for (a=0; a<20; ++a) if (q->f[i][a]>1E-10) sum -= q->f[i][a]*fast_log2(q->f[i][a]); q->Neff_HMM+=pow(2.0,sum); } q->Neff_HMM/=L; float Nlim=fmax(10.0,q->Neff_HMM+1.0); // limiting Neff float scale=log2((Nlim-q->Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos for (i=1; i<=L; ++i) { float w_M=-1.0/N_filtered; for (k=0; kNeff_M[i]=1.0; else q->Neff_M[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_M); // fprintf(stderr,"M i=%3i ncol=--- Neff_M=%5.2f Nlim=%5.2f w_M=%5.3f Neff_M=%5.2f\n",i,q->Neff_HMM,Nlim,w_M,q->Neff_M[i]); } } else { for (i=1; i<=L; ++i) { q->Neff_HMM+=Neff[i]; q->Neff_M[i]=Neff[i]; if (q->Neff_M[i] == 0) { q->Neff_M[i] = 1; } } q->Neff_HMM/=L; } return; } ///////////////////////////////////////////////////////////////////////////////////// // Calculate transitions q->tr[i][a] (a=DM,DD) with pos-specific subalignments ///////////////////////////////////////////////////////////////////////////////////// void Alignment::Transitions_from_I_state(HMM* q, char* in) { // Calculate position-dependent weights wi[k] for each i. // For calculation of weights in column i use sub-alignment // over sequences which have a INSERT in column i // and over columns where none of these sequences has an end gap. // This is done by calculating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L. // n[j][a] = number of occurences of amino acid a at column j of the subalignment, // => only columns with n[j][ENDGAP]=0 are contained in the subalignment! // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0) // then the old values wi[k], Neff[i-1], and ncol are used for the new position i. // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22 int k; // index of sequence int i,j; // position in alignment int a; // amino acid (0..19) int naa; // number of different amino acids int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i float Neff[par.maxres]; // diversity of subalignment i int nseqi; // number of sequences in subalignment i int ncol; // number of columns j that contribute to Neff[i] float fj[NAA+3]; // to calculate entropy float sum; float Nlim=0.0; // only for global weights float scale=0.0; // only for global weights // Global weights? //if (par.wg==1) if (1) { for (k=0; kNeff_HMM+1.0); // limiting Neff scale=log2((Nlim-q->Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with inserts at specific pos } // Initialization n = new(int*[L+2]); for (j=1; j<=L; ++j) n[j]=new(int[NAA+3]); ////////////////////////////////////////////////////////////////////////////////////////////// // Main loop through alignment columns for (i=1; i<=L; ++i) // Calculate wi[k] at position i as well as Neff[i] { //if (par.wg==0) // local weights? if (0) { // Calculate n[j][a] and ri[j] nseqi=0; for (k=0; k0) { if (nseqi==0) // Initialize only if inserts present! Otherwise O(L*L) even for single sequences! { // Initialization of n[j][a] for (j=1; j<=L; ++j) for (a=0; atr[i][I2M]=-100000; q->tr[i][I2I]=-100000; continue; } // update weights wi[k] and Neff[i] // if (1) { // Initialize weights and numbers of residues for subalignment i ncol=0; for (k=0; kMAXENDGAPFRAC*nseqi) continue; naa=0; for (a=0; a<20; ++a) if(n[j][a]) naa++; if (naa==0) continue; ncol++; for (k=0; k0 && X[k][j]=NCOLMIN) // Take global weights for (k=0; k0) wi[k]=wg[k]; else wi[k]=0.0; // Calculate Neff[i] Neff[i]=0.0; for (j=1; j<=L; ++j) { if (n[j][ENDGAP]>MAXENDGAPFRAC*nseqi) continue; for (a=0; a<20; ++a) fj[a]=0; for (k=0; k0 && X[k][j]1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]); } if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0; } // Calculate transition probabilities from I state q->tr[i][I2M]=q->tr[i][I2I]=0.0; for (k=0; k0) //current state is I { q->tr[i][I2M]+=wi[k]; q->tr[i][I2I]+=wi[k]*(I[k][i]-1); } } // end for(k) } else // fast global weights? { float w_I=-1.0/N_filtered; ncol=0; q->tr[i][I2M]=q->tr[i][I2I]=0.0; // Calculate amino acid frequencies fj[a] from weights wg[k] for (k=0; k0) { ncol++; w_I+=wg[k]; q->tr[i][I2M]+=wi[k]; q->tr[i][I2I]+=wi[k]*(I[k][i]-1); } if (ncol>0) { if (w_I<0) Neff[i]=1.0; else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_I); // fprintf(stderr,"I i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_I=%5.3f Neff_I=%5.2f\n",i,ncol,q->Neff_HMM,Nlim,w_I,Neff[i]); } else { Neff[i]=0.0; q->tr[i][I2M]=-100000; q->tr[i][I2I]=-100000; continue; } } // Normalize and take log sum = q->tr[i][I2M]+q->tr[i][I2I]; q->tr[i][I2M]=log2(q->tr[i][I2M]/sum); q->tr[i][I2I]=log2(q->tr[i][I2I]/sum); } // end loop through alignment columns i ////////////////////////////////////////////////////////////////////////////////////////////// // delete n[][] for (j=1; j<=L; ++j) delete[](n[j]); delete[](n); q->tr[0][I2M]=0; q->tr[0][I2I]=-100000; q->tr[L][I2M]=0; q->tr[L][I2I]=-100000; q->Neff_I[0]=99.999; // Assign Neff_I[i] for (i=1; i<=L; ++i) // Calculate wi[k] at position i as well as Neff[i] and Neff[i] q->Neff_I[i]=Neff[i]; return; } ///////////////////////////////////////////////////////////////////////////////////// // Calculate transitions q->tr[i][a] (a=DM,DD) with pos-specific subalignments ///////////////////////////////////////////////////////////////////////////////////// void Alignment::Transitions_from_D_state(HMM* q, char* in) { // Calculate position-dependent weights wi[k] for each i. // For calculation of weights in column i use sub-alignment // over sequences which have a DELETE in column i // and over columns where none of these sequences has an end gap. // This is done by updating the arrays n[j][a] and rj[j] at each step i-1->i while letting i run from 1 to L. // n[j][a] = number of occurences of index a at column j of the subalignment, // => only columns with n[j][ENDGAP]=0 are contained in the subalignment! // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0) // then the old values wi[k], Neff[i-1], and ncol are used for the new position i. // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22 int k; // index of sequence int i,j; // position in alignment int a; // amino acid (0..19) int naa; // number of different amino acids int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j float wi[MAXSEQ]; // weight of sequence k in column i, calculated from subalignment i float Neff[par.maxres]; // diversity of subalignment i int nseqi=0; // number of sequences in subalignment i (for DEBUGGING) int ncol=0; // number of columns j that contribute to Neff[i] char change; // has the set of sequences in subalignment changed? 0:no 1:yes float fj[NAA+3]; // to calculate entropy float sum; float Nlim=0.0; // only for global weights float scale=0.0; // only for global weights // Global weights? //if (par.wg==1) if(1) { for (k=0; kNeff_HMM+1.0); // limiting Neff scale=log2((Nlim-q->Neff_HMM)/(Nlim-1.0)); // for calculating Neff for those seqs with dels at specific pos } // Initialization n = new(int*[L+2]); for (j=1; j<=L; ++j) n[j]=new(int[NAA+3]); for (j=1; j<=L; ++j) for (a=0; atr[i][D2M]=-100000; q->tr[i][D2D]=-100000; continue; } // If subalignment changed: update weights wi[k] and Neff[i] if (change) { // Initialize weights and numbers of residues for subalignment i ncol=0; for (k=0; kMAXENDGAPFRAC*nseqi) continue; naa=0; for (a=0; a<20; ++a) if(n[j][a]) naa++; if (naa==0) continue; ncol++; for (k=0; kMAXENDGAPFRAC*nseqi) continue; for (a=0; a<20; ++a) fj[a]=0; for (k=0; k1E-10) Neff[i]-=fj[a]*fast_log2(fj[a]); } if (ncol>0) Neff[i]=pow(2.0,Neff[i]/ncol); else Neff[i]=1.0; } else //no update was necessary; copy values for i-1 { Neff[i]=Neff[i-1]; } // Calculate transition probabilities from D state q->tr[i][D2M]=q->tr[i][D2D]=0.0; for (k=0; ktr[i][D2D]+=wi[k]; else if (X[k][i+1]<=ANY) //next state is M q->tr[i][D2M]+=wi[k]; } } // end for(k) } else // fast global weights? { float w_D=-1.0/N_filtered; ncol=0; q->tr[i][D2M]=q->tr[i][D2D]=0.0; // Calculate amino acid frequencies fj[a] from weights wg[k] for (k=0; ktr[i][D2D]+=wi[k]; else if (X[k][i+1]<=ANY) //next state is M q->tr[i][D2M]+=wi[k]; } if (ncol>0) { if (w_D<0) Neff[i]=1.0; else Neff[i] = Nlim - (Nlim-1.0)*fpow2(scale*w_D); // fprintf(stderr,"D i=%3i ncol=%3i Neff_M=%5.2f Nlim=%5.2f w_D=%5.3f Neff_D=%5.2f\n",i,ncol,q->Neff_HMM,Nlim,w_D,Neff[i]); } else { Neff[i]=0.0; // effective number of sequences = 0! q->tr[i][D2M]=-100000; q->tr[i][D2D]=-100000; continue; } } // Normalize and take log sum = q->tr[i][D2M]+q->tr[i][D2D]; q->tr[i][D2M]=log2(q->tr[i][D2M]/sum); q->tr[i][D2D]=log2(q->tr[i][D2D]/sum); } // end loop through alignment columns i ////////////////////////////////////////////////////////////////////////////////////////////// q->tr[0][D2M]=0; q->tr[0][D2D]=-100000; q->Neff_D[0]=99.999; // Assign Neff_D[i] for (i=1; i<=L; ++i) q->Neff_D[i]=Neff[i]; // delete n[][] for (j=1; j<=L; ++j) delete[](n[j]); delete[](n); return; } ///////////////////////////////////////////////////////////////////////////////////// // Write alignment without insert states (lower case) to alignment file? ///////////////////////////////////////////////////////////////////////////////////// void Alignment::WriteWithoutInsertsToFile(const char* alnfile) { if (v>=2) cout<<"Writing alignment to "<=2) cout<<"Writing alignment to "<%s\n",sname[k]); for (int i=1; i<=L; ++i) fprintf(alnf,"%c",i2aa(X[k][i])); fprintf(alnf,"\n"); } // Write other sequences for (int k=0; k0) or display is obligatory (display[k]==2) { fprintf(alnf,">%s\n",sname[k]); for (int i=1; i<=L; ++i) fprintf(alnf,"%c",i2aa(X[k][i])); fprintf(alnf,"\n"); } fclose(alnf); } ///////////////////////////////////////////////////////////////////////////////////// // Write stored, filtered sequences WITH insert states (lower case) to alignment file? ///////////////////////////////////////////////////////////////////////////////////// void Alignment::WriteToFile(const char* alnfile, const char format[]) { FILE* alnf=NULL; char* tmp_name = new(char[NAMELEN]); if(strcmp(alnfile, "stdout") != 0) { if (!par.append) alnf = fopen(alnfile, "w"); else alnf = fopen(alnfile, "a"); } else { alnf = stdout; } if (!alnf) OpenFileError(alnfile); if (!format || !strcmp(format,"a3m")) { if (v>=2) cout<<"Writing A3M alignment to "<%s\n%s\n",sname[k],seq[k]+1); // Write other sequences for (int k=0; k0) or display is obligatory (display[k]==2) fprintf(alnf,">%s\n%s\n",sname[k],seq[k]+1); } else // PSI-BLAST format { if (v>=2) cout<<"Writing PSI-BLAST-formatted alignment to "<0) or display is obligatory (display[k]==2) { strwrd(tmp_name,sname[k],NAMELEN); fprintf(alnf,"%-20.20s ",tmp_name); // for (int i=1; i<=L; ++i) fprintf(alnf,"%c",i2aa(X[k][i])); // fprintf(alnf,"\n"); char* ptr=seq[k]; for (; *ptr!='\0'; ptr++) if (*ptr==45 || (*ptr>=65 && *ptr<=90)) fprintf(alnf,"%c",*ptr); fprintf(alnf,"\n"); } } delete[] tmp_name; if (strcmp(alnfile, "stdout")) fclose(alnf); } ///////////////////////////////////////////////////////////////////////////////////// // Read a3m slave alignment of hit from file and merge into (query) master alignment ///////////////////////////////////////////////////////////////////////////////////// void Alignment::MergeMasterSlave(Hit& hit, Alignment& Tali, char* ta3mfile) { char* cur_seq = new(char[par.maxcol]); // Sequence currently read in int maxcol=par.maxcol; int l,ll; // position in unaligned template (T) sequence Tali.seq[l] int i; // counts match states in query (Q) HMM int j; // counts match states in T sequence Tali.seq[l] int h; // position in aligned T sequence cur_seq[h] int k; // sequence index char c; // if (v>=3) printf("Merging %s to query alignment\n",ta3mfile); // Record imatch[j] int* imatch=new(int[hit.j2+1]); int step = hit.nsteps; for (j=hit.j1; j<=hit.j2; ++j) { // Advance to position of next T match state j while (hit.j[step]3) { printf("step=%-3i i=%-3i j=%-3i i1: %4i i2: %4i j1: %4i j2: %4i\n",step,imatch[j],j,hit.i1,hit.i2,hit.j1,hit.j2); } } // Determine number of match states of Qali for (L=0,l=1; seq[kfirst][l]>'\0'; l++) if ((seq[kfirst][l]>='A' && seq[kfirst][l]<='Z') || seq[kfirst][l]=='-') L++; // For each sequence in T alignment: align to Qali for (k=0; k=MAXSEQ) { fprintf(stderr,"WARNING in %s: maximum number of %i sequences exceeded while reading %s. Skipping all following sequences of this MSA\n",program_name,MAXSEQ,ta3mfile); break; } cur_seq[0]=' '; // 0'th position not used // Add the hit.i1-1 left end gaps to aligned sequence for (h=1; h'\0'; l++) if ((c>='A' && c<='Z') || c=='-') // match state at position l? if ((++j)==hit.j1) break; // yes: increment j. Reached hit,j1? yes: break if (j'\0' && ((c>='a' && c<='z') || c=='.')) ; int di=i-iprev; // number of Match states in Q between T match state j-1 and j int dl=l-lprev; // 1 + number of inserted residues in T sequence between T match state j-1 and j if (di==1) { // One Q match state for one T match state (treated as special case for speed reasons) // i: i-1 i di=1 // Q: XXXXXX.....XXXXXX // T: YYYYYYyyyyyYYYYYY // j: j-1 j // l: lprev l dl=6 // Inserts in lower case for (ll=lprev+1; ll upper case cur_seq[h++] = Tali.seq[k][ll]; } else if (di==0) { // Gap in query: no Q match state for on T match state (special case for speed reasons) // i: i-1 i-1 di=0 // Q: XXXXXX.....~~~XXX // T: YYYYYYyyyyyYYYYYY // j: j-1 j // l: lprev l dl=6 // All T residues (including T match state) in lower case for (ll=lprev+1; ll<=l; ll++) if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]); } else if (di>=dl) { // More Match states in Q than Inserts in the T sequence // => half T inserts y left, half right-aligned in uc, gaps to fill up // Number of T insert residues to be left-aligned: (int)(dl/2) // i: iprev i di=7 // Q: XXXXXXXXXXXXXXXXXX // T: YYYYYYYyyy-yyYYYYY // j: j-1 j // l: lprev l dl=6 // Add left-bounded template residues for (ll=lprev+1; ll<=lprev+(int)(dl/2); ll++) cur_seq[h++]=uprchr(Tali.seq[k][ll]); // Add central gaps for (int gap=1; gap<=di-dl; gap++) cur_seq[h++]='-'; // Add right-bounded residues for (; ll<=l; ll++) cur_seq[h++]=uprchr(Tali.seq[k][ll]); } else if (di half of available space di for left- half for right-aligned T inserts, rest in lc // number of T inserts to be left-aligned in uc: (int)(di/2), // i: iprev i di=5 // Q: XXXXXXXXX.XXXXXXX // T: YYYYYYYyyyyyYYYYY // j: j-1 j // l: lprev l dl=6 // Add left-bounded template residues for (ll=lprev+1; ll<=lprev+(int)(di/2); ll++) cur_seq[h++]=uprchr(Tali.seq[k][ll]); // Add central inserts for (int ins=1; ins<=dl-di; ins++,ll++) if (Tali.seq[k][ll]!='-' && Tali.seq[k][ll]!='.') cur_seq[h++] = lwrchr(Tali.seq[k][ll]); // Add right-bounded residues for (; ll<=l; ll++) cur_seq[h++]=uprchr(Tali.seq[k][ll]); } if (v>4) { printf("i=%-3i j=%-3i l=%-3i cur_seq=%s\n",i,j,l,cur_seq); } iprev=i; lprev=l; if (h>=maxcol-1000) // too few columns? Reserve double space { char* new_seq=new(char[2*maxcol]); strncpy(new_seq,cur_seq,maxcol); //////// check: maxcol-1 ???? delete[](cur_seq); cur_seq=new_seq; maxcol*=2; } } // Add the remaining gaps '-' to the end of the template sequence for (i=hit.i2+1; i<=L; ++i) cur_seq[h++]='-'; cur_seq[h++]='\0'; keep[N_in] = display[N_in] = 1; seq[N_in]=new(char[h]); if (!seq[N_in]) MemoryError("array for input sequences"); strcpy(seq[N_in],cur_seq); X[N_in]=new(char[h]); if (!X[N_in]) MemoryError("array for input sequences"); I[N_in]=new(short unsigned int[h]); if (!I[N_in]) MemoryError("array for input sequences"); sname[N_in]=new(char[strlen(Tali.sname[k])+1]); if (!sname[N_in]) MemoryError("array for input sequences"); strcpy(sname[N_in],Tali.sname[k]); N_in++; if (v>3) { printf("k=%-3i %s\n",k,Tali.seq[k]); printf("Query %s\n",seq[kfirst]); printf("k=%-3i %s\n\n",k,cur_seq); } } // end for (k) // printf("N_in=%-5i HMM=%s with %i sequences\n",N_in,ta3mfile,Tali.N_filtered); delete[] cur_seq; delete[] imatch; delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment delete[] first; first=NULL; // if first already existed it will be to short for merged alignment delete[] last; last=NULL; // if last already existed it will be to short for merged alignment delete[] nres; nres=NULL; // if nres already existed it will be to short for merged alignment } ///////////////////////////////////////////////////////////////////////////////////// // Add a sequence to Qali ///////////////////////////////////////////////////////////////////////////////////// void Alignment::AddSequence(char Xk[], int Ik[]) { int i; // position in query and template if (L<=0) InternalError("L is not set in AddSequence()"); X[N_in]=new(char[L+2]); for (i=0; i<=L+1; ++i) X[N_in][i]=Xk[i]; if (Ik==NULL) for (i=0; i<=L+1; ++i) I[N_in][i]=0; else for (i=0; i<=L+1; ++i) I[N_in][i]=Ik[i]; N_in++; delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment delete[] first; first=NULL; // if first already existed it will be to short for merged alignment delete[] last; last=NULL; // if last already existed it will be to short for merged alignment } ///////////////////////////////////////////////////////////////////////////////////// // Add secondary structure prediction to alignment (overwrite existing prediction) ///////////////////////////////////////////////////////////////////////////////////// void Alignment::AddSSPrediction(char seq_pred[], char seq_conf[]) { unsigned int i; if ((int)strlen(seq_pred)!=L+1) { cerr<<"WARNING: Could not add secondary struture prediction - unequal length!\n"; return; } if (kss_pred < 0 || kss_conf < 0) // At least one sequence is added { delete[] ksort; ksort=NULL; // if ksort already existed it will be to short for merged alignment delete[] first; first=NULL; // if first already existed it will be to short for merged alignment delete[] last; last=NULL; // if last already existed it will be to short for merged alignment } if (kss_pred < 0) // No ss prediction exists { kss_pred=N_in; keep[N_in]=0; display[N_in]=1; seq[N_in]=new(char[L+2]); strcpy(seq[N_in],seq_pred); X[N_in]=new(char[L+2]); for (i=0; ii while letting i run from 1 to L. // n[j][a] = number of occurences of amino acid a at column j of the subalignment, // => only columns with n[j][ENDGAP]=0 are contained in the subalignment! // If no sequences enter or leave the subalignment at the step i-1 -> i (i.e. change=0) // then the old values w[k][i] and ncol are used for the new position i. // Index a can be an amino acid (0-19), ANY=20, GAP=21, or ENDGAP=22 char* in=keep; // to keep the code similar to Amino_acid_frequencies_and_transitions_from_M_state() int k; // index of sequence int i,j; // position in alignment int a; // amino acid (0..19) int naa; // number of different amino acids int** n; // n[j][a] = number of seq's with some residue at column i AND a at position j int nseqi=0; // number of sequences in subalignment i int ncol=0; // number of columns j that contribute to Neff[i] char change; // has the set of sequences in subalignment changed? 0:no 1:yes // Global weights? if (par.wg==1) { for (k=0; k=ANY && X[k][i]=ANY) { // ... if sequence k WAS included in i-1 and has to be thrown out for column i change=1; nseqi--; for (int j=1; j<=L; ++j) n[j][ (int)X[k][j]]--; } } //end for (k) nseqs[i]=nseqi; // If subalignment changed: update weights w[k][i] and Neff[i] if (change) { // Initialize weights and numbers of residues for subalignment i ncol=0; for (k=0; kMAXENDGAPFRAC*nseqi) continue; naa=0; for (a=0; a<20; ++a) if(n[j][a]) naa++; if (naa==0) continue; ncol++; for (k=0; k