const char* docstring="" "AlnAaProb seq.aln 0.8\n" " Calculate the composition of 20 amino acids at each position for\n" " PSICOV format MSA file 'seq.aln'. GREMLIN sequence weight is applied\n" " to weight each sequence, using 0.8 (default) sequence identity cutoff.\n" "\n" ; #include #include #include #include #include #include #include #include using namespace std; string aa_list="ACDEFGHIKLMNPQRSTVWY"; float AlnAaProb(const string infile, float id_cut, vector > &AaProb_mat, vector &seq_weight_vec, vector &AaProb_vec) { if (id_cut>1) id_cut/=100.0; // percentage seqID /* read aln file */ vector aln; string sequence; int L=0; ifstream fp; if (infile!="-") fp.open(infile.c_str(),ios::in); while ((infile!="-")?fp.good():cin.good()) { if (infile!="-") getline(fp,sequence); else getline(cin,sequence); if (sequence.length()==0) continue; if (L==0) { L=sequence.length(); } else if (sequence.length()!=L) { cerr<<"ERROR! length not match for sequence "< homo_num_vec(seq_num,1); // seq is always similar to itself int m,n; // sequence index int i; // residue position int Ldiff=0; // length of different residues, including gaps int max_Ldiff=L*(1.-id_cut); // max num of different residues for // two seq to be considered homolog for (m=0;mmax_Ldiff) break; } homo_num_vec[m]+=(Ldiff<=max_Ldiff); homo_num_vec[n]+=(Ldiff<=max_Ldiff); } } /* compute seq weight */ seq_weight_vec.assign(seq_num,0.); for (m=0;m aa2int_dict; int a; for (a=0;a2) id_cut=atof(argv[2]); vector > AaProb_mat; vector seq_weight_vec; vector AaProb_vec; float neff=AlnAaProb(infile,id_cut,AaProb_mat,seq_weight_vec,AaProb_vec); /* print Aa Prob */ cout<<"#neff="<