const char* docstring="" "rmRedundantSeq id_cut cov_cut metaclust.aln\n" " remove sequences with < cov_cut sequence coverage or with > id_cut\n" " sequence identity to other sequences in alignment file metaclust.aln\n" "\n" "rmRedundantSeq id_cut cov_cut uniclust.aln metaclust.aln\n" " remove sequences in metaclust.aln with < cov_cut sequence coverage or\n" " or with > id_cut sequence identity to other sequences in either\n" " uniclust.aln or metaclust.aln\n" ; #include #include #include #include #include using namespace std; string aa_list="ACDEFGHIKLMNPQRSTVWY"; /* parse single MSA at seqID & coverage cutoff id_cut & cov_cut */ int parse_single_aln(const char *filename, vector & aln, const float id_cut, const float cov_cut) { int L=0; // alignment length int Ldiff=0,maxLdiff=0; // length of different positions int i,n; // position, sequence index int Lali; // number of non-gap positions float seqID1,seqID2; // seqID with previous sequences vector is_not_gap_vec; // if a positions is an amino acid vector > is_not_gap_mat; // if a position is an amino acid ifstream fp; fp.open(filename,ios::in); string sequence; while (fp.good()) { getline(fp,sequence); if (sequence.length()==0) continue; if (L==0) { L=sequence.length(); is_not_gap_vec.assign(L,0); } Lali=0; for (i=0;i1) { aln.push_back(sequence); } else { maxLdiff=(1.-id_cut)*Lali; for (n=0;nmaxLdiff) break; } //if (seqID1>id_cut*Lali) break; if (Ldiff<=maxLdiff) break; } if (n==aln.size()) { aln.push_back(sequence); is_not_gap_mat.push_back(is_not_gap_vec); } } } fp.close(); is_not_gap_mat.clear(); return aln.size(); } /* parse second MSA at seqID & coverage cutoff id_cut & cov_cut */ int parse_two_aln(const char *query_filename, const char *filename, vector & query_aln, vector & aln, const float id_cut, const float cov_cut) { string sequence; if (query_filename[0]!='-') { ifstream fp; fp.open(query_filename); while(fp.good()) { getline(fp,sequence); query_aln.push_back(sequence); } fp.close(); } else { while(cin.good()) { getline(cin,sequence); query_aln.push_back(sequence); } } int L=query_aln[0].length(); // alignment length int query_nseqs=query_aln.size(); // number of sequences int i,n; // position, sequence index int Lali; // number of non-gap positions int Ldiff=0,maxLdiff=0; // length of different positions float max_seqID,seqID1,seqID2; // seqID with previous sequences vector is_not_gap_vec(L,0); // if a positions is an amino acid vector > is_not_gap_mat; // if a position is an amino acid ifstream fp; fp.open(filename,ios::in); while (fp.good()) { getline(fp,sequence); if (sequence.length()==0) continue; Lali=0; for (i=0;i1) { aln.push_back(sequence); } else { //max_seqID=0; maxLdiff=(1.-id_cut)*Lali; for (n=0;nmaxLdiff) break; } if (Ldiff<=maxLdiff) break; //if (seqID1>id_cut*Lali) //{ //max_seqID=seqID1; //break; //} } if (Ldiff<=maxLdiff) continue; //if (max_seqID>id_cut*Lali) continue; for (n=0;nmaxLdiff) break; } if (Ldiff<=maxLdiff) break; //if (seqID1>id_cut*Lali) break; } if (n==aln.size()) { aln.push_back(sequence); is_not_gap_mat.push_back(is_not_gap_vec); } } } fp.close(); is_not_gap_mat.clear(); return aln.size(); } int main(int argc, char **argv) { /* parse commad line argument */ if(argc<4) { cerr<1) id_cut/=100.; if (cov_cut>1) cov_cut/=100.; int nseqs; vector query_aln; vector aln; if (argc<5) // one MSA nseqs=parse_single_aln(argv[3],aln,id_cut,cov_cut); else nseqs=parse_two_aln(argv[3],argv[4],query_aln,aln,id_cut,cov_cut); for (int n=0;n