/* FreeContact - program to predict protein residue contacts from a sufficiently large multiple alignment * Copyright (C) 2012-2013 Laszlo Kajan Rost Lab, Technical University of Munich, Germany * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #include #include #include #include #include #include "config.h" #include "fc.h" #include "freecontact.h" namespace bo = boost; namespace cls = boost::program_options::command_line_style; namespace po = boost::program_options; using namespace std; namespace fc = freecontact; void _print_help( const po::options_description& __opts ); fc::ali_t _read_ali(std::istream& __is, resn_mapper_t &__resn, bool dbg ); std::string _xml_flatali(std::istream& __is, bool dbg); void _write_bioxsd(const fc::ali_t& __ali, const resn_mapper_t &__resn, const map >& __res, bool dbg); inline std::ostream &operator<<( std::ostream &os, const fc::parset_t& pp ) { os<<"--apply-gapth="<()->default_value(num_threads), "Threads to use [0-). 0 means as many as cores.\n") ("apply-gapth", po::value(), "When true, exclude residue columns and rows with a weighted gap frequency > --gapth from the covariance matrix [Boolean].\n") ("clustpc,c", po::value(), "BLOSUM clustering percentage [0-100].\n") ("cov20", po::value(), "If true, leave one amino acid off the covariance matrix, making it non-overdetermined [Boolean].\n") ("density,d", po::value(), "Target precision matrix density [0-1]. 0: do not control density.\n") ("debug", "Turn on debugging.\n") ("estimate-ivcov", po::value(), "Use inverse covariance matrix estimation instead of matrix inversion [Boolean].\n") ("ali,f", po::value()->default_value("-"), "Alignment file [path]. If '-', standard input.\n") ("gapth,g", po::value(), "Weighted gap frequency threshold (0-1].\n") ("help,h", "Produce this help message.\n") ("input-format,i", po::value()->default_value("flat"), "Input format [flat|xml].\n") ("icme-timeout", po::value()->default_value(icme_timeout), "Inverse covariance matrix estimation timeout in seconds [0-).\n") ("mincontsep", po::value(), "Minimum sequence separation (j-i>=arg) for reporting contacts [1-).\n") ("output-format,o", po::value(), "Output format [evfold|pfrmat_rr|bioxsd].\n") ("parprof", po::value()->default_value("default"), "Parameter profile (optional) [default|evfold|psicov|psicov-sd].\n") ("pep", "Print effective parameters on standard error.\n") ("pseudocnt,p", po::value(), "Pseudocount [0-).\n") ("quiet", po::value()->default_value(false), "Do not print warnings.\n") ("rho", po::value(), "Initial value of Glasso regularization parameter [0-). If negative, choose value automatically.\n") ("veczw", po::value()->default_value(veczw), "Use vectorized sequence weighting when available [Boolean].\n") ("version", "Print version.\n") ("pscount-weight,w", po::value(), "Pseudocount weight [0-1].\n") ; po::variables_map vm; po::store( po::parse_command_line(argc, argv, cmd_args, cls::default_style & ~cls::allow_guessing ), vm ); po::notify(vm); if(vm.count("help")) { _print_help( cmd_args ); return 0; } if(vm.count("version")) { cout << PACKAGE_VERSION << "\n"; return 0; } bool dbg = vm.count("debug"); bool quiet = !vm.count("quiet"); // parameter defaults fc::parset_t parset = fc::ps_evfold; ofmt = fc_of_evfold; // parameter profiles if(vm["parprof"].as() == "evfold"){ parset = fc::ps_evfold; ofmt = fc_of_evfold; } else if(vm["parprof"].as() == "psicov"){ parset = fc::ps_psicov; ofmt = fc_of_pfrmat_rr; } else if(vm["parprof"].as() == "psicov-sd"){ parset = fc::ps_psicov_sd; ofmt = fc_of_pfrmat_rr; } else if(vm["parprof"].as() != "default") cerr << "Warning: parprof value '" << vm["parprof"].as() << "' is invalid and will be ignored.\n"; // cmd arguments sanity check if(vm.count("clustpc") && ( vm["clustpc"].as() < 0 || vm["clustpc"].as() > 100 )) { cerr << "Error: clustpc " << vm["clustpc"].as() << " is outside allowed range [0-100].\n"; return 1; } if(vm.count("density") && ( vm["density"].as() < 0 || vm["density"].as() > 1 )) { cerr << "Error: density " << vm["density"].as() << " is outside allowed range [0-1].\n"; return 1; } if(vm.count("gapth") && ( vm["gapth"].as() <= 0 || vm["gapth"].as() > 1 )) { cerr << "Error: gapth " << vm["gapth"].as() << " is outside allowed range [0-1].\n"; return 1; } if(vm.count("mincontsep") && vm["mincontsep"].as() < 1 ){ cerr << "Error: mincontsep is outside allowed range [1-).\n"; return 1; } if(vm.count("pseudocnt") && vm["pseudocnt"].as() < 0 ){ cerr << "Error: pseudocnt is outside allowed range [0-).\n"; return 1; } if(vm.count("pscount-weight") && ( vm["pscount-weight"].as() < 0 || vm["pscount-weight"].as() > 1 )) { cerr << "Error: pscount-weight " << vm["pscount-weight"].as() << " is outside allowed range [0-1].\n"; return 1; } if (vm.count("threads") && vm["threads"].as() < 0 ){ cerr << "Error: threads is outside allowed range [0-).\n"; return 1; } if (vm.count("icme-timeout") && vm["icme-timeout"].as() < 0 ){ cerr << "Error: icme-timeout is outside allowed range [0-).\n"; return 1; } // cmd line arguments copy to vars if(vm.count("apply-gapth")) parset.apply_gapth = vm["apply-gapth"].as(); if(vm.count("clustpc")) parset.clustpc = vm["clustpc"].as()/100; if(vm.count("density")) parset.density = vm["density"].as(); if(vm.count("estimate-ivcov")) parset.estimate_ivcov = vm["estimate-ivcov"].as(); if(vm.count("gapth")) parset.gapth = vm["gapth"].as(); if(vm.count("icme-timeout")) icme_timeout = vm["icme-timeout"].as(); if(vm.count("mincontsep")) parset.mincontsep = vm["mincontsep"].as(); if(vm.count("pseudocnt")) parset.pseudocnt = vm["pseudocnt"].as(); if(vm.count("pscount-weight")) parset.pscnt_weight = vm["pscount-weight"].as(); if(vm.count("rho")) parset.rho = vm["rho"].as(); if(vm.count("threads")) num_threads = vm["threads"].as(); if(vm.count("cov20")) parset.cov20 = vm["cov20"].as(); if(vm.count("veczw")) veczw = vm["veczw"].as(); if(vm.count("output-format")) { if(vm["output-format"].as() == "evfold") ofmt = fc_of_evfold; else if(vm["output-format"].as() == "pfrmat_rr") ofmt = fc_of_pfrmat_rr; else if(vm["output-format"].as() == "bioxsd") { #ifndef HAVE_XSDCXX cerr << "Error: unsupported output-format: freecontact has not been compiled with XML support.\n"; return 1; #else ofmt = fc_of_bioxsd; #endif } else cerr << "Warning: output-format value '" << vm["output-format"].as() << "' is invalid and will be ignored.\n"; } if(dbg || vm.count("pep")) cerr << "Effective parameters: " << parset << " --output-format="<< ofmt <<"\n"; // read alignment file fc::ali_t ali; resn_mapper_t resn; { auto_ptr ali_ifs_p; if(vm["ali"].as() != "-") { ali_ifs_p = auto_ptr(new std::ifstream(vm["ali"].as().c_str())); if(!ali_ifs_p.get() || !*ali_ifs_p) throw runtime_error( string("failed to open '" + vm["ali"].as() + "': " + strerror(errno) ) ); } if(vm["input-format"].as() == "xml") { #ifndef HAVE_XSDCXX cerr << "Error: unsupported input-format: freecontact has not been compiled with XML support.\n"; return 1; #else // If input-format is xml and content is , then create a string stream for the text of and read from that. std::istringstream flatis(_xml_flatali(ali_ifs_p.get() ? *ali_ifs_p : cin, dbg)); ali = _read_ali(flatis, resn, dbg); #endif } else if(vm["input-format"].as() == "flat") ali = _read_ali(ali_ifs_p.get() ? *ali_ifs_p : cin, resn, dbg); else cerr << "Error: input-format value '" << vm["input-format"].as() << "' is invalid.\n"; } if (ali.seqcnt < 2){cerr << "Error: there are not enough sequences (" << ali.seqcnt << ") in this alignment.\n"; return 1;} if (!quiet && ali.seqcnt < ali.alilen) cerr << "Warning: there are fewer sequences (" << ali.seqcnt << ") in this alignment than recommended (" << ali.alilen << ").\n"; if (ali.alilen < 4){cerr << "Error: alignment length " << ali.alilen << " is smaller than minimum allowed (4).\n"; return 1;} if (ali.alilen < parset.mincontsep+1) {cerr << "Error: alignment (" << ali.alilen << ") shorter than mincontsep (" << parset.mincontsep << ") + 1.\n"; return 1;} if (ali.alilenpad > 4080){cerr << "Error: padded alignment length " << ali.alilenpad << " is bigger than maximum allowed (4080).\n"; return 1;} map > res; fc::predictor fcp(dbg); try { res = fcp.run(ali, parset, veczw, num_threads, icme_timeout); } catch(fc::icme_timeout_error& e) { cerr << "Error: " << e.what() << ".\n"; return 2; } // print output // lkajan: TODO: output format and the choice of score to output is mashed together here: disentangle. if(ofmt == fc_of_pfrmat_rr) { vector &psicov_res = res["l1norm"]; sort( psicov_res.begin(), psicov_res.end(), std::greater() ); for( size_t i = 0, e = psicov_res.size(); i < e; ++i ) fprintf( stdout, "%d %d 0 8 %f\n", resn(psicov_res[i].i), resn(psicov_res[i].j), psicov_res[i].score ); } else if(ofmt == fc_of_evfold) { vector &MI = res["MI"], &fro = res["fro"]; if(MI.size() != fro.size()){ cerr << "Error: unequal result set sizes MI and fro.\n"; exit(1); } for(size_t i = 0; i < MI.size(); ++i) fprintf(stdout, "%d %c %d %c %g %g\n", resn(MI[i].i), fc::mapaa[ali(0,MI[i].i)], resn(MI[i].j), fc::mapaa[ali(0,MI[i].j)], MI[i].score, fro[i].score ); } else if(ofmt == fc_of_uni) { fprintf(stdout, "# l1-norm | MI | Frobenius norm after zero-sum gauge (CN-score)\n"); sort( res["l1norm"].begin(), res["l1norm"].end(), std::greater() ); sort( res["MI"].begin(), res["MI"].end(), std::greater() ); sort( res["fro"].begin(), res["fro"].end(), std::greater() ); for( size_t i = 0, e = std::max( res["l1norm"].size(), res["fro"].size() ); i < e; ++i ) { fc::contact_t psicov_res, MI_res, evfold_res; if( i < res["l1norm"].size() ) psicov_res = res["l1norm"][i]; if( i < res["MI"].size() ) MI_res = res["MI"][i]; if( i < res["fro"].size() ) evfold_res = res["fro"][i]; fprintf( stdout, "%4d %4d %f | %4d %4d %f | %4d %4d %f\n", resn(psicov_res.i), resn(psicov_res.j), psicov_res.score, resn(MI_res.i), resn(MI_res.j), MI_res.score, resn(evfold_res.i), resn(evfold_res.j), evfold_res.score ); } } #ifdef HAVE_XSDCXX else if(ofmt == fc_of_bioxsd) _write_bioxsd(ali, resn, res, dbg); #endif else { cerr << "Error: unrecognized output format, there is a bug in %s around %s:%d.\n"; return 1; } return 0; } fc::ali_t _read_ali( std::istream &__is, resn_mapper_t &__resn, bool dbg ) { // We read the kind of alignment PSICOV can read. Since we are able to read from standard in, a reformatter can always be put into the pipeline. // We accept [A-Z-], anything else terminates alignment line; B is mapped to D, Z is mapped to E; [JOUX] are mapped outside the AA range. size_t line_n = 0; string line; uint16_t qs = 1; string q; do { do { getline(__is, line); ++line_n; } while(__is && line.empty()); if(line.length() > 3 && line[0] == '#') { int i; if(sscanf(line.c_str(), "# querystart=%d", &i) == 1) { qs = i; if(dbg) cerr << "matched header line '" << line << "'\n"; } char s[65536]; if(sscanf(line.c_str(), "# query=%65535[A-Za-z]", s) == 1) { q = s; if(dbg) cerr << "matched header line '" << line.substr(0,80) << "[...]'\n"; } } } while(__is && line.length() > 0 && line[0] == '#'); uint16_t alilen = 0; for( string::const_iterator l_i = line.begin(), l_e = line.end(); l_i != l_e; ++l_i ) if( isalpha(*l_i) || *l_i == '-' ) ++alilen; else break; fc::ali_t ali(alilen); do { vector al = fc::ali_t::read_a_seq( line ); if(al.size() != ali.alilen) cerr << "warning: alignment on input line " << line_n << " is skipped because it is of unexpected size " << al.size() << ", expected " << ali.alilen << "\n"; else ali.push(al); // if(q.empty()) q = line; // use first alignment line as 'query', unless one is given in the header do { getline(__is, line); ++line_n; } while(__is && line.empty()); } while(__is); __resn = resn_mapper_t(qs, q); return ali; } void _print_help( const po::options_description& __opts ) { cout << "Usage: freecontact [OPTION] < [ALIGNMENT]\n"; cout << "Predict protein contacts from multiple alignment.\n\n"; cout << __opts; // there's an implicit \n after that cout << "Parameter profiles (default=evfold):\n"; cout << "evfold:\n\t" << fc::ps_evfold << " --output-format="<< fc_of_evfold <<"\n"; cout << "psicov:\n\t" << fc::ps_psicov << " --output-format="<< fc_of_pfrmat_rr <<"\n"; cout << "psicov-sd (PSICOV sensible default):\n\t" << fc::ps_psicov_sd << " --output-format="<< fc_of_pfrmat_rr <<"\n"; } /* =pod =encoding utf8 =head1 NAME freecontact - fast protein contact predictor =head1 SYNOPSIS freecontact [OPTION] freecontact --parprof [evfold|psicov|psicov-sd] < alignment.aln > contacts.out __pkgdatadir__/a2m2aln --query '^RASH_HUMAN/(\d+)' < alignment.fa | freecontact --parprof evfold > contacts.out freecontact --ali=F --apply-gapth=I --clustpc=I --density=I --cov20=I --estimate-ivcov=I --gapth=I --icme-timeout=I --input-format=I<[flat|xml]> --mincontsep=I --output-format=I<[evfold|pfrmat_rr|bioxsd]> --pseudocnt=I --pscount-weight=I --rho=I --threads=I --veczw=I freecontact --help --debug --quiet --version =head1 DESCRIPTION FreeContact is a protein residue contact predictor optimized for speed. FreeContact can function as an accelerated drop-in for the published contact predictors EVfold-mfDCA of DS. Marks et al. (2011) [1], and PSICOV of D. Jones et al. (2011) [2]. FreeContact is accelerated by a combination of vector instructions, multiple threads, and faster implementation of key parts. Depending on the alignment, 8-fold or higher speedups are possible. A sufficiently large alignment is required for meaningful results. As a minimum, an alignment with an effective (after-weighting) sequence count bigger than the length of the query sequence should be used. Alignments with tens of thousands of (effective) sequences are considered good input. L or L can be used to generate the alignments, for example. [1] PLoS One. 2011;6(12):e28766. doi: 10.1371/journal.pone.0028766. Epub 2011 Dec 7. Protein 3D structure computed from evolutionary sequence variation. Marks DS, Colwell LJ, Sheridan R, Hopf TA, Pagnani A, Zecchina R, Sander C. [2] Bioinformatics. 2012 Jan 15;28(2):184-90. Epub 2011 Nov 17. PSICOV: precise structural contact prediction using sparse inverse covariance estimation on large multiple sequence alignments. Jones DT, Buchan DW, Cozzetto D, Pontil M. =head2 Input The following formats are supported: =over =item I The following simple input file format is used: # querystart=5 # query=QUERYwithinsertionSEQUENCEWITHNOGAPSORINSERTIONS QUERYSEQUENCEWITHNOGAPSORINSERTIONS -ALIGNED---SEQUENCE--WITH-GAPS----- ANOTHER-ALIGNED------------SEQUENCE The '#' header lines are optional. Header lines are used to calculate contact residue numbers and to look up respective query residues for certain output formats. If no query is defined, the first sequence in the alignment is used as the query sequence. The query sequence must not contain gaps in the alignment. All alignment rows must be the same length, and may contain only [ABCDEFGHIJKLMNOPQRSTUVWXYZ-]. [B] is mapped to [D], [Z] is mapped to [E], [JOUX] are mapped to [X]. [X] matches only itself for the entire program. A2M input alignments can be converted to the above format using F<__pkgdatadir__/a2m2aln>. a2m2aln can be used to pipe the alignment directly into freecontact. =item I XML document with one element, defined in the FreeContact schema [4] derived from BioXSD [5]. Example: F<__examplesdir__/PF00071_v25_1000.xml>. =back =head2 Output The original EVfold-mfDCA or PSICOV output format is used by default when the respective parameter profile is selected. =over =item I (EVfold-mfDCA) 5 K 6 L 0.332129 3.59798 | | | | | + corrected norm (CN) contact score | | | | + mutual information (MI) score | | | + contact amino acid residue code | | + contact residue number | + contact amino acid residue code + contact residue number Contacts are sorted on residue number. =item I (PSICOV) CASP residue-residue separation prediction (PFRMAT RR) format [3]: 55 67 0 8 10.840280 | | | | + contact score | | +-+ range [Å] of Cb-Cb distance predicted for the residue pair | | (C-alpha for glycines) | | These two fields are invariant in the output. | + contact residue number + contact residue number Contacts are sorted on score, descending. [3] L =item I XML document with one element, defined in the FreeContact schema [4] derived from BioXSD [5]. Example: F<__examplesdir__/PF00071_v25_1000.evfold.50.xml>. Note: as BioXSD is under active development in collaboration with FreeContact, the FreeContact schema may actually be derived from a version not yet available at [5]. [4] L [5] L =back The output may not list all possible contacts. =head1 REFERENCES =over Submitted. FreeContact: fast and free direct residue-residue contact prediction. Kaján L, Sustik MA, Marks DS, Hopf TA, Kalaš M, Rost B. =back =head1 OPTIONS =over =item -a [ --threads ] arg Threads to use [0-). 0 means as many as cores. =item --apply-gapth arg When true, exclude residue columns and rows with a weighted gap frequency > B<--gapth> from the covariance matrix [Boolean]. =item -c [ --clustpc ] arg BLOSUM clustering percentage [0-100]. =item --cov20 arg If true, leave one amino acid off the covariance matrix, making it non-overdetermined [Boolean]. =item -d [ --density ] arg Target precision matrix density [0-1]. Set I<0> to not control density. =item --debug Turn on debugging. =item --estimate-ivcov arg Use inverse covariance matrix estimation instead of matrix inversion [Boolean]. =item -f [ --ali ] arg (=-) Alignment file [path]. If '-', standard input. Default: '-'. =item -g [ --gapth ] arg Weighted gap frequency threshold (0-1]. =item -h [ --help ] Produce this help message. =item -i [ --input-format ] arg (=flat) Input format [flat|xml]. =item --icme-timeout arg (=1800) Inverse covariance matrix estimation timeout in seconds [0-). Applied to each iversion call independently. If a timeout occurs, the program exits with status I<2>. =item --mincontsep arg Minimum sequence-wise contacting residue pair separation given in amino acids as (j-i>=arg). I<1> for adjacent residues. [1-). =item -o [ --output-format ] arg Output format [evfold|pfrmat_rr|bioxsd]. =item --parprof arg (=default) Parameter profile (optional) [default|evfold|psicov]. The default profile is I. Command line arguments can be used to override profile values. =over =item evfold Triggers EVfold-mfDCA [1] compatibility mode. =item psicov Triggers PSICOV [2] compatibility mode. =item psicov-sd Triggers PSICOV [2] sensible default mode: fixed default rho, no density control. =back =item -w [ --pscount-weight ] arg Pseudocount weight [0-1]. =item -p [ --pseudocnt ] arg Pseudocount [0-). =item --pep Print effective parameters on standard error. Use this option to see what parameters freecontact(1) is run with in detail. This is especially useful when the B<--parprof> option is used in combination with other options. =item --rho arg Initial value of Glasso regularization parameter [0-). If negative, choose value automatically. =item --quiet arg (=0) Print nothing but error messages on standard error. Does not affect B<--debug>. =item --veczw arg Use vectorized sequence weighting when available [Boolean]. =item --version Print version. =back =head1 EXIT STATUS =over =item I<0> No error - success. =item I<1> Unspecified error. =item I<2> A timeout (see B<--icme-timeout>) occurred. =back =head1 EXAMPLES __pkgdatadir__/a2m2aln --query '^RASH_HUMAN/(\d+)' < '__examplesdir__/PF00071_v25_1000.fa' | \ freecontact --parprof evfold > PF00071_v25_1000.evfold freecontact --parprof evfold -i xml -o bioxsd < '__examplesdir__/PF00071_v25_1000.xml' > PF00071_v25_1000.evfold.xml freecontact --parprof psicov < __examplesdir__/demo_1000.aln > demo_1000.psicov =head1 NOTES For optimal performance, use the Automatically Tuned Linear Algebra Software (ATLAS) library I where freecontact is run. =head1 AUTHOR László Kaján =head1 SEE ALSO jackhmmer(1), hhblits(1), blastpgp(1) =cut */ // vim:et:ts=4:ai: