/* FreeContact - program to predict protein residue contacts from a sufficiently large multiple alignment * Copyright (C) 2012-2013 Laszlo Kajan Rost Lab, Technical University of Munich, Germany * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ #ifndef LIBFREECONTACT_H #define LIBFREECONTACT_H #include #include #include #include #include #include #include #ifdef __SSE2__ #include #endif // __SSE2__ #define LIBFREEC_API __attribute__ ((visibility ("default"))) #define LIBFREEC_LOCAL __attribute__ ((visibility ("hidden"))) namespace freecontact { // http://www.fao.org/docrep/004/Y2775E/y2775e0e.htm // Use 20 for gap. static const uint8_t aamap_gapidx = 20; static const uint8_t aamap[] = { // A B C D E F G H I J K L M N 21, 0, 3, 4, 3, 6, 13, 7, 8, 9, 21, 11, 10, 12, 2, // O P Q R S T U V W X Y Z 21, 14, 5, 1, 15, 16, 21, 19, 17, 21, 18, 6 }; static const uint8_t mapaa[] = { //0 1 2 3 4 5 6 7 8 9 'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-', 'X' }; /// Alignment length exception. class LIBFREEC_API alilen_error : public std::runtime_error { public: alilen_error(const std::string& __arg) : std::runtime_error(__arg){} }; /// Inverse covariance matrix estimation timeout exception. class LIBFREEC_API icme_timeout_error : public std::runtime_error { public: icme_timeout_error(const std::string& __arg) : std::runtime_error(__arg){} }; #ifndef __SSE2__ typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__)); #endif /// Protein sequence alignment. class LIBFREEC_API ali_t : public std::vector<__m128i> { private: #pragma GCC visibility push(hidden) #pragma GCC visibility pop public: uint32_t seqcnt; uint16_t alilen; // aligned sequence length, max 65535, reasonable uint16_t alilen16; // size in 16-bytes uint16_t alilenpad; // 16 * alilen16 ali_t( const uint16_t __alilen = 0 ) : std::vector<__m128i>(), seqcnt(0), alilen(__alilen), alilen16(( alilen-1 )/16 + 1), alilenpad( 16 * alilen16 ) { reserve(1024*alilen16); } virtual ~ali_t(){} inline uint8_t& operator()(uint32_t __k, uint16_t __ai) { return reinterpret_cast(this->_M_impl._M_start)[ __k*alilenpad + __ai ]; } inline const uint8_t& operator()(uint32_t __k, uint16_t __ai) const { return reinterpret_cast(this->_M_impl._M_start)[ __k*alilenpad + __ai ]; } /// Convert alignment string to amino acid codes with aamap. static std::vector read_a_seq( const std::string& __l ) { std::vector ret; ret.reserve(__l.length()); for( std::string::const_iterator l_b = __l.begin(), l_e = __l.end(); l_b != l_e; ++l_b ) { const std::string::value_type c = *l_b; if( isalpha(c) ) ret.push_back( aamap[c & 0x1F] ); else if( c == '-' ) ret.push_back( 20 ); else break; } return ret; } /// Push a sequence to the alignment. ali_t& push(const std::vector& __al) throw (alilen_error); /// Push a sequence to the alignment. inline ali_t& push(const std::string& __l) throw (alilen_error) { return push(read_a_seq(__l)); } }; /// Contact prediction. struct LIBFREEC_API contact_t { /// Residue number, 0-based(!). uint16_t i, j; /// Contact score. float score; contact_t( uint16_t __i = 0, uint16_t __j = 0, float __score = 0 ) : i(__i), j(__j), score(__score) {} inline bool operator<( const contact_t& __b ) const { return score < __b.score; } // to facilitate sorting by score inline bool operator>( const contact_t& __b ) const { return score > __b.score; } // to facilitate sorting by score }; /// Parameter set structure for predictor::run(). struct parset_t { double clustpc, density, gapth; uint16_t mincontsep; double pseudocnt, pscnt_weight; bool estimate_ivcov; double shrink_lambda; bool cov20, apply_gapth; double rho; parset_t( double __clustpc = 0, double __density = 0, double __gapth = 0, uint16_t __mincontsep = 0, double __pseudocnt = 0, double __pscnt_weight = 0, bool __estimate_ivcov = 0, double __shrink_lambda = 0, bool __cov20 = 0, bool __apply_gapth = 0, double __rho = 0 ) : clustpc(__clustpc), density(__density), gapth(__gapth), mincontsep(__mincontsep), pseudocnt(__pseudocnt), pscnt_weight(__pscnt_weight), estimate_ivcov(__estimate_ivcov), shrink_lambda(__shrink_lambda), cov20(__cov20), apply_gapth(__apply_gapth), rho(__rho) {} }; const parset_t // clust conts gapt minc pseud pscw estim shrin cov20 agapth rho //ps_evfold(0.70, 0.0, 1.0, 1, 0.0, 0.5, false, 0.0, true, false, -1 ), // original Matlab that expects gap pre-filtering ps_evfold( 0.70, 0.0, 0.9, 1, 0.0, 0.5, false, 0.0, true, true, -1 ), ps_psicov( 0.62, 0.03, 0.9, 5, 1.0, 0.0, true, 0.1, false, false, -1 ), // as published ps_psicov_sd( 0.62, 0.0, 0.9, 5, 1.0, 0.0, true, 0.1, false, false, 0.001 ); // PSICOV sensible default (sd) for most cases /// Protein residue contact predictor. /** A sufficiently large multiple alignment is required for meaningful results. */ class LIBFREEC_API predictor { public: typedef std::map > cont_res_t; typedef double freq_t; typedef float pairfreq_t; typedef std::vector freq_vec_t; typedef std::map time_res_t; public: bool dbg; predictor(bool __dbg = false) : dbg(__dbg) {} virtual ~predictor(){} /// Calculate alignment sequence weights. /** \param [out] __aliw Vector of alignment sequence weights. * \param [out] __wtot Total alignment weight. * \param [in] __ali Input alignment. * \param [in] __clustpc BLOSUM-style clustering similarity threshold [0-1]. * \param [in] __veczw Use vectorized sequence weighting when available. * \param [in] __num_threads Number of OpenMP threads, effective if non-zero. The default is as many threads as cores in host. Ineffective if library is not compiled with OMP support. */ void get_seq_weights( freq_vec_t &__aliw, double &__wtot, const ali_t& __ali, double __clustpc, bool __veczw = true, int __num_threads = 0 ) throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error); /// Predict residue contacts. /** \param [in] __ali Input alignment. * \param [in] __aliw Vector of alignment sequence weights. * Obtain this with get_seq_weights(). * \param [in] __wtot Total weight of alignment. * Obtain this with get_seq_weights(). * \param [in] __density Target precision matrix density [0-1]. Set to 0 to not control density. * \param [in] __gapth Threshold of weighted gap frequency for ignoring alignment columns with too many gaps [0-1]. Set to 1.00 to keep all columns. * This is implemented by using a very high regularization value for gap columns, and by excluding gap columns from the covariance * matrix, see __apply_gapth. * \param [in] __mincontsep Minimum sequence-wise contacting residue pair separation given in amino acids as (j-i). 1 for adjacent residues. [1-). * \param [in] __pseudocnt Number to initialize single and pair amino acid counts with [0-). * \param [in] __pscnt_weight Pseudocount weight to apply to single and pair amino acid frequencies [0-1]. * \param [in] __estimate_ivcov Estimate inverse covariance matrix instead of fully inverting matrix. This is currently done by GLASSOFAST. * \param [in] __shrink_lambda Covariance matrix shrinkage parameter, controlling rate of shrinkage [0-1]. * \param [in] __cov20 Leave one amino acid off the covariance matrix, making it non-overdetermined. * \param [in] __apply_gapth When true, exclude residue columns and rows with a weighted gap frequency > __gapth from the covariance matrix. * \param [in] __rho Initial value of Glasso regularization parameter. Negative values trigger an automatic choice for rho. * \param [in] __num_threads Number of OpenMP threads, effective if non-zero. The default is as many threads as cores in host. Ineffective if library is not compiled with OMP support. * \param [in] __icme_timeout Inverse covariance matrix estimation timeout in seconds. Default: 1800. Applied to each inversion call independently. * \param [out] __timing Pointer to map of timing results for certain components of this method. Useful for debugging. Keys are: [num_threads|seqw|pairfreq|shrink|inv|all]. * \return Map of vectors of predicted contacts. * Key is contact score calculation method name: 'l1norm', 'MI' (mutual information), 'fro' (Frobenius norm after zero-sum gauge). * Vectors may not contain all contact pairs. Contacts are ordered by residue index (i,j). */ cont_res_t run(const ali_t& __ali, const freq_vec_t &__aliw, const double __wtot, double __density, double __gapth, uint16_t __mincontsep, double __pseudocnt, double __pscnt_weight, bool __estimate_ivcov, double __shrink_lambda, bool __cov20, bool __apply_gapth, double __rho, int __num_threads = 0, time_t __icme_timeout = 1800, time_res_t *__timing = NULL ) throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error); /// Predict residue contacts. cont_res_t run(const ali_t& __ali, double __clustpc, double __density, double __gapth, uint16_t __mincontsep, double __pseudocnt, double __pscnt_weight, bool __estimate_ivcov, double __shrink_lambda, bool __cov20, bool __apply_gapth, double __rho, bool __veczw = true, int __num_threads = 0, time_t __icme_timeout = 1800, time_res_t *__timing = NULL ) throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error); /// Predict residue contacts. inline cont_res_t run(const ali_t& __ali, const parset_t& __parset, bool __veczw = true, int __num_threads = 0, time_t __icme_timeout = 1800, time_res_t *__timing = NULL ) throw (alilen_error, icme_timeout_error, std::range_error, std::runtime_error) { return run(__ali, __parset.clustpc, __parset.density, __parset.gapth, __parset.mincontsep, __parset.pseudocnt, __parset.pscnt_weight, __parset.estimate_ivcov, __parset.shrink_lambda, __parset.cov20, __parset.apply_gapth, __parset.rho, __veczw, __num_threads, __icme_timeout, __timing); } }; } // namespace freecontact #endif // LIBFREECONTACT_H // vim:et:ts=4:ai: