// Copyright 2009, Andreas Biegert #ifndef CS_CONTEXT_LIBRARY_H_ #define CS_CONTEXT_LIBRARY_H_ #include "abstract_state_matrix.h" #include "co_emission.h" #include "context_profile.h" #include "pseudocounts.h" namespace cs { // Forward declarations template class ContextLibrary; template class Emission; // Strategy class for initializing a context library template class LibraryInit { public: LibraryInit() {} virtual ~LibraryInit() {} virtual void operator() (ContextLibrary& lib) const = 0; }; // A container of K context profiles representing the most common // sequence motifs in a training database of proteins/DNA sequences. template class ContextLibrary { public: typedef ContextProfile* ProfileIter; typedef const ContextProfile* ConstProfileIter; // Constructs an empty profile library of given dimenions. ContextLibrary(size_t size, size_t wlen); // Constructs a profile library from serialized data read from input stream. explicit ContextLibrary(FILE* fin); // Constructs profile library with a specific init-strategy encapsulated by an // initializer. ContextLibrary(size_t size, size_t wlen, const LibraryInit& init); // Nothing to do here virtual ~ContextLibrary() {} // Returns the number of profiles in the fully assembled profile library size_t size() const { return profiles_.size(); } // Returns the number of columns in each context profile. size_t wlen() const { return wlen_; } // Returns index of central profile column. size_t center() const { return (wlen_ - 1) / 2; } // Accessor methods for state i, where i is from interval [0,size]. ContextProfile& operator[](size_t i) { return profiles_[i]; } const ContextProfile& operator[](size_t i) const { return profiles_[i]; } // Initializes profile at index 'k' with given profile. void SetProfile(size_t k, const ContextProfile& p); // Returns an iterator pointing to beginning of profiles. ProfileIter begin() { return &profiles_[0]; } // Returns an iterator pointing past the end of profiles. ProfileIter end() { return &profiles_[0] + profiles_.size(); } // Returns a const iterator pointing to beginning of profiles. ConstProfileIter begin() const { return &profiles_[0]; } // Returns a const iterator pointing past the end of profiles. ConstProfileIter end() const { return &profiles_[0] + profiles_.size(); } // Writes the profile library in serialization format to output stream. void Write(FILE* fout) const; // Sorts context states by relative entropy of central column and assigns // new state indices according to this new ordering void SortByEntropy(); private: // Initializes the library from serialized data read from stream. void Read(FILE* fin); size_t wlen_; // size of context window. Vector > profiles_; // context profiles ordered by index. }; // ContextLibrary // Prints the library in human-readable format for debugging. template std::ostream& operator<< (std::ostream& out, const ContextLibrary& lib) { out << "ContextLibrary" << std::endl; out << "size:\t" << lib.size() << std::endl; out << "wlen:\t" << lib.wlen() << std::endl; for (size_t k = 0; k < lib.size(); ++k) out << lib[k]; return out; } // Transforms probabilites in context profiles to log-space and sets 'is_log' flag. template void TransformToLog(ContextLibrary& lib); // Transforms probabilites in context profiles to lin-space and sets 'is_log' flag. template void TransformToLin(ContextLibrary& lib); // Calculates posterior probs for a context library and sequence window X_i // centered at index 'i' and writes them to array 'pp'. Caller is responsible for // making sure that 'pp' has sufficient length. Return value is log sum of all // individual emission terms. The third template parameter specifies the central // position of the context window. template double CalculatePosteriorProbs(const ContextLibrary& lib, const Emission& emission, const ContextInput& input, CenterPos i, double* pp); // Strategy for initializing library by sampling from training set of count // profiles, optionally adding pseudocounts. template class SamplingLibraryInit : public LibraryInit { public: typedef std::vector< CountProfile > TrainingSet; SamplingLibraryInit(const TrainingSet& trainset, const Pseudocounts& pc, const Admix& admix, unsigned int seed = 0) : trainset_(trainset), pc_(pc), admix_(admix), seed_(seed) {} virtual ~SamplingLibraryInit() {} virtual void operator() (ContextLibrary& lib) const; private: const TrainingSet& trainset_; const Pseudocounts& pc_; const Admix& admix_; const unsigned int seed_; }; // SamplingLibraryInit // Strategy that initializes profile probs by sammpling from gaussian distribution // with mean at background frequencies. template class GaussianLibraryInit : public LibraryInit { public: GaussianLibraryInit(double sigma, const SubstitutionMatrix& sm, unsigned int seed = 0) : sigma_(sigma), sm_(sm), seed_(seed) {} virtual ~GaussianLibraryInit() {} virtual void operator() (ContextLibrary& lib) const; protected: double sigma_; const SubstitutionMatrix& sm_; unsigned int seed_; }; // class GaussianLibraryInit // Translate a sequence or count profile into an abstract state sequence. template Sequence TranslateIntoStateSequence(const CountsInput& input, const ContextLibrary& lib, const Emission& emission); // Translate a sequence or count profile into an abstract state profile given // the learned context-to-state mutation probabilities in 'matrix'. template Profile TranslateIntoStateProfile(const CountsInput& input, const ContextLibrary& lib, const Emission& emission, const AbstractStateMatrix& matrix); // Learns a color-space SOM from a full-blown context-library. template void LearnContextMap(const ContextLibrary& lib, ContextLibrary& som, const CoEmission& co_emission, int nsteps, // number of learning steps double sigma = 5.0, // initial neighborhood gaussian sigma double alpha = 0.1, // initial learning rate double tau1 = 0.0, // timescale parameter for sigma double tau2 = 0.0, // timescale parameter for alpha unsigned int seed = 0); // Assigns each context profile in given context lib an RBG color based learned SOM template void AssignContextColors(ContextLibrary& lib, const ContextLibrary& som, const CoEmission& co_emission, double color_offset = 0.2); // Assigns each context profile a unique name based on its position in learned SOM template void AssignContextNames(ContextLibrary& lib, const ContextLibrary& som, const CoEmission& co_emission); } // namespace cs #endif // CS_CONTEXT_LIBRARY_H_