PEACE: ESTCodec.h Source File

00001 #ifndef EST_CODEC_H
00002 #define EST_CODEC_H
00003 
00004 //--------------------------------------------------------------------
00005 //
00006 // This file is part of PEACE.
00007 // 
00008 // PEACE is free software: you can redistribute it and/or modify it
00009 // under the terms of the GNU General Public License as published by
00010 // the Free Software Foundation, either version 3 of the License, or
00011 // (at your option) any later version.
00012 // 
00013 // PEACE is distributed in the hope that it will be useful, but
00014 // WITHOUT ANY WARRANTY; without even the implied warranty of
00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016 // General Public License for more details.
00017 // 
00018 // You should have received a copy of the GNU General Public License
00019 // along with PEACE.  If not, see <http://www.gnu.org/licenses/>.
00020 // 
00021 // Miami University makes no representations or warranties about the
00022 // suitability of the software, either express or implied, including
00023 // but not limited to the implied warranties of merchantability,
00024 // fitness for a particular purpose, or non-infringement.  Miami
00025 // University shall not be liable for any damages suffered by licensee
00026 // as a result of using, result of using, modifying or distributing
00027 // this software or its derivatives.
00028 //
00029 // By using or copying this Software, Licensee agrees to abide by the
00030 // intellectual property laws, and all other applicable laws of the
00031 // U.S., and the terms of GNU General Public License (version 3).
00032 //
00033 // Authors:   Dhananjai M. Rao          raodm@muohio.edu
00034 //
00035 //---------------------------------------------------------------------
00036 
00037 #include <functional>
00038 #include "HashMap.h"
00039 
00040 /** A helper class to serve as a EST enCOder-DECoder.
00041 
00042     <p>This class was introduced to try and centralize a bunch of EST
00043     encoding/decoding code that was spread out amonst multiple
00044     independent classes.  Most of the EST analysis algorithms try and
00045     encode the base pairs (\c A, \c T, \c C, and \c G) in ESTs into
00046     2-bits each (\c 00, \c 11, \c 10, and \c 01) to reduce memory
00047     footprint and to enable use of numerical operations.  The code to
00048     perform encoding and decoding was spread across multiple classes
00049     on a "as needed" basis.  However, more than three classes required
00050     pretty much the same code warranting the introduction of this
00051     class to minimize code-redundancy. <p>
00052 
00053     <p>There is one process-wide unique instance of this ESTCodec
00054     class.  A single instance is used to create commonly used tables
00055     to enable rapid CODEC operations.  These tables are created once,
00056     when the globally unique EST object is referenced.  The globally
00057     unique instance can be referenced via the getCodec() method. </p>
00058 */
00059 class ESTCodec {
00060 public:
00061     /** Obtain reference to process-wide unqiue instance of ESTCodec.
00062 
00063         This method must be used to obtain the process-wide unique
00064         instance of the ESTCodec object. The returned reference can be
00065         used to invoke other methods in this class.  Here is a typical
00066         usage:
00067 
00068         \code
00069 
00070         const ESTCodec& codec = ESTCodec::getCodec();
00071         int bitCodec = codec.encode('A');
00072 
00073         \endcode
00074     */
00075     inline static ESTCodec& getCodec() { return estCodec; }
00076 
00077     /** Obtain 2-bit code for a given base pair.
00078 
00079         This method can be used to obtain the 2-bit encoding for a
00080         given base pair (bp).  This method essentially translate base
00081         pairs (\c A, \c T, \c C, and \c G, both upper and lower case)
00082         into 2-bits codes (\c 00, \c 11, \c 10, and \c 01).
00083 
00084         \note In favor of speed, this method does not perform any
00085         special checks on the actual character in bp. It is the
00086         responsiblity of the caller to ensure that this method is
00087         invoked with appropriate parameter value.
00088         
00089         \param[in] bp The base pair character (both upper and lower
00090         cases are handled correctly) to be encoded.
00091     */
00092     inline static char encode(const char bp)
00093     { return charToInt[(int) bp]; }
00094 
00095     /** Obtain 2-bit \b complement code for a given base pair.
00096 
00097         This method can be used to obtain the \b complementary 2-bit
00098         encoding for a given base pair (bp).  This method essentially
00099         translate base pairs (\c A, \c T, \c C, and \c G, both upper
00100         and lower case) into 2-bits codes (\c 11, \c 00, \c 01, and \c
00101         10).
00102 
00103         \note In favor of speed, this method does not perform any
00104         special checks on the actual character in bp. It is the
00105         responsiblity of the caller to ensure that this method is
00106         invoked with appropriate parameter value.
00107         
00108         \param[in] bp The base pair character (both upper and lower
00109         cases are handled correctly) whose complementary encoding is
00110         required.
00111     */
00112     inline static char encode2rc(const char bp)
00113     { return charToIntComp[(int) bp]; }
00114 
00115 
00116     /** Obtain the reverse-complement for a given word.
00117 
00118         This method can be used to obtain the reverse-complement
00119         encoding for a given encoded word.  This method essentially
00120         translates a given encoded word to its reverse-complement
00121         representation.
00122 
00123         \note In favor of speed, this method does not perform any
00124         special checks on the word to be translated.  It is the
00125         responsiblity of the caller to ensure that this method is
00126         invoked with appropriate parameter value after the \c
00127         setRevCompTable method is invoked.
00128         
00129         \param[in] word The encoded word that must be translated to
00130         its corresponding reverse complement representation.
00131 
00132         \return The reverse-complement representation for a given
00133         word.
00134     */
00135     inline int encode2rc(const int word) const { return revCompTable[word]; }
00136     
00137     /** Set the reverse-complement translation table to be used whe
00138         next time encode2rc method is called.
00139 
00140         This method must be invoked to set the correct translation
00141         table to be used by the encode2rc(int) method.  If a
00142         translation table does not exist in the \c revCompTables, then
00143         a new reverse-complement table is created by the \c
00144         addRevCompTable method.
00145     
00146         \param[in] wordSize The number of base pairs in the word for
00147         which a reverse-complement translation table is to be created.
00148     */
00149     void setRevCompTable(const int wordSize);
00150 
00151     /** A functor to generate a encoded word (serves as a hash entry).
00152         
00153         This functor must be used to generate an encoded word from a
00154     "normal" (rather than reverse complement) fragment. This
00155     me thod handles 'n' entries in the EST in the following manner:
00156 
00157         <ul>
00158 
00159         <li>Whenever it encounters an 'n' base pair (that is created
00160         when ESTs are loaded in the class EST.cpp) it sets the
00161         ignoreMask to Mask (as Mask template parameter already tells
00162         us the number of bits we care about and that is used as an
00163         indicator of number hashes to ignore).</li>
00164 
00165         <li>The ignoreMask is shifted right dropping off the least
00166         significant bits each time this method is called. If all bits
00167         of ignoreMask are cleared then the ignore mask is zero and
00168         ignored.</li>
00169 
00170         <li>If the ignoreMask is non-zero, then the caller is expected
00171         not to use the hash returned from this method.  as this
00172         word/hash contains a 'n'. As bits get shifted the value for
00173         'n' drops off (when the ignoreMask is zero) and the hashes can
00174         actually be used by the caller.</li>
00175 
00176         </ul>
00177 
00178         \tparam Shift The number of bits by which the encoding for the
00179         given base pair must be shifted to the left. For example, when
00180         using a word of length 6 nt, this value would be 10.
00181 
00182         \tparam Mask The mask (with bits set to 1) that must be used
00183         to retain the signficiant values in the hash. For example,
00184         when using words of length 6, the Mask would be the binary
00185         <code>1111 1111 1111</code> or <code>0xfff</code>.
00186 
00187         \param[in] w The current hash value that has been computed
00188         thusfar.
00189 
00190         \param[in] bp The base pair ('A', 'T', 'C', 'G', or 'n') to be
00191         encoded by this method.
00192 
00193         \param[in,out] ignoreMask The ignore mask that is used to
00194         determine if the current hash/word has a 'n' entry in it and
00195         must be ignored.
00196 
00197         \return The hash for the current word being encoded.
00198 
00199         \note The caller must use the returned value for further
00200         operation only if the ignoreMask is zero.  Otherwise the
00201         encoder must be repeatedly called with subsequent bases until
00202         the ignoreMask is cleared by this method.
00203     */
00204     template <const int& Shift, const int& Mask>
00205     struct NormalEncoder : public std::binary_function<int, char, int> {
00206         inline int operator()(const int w, const char bp, int& ignoreMask) const {
00207             // Setup the ignore mask if base pair is 'n' otherwise
00208             // drop off the lowest 2 bits.
00209             ignoreMask = (bp != 'N') ? (ignoreMask >> 2) : Mask;
00210             // Compute the hash for the word. Encoding for 'n' is 0.
00211             return ((w >> 2) | (ESTCodec::encode(bp) << Shift)) & Mask;
00212         }
00213     };
00214 
00215     /** A functor to generate a encoded word (serves as a hash entry).
00216         
00217         This functor must be used to generate an encoded word for
00218         a reverse complement (rather than normal) fragment. This
00219         method handles 'n' entries in the EST in the following
00220         manner:
00221         
00222         <ul>
00223         
00224         <li>Whenever it encounters an 'n' base pair (that is created
00225         when ESTs are loaded in the class EST.cpp) it sets the
00226         ignoreMask to Mask (as Mask template parameter already tells
00227         us the number of bits we care about and that is used as an
00228         indicator of number hashes to ignore).</li>
00229         
00230         <li>The ignoreMask is shifted right dropping off the least
00231         significant bits each time this method is called. If all bits
00232         of ignoreMask are cleared then the ignore mask is zero and
00233         ignored.</li>
00234       
00235         <li>If the ignoreMask is non-zero, then the caller is expected
00236         not to use the hash returned from this method.  as this
00237         word/hash contains a 'n'. As bits get shifted the value for
00238         'n' drops off (when the ignoreMask is zero) and the hashes can
00239         actually be used by the caller.</li>
00240         
00241         </ul>
00242         
00243         \tparam Shift The number of bits by which the encoding for the
00244         given base pair must be shifted. Currently this parameter is
00245         not used but it is present to provide a symmetric interface
00246         with the NormalEncoder.
00247             
00248         \tparam Mask The mask (with bits set to 1) that must be used
00249         to retain the signficiant values in the hash. For example,
00250         when using words of length 6, the Mask would be the binary
00251         <code>1111 1111 1111</code> or <code>0xfff</code>.
00252         
00253         \param[in] w The current hash value that has been computed
00254         thusfar.
00255         
00256         \param[in] bp The base pair ('A', 'T', 'C', 'G', or 'n') to be
00257         encoded by this method.
00258         
00259         \param[in,out] ignoreMask The ignore mask that is used to
00260         determine if the current hash/word has a 'n' entry in it and
00261         must be ignored.
00262 
00263         \note The caller must use the returned value for further
00264         operation only if the ignoreMask is zero.  Otherwise the
00265         encoder must be repeatedly called with subsequent bases until
00266         the ignoreMask is cleared by this method.
00267     */
00268     template <const int& Shift, const int& Mask>
00269     struct RevCompEncoder : public std::binary_function<int, char, int> {
00270         inline int operator()(const int w, const char bp, int& ignoreMask) const {
00271             // Setup the ignore mask if base pair is 'n' otherwise
00272             // drop off the lowest 2 bits.
00273             ignoreMask = (bp != 'N') ? (ignoreMask >> 2) : Mask;
00274             // Compute the hash for the word. 'n' becomes 0.
00275             return ((w << 2) | ESTCodec::encode2rc(bp)) & Mask;
00276         }
00277     };
00278     
00279     /** The destructor.
00280         
00281         The destructor frees up memory allocated to hold translation
00282         tables etc.  The destructor is called only once, when the
00283         process-wide unique instance is destroyed when program
00284         terminates.
00285     */
00286     ~ESTCodec();
00287 
00288 protected:
00289     /** The constructor.
00290 
00291         The constructor is invoked only once when the process-wide
00292         unique static instance of the ESTCodec is created when the
00293         process starts.  The constructor initializes the CharToInt
00294         array that is used to translate base pairs (\c A, \c T, \c C,
00295         and \c G) into 2-bits codes (\c 00, \c 11, \c 10, and \c 01)
00296     */
00297     ESTCodec();
00298 
00299     /** Creates and adds a new reverse-complement translation table
00300         for the given word size.
00301 
00302         This method is a helper method that is invoked from the \c
00303         setRevCompTable whenever a new reverse-complement translation
00304         table is needed.  This method creates a reverse-complement
00305         table with 4<sup>wordSize</sup> entries.
00306 
00307         \param[in] wordSize The number of base pairs in the word for
00308         which a reverse-complement translation table is to be created.
00309 
00310         \return This method returns the newly created
00311         reverse-complement translation table.
00312     */
00313     int* addRevCompTable(const int wordSize);
00314     
00315 private:
00316     /** A simple array to map characters \c A, \c T, \c C, and \c G to
00317         \c 0, \c 3, \c 2, and \c 1 respectively.
00318         
00319         This is a simple array of 255 entries that are used to convert
00320         the base pair encoding characters \c A, \c T, \c C, and \c G
00321         to \c 0, \c 3, \c 2, and \c 1 respectively.  This encoding is
00322         typically used to compute the hash as defined by various EST
00323         analysis algorithms.  This array is statically allocated. It
00324         is initialized in the constructor and is never changed during
00325         the life time of this class.
00326 
00327         \note This array is statically allocated to enable ready
00328         access from NormalEncoder and RevCompEncoder functors defined
00329         in this class.  Hopefully with static arrays, the compilers
00330         more readily optimize and inline the method calls to enocde
00331         and encode2rc.
00332     */
00333     static char charToInt[];
00334 
00335     /** A simple array to map characters \c A, \c T, \c C, and \c G to
00336         complementary encodings \c 3, \c 0, \c 1, and \c 2
00337         respectively.
00338         
00339         This is a simple array of 255 entries that are used to convert
00340         the base pair encoding characters \c A, \c T, \c C, and \c G
00341         to \b complementary codes \c 3, \c 0, \c 1, and \c 2
00342         respectively.  This encoding is typically used to compute the
00343         hash as defined by various EST analysis algorithms.  This
00344         array is statically allocated. It is initialized in the
00345         constructor and is never changed during the life time of this
00346         class.
00347 
00348         \note This array is statically allocated to enable ready
00349         access from NormalEncoder and RevCompEncoder functors defined
00350         in this class.  Hopefully with static arrays, the compilers
00351         more readily optimize and inline the method calls to enocde
00352         and encode2rc.
00353     */
00354     static char charToIntComp[];
00355 
00356     /** A hash map that holds tables to aid in translating a given
00357         word to its reverse complement.
00358 
00359         <p>Converting a given encoded word (some fixed \em n number of
00360         base pairs, with each base pair encoded into 2-bits) to its
00361         reverse complement (that is, given the encoded sequence for \c
00362         attcggct it must be converted to the encoded sequence for \c
00363         agccgaat) needs to be computed as a part of EST analysis
00364         algorithms and heuristics. In order to enable rapid translation
00365         pre-populated tabes are used.</p>
00366 
00367         <p>However, the reverse-complement translation tables need to
00368         have entries corresponding to the size of words to be
00369         translated. Different algorithms use different word sizes
00370         (such as: 8 bps or 10 bps etc).  Accordingly, this hash_map is
00371         used to hold pre-computed reverse-complement translation
00372         tables. The key in the hash map is the word size. The
00373         translation tables contained in this hash map are used via the
00374         \c setRevCompTable method. If a reverse-complement entry does
00375         not exist, then a new entry is added by the addRevCompTable
00376         method. </p>
00377     */
00378     HashMap<int, int*> revCompTables;
00379 
00380     /** The reverse-complement translation table to be used by the
00381         encode2rc method.
00382 
00383         This array is set by the \c setRevCompTable method to refer to
00384         the reverse-complement translation table to translate words of
00385         given size to their corresponding reverse-complement
00386         encodings.
00387     */
00388     const int* revCompTable;
00389     
00390     /** The process-wide unique codec instance.
00391         
00392         This instance variable is a process-wide unique codec that is
00393         created when the process is started and is destroyed only when
00394         the process terminates.
00395     */
00396     static ESTCodec estCodec;
00397 };
00398 
00399 #endif