00001 #ifndef EST_CODEC_H 00002 #define EST_CODEC_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include <functional> 00038 #include "HashMap.h" 00039 00040 /** A helper class to serve as a EST enCOder-DECoder. 00041 00042 <p>This class was introduced to try and centralize a bunch of EST 00043 encoding/decoding code that was spread out amonst multiple 00044 independent classes. Most of the EST analysis algorithms try and 00045 encode the base pairs (\c A, \c T, \c C, and \c G) in ESTs into 00046 2-bits each (\c 00, \c 11, \c 10, and \c 01) to reduce memory 00047 footprint and to enable use of numerical operations. The code to 00048 perform encoding and decoding was spread across multiple classes 00049 on a "as needed" basis. However, more than three classes required 00050 pretty much the same code warranting the introduction of this 00051 class to minimize code-redundancy. <p> 00052 00053 <p>There is one process-wide unique instance of this ESTCodec 00054 class. A single instance is used to create commonly used tables 00055 to enable rapid CODEC operations. These tables are created once, 00056 when the globally unique EST object is referenced. The globally 00057 unique instance can be referenced via the getCodec() method. </p> 00058 */ 00059 class ESTCodec { 00060 public: 00061 /** Obtain reference to process-wide unqiue instance of ESTCodec. 00062 00063 This method must be used to obtain the process-wide unique 00064 instance of the ESTCodec object. The returned reference can be 00065 used to invoke other methods in this class. Here is a typical 00066 usage: 00067 00068 \code 00069 00070 const ESTCodec& codec = ESTCodec::getCodec(); 00071 int bitCodec = codec.encode('A'); 00072 00073 \endcode 00074 */ 00075 inline static ESTCodec& getCodec() { return estCodec; } 00076 00077 /** Obtain 2-bit code for a given base pair. 00078 00079 This method can be used to obtain the 2-bit encoding for a 00080 given base pair (bp). This method essentially translate base 00081 pairs (\c A, \c T, \c C, and \c G, both upper and lower case) 00082 into 2-bits codes (\c 00, \c 11, \c 10, and \c 01). 00083 00084 \note In favor of speed, this method does not perform any 00085 special checks on the actual character in bp. It is the 00086 responsiblity of the caller to ensure that this method is 00087 invoked with appropriate parameter value. 00088 00089 \param[in] bp The base pair character (both upper and lower 00090 cases are handled correctly) to be encoded. 00091 */ 00092 inline static char encode(const char bp) 00093 { return charToInt[(int) bp]; } 00094 00095 /** Obtain 2-bit \b complement code for a given base pair. 00096 00097 This method can be used to obtain the \b complementary 2-bit 00098 encoding for a given base pair (bp). This method essentially 00099 translate base pairs (\c A, \c T, \c C, and \c G, both upper 00100 and lower case) into 2-bits codes (\c 11, \c 00, \c 01, and \c 00101 10). 00102 00103 \note In favor of speed, this method does not perform any 00104 special checks on the actual character in bp. It is the 00105 responsiblity of the caller to ensure that this method is 00106 invoked with appropriate parameter value. 00107 00108 \param[in] bp The base pair character (both upper and lower 00109 cases are handled correctly) whose complementary encoding is 00110 required. 00111 */ 00112 inline static char encode2rc(const char bp) 00113 { return charToIntComp[(int) bp]; } 00114 00115 00116 /** Obtain the reverse-complement for a given word. 00117 00118 This method can be used to obtain the reverse-complement 00119 encoding for a given encoded word. This method essentially 00120 translates a given encoded word to its reverse-complement 00121 representation. 00122 00123 \note In favor of speed, this method does not perform any 00124 special checks on the word to be translated. It is the 00125 responsiblity of the caller to ensure that this method is 00126 invoked with appropriate parameter value after the \c 00127 setRevCompTable method is invoked. 00128 00129 \param[in] word The encoded word that must be translated to 00130 its corresponding reverse complement representation. 00131 00132 \return The reverse-complement representation for a given 00133 word. 00134 */ 00135 inline int encode2rc(const int word) const { return revCompTable[word]; } 00136 00137 /** Set the reverse-complement translation table to be used whe 00138 next time encode2rc method is called. 00139 00140 This method must be invoked to set the correct translation 00141 table to be used by the encode2rc(int) method. If a 00142 translation table does not exist in the \c revCompTables, then 00143 a new reverse-complement table is created by the \c 00144 addRevCompTable method. 00145 00146 \param[in] wordSize The number of base pairs in the word for 00147 which a reverse-complement translation table is to be created. 00148 */ 00149 void setRevCompTable(const int wordSize); 00150 00151 /** A functor to generate a encoded word (serves as a hash entry). 00152 00153 This functor must be used to generate an encoded word from a 00154 "normal" (rather than reverse complement) fragment. This 00155 me thod handles 'n' entries in the EST in the following manner: 00156 00157 <ul> 00158 00159 <li>Whenever it encounters an 'n' base pair (that is created 00160 when ESTs are loaded in the class EST.cpp) it sets the 00161 ignoreMask to Mask (as Mask template parameter already tells 00162 us the number of bits we care about and that is used as an 00163 indicator of number hashes to ignore).</li> 00164 00165 <li>The ignoreMask is shifted right dropping off the least 00166 significant bits each time this method is called. If all bits 00167 of ignoreMask are cleared then the ignore mask is zero and 00168 ignored.</li> 00169 00170 <li>If the ignoreMask is non-zero, then the caller is expected 00171 not to use the hash returned from this method. as this 00172 word/hash contains a 'n'. As bits get shifted the value for 00173 'n' drops off (when the ignoreMask is zero) and the hashes can 00174 actually be used by the caller.</li> 00175 00176 </ul> 00177 00178 \tparam Shift The number of bits by which the encoding for the 00179 given base pair must be shifted to the left. For example, when 00180 using a word of length 6 nt, this value would be 10. 00181 00182 \tparam Mask The mask (with bits set to 1) that must be used 00183 to retain the signficiant values in the hash. For example, 00184 when using words of length 6, the Mask would be the binary 00185 <code>1111 1111 1111</code> or <code>0xfff</code>. 00186 00187 \param[in] w The current hash value that has been computed 00188 thusfar. 00189 00190 \param[in] bp The base pair ('A', 'T', 'C', 'G', or 'n') to be 00191 encoded by this method. 00192 00193 \param[in,out] ignoreMask The ignore mask that is used to 00194 determine if the current hash/word has a 'n' entry in it and 00195 must be ignored. 00196 00197 \return The hash for the current word being encoded. 00198 00199 \note The caller must use the returned value for further 00200 operation only if the ignoreMask is zero. Otherwise the 00201 encoder must be repeatedly called with subsequent bases until 00202 the ignoreMask is cleared by this method. 00203 */ 00204 template <const int& Shift, const int& Mask> 00205 struct NormalEncoder : public std::binary_function<int, char, int> { 00206 inline int operator()(const int w, const char bp, int& ignoreMask) const { 00207 // Setup the ignore mask if base pair is 'n' otherwise 00208 // drop off the lowest 2 bits. 00209 ignoreMask = (bp != 'N') ? (ignoreMask >> 2) : Mask; 00210 // Compute the hash for the word. Encoding for 'n' is 0. 00211 return ((w >> 2) | (ESTCodec::encode(bp) << Shift)) & Mask; 00212 } 00213 }; 00214 00215 /** A functor to generate a encoded word (serves as a hash entry). 00216 00217 This functor must be used to generate an encoded word for 00218 a reverse complement (rather than normal) fragment. This 00219 method handles 'n' entries in the EST in the following 00220 manner: 00221 00222 <ul> 00223 00224 <li>Whenever it encounters an 'n' base pair (that is created 00225 when ESTs are loaded in the class EST.cpp) it sets the 00226 ignoreMask to Mask (as Mask template parameter already tells 00227 us the number of bits we care about and that is used as an 00228 indicator of number hashes to ignore).</li> 00229 00230 <li>The ignoreMask is shifted right dropping off the least 00231 significant bits each time this method is called. If all bits 00232 of ignoreMask are cleared then the ignore mask is zero and 00233 ignored.</li> 00234 00235 <li>If the ignoreMask is non-zero, then the caller is expected 00236 not to use the hash returned from this method. as this 00237 word/hash contains a 'n'. As bits get shifted the value for 00238 'n' drops off (when the ignoreMask is zero) and the hashes can 00239 actually be used by the caller.</li> 00240 00241 </ul> 00242 00243 \tparam Shift The number of bits by which the encoding for the 00244 given base pair must be shifted. Currently this parameter is 00245 not used but it is present to provide a symmetric interface 00246 with the NormalEncoder. 00247 00248 \tparam Mask The mask (with bits set to 1) that must be used 00249 to retain the signficiant values in the hash. For example, 00250 when using words of length 6, the Mask would be the binary 00251 <code>1111 1111 1111</code> or <code>0xfff</code>. 00252 00253 \param[in] w The current hash value that has been computed 00254 thusfar. 00255 00256 \param[in] bp The base pair ('A', 'T', 'C', 'G', or 'n') to be 00257 encoded by this method. 00258 00259 \param[in,out] ignoreMask The ignore mask that is used to 00260 determine if the current hash/word has a 'n' entry in it and 00261 must be ignored. 00262 00263 \note The caller must use the returned value for further 00264 operation only if the ignoreMask is zero. Otherwise the 00265 encoder must be repeatedly called with subsequent bases until 00266 the ignoreMask is cleared by this method. 00267 */ 00268 template <const int& Shift, const int& Mask> 00269 struct RevCompEncoder : public std::binary_function<int, char, int> { 00270 inline int operator()(const int w, const char bp, int& ignoreMask) const { 00271 // Setup the ignore mask if base pair is 'n' otherwise 00272 // drop off the lowest 2 bits. 00273 ignoreMask = (bp != 'N') ? (ignoreMask >> 2) : Mask; 00274 // Compute the hash for the word. 'n' becomes 0. 00275 return ((w << 2) | ESTCodec::encode2rc(bp)) & Mask; 00276 } 00277 }; 00278 00279 /** The destructor. 00280 00281 The destructor frees up memory allocated to hold translation 00282 tables etc. The destructor is called only once, when the 00283 process-wide unique instance is destroyed when program 00284 terminates. 00285 */ 00286 ~ESTCodec(); 00287 00288 protected: 00289 /** The constructor. 00290 00291 The constructor is invoked only once when the process-wide 00292 unique static instance of the ESTCodec is created when the 00293 process starts. The constructor initializes the CharToInt 00294 array that is used to translate base pairs (\c A, \c T, \c C, 00295 and \c G) into 2-bits codes (\c 00, \c 11, \c 10, and \c 01) 00296 */ 00297 ESTCodec(); 00298 00299 /** Creates and adds a new reverse-complement translation table 00300 for the given word size. 00301 00302 This method is a helper method that is invoked from the \c 00303 setRevCompTable whenever a new reverse-complement translation 00304 table is needed. This method creates a reverse-complement 00305 table with 4<sup>wordSize</sup> entries. 00306 00307 \param[in] wordSize The number of base pairs in the word for 00308 which a reverse-complement translation table is to be created. 00309 00310 \return This method returns the newly created 00311 reverse-complement translation table. 00312 */ 00313 int* addRevCompTable(const int wordSize); 00314 00315 private: 00316 /** A simple array to map characters \c A, \c T, \c C, and \c G to 00317 \c 0, \c 3, \c 2, and \c 1 respectively. 00318 00319 This is a simple array of 255 entries that are used to convert 00320 the base pair encoding characters \c A, \c T, \c C, and \c G 00321 to \c 0, \c 3, \c 2, and \c 1 respectively. This encoding is 00322 typically used to compute the hash as defined by various EST 00323 analysis algorithms. This array is statically allocated. It 00324 is initialized in the constructor and is never changed during 00325 the life time of this class. 00326 00327 \note This array is statically allocated to enable ready 00328 access from NormalEncoder and RevCompEncoder functors defined 00329 in this class. Hopefully with static arrays, the compilers 00330 more readily optimize and inline the method calls to enocde 00331 and encode2rc. 00332 */ 00333 static char charToInt[]; 00334 00335 /** A simple array to map characters \c A, \c T, \c C, and \c G to 00336 complementary encodings \c 3, \c 0, \c 1, and \c 2 00337 respectively. 00338 00339 This is a simple array of 255 entries that are used to convert 00340 the base pair encoding characters \c A, \c T, \c C, and \c G 00341 to \b complementary codes \c 3, \c 0, \c 1, and \c 2 00342 respectively. This encoding is typically used to compute the 00343 hash as defined by various EST analysis algorithms. This 00344 array is statically allocated. It is initialized in the 00345 constructor and is never changed during the life time of this 00346 class. 00347 00348 \note This array is statically allocated to enable ready 00349 access from NormalEncoder and RevCompEncoder functors defined 00350 in this class. Hopefully with static arrays, the compilers 00351 more readily optimize and inline the method calls to enocde 00352 and encode2rc. 00353 */ 00354 static char charToIntComp[]; 00355 00356 /** A hash map that holds tables to aid in translating a given 00357 word to its reverse complement. 00358 00359 <p>Converting a given encoded word (some fixed \em n number of 00360 base pairs, with each base pair encoded into 2-bits) to its 00361 reverse complement (that is, given the encoded sequence for \c 00362 attcggct it must be converted to the encoded sequence for \c 00363 agccgaat) needs to be computed as a part of EST analysis 00364 algorithms and heuristics. In order to enable rapid translation 00365 pre-populated tabes are used.</p> 00366 00367 <p>However, the reverse-complement translation tables need to 00368 have entries corresponding to the size of words to be 00369 translated. Different algorithms use different word sizes 00370 (such as: 8 bps or 10 bps etc). Accordingly, this hash_map is 00371 used to hold pre-computed reverse-complement translation 00372 tables. The key in the hash map is the word size. The 00373 translation tables contained in this hash map are used via the 00374 \c setRevCompTable method. If a reverse-complement entry does 00375 not exist, then a new entry is added by the addRevCompTable 00376 method. </p> 00377 */ 00378 HashMap<int, int*> revCompTables; 00379 00380 /** The reverse-complement translation table to be used by the 00381 encode2rc method. 00382 00383 This array is set by the \c setRevCompTable method to refer to 00384 the reverse-complement translation table to translate words of 00385 given size to their corresponding reverse-complement 00386 encodings. 00387 */ 00388 const int* revCompTable; 00389 00390 /** The process-wide unique codec instance. 00391 00392 This instance variable is a process-wide unique codec that is 00393 created when the process is started and is destroyed only when 00394 the process terminates. 00395 */ 00396 static ESTCodec estCodec; 00397 }; 00398 00399 #endif