00001 #ifndef D2_H 00002 #define D2_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "FWAnalyzer.h" 00038 #include "EST.h" 00039 00040 #include <string> 00041 #include <vector> 00042 00043 // Forward declaration to keep compiler happy 00044 class EST; 00045 class ResultLog; 00046 00047 /** \brief EST Analyzer that uses the D2 algorithm to compute 00048 distances between two ESTs. 00049 00050 <p>This analyzer provides the mechanism to use D2 algorithm to 00051 compute the distance values between a pair of ESTs. The D2 00052 implementation has been adapted purely from the implementations of 00053 WCD, Zimmerman, and CLU.<p> 00054 00055 \note This D2 analyzer uses a word size of 8 base pairs. This is 00056 hard coded into the algorithm in the form of various assumptions 00057 (data types of variables etc.). A similar approach is used by 00058 other D2 implementations as well. This is a compromise between 00059 performance and flexibility of the implementation. 00060 */ 00061 class D2 : public FWAnalyzer { 00062 friend class ESTAnalyzerFactory; 00063 public: 00064 /** The destructor. 00065 00066 The destructor frees up all any dynamic memory allocated by 00067 this object for its operations. 00068 */ 00069 virtual ~D2(); 00070 00071 /** Display valid command line arguments for this analyzer. 00072 00073 This method must be used to display all valid command line 00074 options that are supported by this analyzer. Currently, this 00075 analyzer does not require any special command line parameters. 00076 00077 \note The ESTAnalyzer base class requires that derived EST 00078 analyzer classes <b>must</b> override this method to display 00079 help for their custom command line arguments. When this 00080 method is overridden don't forget to call the corresponding 00081 base class implementation to display common options. 00082 00083 \param[out] os The output stream to which the valid command 00084 line arguments must be written. 00085 */ 00086 virtual void showArguments(std::ostream& os); 00087 00088 /** Process command line arguments. 00089 00090 <p> This method is used to process command line arguments 00091 specific to this EST analyzer. This method is typically used 00092 from the \c main method just after the EST analyzer has been 00093 instantiated. This method consumes all valid command line 00094 arguments. If the command line arguments were valid and 00095 successfully processed, then this method returns \c true. </p> 00096 00097 <p>Currently, this EST analyzer accepts an optional \c 00098 frameShift value to control the stride used for D2 00099 algorithm. However, the default value of 1 for \c frameShift 00100 is the preferred value for this parameter.</p> 00101 00102 \note The \c ESTAnalyzer base class requires that derived EST 00103 analyzer classes <b>must</b> override this method to process 00104 any command line arguments that are custom to their operation. 00105 However, as per API requirement, this methodcalls the 00106 corresponding base class implementation to display additional 00107 options. 00108 00109 \param[in,out] argc The number of command line arguments to be 00110 processed. 00111 00112 \param[in,out] argv The array of command line arguments. 00113 00114 \return This method returns \c true if the command line 00115 arguments were successfully processed. Otherwise this method 00116 returns \c false. This method checks to ensure that a valid 00117 frame size and a valid word size have been specified. 00118 */ 00119 virtual bool parseArguments(int& argc, char **argv); 00120 00121 /** Method to begin EST analysis. 00122 00123 This method is invoked just before commencement of EST 00124 analysis. This method first invokes the base class's \c 00125 initialize method that initializes the heuristic chain, if a 00126 chain has been set for this analyzer. It then initializes 00127 memory for the word tables used to store hash values for the 00128 pairs of ESTs to be analyzed. 00129 00130 \return Currently, this method always returns 0 (zero) to 00131 indicate initialization was successfully completed. 00132 */ 00133 int initialize(); 00134 00135 /** Method to obtain human-readable name for this EST analyzer 00136 00137 This method provides a human-readable string identifying the 00138 EST analyzer. This string is typically used for 00139 display/debugging purposes (particularly via the PEACE 00140 Interactive Console). 00141 00142 \return This method returns the string "d2" identifiying this 00143 analyzer. 00144 */ 00145 virtual std::string getName() const { return "d2"; } 00146 00147 /** Set the reference EST id for analysis. 00148 00149 This method is invoked just before a batch of ESTs are 00150 analyzed via a call to the analyze(EST *) method. This method 00151 currently saves the index in the \c refESTidx instance 00152 variable for further look up. Next it creates a "word table" 00153 (via a call to the \c buildWordTable method) mapping integer 00154 indices to integer hashes of words, in effect translating the 00155 sequence from a sequence of characters to a sequence of 00156 n-words (where n = wordSize). This word table is kept until 00157 the reference EST is changed, which reduces overhead. 00158 00159 \note This method must be called only after the \c 00160 initialize() method is called. 00161 00162 \return This method returns \c true if the estIdx was within 00163 the given range of values. Otherwise this method returns a 00164 non-zero value as the error code. 00165 */ 00166 virtual int setReferenceEST(const int estIdx); 00167 00168 /** Method to perform exhaustive EST analysis. 00169 00170 This method is used to perform the core tasks of comparing all 00171 ESTs to one another for full analysis of ESTs. This is an 00172 additional feature of PEACE that is \em not used for clustering 00173 but just doing an offline analysis. Currently, this method 00174 merely calls the corresponding base class implementation that 00175 performs all the necessary operations. 00176 00177 \note This method was introduced to avoid the unnecessary 00178 warning about partial overloading of virtual methods (warning 00179 #654) in ICC. 00180 00181 \return This method returns zero if all the processing 00182 proceeded successfully. On errors this method returns a 00183 non-zero value. 00184 */ 00185 virtual int analyze() { return FWAnalyzer::analyze(); } 00186 00187 protected: 00188 /** Analyze and obtain a distance metric. 00189 00190 This method can be used to compare a given EST with the 00191 reference EST (set via the call to the setReferenceEST()) 00192 method. 00193 00194 \param[in] otherEST The index (zero based) of the EST with 00195 which the reference EST is to be compared. 00196 00197 \return This method returns the distance value reported by the 00198 D2 algorithm. 00199 */ 00200 virtual float getMetric(const int otherEST); 00201 00202 /** Method to compare two metrics generated by this class. 00203 00204 This method provides the interface for comparing metrics 00205 generated by this ESTAnalyzer when comparing two different 00206 ESTs. This method returns \c true if \c metric1 is 00207 comparatively better than or equal to \c metric2. 00208 00209 \note As per the ESTAnalyzer API requirements, EST analyzers 00210 that are based on distance measures (such as this D2 analyzer) 00211 \b must override this method. 00212 00213 \param[in] metric1 The first metric to be compared against. 00214 00215 \param[in] metric2 The second metric to be compared against. 00216 00217 \return This method returns \c true if \c metric1 is 00218 comparatively better than \c metric2. 00219 */ 00220 inline bool compareMetrics(const float metric1, const float metric2) const 00221 { return (metric1 < metric2); } 00222 00223 /** Obtain an invalid (or the worst) metric generated by this 00224 analyzer. 00225 00226 This method can be used to obtain an invalid metric value for 00227 this analyzer. This value can be used to initialize metric 00228 values. 00229 00230 \note Derived distance-based metric classes (such as this D2 00231 analyzer) \b must override this method to provide a suitable 00232 value. 00233 00234 \return This method returns an invalid (or the worst) metric 00235 of 1e7 for this EST analyzer. 00236 */ 00237 inline float getInvalidMetric() const { return 4.0f * frameSize; } 00238 00239 /** Get alignment data for the previous call to analyze method. 00240 00241 This method can be used to obtain alignment data that was 00242 obtained typically as an byproduct of the previous call to the 00243 analyze() method. This method essentially returns the 00244 difference between the two windows that provided the minimum 00245 d2 distance value. 00246 00247 \param[out] alignmentData The parameter is updated to the 00248 alignment information generated as a part of the the 00249 immediately preceding analyze(const int) method call is 00250 returned in the parameter. 00251 00252 \return This method always returns \c true to indicate that 00253 alignment data is computed by this ESTAnalyzer. 00254 */ 00255 virtual bool getAlignmentData(int &alignmentData); 00256 00257 /** Determine if this EST analyzer provides distance metrics or 00258 similarity metrics. 00259 00260 This method can be used to determine if this EST analyzer 00261 provides distance metrics or similarity metrics. If this 00262 method returns \c true, then this EST analyzer returns 00263 distance metrics (smaller is better). On the other hand, if 00264 this method returns \c false, then this EST analyzer returns 00265 similarity metrics (bigger is better). 00266 00267 \return This method returns \c true to indicate that this EST 00268 analyzer operates using distance metrics. 00269 */ 00270 bool isDistanceMetric() const { return true; } 00271 00272 /** Helper method to create a word table. 00273 00274 Creates a "word table" mapping integer indices to integer 00275 hashes of words, in effect translating the sequence from a 00276 sequence of characters to a sequence of n-words (where n = 00277 wordSize). 00278 00279 \param encoder The encoder class to be used for building the 00280 word table. 00281 00282 \param[out] wordTable The word table to be populated with with 00283 hash values. 00284 00285 \param[in] estSeq The sequence of base pairs associated with 00286 this EST that must be converted to hash values. 00287 */ 00288 template <typename Encoder> 00289 void buildWordTable(std::vector<int>& wordTable, const char* estSeq, 00290 Encoder encoder) { 00291 // First clear out old data in word table. 00292 wordTable.clear(); 00293 const size_t wordTableSize = EST::getMaxESTLen() + frameSize; 00294 // Reserve space to avoid repeated reallocations as entries 00295 // are added to the wordTable vector. 00296 wordTable.reserve(wordTableSize); 00297 // Compute the has for the first word using the encoder. The 00298 // encoder may do normal or reverse-complement encoding for us. 00299 int hash = 0; 00300 int ignoreHash = 0; 00301 for(register int i = 0; ((*estSeq != 0) && (i < wordSize - 1)); 00302 i++, estSeq++) { 00303 hash = encoder(hash, *estSeq, ignoreHash); 00304 } 00305 // Now compute the hash for each word 00306 for(int entry = 0; (*estSeq != 0); entry++, estSeq++) { 00307 hash = encoder(hash, *estSeq, ignoreHash); 00308 if (!ignoreHash) { 00309 // This hash does not have a 'n' in it. So use it. 00310 wordTable.push_back(hash); 00311 } 00312 } 00313 } 00314 00315 /** Instance variable that maps an index in the reference sequence 00316 (sequence s1) to the hash of a word. This hash can then be 00317 used as an index in the delta array to get the frequency 00318 differential for that word. 00319 00320 The word table is created in the initialize() method and 00321 filled in using the buildWordTable() method. For the reference 00322 EST, buildWordTable() is called in the setReferenceEST() method, 00323 meaning it does not need to be rebuilt every time we analyze 00324 a new comparison sequence. 00325 */ 00326 std::vector<int> s1WordTable; 00327 00328 /** Instance variable that maps an index in the comparison sequence 00329 (sequence s2) to the hash of a word. 00330 00331 This hash is used as an index in the delta array to get the 00332 frequency differential for that word. 00333 00334 The word table is created in the initialize() method and 00335 filled in using the buildWordTable() method. For the comparison 00336 EST, buildWordTable() must be called in the analyze() method because 00337 a new comparison sequence is given every time analyze() is called. 00338 */ 00339 std::vector<int> s2WordTable; 00340 00341 /** Array to track the delta values in the core D2 algorithm. 00342 00343 This array is initialized to point to an array of size 00344 4<sup>wordSize</sup>. This array is used to track the delta 00345 values generated in the core D2 algorithm. 00346 */ 00347 int *delta; 00348 00349 /** Parameter to define number of characters to shift the frame 00350 on the reference sequence, when computing D2. 00351 00352 This parameter is used to enable D2-asymmetric behavior. 00353 The default value is 1, which means D2 symmetric: all frames 00354 in both sequences will be compared. Higher values mean that 00355 the algorithm will shift by more than one character when 00356 shifting the frame on the reference sequence, resulting in 00357 fewer computations but a possible loss of accuracy from not 00358 comparing every frame in both sequences. 00359 00360 \note Currently this value is not used. 00361 */ 00362 static int frameShift; 00363 00364 /** Instance variable to store the number of bits to be shifted to 00365 create hash values. 00366 00367 <p>This instance variable is set to the value of 2 * (\em 00368 wordSize - 1) (in the \c initialize method) to reflect the 00369 number of bits that need to be shifted in order to build the 00370 hash values for common words (including the values stored in 00371 \c s1WordMap and \c s1RCWordMap).</p> 00372 00373 <p>This instance variable is actually passed on to the 00374 ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder when 00375 computing hash values. Since this is value is passed in a 00376 template parameter, it is defined to be static (to ensure that 00377 it has external linkage as per the ISO/ANSI standards 00378 requirement).</p> 00379 */ 00380 static int bitShift; 00381 00382 /** The threshold score below which two ESTs are considered 00383 sufficiently similar to be clustered. 00384 00385 This instance variable tracks the threshold value to be used 00386 to break out of the core D2 loop. This value essentially 00387 represents the D2 score below which two ESTs are considered 00388 sufficiently similar. Currently, the default threshold value 00389 is set to 40. This value can be overridden by the user with 00390 the "--threshold" command line argument. 00391 */ 00392 static int threshold; 00393 00394 private: 00395 /** The set of arguments specific to the D2 algorithm 00396 00397 This instance variable contains a static list of arguments 00398 that are specific only to the D2 analyzer class. This 00399 argument list is statically defined and shared by all 00400 instances of this class. 00401 00402 \note Use of static arguments and parameters makes D2 class 00403 hierarchy not MT-safe. 00404 */ 00405 static arg_parser::arg_record argsList[]; 00406 00407 /* The default constructor for this class. 00408 00409 The default constructor for this class. The constructor is 00410 made private so that this class cannot be directly 00411 instantiated. However, since the ESTAnalyzerFactory is a 00412 friend of this class; therefore it can instantiate the D2 00413 analyzer. Accordingly, the ESTAnalyzerFactory::create() 00414 method must be used to instantiate this class. 00415 00416 \param[in] refESTidx The reference EST index value to be used 00417 when performing EST analysis. This parameter should be >= 0. 00418 This value is simply passed onto the base class. 00419 00420 \param[in] outputFile The name of the output file to which the 00421 EST analysis data is to be written. This parameter is ignored 00422 if this analyzer is used for clustering. If this parameter is 00423 the empty string then output is written to standard output. 00424 This value is simply passed onto the base class. 00425 */ 00426 D2(const int refESTidx, const std::string& outputFileName); 00427 00428 /** Instance variable to track alignment metric computed by the 00429 analyze() method. 00430 00431 This instance variable is used to hold the alignment metric 00432 that was computed in the previous \c analyze method call. By 00433 default this value is set to zero. The alignment metric is 00434 computed as the difference in the window positions (on the two 00435 ESTs being analyzed) with the minimum d2 distance. 00436 */ 00437 int alignmentMetric; 00438 00439 /** The bitmask to be used when build hashing values. 00440 00441 This bitmask is used to drop higher order bits when building 00442 hash values for \c wordSize (defined in base class) base pairs 00443 in a given EST. This instance variable is actually passed on 00444 to the ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder 00445 when computing hash values. Since this is value is passed in 00446 a template parameter, it is defined to be static (to ensure 00447 that it has external linkage as per the ISO/ANSI standards 00448 requirement). 00449 */ 00450 static int BitMask; 00451 00452 /** Instance variable to track the number of words (of \c 00453 wordSize) that can fit into a window (of \c frameSize). 00454 00455 This instance variable is set in the initialize() method and 00456 its value is used by various methods in this class. Rather 00457 than computing this value repeatedly, it is computed once and 00458 used throughout. 00459 */ 00460 int numWordsInWindow; 00461 00462 /** Helper method to update the scores based on a sliding window. 00463 00464 This method is invoked from several different spots from the 00465 runD2() method to update the d2 scores as the window slides 00466 across the two sequences being analyzed. The hash values of 00467 the words moving into and out of the window are used to update 00468 the scores. 00469 00470 \param[in] wordIn The hash value of the word moving into the 00471 window. 00472 00473 \param[in] wordOut The hash value of the word moving out of 00474 the window. 00475 00476 \param[in,out] score The current running score for this 00477 window. This value is updated using the delta array. 00478 00479 \param[in,out] minScore The current minimum score. This value 00480 is updated after the score is updated to reflect the minimum 00481 of score and minScore. 00482 */ 00483 inline void updateWindow(const int wordIn, const int wordOut, 00484 int& score, int& minScore) { 00485 // Update score and delta for word moving in 00486 score -= (delta[wordIn] << 1) - 1; 00487 delta[wordIn]--; 00488 // Update score and delta for word moving out 00489 score += (delta[wordOut] << 1) + 1; 00490 delta[wordOut]++; 00491 // Track the minimum score. 00492 minScore = std::min(score, minScore); 00493 } 00494 00495 /** The core method that run's the D2 algorithm. 00496 00497 This method performs the core D2 analysis. This method is 00498 invoked from the getMetric() method to run the core D2 00499 algorithm. This method operates as follows: 00500 00501 <ol> 00502 00503 <li>If a heuristic chain has been set, then this method 00504 obtains a hint from the chain to decide if the normal or 00505 reverse complement analysis must be performed. The default is 00506 normal analysis.</li> 00507 00508 <li>Next, a hash table of words is built for the otherEST 00509 sequence via a call to the buildWordTable method.</li> 00510 00511 <li>Next, it computes the score using the first two windows.</li> 00512 00513 <li>Finally, for each window in the reference EST it iterates 00514 over the windows in the other EST (sliding windows in each 00515 inner iteration) to compute the minimum d2 score by calling 00516 the updateWindow() method.</li> 00517 00518 <li>It finally returns the minimum d2 score recorded.</li> 00519 00520 </ol> 00521 00522 \param[in] otherEST The index of the other EST to be analyzed 00523 by this method. 00524 00525 \return The d2 score between the reference EST (set via call 00526 to setReferenceEST()) and the otherEST (parameter). 00527 */ 00528 float runD2(const int otherEST); 00529 00530 /** The hint key that is used to add hint for normal or 00531 reverse-complement D2 computation. 00532 00533 This hint key is used to set a hint in the \c hints hash 00534 map. This string is defined as a constant to save compute time 00535 in the core \c runHeuristics method. 00536 */ 00537 const std::string hintKey; 00538 }; 00539 00540 #endif