00001 #ifndef TWOPASS_D2_H 00002 #define TWOPASS_D2_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "FWAnalyzer.h" 00038 #include <string> 00039 #include <vector> 00040 00041 // Forward declaration to keep compiler happy 00042 class EST; 00043 class ResultLog; 00044 00045 /** \brief EST Analyzer that uses the D2 algorithm in both its 00046 symmetric and asymmetric variations to compute distance between 00047 two ESTs. 00048 00049 <p>This analyzer provides the mechanism to use D2 algorithm to 00050 compute the distance values between a pair of ESTs. For improved 00051 performance, the analyzer uses D2 asymmetric and symmetric together. 00052 It runs a fast "first pass" using D2 asymmetric and finds the minimum 00053 D2 score, as well as the two windows between which the score was 00054 computed. The analyzer then computes bounds and runs a bounded 00055 version of D2 symmetric. Ideally this analyzer will provide 00056 faster runtime because of the asymmetric pass, and not sacrifice any 00057 precision because of the bounded symmetric pass.<p> 00058 00059 \note This D2 analyzer uses a word size of 8 base pairs. This is 00060 hard coded into the algorithm in the form of various assumptions 00061 (data types of variables etc.). A similar approach is used by 00062 other D2 implementations as well. This is a compromise between 00063 performance and flexibility of the implementation. 00064 */ 00065 class TwoPassD2 : public FWAnalyzer { 00066 friend class ESTAnalyzerFactory; 00067 public: 00068 /** The destructor. 00069 00070 The destructor frees up all any dynamic memory allocated by 00071 this object for its operations. 00072 */ 00073 virtual ~TwoPassD2(); 00074 00075 /** Display valid command line arguments for this analyzer. 00076 00077 This method must be used to display all valid command line 00078 options that are supported by this analyzer. Currently, this 00079 analyzer does not require any special command line parameters. 00080 00081 \note The ESTAnalyzer base class requires that derived EST 00082 analyzer classes <b>must</b> override this method to display 00083 help for their custom command line arguments. When this 00084 method is overridden don't forget to call the corresponding 00085 base class implementation to display common options. 00086 00087 \param[out] os The output stream to which the valid command 00088 line arguments must be written. 00089 */ 00090 virtual void showArguments(std::ostream& os); 00091 00092 /** Process command line arguments. 00093 00094 <p> This method is used to process command line arguments 00095 specific to this EST analyzer. This method is typically used 00096 from the \c main method just after the EST analyzer has been 00097 instantiated. This method consumes all valid command line 00098 arguments. If the command line arguments were valid and 00099 successfully processed, then this method returns \c true. </p> 00100 00101 <p>Currently, this EST analyzer accepts an optional \c 00102 frameShift value to control the stride used for D2 00103 algorithm. However, the default value of 1 for \c frameShift 00104 is the preferred value for this parameter.</p> 00105 00106 \note The \c ESTAnalyzer base class requires that derived EST 00107 analyzer classes <b>must</b> override this method to process 00108 any command line arguments that are custom to their operation. 00109 However, as per API requirement, this methodcalls the 00110 corresponding base class implementation to display additional 00111 options. 00112 00113 \param[in,out] argc The number of command line arguments to be 00114 processed. 00115 00116 \param[in,out] argv The array of command line arguments. 00117 00118 \return This method returns \c true if the command line 00119 arguments were successfully processed. Otherwise this method 00120 returns \c false. This method checks to ensure that a valid 00121 frame size and a valid word size have been specified. 00122 */ 00123 virtual bool parseArguments(int& argc, char **argv); 00124 00125 /** Method to begin EST analysis. 00126 00127 This method is invoked just before commencement of EST 00128 analysis. This method first invokes the base class's \c 00129 initialize method that initializes the heuristic chain, if a 00130 chain has been set for this analyzer. It then initializes 00131 memory for the word tables used to store hash values for the 00132 pairs of ESTs to be analyzed. 00133 00134 \return Currently, this method always returns 0 (zero) to 00135 indicate initialization was successfully completed. 00136 */ 00137 int initialize(); 00138 00139 /** Method to obtain human-readable name for this EST analyzer 00140 00141 This method provides a human-readable string identifying the 00142 EST analyzer. This string is typically used for 00143 display/debugging purposes (particularly via the PEACE 00144 Interactive Console). 00145 00146 \return This method returns the string "layeredD2" identifiying 00147 this analyzer. 00148 */ 00149 virtual std::string getName() const { return "layeredD2"; } 00150 00151 /** Set the reference EST id for analysis. 00152 00153 This method is invoked just before a batch of ESTs are 00154 analyzed via a call to the analyze(EST *) method. This method 00155 currently saves the index in the \c refESTidx instance 00156 variable for further look up. Next it creates a "word table" 00157 (via a call to the \c buildWordTable method) mapping integer 00158 indices to integer hashes of words, in effect translating the 00159 sequence from a sequence of characters to a sequence of 00160 n-words (where n = wordSize). This word table is kept until 00161 the reference EST is changed, which reduces overhead. 00162 00163 \note This method must be called only after the \c 00164 initialize() method is called. 00165 00166 \return This method returns \c true if the estIdx was within 00167 the given range of values. Otherwise this method returns a 00168 non-zero value as the error code. 00169 */ 00170 virtual int setReferenceEST(const int estIdx); 00171 00172 /** Method to perform exhaustive EST analysis. 00173 00174 This method is used to perform the core tasks of comparing all 00175 ESTs to one another for full analysis of ESTs. This is an 00176 additional feature of PEACE that is \em not used for clustering 00177 but just doing an offline analysis. Currently, this method 00178 merely calls the corresponding base class implementation that 00179 performs all the necessary operations. 00180 00181 \note This method was introduced to avoid the unnecessary 00182 warning about partial overloading of virtual methods (warning 00183 #654) in ICC. 00184 00185 \return This method returns zero if all the processing 00186 proceeded successfully. On errors this method returns a 00187 non-zero value. 00188 */ 00189 virtual int analyze() { return FWAnalyzer::analyze(); } 00190 00191 protected: 00192 /** Analyze and obtain a distance metric. 00193 00194 This method can be used to compare a given EST with the 00195 reference EST (set via the call to the setReferenceEST()) 00196 method. 00197 00198 \param[in] otherEST The index (zero based) of the EST with 00199 which the reference EST is to be compared. 00200 00201 \return This method returns the distance value reported by the 00202 D2 algorithm. 00203 */ 00204 virtual float getMetric(const int otherEST); 00205 00206 /** Method to compare two metrics generated by this class. 00207 00208 This method provides the interface for comparing metrics 00209 generated by this ESTAnalyzer when comparing two different 00210 ESTs. This method returns \c true if \c metric1 is 00211 comparatively better than or equal to \c metric2. 00212 00213 \note As per the ESTAnalyzer API requirements, EST analyzers 00214 that are based on distance measures (such as this D2 analyzer) 00215 \b must override this method. 00216 00217 \param[in] metric1 The first metric to be compared against. 00218 00219 \param[in] metric2 The second metric to be compared against. 00220 00221 \return This method returns \c true if \c metric1 is 00222 comparatively better than \c metric2. 00223 */ 00224 inline bool compareMetrics(const float metric1, const float metric2) const 00225 { return (metric1 < metric2); } 00226 00227 /** Obtain an invalid (or the worst) metric generated by this 00228 analyzer. 00229 00230 This method can be used to obtain an invalid metric value for 00231 this analyzer. This value can be used to initialize metric 00232 values. 00233 00234 \note Derived distance-based metric classes (such as this D2 00235 analyzer) \b must override this method to provide a suitable 00236 value. 00237 00238 \return This method returns an invalid (or the worst) metric 00239 of 1e7 for this EST analyzer. 00240 */ 00241 inline float getInvalidMetric() const { return 400.0f; } 00242 00243 /** Get alignment data for the previous call to analyze method. 00244 00245 This method can be used to obtain alignment data that was 00246 obtained typically as an byproduct of the previous call to the 00247 analyze() method. This method essentially returns the 00248 difference between the two windows that provided the minimum 00249 d2 distance value. 00250 00251 \param[out] alignmentData The parameter is updated to the 00252 alignment information generated as a part of the the 00253 immediately preceding analyze(const int) method call is 00254 returned in the parameter. 00255 00256 \return This method always returns \c true to indicate that 00257 alignment data is computed by this ESTAnalyzer. 00258 */ 00259 virtual bool getAlignmentData(int &alignmentData); 00260 00261 /** Determine if this EST analyzer provides distance metrics or 00262 similarity metrics. 00263 00264 This method can be used to determine if this EST analyzer 00265 provides distance metrics or similarity metrics. If this 00266 method returns \c true, then this EST analyzer returns 00267 distance metrics (smaller is better). On the other hand, if 00268 this method returns \c false, then this EST analyzer returns 00269 similarity metrics (bigger is better). 00270 00271 \return This method returns \c true to indicate that this EST 00272 analyzer operates using distance metrics. 00273 */ 00274 bool isDistanceMetric() const { return true; } 00275 00276 /** Helper method to create a word table. 00277 00278 Creates a "word table" mapping integer indices to integer 00279 hashes of words, in effect translating the sequence from a 00280 sequence of characters to a sequence of n-words (where n = 00281 wordSize). 00282 00283 \param[out] wordTable The word table to be populated with with 00284 hash values. 00285 00286 \param[in] estSeq The sequence of base pairs associated with 00287 this EST that must be converted to hash values. 00288 00289 \param encoder The encoder object to be used to generate 00290 encoding for the words added to the generated word table. 00291 */ 00292 template <typename Encoder> 00293 void buildWordTable(int* wordTable, const char* estSeq, Encoder encoder) { 00294 // Compute the has for the first word using the encoder. The 00295 // encoder may do normal or reverse-complement encoding for us. 00296 register int hash = 0; 00297 int ignoreMask = 0; 00298 for(register int i = 0; ((*estSeq != 0) && (i < wordSize - 1)); 00299 i++, estSeq++) { 00300 hash = encoder(hash, *estSeq, ignoreMask); 00301 } 00302 // Now compute the hash for each word 00303 for(int entry = 0; (*estSeq != 0); entry++, estSeq++) { 00304 hash = encoder(hash, *estSeq, ignoreMask); 00305 // If ignoreMask is zero that indicates that the hash 00306 // is not tainted due to 'n' bp in sequence. So use it. 00307 // Otherwise, use the special value for 'n' hashes. 00308 if (!ignoreMask) { 00309 wordTable[entry] = hash; 00310 } else { 00311 wordTable[entry] = NHash; 00312 } 00313 } 00314 } 00315 00316 /** Instance variable that maps an index in the reference sequence 00317 (sequence s1) to the hash of a word. This hash can then be used 00318 as an index in the fdHashMap to get the frequency differential 00319 for that word. 00320 00321 The word table is created in the initialize() method and 00322 filled in using the buildWordTable() method. For the reference 00323 EST, buildWordTable() is called in the setReferenceEST() method, 00324 meaning it does not need to be rebuilt every time we analyze 00325 a new comparison sequence. 00326 */ 00327 int* s1WordTable; 00328 00329 /** Instance variable that maps an index in the comparison sequence 00330 (sequence s2) to the hash of a word. 00331 00332 This hash is used as an index in the fdHashMap to get the 00333 frequency differential for that word. 00334 00335 The word table is created in the initialize() method and 00336 filled in using the buildWordTable() method. For the comparison 00337 EST, buildWordTable() must be called in the analyze() method because 00338 a new comparison sequence is given every time analyze() is called. 00339 */ 00340 int* s2WordTable; 00341 00342 /** Array to track the delta values in the core D2 algorithm. 00343 00344 This array is initialized to point to an array of size 00345 4<sup>wordSize</sup>. This array is used to track the delta 00346 values generated in the core D2 algorithm. 00347 */ 00348 int *delta; 00349 00350 /** Parameter to define number of characters to shift the frame 00351 on the reference sequence, when computing D2. 00352 00353 This parameter is used to enable D2-asymmetric behavior. 00354 The default value is 1, which means D2 symmetric: all frames 00355 in both sequences will be compared. Higher values mean that 00356 the algorithm will shift by more than one character when 00357 shifting the frame on the reference sequence, resulting in 00358 fewer computations but a possible loss of accuracy from not 00359 comparing every frame in both sequences. 00360 */ 00361 int frameShift; 00362 00363 /** Instance variable to store the number of bits to be shifted to 00364 create hash values. 00365 00366 <p>This instance variable is set to the value of 2 * (\em 00367 wordSize - 1) (in the \c initialize method) to reflect the 00368 number of bits that need to be shifted in order to build the 00369 hash values for common words (including the values stored in 00370 \c s1WordMap and \c s1RCWordMap).</p> 00371 00372 <p>This instance variable is actually passed on to the 00373 ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder when 00374 computing hash values. Since this is value is passed in a 00375 template parameter, it is defined to be static (to ensure that 00376 it has external linkage as per the ISO/ANSI standards 00377 requirement).</p> 00378 */ 00379 static int bitShift; 00380 00381 /** The threshold score below which two ESTs are considered 00382 sufficiently similar to be clustered. 00383 00384 This instance variable tracks the threshold value to be used 00385 to break out of the core D2 loop. This value essentially 00386 represents the D2 score below which two ESTs are considered 00387 sufficiently similar. Currently, the default threshold value 00388 is set to 0. This value can be overridden by the user with 00389 the "--threshold" command line argument. 00390 */ 00391 static int minThreshold; 00392 00393 /** The threshold score above which two ESTs are considered 00394 sufficiently dissimilar that a more accurate symmetric 00395 D2 pass would still not cluster the ESTs. 00396 00397 This instance variable tracks the threshold value to be used 00398 to call the runD2Bounded method. If the calculated D2 score 00399 from the runD2Asymmetric method is above this threshold, 00400 bounded D2 will not be run. Currently, the default value is 00401 130. This value can be overridden by the user with the 00402 "--maxThreshold" command line argument. 00403 */ 00404 int maxThreshold; 00405 00406 /** The threshold score to be used for clustering. 00407 00408 note: need to disambiguate from the two scores above; 00409 and possibly minThreshold should be made non-static for consistency 00410 */ 00411 int threshold; 00412 00413 private: 00414 /** The set of arguments specific to the D2 algorithm 00415 00416 This instance variable contains a static list of arguments 00417 that are specific only to the D2 analyzer class. This 00418 argument list is statically defined and shared by all 00419 instances of this class. 00420 00421 \note Use of static arguments and parameters makes D2 class 00422 hierarchy not MT-safe. 00423 */ 00424 static arg_parser::arg_record argsList[]; 00425 00426 /* The default constructor for this class. 00427 00428 The default constructor for this class. The constructor is 00429 made private so that this class cannot be directly 00430 instantiated. However, since the ESTAnalyzerFactory is a 00431 friend of this class; therefore it can instantiate the D2 00432 analyzer. Accordingly, the ESTAnalyzerFactory::create() 00433 method must be used to instantiate this class. 00434 00435 \param[in] refESTidx The reference EST index value to be used 00436 when performing EST analysis. This parameter should be >= 0. 00437 This value is simply passed onto the base class. 00438 00439 \param[in] outputFile The name of the output file to which the 00440 EST analysis data is to be written. This parameter is ignored 00441 if this analyzer is used for clustering. If this parameter is 00442 the empty string then output is written to standard output. 00443 This value is simply passed onto the base class. 00444 */ 00445 TwoPassD2(const int refESTidx, const std::string& outputFileName); 00446 00447 /** Instance variable to track alignment metric computed by the 00448 analyze() method. 00449 00450 This instance variable is used to hold the alignment metric 00451 that was computed in the previous \c analyze method call. By 00452 default this value is set to zero. The alignment metric is 00453 computed as the difference in the window positions (on the two 00454 ESTs being analyzed) with the minimum d2 distance. 00455 */ 00456 int alignmentMetric; 00457 00458 /** The bitmask to be used when building hash values. 00459 00460 This bitmask is used to drop higher order bits when building 00461 hash values for \c wordSize (defined in base class) base pairs 00462 in a given EST. This instance variable is actually passed on 00463 to the ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder 00464 when computing hash values. Since this is value is passed in 00465 a template parameter, it is defined to be static (to ensure 00466 that it has external linkage as per the ISO/ANSI standards 00467 requirement). 00468 */ 00469 static int BitMask; 00470 00471 /** The hash value to be used for a word containing 'N'. 00472 00473 Initialized to MapSize (the size of the delta table). 00474 */ 00475 static int NHash; 00476 00477 /** Boolean value indicating whether or not d2 scores should be 00478 normalized (converted to a 1.0-based value where 1.0 is equal 00479 to the d2 threshold). 00480 00481 Defaults to false (i.e. scores should be normalized). 00482 */ 00483 static bool noNormalize; 00484 00485 /** Instance variable to track the number of words (of \c 00486 wordSize) that can fit into a window (of \c frameSize). 00487 00488 This instance variable is set in the initialize() method and 00489 its value is used by various methods in this class. Rather 00490 than computing this value repeatedly, it is computed once and 00491 used throughout. 00492 */ 00493 int numWordsInWindow; 00494 00495 /** Length of sequence 1, or the reference sequence. Stored to 00496 avoid making multiple unnecessary strlen() calls. 00497 */ 00498 int sq1Len; 00499 00500 /** Length of sequence 2, or the comparison sequence. Stored to 00501 avoid making multiple unnecessary strlen() calls. 00502 */ 00503 int sq2Len; 00504 00505 /** Helper method to update the delta table as well as the 00506 minimum score (if it has changed) when the window is shifted. 00507 00508 For d2 asymmetric, the updateWindow method also has to keep 00509 track of the location on each sequence where the minimum 00510 score was found, which is the function of the variables 00511 s1MinScoreIdx and s2MinScoreIdx. The variables s1CurWindowIdx 00512 and s2CurWindowIdx mark the start of the current window on each 00513 sequence. All of these variables are numbers, corresponding to 00514 indices of the word tables. 00515 */ 00516 inline void updateWindowAsym(const int wordIn, const int wordOut, 00517 int& score, int& minScore, 00518 const int s1CurWindowIdx, 00519 const int s2CurWindowIdx, 00520 int* s1MinScoreIdx, int* s2MinScoreIdx) { 00521 // Update score and delta for word moving in 00522 if (wordIn != NHash) { 00523 score -= (delta[wordIn] << 1) - 1; 00524 delta[wordIn]--; 00525 } 00526 // Update score and delta for word moving out 00527 if (wordOut != NHash) { 00528 score += (delta[wordOut] << 1) + 1; 00529 delta[wordOut]++; 00530 } 00531 // Track the minimum score. 00532 if (score < minScore) { 00533 minScore = score; 00534 // Update the indices for the new min score 00535 *s1MinScoreIdx = s1CurWindowIdx; 00536 *s2MinScoreIdx = s2CurWindowIdx; 00537 } 00538 } 00539 00540 /** Helper method to update the delta table as well as the 00541 minimum score (if it has changed) when the window is shifted. 00542 This method is identical to the updateWindow method found in 00543 D2.h (Peace's base D2 implementation). 00544 */ 00545 inline void updateWindow(const int wordIn, const int wordOut, 00546 int& score, int& minScore, 00547 const int windowDistance) { 00548 // Update score and delta for word moving in 00549 if (wordIn != NHash) { 00550 score -= (delta[wordIn] << 1) - 1; 00551 delta[wordIn]--; 00552 } 00553 // Update score and delta for word moving out 00554 if (wordOut != NHash) { 00555 score += (delta[wordOut] << 1) + 1; 00556 delta[wordOut]++; 00557 } 00558 // Track the minimum score. 00559 if (score < minScore) { 00560 minScore = score; 00561 alignmentMetric = windowDistance; 00562 } 00563 } 00564 00565 /** Method to dynamically compute parameters such as frame size 00566 and threshold. Sets these parameters to their (hopefully) 00567 optimum values for better clustering of data sets with wide 00568 variation in length, such as data generated by 454 pyrosequencing. 00569 */ 00570 void updateParameters(); 00571 00572 /** Method to run the D2 algorithm, asymmetric version. 00573 This method runs D2 asymmetric on the two EST sequences 00574 and finds the minimum D2 score. Since this is D2 asymmetric, 00575 the score is not guaranteed to be the true minimum. Thus, 00576 this method also finds the indices of the windows on each 00577 sequence where the minimum D2 score was found. The analyzer 00578 will use these to compute appropriate bounds for the bounded 00579 symmetric D2 function. 00580 */ 00581 float runD2Asymmetric(int* s1MinScoreIdx, int* s2MinScoreIdx); 00582 00583 /** Method to run the D2 algorithm, symmetric version. 00584 This method is essentially identical to the runD2 method 00585 in the D2.cpp class (Peace's base D2 implementation) with 00586 the exception that this method places bounds on each sequence. 00587 It will not search beyond these bounds. This saves a great 00588 deal of computation time (though the asymptotic runtime 00589 is unchanged) by avoiding unnecessary computations and only 00590 searching in the area where the best match might be found. 00591 */ 00592 float runD2Bounded(int sq1Start, int sq1End, int sq2Start, int sq2End); 00593 00594 /** The hint key that is used to add hint for normal or 00595 reverse-complement D2 computation. 00596 00597 This hint key is used to set a hint in the \c hints hash 00598 map. This string is defined as a constant to save compute time 00599 in the core \c runHeuristics method. 00600 */ 00601 const std::string hintKey; 00602 00603 }; 00604 00605 #endif