00001 #ifndef D2_ZIM_H 00002 #define D2_ZIM_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "FWAnalyzer.h" 00038 #include <string> 00039 #include <vector> 00040 00041 // Forward declaration to keep compiler happy 00042 class EST; 00043 class ResultLog; 00044 00045 /** \brief EST Analyzer that uses the d2 algorithm to compute 00046 distances between two ESTs. 00047 00048 <p>This analyzer provides the mechanism to use vanilla D2 00049 algorithm to compute the distance values between a pair of 00050 ESTs. The D2 implementation has been adapted from the 00051 implementations of WCD and CLU.<p> 00052 00053 \note This D2 analyzer uses a word size of 8 base pairs. This is 00054 hard coded into the algorithm in the form of various assumptions 00055 (data types of variables etc.). A similar approach is used by 00056 other D2 implementations as well. This is a compromise between 00057 performance and flexibility of the implementation. 00058 */ 00059 class D2Zim : public FWAnalyzer { 00060 friend class ESTAnalyzerFactory; 00061 public: 00062 /** The destructor. 00063 00064 The destructor frees up all any dynamic memory allocated by 00065 this object for its operations. 00066 */ 00067 virtual ~D2Zim(); 00068 00069 /** Display valid command line arguments for this analyzer. 00070 00071 This method must be used to display all valid command line 00072 options that are supported by this analyzer. Currently, this 00073 analyzer does not require any special command line parameters. 00074 00075 \note The ESTAnalyzer base class requires that derived EST 00076 analyzer classes <b>must</b> override this method to display 00077 help for their custom command line arguments. When this 00078 method is overridden don't forget to call the corresponding 00079 base class implementation to display common options. 00080 00081 \param[out] os The output stream to which the valid command 00082 line arguments must be written. 00083 */ 00084 virtual void showArguments(std::ostream& os); 00085 00086 /** Process command line arguments. 00087 00088 <p> This method is used to process command line arguments 00089 specific to this EST analyzer. This method is typically used 00090 from the \c main method just after the EST analyzer has been 00091 instantiated. This method consumes all valid command line 00092 arguments. If the command line arguments were valid and 00093 successfully processed, then this method returns \c true. </p> 00094 00095 <p>Currently, this EST analyzer does not require any 00096 additional command line parameters. Consequently, it simply 00097 calls the corresponding method in the base class.</p> 00098 00099 \note The \c ESTAnalyzer base class requires that derived EST 00100 analyzer classes <b>must</b> override this method to process 00101 any command line arguments that are custom to their operation. 00102 When this method is overridden don't forget to call the 00103 corresponding base class implementation to display common 00104 options. 00105 00106 \param[in,out] argc The number of command line arguments to be 00107 processed. 00108 00109 \param[in,out] argv The array of command line arguments. 00110 00111 \return This method returns \c true if the command line 00112 arguments were successfully processed. Otherwise this method 00113 returns \c false. This method checks to ensure that a valid 00114 frame size and a valid word size have been specified. 00115 */ 00116 virtual bool parseArguments(int& argc, char **argv); 00117 00118 /** Method to begin EST analysis. 00119 00120 This method is invoked just before commencement of EST 00121 analysis. This method currently does not have any specific 00122 tasks to perform. It simply returns 0. 00123 00124 \return Currently, this method always returns 0 (zero) to 00125 indicate initialization was successfully completed. 00126 */ 00127 int initialize(); 00128 00129 /** Method to obtain human-readable name for this EST analyzer 00130 00131 This method provides a human-readable string identifying the 00132 EST analyzer. This string is typically used for 00133 display/debugging purposes (particularly via the PEACE 00134 Interactive Console). 00135 00136 \return This method returns the string "d2zim" identifiying 00137 this d2 analyzer was based on description in Zimmermann's 00138 paper. 00139 */ 00140 virtual std::string getName() const { return "d2zim"; } 00141 00142 /** Set the reference EST id for analysis. 00143 00144 This method is invoked just before a batch of ESTs are 00145 analyzed via a call to the analyze(EST *) method. This method 00146 currently saves the index in the instance variable for further 00147 look up. Next it creates a "word table" mapping integer indices 00148 to integer hashes of words, in effect translating the sequence 00149 from a sequence of characters to a sequence of n-words (where n = 00150 wordSize). This word table is kept until the reference EST is 00151 changed, which reduces overhead. 00152 00153 \note This method must be called only after the initialize() 00154 method is called. 00155 00156 \return This method returns \c true if the estIdx was within 00157 the given range of values. Otherwise this method returns a 00158 non-zero value as the error code. 00159 */ 00160 virtual int setReferenceEST(const int estIdx); 00161 00162 /** Method to perform exhaustive EST analysis. 00163 00164 This method is used to perform the core tasks of comparing all 00165 ESTs to one another for full analysis of ESTs. This is an 00166 additional feature of PEACE that is \em not used for 00167 clustering but just doing an offline analysis. Currently, 00168 this method merely calls the corresponding base class 00169 implementation that performs all the necessary operations. 00170 00171 \return This method returns zero if all the processing 00172 proceeded successfully. On errors this method returns a 00173 non-zero value. 00174 */ 00175 virtual int analyze() { return FWAnalyzer::analyze(); } 00176 00177 protected: 00178 /** Analyze and obtain a distance metric. 00179 00180 This method can be used to compare a given EST with the 00181 reference EST (set via the call to the setReferenceEST()) 00182 method. 00183 00184 \param[in] otherEST The index (zero based) of the EST with 00185 which the reference EST is to be compared. 00186 00187 \return This method returns the distance value reported by the 00188 D2 algorithm. 00189 */ 00190 virtual float getMetric(const int otherEST); 00191 00192 /** Method to compare two metrics generated by this class. 00193 00194 This method provides the interface for comparing metrics 00195 generated by this ESTAnalyzer when comparing two different 00196 ESTs. This method returns \c true if \c metric1 is 00197 comparatively better than or equal to \c metric2. 00198 00199 \note As per the ESTAnalyzer API requirements, EST analyzers 00200 that are based on distance measures (such as this D2 analyzer) 00201 \b must override this method. 00202 00203 \param[in] metric1 The first metric to be compared against. 00204 00205 \param[in] metric2 The second metric to be compared against. 00206 00207 \return This method returns \c true if metric1 is 00208 comparatively better than \c metric2. 00209 */ 00210 inline bool compareMetrics(const float metric1, const float metric2) const 00211 { return (metric1 < metric2); } 00212 00213 /** Obtain an invalid (or the worst) metric generated by this 00214 analyzer. 00215 00216 This method can be used to obtain an invalid metric value for 00217 this analyzer. This value can be used to initialize metric 00218 values. 00219 00220 \note Derived distance-based metric classes (such as this D2 00221 analyzer) \b must override this method to provide a suitable 00222 value. 00223 00224 \return This method returns an invalid (or the worst) metric 00225 of 1e7 for this EST analyzer. 00226 */ 00227 inline float getInvalidMetric() const { return 1e7; } 00228 00229 /** Get alignment data for the previous call to analyze method. 00230 00231 This method can be used to obtain alignment data that was 00232 obtained typically as an byproduct of the previous call to the 00233 analyze() method. This method essentially returns the 00234 difference between the two windows that provided the minimum 00235 d2 distance value. 00236 00237 \param[out] alignmentData The parameter is updated to the 00238 alignment information generated as a part of the the 00239 immediately preceding analyze(const int) method call is 00240 returned in the parameter. 00241 00242 \return This method always returns \c true to indicate that 00243 alignment data is computed by this ESTAnalyzer. 00244 */ 00245 virtual bool getAlignmentData(int &alignmentData); 00246 00247 /** Determine if this EST analyzer provides distance metrics or 00248 similarity metrics. 00249 00250 This method can be used to determine if this EST analyzer 00251 provides distance metrics or similarity metrics. If this 00252 method returns \c true, then this EST analyzer returns 00253 distance metrics (smaller is better). On the other hand, if 00254 this method returns \c false, then this EST analyzer returns 00255 similarity metrics (bigger is better). 00256 00257 \return This method returns \c true to indicate that this EST 00258 analyzer operates using distance metrics. 00259 */ 00260 bool isDistanceMetric() const { return true; } 00261 00262 /** 00263 Creates a "word table" mapping integer indices to integer hashes 00264 of words, in effect translating the sequence from a sequence of 00265 characters to a sequence of n-words (where n = wordSize). 00266 */ 00267 void buildWordTable(int* wordTable, const char* s); 00268 00269 /** Helper method to build frequency distribution hash map. 00270 00271 This is a helper method that is used to build the initial frequency 00272 differential hash map for the pair of ESTs currently being 00273 compared. This structure maps integer hashes of words to 00274 integers denoting the difference in word frequency from sequence 00275 1 to sequence 2. Additionally, it computes the initial squared 00276 Euclidean distance (d2's distance measure). 00277 */ 00278 void buildFdHashMaps(int* sed); 00279 00280 /** Helper method to update the frequency hash map and squared 00281 Euclidean distance after the frame is shifted by 1 character 00282 on the reference sequence. 00283 */ 00284 inline void refShiftUpdateFd(int* sed, const int framePos) { 00285 // update sed and fd from leftmost word falling out 00286 *sed += -2*(fdHashMap[s1WordTable[framePos-1]]--) + 1; 00287 // update sed and fd from new rightmost word 00288 *sed += 2*(fdHashMap[s1WordTable[framePos+NumWordsWin]]++) + 1; 00289 } 00290 00291 /** Helper method to update the frequency hash map and squared 00292 Euclidean distance after a 1 character right-shift on the 00293 comparison sequence. 00294 */ 00295 inline void rightShiftUpdateFd(int* sed, const int framePos) { 00296 // update sed and fd from leftmost word falling out 00297 *sed += 2*(fdHashMap[s2WordTable[framePos-1]]++) + 1; 00298 // update sed and fd from new rightmost word 00299 *sed += -2*(fdHashMap[s2WordTable[framePos+NumWordsWin]]--) + 1; 00300 } 00301 00302 00303 /** Helper method to update the frequency hash map and squared 00304 Euclidean distance after a 1 character left-shift on the 00305 comparison sequence. 00306 */ 00307 inline void leftShiftUpdateFd(int* sed, const int framePos) { 00308 // update sed and fd from rightmost word falling out 00309 *sed += 2*(fdHashMap[s2WordTable[framePos+NumWordsWin+1]]++) + 1; 00310 // update sed and fd from new leftmost word 00311 *sed += -2*(fdHashMap[s2WordTable[framePos]]--) + 1; 00312 } 00313 00314 /** Instance variable that keeps track of the frequency differentials 00315 for words found in the current windows on the reference and 00316 comparison sequences. These frequency differentials are used 00317 in the calculation of the D2 distance between two windows. 00318 00319 This variable is created in the initialize() method and contains 00320 4<sup>wordSize</sup> entries. 00321 */ 00322 int* fdHashMap; 00323 00324 /** Instance variable that maps an index in the reference sequence 00325 (sequence s1) to the hash of a word. This hash can then be used 00326 as an index in the fdHashMap to get the frequency differential 00327 for that word. 00328 00329 The word table is created in the initialize() method and 00330 filled in using the buildWordTable() method. For the reference 00331 EST, buildWordTable() is called in the setReferenceEST() method, 00332 meaning it does not need to be rebuilt every time we analyze 00333 a new comparison sequence. 00334 */ 00335 int* s1WordTable; 00336 00337 /** Instance variable that maps an index in the comparison sequence 00338 (sequence s2) to the hash of a word. This hash can then be used 00339 as an index in the fdHashMap to get the frequency differential 00340 for that word. 00341 00342 The word table is created in the initialize() method and 00343 filled in using the buildWordTable() method. For the comparison 00344 EST, buildWordTable() must be called in the analyze() method because 00345 a new comparison sequence is given every time analyze() is called. 00346 */ 00347 int* s2WordTable; 00348 00349 /** Parameter to define number of characters to shift the frame 00350 on the reference sequence, when computing D2. 00351 00352 This parameter is used to enable D2-asymmetric behavior. 00353 The default value is 1, which means D2 symmetric: all frames 00354 in both sequences will be compared. Higher values mean that 00355 the algorithm will shift by more than one character when 00356 shifting the frame on the reference sequence, resulting in 00357 fewer computations but a possible loss of accuracy from not 00358 comparing every frame in both sequences. 00359 */ 00360 static int frameShift; 00361 00362 static size_t wordTableSize; 00363 00364 static int BitMask; 00365 00366 private: 00367 00368 /** The set of arguments specific to the D2 algorithm 00369 00370 This instance variable contains a static list of arguments 00371 that are specific only to the D2 analyzer class. This 00372 argument list is statically defined and shared by all 00373 instances of this class. 00374 00375 \note Use of static arguments and parameters makes D2 class 00376 hierarchy not MT-safe. 00377 */ 00378 static arg_parser::arg_record argsList[]; 00379 00380 /* The default constructor for this class. 00381 00382 The default constructor for this class. The constructor is 00383 made private so that this class cannot be directly 00384 instantiated. However, since the ESTAnalyzerFactory is a 00385 friend of this class; therefore it can instantiate the D2 00386 analyzer. Accordingly, the ESTAnalyzerFactory::create() 00387 method must be used to instantiate this class. 00388 00389 \param[in] refESTidx The reference EST index value to be used 00390 when performing EST analysis. This parameter should be >= 0. 00391 This value is simply passed onto the base class. 00392 00393 \param[in] outputFile The name of the output file to which the 00394 EST analysis data is to be written. This parameter is ignored 00395 if this analyzer is used for clustering. If this parameter is 00396 the empty string then output is written to standard output. 00397 This value is simply passed onto the base class. 00398 */ 00399 D2Zim(const int refESTidx, const std::string& outputFileName); 00400 00401 /** Instance variable to track alignment metric computed by the 00402 analyze() method. 00403 00404 This instance variable is used to hold the alignment metric 00405 that was computed in the previous \c analyze method call. By 00406 default this value is set to zero. The alignment metric is 00407 computed as the difference in the window positions (on the two 00408 ESTs being analyzed) with the minimum d2 distance. 00409 */ 00410 int alignmentMetric; 00411 00412 int NumWordsWin; 00413 }; 00414 00415 #endif