00001 #ifndef CLU_H 00002 #define CLU_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "FWAnalyzer.h" 00038 #include "ESTCustomData.h" 00039 #include <vector> 00040 00041 // Forward declaration to keep compiler happy 00042 class EST; 00043 class ResultLog; 00044 00045 /** CLU: An implementation of the CLU's analysis algorithm, originally 00046 developed by Andrey Ptitsyn and Winston Hide. 00047 00048 <p>This analyzer implements the similarity metric generation part 00049 of CLU, a clustering algorithm developed by Andrey Ptitsyn and 00050 Winston Hide. The Reference to clu is: <br><br> 00051 00052 00053 "CLU: A new algorithm for EST clustering", A. Ptitsyn and W. Hide, 00054 BMC Bioinformatics, 6(2), 2005. doi: 00055 10.1186/1471-2105-6-S2-S3. <br><br> </p> 00056 00057 <p>The implementation in this class has been developed by suitably 00058 adapting parts of the source code for CLU available from 00059 http://lamar.colostate.edu/~ptitsyn/ </p> 00060 00061 <p>This class has been implemented by extending the FWAnalyzer 00062 base class. The FWAnalyzer base class provides most of the 00063 standard functionality involved in reading FASTA files and 00064 generating formatted output and processing some parameters. This 00065 class adds functionality to compare EST's using CLU's similarity 00066 comparison metrics.</p> 00067 00068 \note This class instantiated via the ESTAnalyzerFactory::create 00069 method. 00070 */ 00071 class CLU : public FWAnalyzer { 00072 friend class ESTAnalyzerFactory; 00073 public: 00074 /** The destructor. 00075 00076 The destructor frees up all any dynamic memory allocated by 00077 this object for its operations. 00078 */ 00079 virtual ~CLU(); 00080 00081 /** Display valid command line arguments for this analyzer. 00082 00083 This method must be used to display all valid command line 00084 options that are supported by this analyzer. Note that 00085 derived classes may override this method to display additional 00086 command line options that are applicable to it. This method 00087 is typically used in the main() method when displaying usage 00088 information. 00089 00090 \note Derived EST analyzer classes <b>must</b> override this 00091 method to display help for their custom command line 00092 arguments. When this method is overridden don't forget to 00093 call the corresponding base class implementation to display 00094 common options. 00095 00096 \param[out] os The output stream to which the valid command 00097 line arguments must be written. 00098 */ 00099 virtual void showArguments(std::ostream& os); 00100 00101 /** Process command line arguments. 00102 00103 This method is used to process command line arguments specific 00104 to this EST analyzer. This method is typically used from the 00105 main method just after the EST analyzer has been instantiated. 00106 This method consumes all valid command line arguments. If the 00107 command line arguments were valid and successfully processed, 00108 then this method returns \c true. 00109 00110 \note Derived EST analyzer classes <b>must</b> override this 00111 method to process any command line arguments that are custom 00112 to their operation. When this method is overridden don't 00113 forget to call the corresponding base class implementation to 00114 display common options. 00115 00116 \param[in,out] argc The number of command line arguments to be 00117 processed. 00118 00119 \param[in,out] argv The array of command line arguments. 00120 00121 \return This method returns \c true if the command line 00122 arguments were successfully processed. Otherwise this method 00123 returns \c false. This method checks to ensure that a valid 00124 frame size and a valid word size have been specified. 00125 */ 00126 virtual bool parseArguments(int& argc, char **argv); 00127 00128 /** Method to begin EST analysis. 00129 00130 This method is invoked just before commencement of EST 00131 analysis. This method first invokes the base class method 00132 that loads the list of ESTs from a given input multi-FASTA 00133 file and pouplates the list of ESTs. If the ESTs were 00134 successfully loaded, then this method initializes the custom 00135 data for each EST (with empty hash maps). 00136 00137 \return If the ESTs were successfully loaded from the FATA 00138 file then this method returns 0. Otherwise this method 00139 returns with a non-zero error code. 00140 */ 00141 virtual int initialize(); 00142 00143 /** Method to obtain human-readable name for this EST analyzer 00144 00145 This method provides a human-readable string identifying the 00146 EST analyzer. This string is typically used for 00147 display/debugging purposes (particularly via the PEACE 00148 Interactive Console). 00149 00150 \return This method returns the string "CLU" identifiying this 00151 analyzer. 00152 */ 00153 virtual std::string getName() const { return "CLU"; } 00154 00155 virtual float getValidMetric() const { return 1000; } 00156 00157 /** Set the reference EST id for analysis. 00158 00159 This method is invoked just before a batch of ESTs are 00160 analyzed via a call to the analyze(EST *) method. This method 00161 builds the hash table (used by CLU for searching/comparison) 00162 for the reference analysis. The reference EST is also called 00163 Sq in CLU literature. 00164 00165 \note This method must be called only after the initialize() 00166 method is called. This method overrides the implementation in 00167 the base class to perform its own custom operation. 00168 00169 \return If the reference estIdx is invalid then this method 00170 returns with 1. Otherwise it pouplates the referenceTable 00171 array with the necessary information and returns 0. 00172 */ 00173 virtual int setReferenceEST(const int estIdx); 00174 00175 protected: 00176 /** Analyze and obtain a similarity metric. 00177 00178 This method can be used to compare a given EST with the 00179 reference EST (set via the call to the setReferenceEST()) 00180 method. 00181 00182 \note This method overrides the default implementation in the 00183 base class to perform its own custom operations. 00184 00185 \param[in] otherEST The index (zero based) of the EST with 00186 which the reference EST is to be compared. This method 00187 essentially implements the search() method from CLU's 00188 implementation. 00189 00190 \return This method must returns a similarity metric by 00191 comparing the ESTs by calling the analyze() method. 00192 */ 00193 using FWAnalyzer::getMetric; 00194 virtual float getMetric(const int otherEST); 00195 00196 /** Dumps a given EST in 3-column format using a ResultLog. 00197 00198 This method is a helper method that dumps a given EST out to 00199 the log. This method overrides the default implementation in 00200 the base class to perform its own custom operation. 00201 00202 \param[out] log The log to which the EST is to be dumped. 00203 00204 \param[in] est The EST to be dumped. This parameter is never 00205 NULL. 00206 00207 \param[in] isReference If this flag is true, then this EST is 00208 the reference EST to be dumped out. 00209 */ 00210 virtual void dumpEST(ResultLog& log, const EST* est, 00211 const bool isReference = false); 00212 00213 /** Helper method to build reference and complement hash maps. 00214 00215 This is a helper method that is used to build the reference 00216 and complement hash maps for a given EST. If the reference 00217 (and complement) hash maps already exist for the given EST 00218 then this metod exits immediately without rebuilding hash 00219 maps. Otherwise it builds the reference and complement hash 00220 maps using the createCLUHashMap() method and pouplates the 00221 custom ESTCLUData object associated with the EST. 00222 00223 \param[in,out] est Pointer to the EST whose reference and 00224 complement hash maps are to be built. 00225 */ 00226 void buildHashMaps(EST *est); 00227 00228 /** Helper method to create the CLU hash/look up table. 00229 00230 This method is invoked from the setReferenceEST() method to 00231 create the referenceHashTable and complementHashTable required 00232 for comparing and processing other ESTs with the reference 00233 EST. This method operates as follows: 00234 00235 <ol> 00236 00237 <li>It initializes the table (if it is NULL) to hold 00238 4^wordSize hash entries.</li> 00239 00240 <li>It resets all the entries to 0 (zero) in the table.</li> 00241 00242 <li>For each wordSize base pairs in the sequence, it computes 00243 the hash value for the sequence and increments the 00244 corresponding entry (using hash value as the index) in the 00245 table.</li> 00246 00247 <li>It finally calls the filterHashMap() method to filter 00248 out low-complexity and abundant oligos.</li> 00249 00250 </ol> 00251 00252 \param[in,out] table The hash table to be populated by this 00253 method. 00254 00255 \param[in] sequence The EST sequence to be used for populating 00256 the hash table. 00257 */ 00258 void createCLUHashMap(int* &table, const char *sequence); 00259 00260 /** Helper method to filter out certain entries from the reference 00261 hash map. 00262 00263 This helper method is invoked from the setReferenceEST() 00264 method to filter out certain entries from the hash map as they 00265 are not significant when comparing ESTs. Specifically, this 00266 method filters out non-informative and low-complexity 00267 sequences in the following manner: 00268 00269 <ul> 00270 00271 <li> First, zero out all simple oligos from consideration. 00272 The simple oligos are sequences of the form: 00273 "AAAAAACCCCCCGGGGGGTTTTTT".</li> 00274 00275 <li>Next it removes abundant sequences (sequence that occur 00276 too frequently) from the table. Such abundant sequences are 00277 not informative in EST comparisons.</li> 00278 00279 <li>Finally, all non-zero entries are normalized to 1.</li> 00280 00281 </ul> 00282 00283 \param[in,out] table The table of hash values to be filtered 00284 and normalized by this method. This table must have been 00285 populated by a call to createCLUHashTable() method. 00286 00287 \param[in] sequenceLength The length of the EST sequence from 00288 which the referenceHashMap has been generated. 00289 */ 00290 void filterHashMap(int *table, const int sequenceLength); 00291 00292 /** Obtain similarity metric between reference sequence and a 00293 given sequence. 00294 00295 This method performs the core task of comparing a given EST 00296 sequence with the reference sequence given a CLU hash table. 00297 This method operates as follows: 00298 00299 <ol> 00300 00301 <li>It computes the hash for each word the first frame of the 00302 given sequence and obtains a categorized distribution (cd) 00303 index by add number of matching words found in the reference 00304 sequence.</li> 00305 00306 <li>For sequent frames in the sequence, it uses the values 00307 computed for the first sequence to compute the categorized 00308 distribution (cd) index value. </li> 00309 00310 <li>It adds one to the corresponding cd entry as indicated by 00311 the resulting cdIndex value. </li> 00312 00313 <li>Finally it determines the sum of multiplying the cd values 00314 with the thresholds (constant values obtained earlier by the 00315 original authors via Monte-Carlo type simulations) to obtain 00316 the similarity metric.</li> 00317 00318 </ol> 00319 00320 \param[in] hashTable The hash table from the reference 00321 sequence (either the referenceHashTable or 00322 complementHashTable) to be used for searching/comparison. 00323 00324 \param[in] sequence The other sequence with which the 00325 reference sequence is to be compared. 00326 00327 \return A similarity score for the given sequence against the 00328 reference sequence. 00329 */ 00330 int getSimilarity(const int* const hashTable, 00331 const char* const sequence) const; 00332 00333 /** Container for CLU's hash maps for ESTs 00334 00335 This class is used to encapsulate the reference and complement 00336 hash maps needed by CLU to compare ESTs. This class extends 00337 ESTCustomData so that the hash maps can be directly associated 00338 with the EST they refer to. New instances of this object are 00339 created in the initialize method. The place holder objects 00340 are automatically deleted once analysis is complete. 00341 */ 00342 class CLUESTData : public ESTCustomData { 00343 public: 00344 /** The default constructor. 00345 00346 The default constructor merely initializes the 00347 referenceHashMap and complementHashMap to NULL. 00348 */ 00349 CLUESTData() : referenceHashMap(NULL), complementHashMap(NULL) {} 00350 00351 /** The destructor. 00352 00353 The destructor deletes the reference and complement hash 00354 map in this class if they are not NULL. 00355 */ 00356 virtual ~CLUESTData(); 00357 00358 /** Hash table to accelerate the comparison of all words in a 00359 sequence. 00360 00361 <p> Quotation from CLU paper: <i>To accelerate the 00362 comparison all words found in sequence Sq are presented in 00363 a table (H). The table H is a linear array, where the 00364 offset itself is a hash value of certain 00365 oligonucleotide. Each element of this array contains a 00366 number, associated with a corresponding 00367 oligonucleotide</i>.</p> 00368 00369 This vector has a size 4^wordSize (for example, if wordSize is 00370 6, this table has 4096 entries). These entries are populated 00371 in the setReferenceEST() method. 00372 */ 00373 int *referenceHashMap; 00374 00375 /** Hash table to accelerate the comparison of all words in a 00376 complement (reverse) sequence. 00377 00378 <p> Quotation from CLU paper: <i>To accelerate the 00379 comparison all words found in sequence Sq are presented in 00380 a table (H). The table H is a linear array, where the 00381 offset itself is a hash value of certain 00382 oligonucleotide. Each element of this array contains a 00383 number, associated with a corresponding 00384 oligonucleotide.</i></p> 00385 00386 This vector has a size 4^wordSize (for example, if wordSize is 00387 6, this table has 4096 entries). These entries are populated 00388 in the setReferenceEST() method using the reverse of a given 00389 EST sequence. 00390 */ 00391 int *complementHashMap; 00392 00393 protected: 00394 // Currently the ESTCLUData class has no protected members. 00395 00396 private: 00397 // Currently the ESTCLUData class has no private members. 00398 }; 00399 00400 /** Parameter to define fraction value to compute abundance 00401 metric. 00402 00403 This instance variable is used to track the fraction of values 00404 (with respect to sequence length) after which a specific 00405 oligonucleotide (word) must be considered to be abundant. The 00406 default value is 10. This value can be changed by the user 00407 via a command line parameter. 00408 */ 00409 static int abundanceFraction; 00410 00411 private: 00412 /** The set of arguments specific to the CLU program 00413 00414 This instance variable contains a static list of arguments 00415 that are specific only to the CLU analyzer class. This 00416 argument list is statically defined and shared by all 00417 instances of this class. 00418 00419 \note Use of static arguments and parameters makes CLU class 00420 hierarchy not MT-safe. 00421 */ 00422 static arg_parser::arg_record argsList[]; 00423 00424 /** The default constructor. 00425 00426 The default constructor for this class. The constructor is 00427 made private so that this class cannot be directly 00428 instantiated. However, since the ESTAnalyzerFactory is a 00429 friend of this class, an object can be instantiated via the 00430 ESTAnalyzerFactory::create() method. 00431 00432 00433 \param[in] refESTidx The reference EST index value to be used 00434 when performing EST analysis. This parameter should be >= 0. 00435 This value is simply passed onto the base class. This 00436 parameter is not really used for this analyzer is used for 00437 clustering. 00438 00439 \param[in] outputFile The name of the output file to which the 00440 EST analysis data is to be written. This parameter is ignored 00441 if this analyzer is used for clustering. If this parameter is 00442 the empty string then output is written to standard output. 00443 This value is simply passed onto the base class. 00444 */ 00445 CLU(const int refESTidx, const std::string& outputFile); 00446 00447 /** A simple array to map characters A, T, C, and G to 0, 1, 2, 00448 and 3 respectively. 00449 00450 This is a simple array of 255 entries that are used to convert 00451 the base pair encoding characters A, T, C, and G to 0, 1, 2, 00452 and 3 respectively to compute the hash as defined by CLU. 00453 This array is initialized in the constructor and is never 00454 changed during the life time of this class. 00455 */ 00456 static char CharToInt[]; 00457 }; 00458 00459 #endif