00001 #ifndef EST_ANALYZER_H 00002 #define EST_ANALYZER_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "arg_parser.h" 00038 #include "HeuristicChain.h" 00039 #include "Utilities.h" 00040 00041 /** The base class of all EST analyzers. 00042 00043 This class must be the base class of all EST analyzers in the 00044 system. This class provides some default functionality that can be 00045 readily used by the EST analyzers. 00046 */ 00047 class ESTAnalyzer { 00048 public: 00049 /** Display valid command line arguments for this analyzer. 00050 00051 This method must be used to display all valid command line 00052 options that are supported by this analyzer. Note that 00053 derived classes may override this method to display additional 00054 command line options that are applicable to it. This method 00055 is typically used in the main() method when displaying usage 00056 information. 00057 00058 \note Derived EST analyzer classes <b>must</b> override this 00059 method to display help for their custom command line 00060 arguments. When this method is overridden don't forget to 00061 call the corresponding base class implementation to display 00062 common options. 00063 00064 \param[out] os The output stream to which the valid command 00065 line arguments must be written. 00066 */ 00067 virtual void showArguments(std::ostream& os); 00068 00069 /** Process command line arguments. 00070 00071 This method is used to process command line arguments specific 00072 to this EST analyzer. This method is typically used from the 00073 main method just after the EST analyzer has been instantiated. 00074 This method consumes all valid command line arguments. If the 00075 command line arguments were valid and successfully processed, 00076 then this method returns \c true. 00077 00078 \note Derived EST analyzer classes <b>must</b> override this 00079 method to process any command line arguments that are custom 00080 to their operation. When this method is overridden don't 00081 forget to call the corresponding base class implementation to 00082 display common options. 00083 00084 \param[in,out] argc The number of command line arguments to be 00085 processed. 00086 00087 \param[in,out] argv The array of command line arguments. 00088 00089 \return This method returns \c true if the command line 00090 arguments were successfully processed. Otherwise this method 00091 returns \c false. This method returns true if all arguments 00092 are consumed successfully and if a valid estID and estFileName 00093 have been specified. 00094 */ 00095 virtual bool parseArguments(int& argc, char **argv); 00096 00097 /** Method to begin EST analysis. 00098 00099 This method is invoked just before commencement of EST 00100 analysis. This method typically loads the list of ESTs from a 00101 given input file. In addition, it may perform any 00102 pre-processing as the case may be. Some EST analyzers may also 00103 add dummy entries to aid in various operations. 00104 00105 \note Derived classes must override this method. 00106 00107 \return If the initialization process was sucessful, then this 00108 method returns 0. Otherwise this method returns with a 00109 non-zero error code. 00110 */ 00111 virtual int initialize() = 0; 00112 00113 /** Method to obtain human-readable name for this EST analyzer 00114 00115 This method provides a human-readable string identifying the 00116 EST analyzer. This string is typically used for 00117 display/debugging purposes (particularly via the PEACE 00118 Interactive Console). 00119 00120 \note Derived classes must override this method. 00121 00122 \return A string containing a short, human-readable identifier 00123 for this analyzer. 00124 */ 00125 virtual std::string getName() const = 0; 00126 00127 /** Set the reference EST id for analysis. 00128 00129 This method is invoked just before a batch of ESTs are 00130 analyzed via a call to the analyze(EST *) method. Setting the 00131 reference EST provides analyzer's an opportunity to optimize 00132 certain operations, if possible. 00133 00134 \note This method must be called only after the initialize() 00135 method is called. 00136 00137 \return If the initialization process was sucessful, then this 00138 method returns 0. Otherwise this method returns an error code. 00139 */ 00140 virtual int setReferenceEST(const int estIdx) = 0; 00141 00142 /** Analyze and obtain a similarity metric using the attached 00143 heuristic chain (if one exists) followed by the appropriate 00144 heavy weight distance/similarity measure associated with 00145 this ESTAnalyzer. 00146 00147 This method can be used to compare a given EST with the 00148 reference EST (set via the call to the setReferenceEST()) 00149 method. 00150 00151 \note This method may return -1, if the otherEST is 00152 significantly different from the reference EST (possibly 00153 warranting no further analysis) that a meanigful metric cannot 00154 be generated. 00155 00156 \param[in] otherEST The index (zero based) of the EST with 00157 which the reference EST is to be compared. 00158 00159 \param[in] useHeuristics A directive instructing the ESTAnalyzer 00160 on whether or not to use its heuristis chain. Defaults to true. 00161 00162 \param[in] useHeavyWeight A directive instructing the ESTAnalyzer 00163 on whether or not to use the heavy weight metric. Defaults to true. 00164 00165 \return This method returns a similarity/distance metric by 00166 comparing the ESTs. This method may return -1, if the otherEST 00167 is significantly different from the reference EST (possibly 00168 warranting no further analysis) that a meanigful metric cannot 00169 be generated. 00170 */ 00171 float analyze(const int otherEST, const bool useHeuristics = true, 00172 const bool useHeavyWeight = true); 00173 00174 /** Method to perform EST analysis. 00175 00176 This method must be used to perform EST analysis. This method 00177 is a pure-virtual method. Therefore all EST analyzers must 00178 override this method to perform all the necessary operations. 00179 Typically, this method performs the following operations: 00180 00181 <ol> 00182 00183 <li>This method calls initialize.</li> 00184 00185 <li>Set's the reference EST via a call to the setReferenceEST() 00186 method.</li> 00187 00188 <li>Repeatedly uses the analyze(const int) method to compare 00189 ESTs.</li> 00190 00191 <li>Generates analysis reports at the end of analysis.</li> 00192 00193 </ol> 00194 */ 00195 virtual int analyze() = 0; 00196 00197 /** Get alignment data for the previous call to analyze method. 00198 00199 This method can be used to obtain alignment data (if any) that 00200 was obtained typically as an byproduct of the previous call 00201 tothe analyze() method. 00202 00203 \param[out] alignmentData The parameter is updated to the 00204 alignment information generated as a part of the the 00205 immediately preceding analyze(const int) method call is 00206 returned in the parameter. 00207 00208 \note Not all ESTAnalyzer classes may compute additional 00209 alignment data. In this case, this method will return \c 00210 false. Furthermore, if a previous analyze() method call was 00211 not made, then the value returned in alignmentData parameter 00212 is not defined. 00213 00214 \return This method returns \c true if the alignment data is 00215 actually computed by this ESTAnalyzer. The default 00216 implementation of this method always returns \c false. 00217 */ 00218 virtual bool getAlignmentData(int& UNREFERENCED_PARAMETER(alignmentData)) 00219 { return false; } 00220 00221 /** Method to load EST information from a FASTA file. 00222 00223 This method can be used to load information regarding ESTs 00224 from a FASTA file. The file name from where the data is to be 00225 loaded must be passed in as the parameter. 00226 00227 \param[in] fileName The file name of the FASTA file from where 00228 the EST information is to be uploaded. 00229 00230 \param[in] unpopulate If this parameter is true then the 00231 header and sequence information in each EST is discarded to 00232 minimize memory foot print. 00233 00234 \return This method returns true if all the ESTs were 00235 successfully loaded from the given file. 00236 */ 00237 bool loadFASTAFile(const char *fileName, const bool unpopulate = false); 00238 00239 /** Obtain the input file name. 00240 00241 This method returns the input file from where the EST data was 00242 read. 00243 00244 \return The input file from where the EST data was read. If 00245 an input file was not specified, then this method returns 00246 NULL. 00247 */ 00248 const char* getInputFileName() const { return estFileName; } 00249 00250 /** Determine if this EST analyzer provides distance metrics or 00251 similarity metrics. 00252 00253 This method can be used to determine if this EST analyzer 00254 provides distance metrics or similarity metrics. If this 00255 method returns \c true, then this EST analyzer returns 00256 distance metrics (smaller is better). On the other hand, if 00257 this method returns \c false, then this EST analyzer returns 00258 similarity metrics (bigger is better). 00259 00260 \note Derived classes that operate using distance metrics must 00261 overload this method to return \c true. 00262 00263 \return This method returns \c false (by default) to indicate 00264 that this EST analyzer operates using similarity metrics. If 00265 it operates using distance metrics then this method returns \c 00266 true. 00267 */ 00268 virtual bool isDistanceMetric() const { return false; } 00269 00270 /** Obtain an invalid (or the worst) metric generated by this 00271 analyzer. 00272 00273 This method can be used to obtain an invalid metric value for 00274 this analyzer. This value can be used to initialize metric 00275 values. By default this method returns -1, which should be 00276 ideal for similarity-based metrics. 00277 00278 \note Dervied distance-based metric classes must override this 00279 method to provide a suitable value. 00280 00281 \return This method returns an invalid (or the worst) metric 00282 for this EST analyzer. 00283 */ 00284 virtual float getInvalidMetric() const { return -1; } 00285 00286 /** Obtain a valid (or the best) metric generated by this 00287 analyzer. 00288 00289 This method can be used to obtain a valid metric value for 00290 this analyzer. This value can be used to initialize metric 00291 values. By default this method returns 0, which should be 00292 ideal for distance-based metrics. 00293 00294 \note Dervied similarity-based metric classes must override this 00295 method to provide a suitable value. 00296 00297 \return This method returns a valid (or the best) metric 00298 for this EST analyzer. 00299 */ 00300 virtual float getValidMetric() const { return 0; } 00301 00302 /** Determine preferred dummy EST lengths to be used with this 00303 analyzer. 00304 00305 <p>This method can be used to determine the preferred dummy 00306 EST lengths to be used with this EST analyzer. This method 00307 may be overridden in derived classes to provide a more 00308 appropriate dummy EST length.</p> 00309 00310 <p>Dummy ESTs are used for the following purpose: When 00311 clustering FASTA data that contains low complexity reads, the 00312 low complexity reads provide false relationships between ESTs 00313 giving raise to very large clusters. These large clusters are 00314 created because transitive relationships are established 00315 between ESTs due to low complexity reads.</p> 00316 00317 <p>In order to avoid super-clusters that get formed due to low 00318 complexity reads, PEACE adds two dummy ESTs, one with all \c 00319 "AAAAA...." and another with all \c "CCCCCC...". The length of 00320 the ESTs must be appropriately chosen based on the type of 00321 analyzer used. This method helps ClusterMaker hierarchy to 00322 determine the appropriate dummy EST length.</p> 00323 00324 \return The default implementation of this method always 00325 returns 128. 00326 */ 00327 virtual int getPreferredDummyESTLength() const { return 128; } 00328 00329 /** Method to compare two metrics generated by this class. 00330 00331 This method provides the interface for comparing metrics 00332 generated by this ESTAnalyzer when comparing two different 00333 ESTs. This method returns \c true if \c metric1 is 00334 comparatively better than or equal to \c metric2. 00335 00336 \note EST analyzers that are based on distance measures \b 00337 must override this method. 00338 00339 \param[in] metric1 The first metric to be compared against. 00340 00341 \param[in] metric2 The second metric to be compared against. 00342 00343 \return This method returns \c true if metric1 is 00344 comparatively better then or equal to \c metric2. 00345 */ 00346 virtual bool compareMetrics(const float metric1, const float metric2) const 00347 { return (metric1 > metric2); } 00348 00349 /** Method to attach a heuristic chain to this EST analyzer. 00350 00351 \param[in] chain The heuristic chain to be attached. 00352 00353 \return If the initialization process was sucessful, then this 00354 method returns 0. Otherwise this method returns an error code. 00355 */ 00356 virtual int setHeuristicChain(HeuristicChain* chain); 00357 00358 /** Method to obtain the heuristic chain set for this EST 00359 analyzer. 00360 00361 This method may be used to obtain a pointer to the heuristic 00362 chain set for use by this analyzer. If a heuristic chain has 00363 not been set, then this method returns NULL. 00364 00365 \note The caller must \c not modify or delete the returned 00366 heuristic pointer. 00367 00368 \return A pointer to the heuristic chain associated set for 00369 this analyzer. If a heuristic has not been set, then this 00370 method returns NULL. 00371 */ 00372 virtual HeuristicChain* getHeuristicChain() const { return chain; } 00373 00374 /** Method to display performance statistics. 00375 00376 This method can be used to display any statistics collated by 00377 this class (and its descendants) regarding their operation and 00378 performance. This method was primarily introduced to enable 00379 derived classes a mechanism to override statistics display and 00380 print additional information. 00381 00382 \note The default implementation in the base class does 00383 absolutely nothing. 00384 00385 \param[out] os The output stream to which the statistics must 00386 be written. 00387 */ 00388 virtual void displayStats(std::ostream& UNREFERENCED_PARAMETER(os)) {} 00389 00390 /** The destructor. 00391 00392 The destructor frees memory allocated for holding any EST data 00393 in the base class. 00394 */ 00395 virtual ~ESTAnalyzer(); 00396 00397 protected: 00398 /** The default constructor. 00399 00400 The constructor has been made protected to ensure that this 00401 class is never directly instantiated. Instead one of the 00402 derived ESTAnalyzer classes must be instantiated via the 00403 ESTAnalyzerFactor API methods. 00404 00405 \param[in] analyzerName The human readable name for this EST 00406 analyzer. This name is used when generating errors, warnings, 00407 and other output messages for this analyzer. 00408 00409 \param[in] refESTidx The reference EST's index in a given 00410 multi-FASTA file. Index values start with 0 (zero). The 00411 refESTidx is supplied as a global argument that is processed 00412 in the main() method. This value is simply copied to the 00413 refESTidx member in this class. 00414 00415 \param[in] outputFileName The file name to which output must 00416 be written. If a valid output file is not specified, then 00417 results are written to standard output. The outputFileName is 00418 simply copied to the outputFileName member object. 00419 */ 00420 ESTAnalyzer(const std::string& analyzerName, const int refESTidx, 00421 const std::string& outputFileName); 00422 00423 /** Analyze and compute a similarity or distance metric between 00424 a given EST and the reference EST using the heavy weight metric 00425 associated with this ESTAnalyzer. 00426 00427 This method can be used to compare a given EST with the 00428 reference EST (set via the call to the setReferenceEST()) 00429 method. 00430 00431 \note This method may return -1, if the otherEST is 00432 significantly different from the reference EST (possibly 00433 warranting no further analysis) that a meanigful metric cannot 00434 be generated. 00435 00436 \param[in] otherEST The index (zero based) of the EST with 00437 which the reference EST is to be compared. 00438 00439 \return This method returns a similarity/distance metric by 00440 comparing the ESTs. This method may return -1, if the otherEST 00441 is significantly different from the reference EST (possibly 00442 warranting no further analysis) that a meanigful metric cannot 00443 be generated. 00444 */ 00445 virtual float getMetric(const int otherEST) = 0; 00446 00447 /** Flag to indicate if a read ahead thread must be used. 00448 00449 This boolean value is by default set to false. However, the 00450 value is changed by the parseArguments method depending on 00451 wether the use whishes to use a read-ahead feature. 00452 */ 00453 static bool readAhead; 00454 00455 /** Flag to indicate if lower-case characters must be masked out of 00456 reads. 00457 00458 Typically lower-case characters ('a', 't', 'c', 'g') are used to 00459 indicate bases that must be masked out of reads. This notation 00460 is used by DUST (part of NCBI BLAST) utility that identifies 00461 and tags low complexity regions with lower-case letters. If this 00462 flag is \c false (default) then these lower-case characters are 00463 converted to 'N' causing them to ignored by PEACE. If this 00464 flag is \c true, then these bases are converted to upper-case 00465 equivalents. This flag is passed to EST::create which actually 00466 does the conversions. 00467 */ 00468 static bool noMaskBases; 00469 00470 /** The index of the reference EST in a given file. 00471 00472 This member object is used to hold the index of a reference 00473 EST in a given file. The index values begin from 0 (zero). 00474 This member is initialized in the constructor and is changed 00475 by the setReferenceEST() id. 00476 */ 00477 int refESTidx; 00478 00479 /** The heuristic chain associated with this EST analyzer. 00480 00481 The heuristic chain contains a sequence of heuristics that 00482 must be used to minimize the number of pairs of ESTs that must 00483 be actually analyzed (using heavy weight algorithms such as 00484 D2). The chain is created in the \c main method via a call to 00485 HeuristicChain::setupChain method and is set by \c main method 00486 via a call to setHeuristicChain method. 00487 */ 00488 HeuristicChain* chain; 00489 00490 /** The FASTA file from where EST data is to be read. 00491 00492 This member object is used to hold the file name from where 00493 all the EST data is to be loaded. This member is initialized 00494 in the constructor and is never changed during the life time 00495 of this class. 00496 */ 00497 static char* estFileName; 00498 00499 /** Flag to indicate if output results must be in HTML format. 00500 00501 This member is initialized to false. However, the value is 00502 changed by the parseArguments method depending on the actual 00503 value specified by the user. 00504 */ 00505 static bool htmlLog; 00506 00507 /** The file to which results must be written. 00508 00509 This member object is used to hold the file name to which all 00510 the analysis results are to be written. This member is 00511 initialized to NULL. However, the value is changed by the 00512 parseArguments method depending on the actual value specified 00513 by the user. 00514 */ 00515 const std::string outputFileName; 00516 00517 /** The name of this analyzer. 00518 00519 This instance variable contains the human recognizable name 00520 for this analyzer. This value is set when the analyzer is 00521 instantiated (in the constructor) and is never changed during 00522 the life time of this analyzer. This information is used when 00523 generating errors, warnings, and other output messages. 00524 */ 00525 const std::string analyzerName; 00526 00527 private: 00528 /** The set of common arguments for all EST analyzers. 00529 00530 This instance variable contains a static list of arguments 00531 that are common all the EST analyzers. The common argument 00532 list is statically defined and shared by all EST instances. 00533 00534 \note This makes ESTAnalyzer class hierarchy not MT-safe. 00535 */ 00536 static arg_parser::arg_record commonArgsList[]; 00537 00538 /** A dummy operator= 00539 00540 The operator=() is supressed for this class as it has constant members 00541 whose value is set when the object is created. These values cannot be 00542 changed during the lifetime of this object. 00543 00544 \param[in] src The source object from where data is to be copied. 00545 Currently this value is ignored. 00546 00547 \return Reference to this. 00548 */ 00549 ESTAnalyzer& operator=(const ESTAnalyzer& src); 00550 }; 00551 00552 #endif