00001 #ifndef MATRIX_FILE_ANALYZER_H 00002 #define MATRIX_FILE_ANALYZER_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "ESTAnalyzer.h" 00038 #include <string> 00039 #include <vector> 00040 00041 // Forward declaration to keep compiler happy 00042 class EST; 00043 class ResultLog; 00044 00045 /** MatrixFileAnalyzer: EST Analyzer that simply obtains distances 00046 from a matrix file for processing. 00047 00048 <p>This analyzer provides a simple interface for using precomputed 00049 distance/similarity values from a given data file. A matrix data 00050 file must have the following format: 00051 00052 \code 00053 00054 # Lines starting with '#' character are assumed to be comments and 00055 # they are ignored. The first non-comment line must be a number 00056 # indicating the number of ESTs for which data is present in the 00057 # file. For example here is a file for 3 ESTs 00058 3 00059 00060 # After the number of EST's there must be nxn matrix of values 00061 # where n is number of EST's. 00062 0.0 10 20 00063 10 0 15 00064 20 15.2 0 00065 00066 \endcode 00067 00068 */ 00069 class MatrixFileAnalyzer : public ESTAnalyzer { 00070 friend class ESTAnalyzerFactory; 00071 public: 00072 /** The destructor. 00073 00074 The destructor frees up all any dynamic memory allocated by 00075 this object for its operations. 00076 */ 00077 virtual ~MatrixFileAnalyzer(); 00078 00079 /** Display valid command line arguments for this analyzer. 00080 00081 This method must be used to display all valid command line 00082 options that are supported by this analyzer. 00083 00084 \note The ESTAnalyzer base class requires that derived EST 00085 analyzer classes <b>must</b> override this method to display 00086 help for their custom command line arguments. When this 00087 method is overridden don't forget to call the corresponding 00088 base class implementation to display common options. 00089 00090 \param[out] os The output stream to which the valid command 00091 line arguments must be written. 00092 */ 00093 virtual void showArguments(std::ostream& os); 00094 00095 /** Process command line arguments. 00096 00097 This method is used to process command line arguments specific 00098 to this EST analyzer. This method is typically used from the 00099 main method just after the EST analyzer has been instantiated. 00100 This method consumes all valid command line arguments. If the 00101 command line arguments were valid and successfully processed, 00102 then this method returns \c true. 00103 00104 \note The \c ESTAnalyzer base class requires that derived EST 00105 analyzer classes <b>must</b> override this method to process 00106 any command line arguments that are custom to their operation. 00107 When this method is overridden don't forget to call the 00108 corresponding base class implementation to display common 00109 options. 00110 00111 \param[in,out] argc The number of command line arguments to be 00112 processed. 00113 00114 \param[in,out] argv The array of command line arguments. 00115 00116 \return This method returns \c true if the command line 00117 arguments were successfully processed. Otherwise this method 00118 returns \c false. This method checks to ensure that a valid 00119 frame size and a valid word size have been specified. 00120 */ 00121 virtual bool parseArguments(int& argc, char **argv); 00122 00123 /** Method to begin EST analysis. 00124 00125 This method is invoked just before commencement of EST 00126 analysis. This method loads the list of distance values from 00127 the given input file and pouplates the matrix \c 00128 distanceValues for futher use in the \c analyze method. 00129 00130 \return If the ESTs were successfully loaded from the data 00131 file then this method returns 0. Otherwise this method 00132 returns with a non-zero error code. 00133 */ 00134 int initialize(); 00135 00136 /** Method to obtain human-readable name for this EST analyzer 00137 00138 This method provides a human-readable string identifying the 00139 EST analyzer. This string is typically used for 00140 display/debugging purposes (particularly via the PEACE 00141 Interactive Console). 00142 00143 \return This method returns the string "MatrixFile" 00144 identifiying this analyzer. 00145 */ 00146 virtual std::string getName() const { return "MatrixFile"; } 00147 00148 /** Set the reference EST id for analysis. 00149 00150 This method is invoked just before a batch of ESTs are 00151 analyzed via a call to the analyze(EST *) method. This method 00152 currently saves the index in the instance variable for further 00153 look up. 00154 00155 \note This method must be called only after the initialize() 00156 method is called. 00157 00158 \return This method returns \c true if the estIdx was within 00159 the range of values that were loaded from the data file. 00160 Otherwise this method returns 1 as the error code. 00161 */ 00162 virtual int setReferenceEST(const int estIdx); 00163 00164 /** Method to pefrom a batch of EST analysis. 00165 00166 The \c ESTAnalyzer base class requires this method to be 00167 overloaded in the dervied class(es). This method is used to 00168 perform the core tasks of EST analysis for teh 00169 MatrixFileAnalyzer. This method operates in the following 00170 manner: 00171 00172 <ol> 00173 00174 <li>First it loads the necessary distance information from the 00175 supplied data file file using the initialize() method. If the 00176 data is not successfully loaded then this method returns right 00177 away with 1.<li> 00178 00179 <li>Upon successfully loading the data, the reference EST is 00180 set via the setReferenceEST() method. If the reference EST is 00181 not correctly determined, then this method immediately returns 00182 with 2.</li> 00183 00184 <li>For each EST in the list of ESTs it performs the following 00185 tasks: 00186 00187 <li>It logs the similarity metric using suitable methods in 00188 the ESTAnalyzer base class.<li> 00189 00190 <li>If all the processing proceeds successfully, this method 00191 returns 0 (zero). 00192 00193 </ol> 00194 */ 00195 virtual int analyze(); 00196 00197 protected: 00198 /** Analyze and obtain a distance (or similarity) metric. 00199 00200 This method can be used to compare a given EST with the 00201 reference EST (set via the call to the setReferenceEST()) 00202 method. 00203 00204 \param[in] otherEST The index (zero based) of the EST with 00205 which the reference EST is to be compared. 00206 00207 \return This method returns the distance (or similarity as the 00208 case may be) value loaded by the data file (loading is 00209 performed by the initialize() method). 00210 */ 00211 virtual float getMetric(const int otherEST); 00212 00213 /** The data file from where the distance (or similarity) 00214 metrics must be loaded. 00215 00216 This instance variable is set to the dat file from where the 00217 necessary information is to be loaded. This value is set in 00218 the constructor and is never changed during the life time of 00219 this class. 00220 */ 00221 const std::string inputFileName; 00222 00223 /** The number of EST's for which we have data in \c distanceValues 00224 matrix. 00225 00226 This instance variable's value is set by the initialize method 00227 in this class. 00228 */ 00229 int estCount; 00230 00231 /** The array of distance values. 00232 00233 This matrix contains the distance values between a given pair 00234 of ESTs. The zero-based index of the EST is used to look up 00235 values in this matrix. For example, given a pair of ESTs 00236 <est1, est2> \c distanceValues[est1][est2] provides the 00237 distance from est1 to est2 while \c distanceValues[est2][est1] 00238 provides the distance from est2 to est1. Note that distances 00239 do not have to be symmetric. 00240 */ 00241 float **distanceValues; 00242 00243 /** Method to compare two metrics generated by this class. 00244 00245 This method provides the interface for comparing metrics 00246 generated by this ESTAnalyzer when comparing two different 00247 ESTs. This method returns \c true if \c metric1 is 00248 comparatively better than or equal to \c metric2. 00249 00250 \note EST analyzers that are based on distance measures \b 00251 must override this method. 00252 00253 \param[in] metric1 The first metric to be compared against. 00254 00255 \param[in] metric2 The second metric to be compared against. 00256 00257 \return This method returns \c true if metric1 is 00258 comparatively better than \c metric2. 00259 */ 00260 bool compareMetrics(const float metric1, const float metric2) const 00261 { return (metric1 < metric2); } 00262 00263 /** Obtain an invalid (or the worst) metric generated by this 00264 analyzer. 00265 00266 This method can be used to obtain an invalid metric value for 00267 this analyzer. This value can be used to initialize metric 00268 values. 00269 00270 \note Dervied distance-based metric classes must override this 00271 method to provide a suitable value. 00272 00273 \return This method returns an invalid (or the worst) metric 00274 of 1e7 for this EST analyzer. 00275 */ 00276 float getInvalidMetric() const { return 1e7; } 00277 00278 /** Determine if this EST analyzer provides distance metrics or 00279 similarity metrics. 00280 00281 This method can be used to determine if this EST analyzer 00282 provides distance metrics or similarity metrics. If this 00283 method returns \c true, then this EST analyzer returns 00284 distance metrics (smaller is better). On the other hand, if 00285 this method returns \c false, then this EST analyzer returns 00286 similarity metrics (bigger is better). 00287 00288 \return This method returns \c true to indicate that this EST 00289 analyzer operates using distance metrics. 00290 */ 00291 bool isDistanceMetric() const { return true; } 00292 00293 private: 00294 /* 00295 The default constructor for this class. The constructor is 00296 made private so that this class cannot be directly 00297 instantiated. However, since the ESTAnalyzerFactory is a 00298 friend of this class, an object can be instantiated via teh 00299 ESTAnalyzerFactory::create() method. 00300 00301 \param[in] refESTidx The reference EST index value to be used 00302 when performing EST analysis. This parameter should be >= 0. 00303 This value is simply passed onto the base class. 00304 00305 \param[in] inputFileName The input data file from where the 00306 data is to be read. Note that this analyzer currently ignores 00307 the FASTA file specified. However, the FASTA file will be used 00308 at later date to cross reference est index values to 00309 corresponding genomic sequences. 00310 00311 \param[in] outputFile The name of the output file to which the 00312 EST analysis data is to be written. This parameter is ignored 00313 if this analyzer is used for clustering. If this parameter is 00314 the empty string then output is written to standard output. 00315 This value is simply passed onto the base class. 00316 */ 00317 MatrixFileAnalyzer(const int refESTidx, 00318 const std::string& outputFileName); 00319 00320 /** Method to read a line from a given EST file. 00321 00322 This method is a helper method that is used to load a given 00323 line from the file. 00324 00325 \param[in,out] fp The file pointer from where the data is to be 00326 read. 00327 00328 \return The line read from the file. If EOF was reached then 00329 this method returns an empty line. 00330 */ 00331 std::string readLine(FILE *fp); 00332 00333 /** Utility method to read metrics from a given line into a given 00334 array from a given starting position. 00335 00336 This method is a utility method that is used to read parse in 00337 a line containing space separated set of values into the array 00338 values. Data is stored into the values array starting with 00339 startPos. 00340 00341 \param[in] line The line whose contents is to be updated. 00342 00343 \param[out] values The array into which the values must be 00344 stored. 00345 00346 \param[in] startPos The starting position in the array from 00347 where values must be stored. 00348 00349 \param[in] maxValues The maximum number of items that must be 00350 processed from line. 00351 00352 \return This method returns the number of values 00353 actually processed and stored into the values array. 00354 */ 00355 int parseMetrics(const char* line, float *values, 00356 const int startPos, const int maxValues); 00357 00358 /** Helper method to process EST count information from a line. 00359 00360 This method is a helper method that is used to read the number 00361 of EST's from the file and create the distanceValues matrix. 00362 00363 \param[in] line The line from where the EST count information 00364 is to be read. 00365 00366 \return This method returns \c true if the EST count was 00367 processed successfully. Otherwise this method returns \c 00368 false. 00369 */ 00370 bool parseESTCount(const char *line); 00371 00372 /** The set of arguments for the MatrixFileAnalyzer. 00373 00374 This instance variable contains a static list of arguments 00375 that are used by this analyzer. 00376 */ 00377 static arg_parser::arg_record argsList[]; 00378 00379 /** The matrix data file from where distance metrics are to be 00380 read. 00381 00382 This member object is used to hold the file name from where 00383 all the distance matrix data is to be loaded. The value is 00384 set to the value supplied by the user via a suitable command 00385 line argument by the parseArguments() method. 00386 */ 00387 static char* dataFileName; 00388 }; 00389 00390 #endif