00001 #ifndef EST_H 00002 #define EST_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include <vector> 00038 #include <functional> 00039 #include <memory> 00040 #include "ESTCustomData.h" 00041 #include "Utilities.h" 00042 00043 /** A single EST. 00044 00045 This class is used to represent a single EST. An EST object 00046 instance consists of the following information: 00047 00048 <ul> 00049 00050 <li>\b id: A unique identifier (usually a number) for this 00051 EST.</li> 00052 00053 <li>\b info: The name and other information associated with the 00054 EST. This information is typically the first header line read 00055 from a FASTA file.</li> 00056 00057 <li>\b sequence: The actual sequence of base pairs associated with 00058 this EST.<li> 00059 00060 <li>\b offset: The offset of in the FASTA file from where this EST 00061 was read. This information can be used to conditionally and 00062 rapidly load ESTs from a file.</li> 00063 00064 </ul> 00065 00066 */ 00067 class EST { 00068 public: 00069 /** EST Constructor. 00070 00071 This constructor is used to instantiate an EST method. 00072 00073 \param[in] id The unqiue ID value to be set for this EST. 00074 00075 \param[in] info The name and other information associated with 00076 the EST. This information is typically the first header line 00077 read from a FASTA file. This information can be NULL. 00078 00079 \param[in] sequence The actual sequence of base pairs 00080 associated with this EST. The sequence information that must 00081 be used to create this EST. The sequence information can be 00082 NULL. 00083 00084 \param[in] offset The offset of in the FASTA file from where 00085 this EST was read. This information can be used to conditionally 00086 and rapidly load EST's from a file. 00087 */ 00088 EST(const int id, const char *info, 00089 const char* sequence = NULL, const int offset = -1); 00090 00091 /** Create a valid EST. 00092 00093 This method must be used to create a valid EST in the system. 00094 The information required to create the EST must be passed in 00095 as the parameter. The EST names are expected to be unique in 00096 a given file. 00097 00098 \note If the new EST is successfully instantiated, then this 00099 method adds the newly created EST to the end of the list of 00100 ESTs maintianed by this class. Consequenlty, the parameter \c 00101 id must be equal to estList.size(). 00102 00103 \param[in] id The unqiue ID value to be set for this EST. 00104 00105 \param[in] info The name and other information associated with 00106 the EST. This information is typically the first header line 00107 read from a FASTA file. This information can be NULL. 00108 00109 \param[in] sequence The actual sequence of base pairs 00110 associated with this EST. The sequence information that must 00111 be used to create this EST. The sequence information can be 00112 NULL. 00113 00114 \param[in] offset The offset of in the FASTA file from where 00115 this EST was read. This information can be used to conditionally 00116 and rapidly load EST's from a file. 00117 00118 \return If the id is valid and a duplicate EST with the same 00119 ID is not present, then this method creates a new EST and 00120 returns a pointer to that EST back to the caller. 00121 */ 00122 static EST* create(const int id, const char *info, 00123 const char* sequence = NULL, 00124 const long offset = -1); 00125 00126 /** Loads data from a FASTA file to create an EST. 00127 00128 This method provides a convenient interface for loading 00129 information regarding an EST from a given FASTA file and using 00130 the information to create either a fully populated or 00131 partially populated EST. 00132 00133 \param[in,out] fastaFile The FASTA file from where the EST data 00134 is to be currently loaded. If this pointer is NULL then this 00135 method perform no action and returns immediately with NULL. 00136 00137 \param[in,out] lineNum A line number counter to be updated to 00138 provide the user with a more meaningful error message. 00139 00140 \param[in] maskBases If this flag is true, then all lowercase 00141 bases are converted to 'N' rather than uppercase characters, 00142 causing them to be ignored by downstream processing. 00143 00144 \note At the end of this method the fastaFile's file pointer 00145 will point at the beginning of the next EST (if any) in the 00146 file. 00147 */ 00148 static EST* create(FILE* fastaFile, int& lineNum, 00149 const bool maskBases = true); 00150 00151 /** Obtain the list of ESTs. 00152 00153 This method may be used to obtain a reference to the list of 00154 ESTs currently defined. 00155 00156 \return The list of ESTs currently defined. 00157 */ 00158 static std::vector<EST*>& getESTList() { return estList; } 00159 00160 /** Obtain count of ESTs that have been flagged as being processed. 00161 00162 This method can be used to determine the number of ESTs that 00163 have been flagged as being processed. Subtracting this number 00164 from the total number of ESTs indicates the number of ESTs to 00165 be processed. 00166 00167 \note This method iterates over the list of ESTs to determine 00168 the current number of processed ESTs. So use this method 00169 sparingly. 00170 00171 \return The number of ESTs that have been flagged as having 00172 been processed. 00173 */ 00174 static int getProcessedESTCount(); 00175 00176 /** Obtain the number of ESTs in this list. 00177 00178 This method may be used to determine the number of ESTs that 00179 have been defined and added to this list. 00180 00181 \return The number of ESTs currently defined. 00182 */ 00183 static int getESTCount() { return (int) estList.size(); } 00184 00185 /** Helper method to determine the longest EST. 00186 00187 This method can be used to determine the length of the longest 00188 EST loaded thus far. This information is typically used to 00189 allocate buffers and other data structures for analysis. 00190 00191 \note This method computes the length of the longest EST the 00192 first time it is invoked. Consequently, it should be called 00193 only after all the ESTs have been loaded. 00194 00195 \return The length of the longest EST to be processed. 00196 */ 00197 static size_t getMaxESTLen(); 00198 00199 /** Obtain a given EST from the EST list. 00200 00201 This method is a convenience method that can be used to obtain 00202 a given EST from the list of ESTs. 00203 00204 \param[in] estIdx The zero-based index of the EST that is 00205 desired from the list of ESTs in this class. If this index is 00206 invalid then the behavior of this method is undefined. 00207 00208 \return A mutable pointer to the EST at the provided estIdx 00209 index position in the EST list. 00210 */ 00211 static EST* getEST(const int estIdx) { return estList[estIdx]; } 00212 00213 /** Dump currently loaded ESTs in FASTA format. 00214 00215 This method can be used to dump the currently loaded EST's in 00216 FASTA file format to a given output stream. 00217 00218 \param[out] os The output stream to which EST data is to be 00219 dumped. 00220 */ 00221 static void dumpESTList(std::ostream& os); 00222 00223 /** Dump currently loaded and (un)processed ESTs in FASTA format. 00224 00225 This method can be used to dump the currently loaded EST's in 00226 FASTA file format to a given output stream. 00227 00228 \param[out] os The output stream to which EST data is to be 00229 dumped. 00230 00231 \param[in] processed If this flag is \c true, then this method 00232 dumps only those ESTs that have been flagged as having been 00233 processed. If this flag is \c false, then this method dumps 00234 only un-processed ESTs. 00235 */ 00236 static void dumpESTList(std::ostream& os, const bool processed); 00237 00238 /** Dump this EST information in FASTA format. 00239 00240 This method can be used to dump the information associated 00241 with the EST in FASTA format to a given output stream. 00242 00243 \param[in] os The output stream to which the EST's information 00244 must be written in FASTA format. 00245 */ 00246 void dumpEST(std::ostream& os); 00247 00248 /** Delete and clear all ESTs. 00249 00250 This method can be used to delete and clear all the EST's from 00251 the internal list of EST's currently loaded. 00252 */ 00253 static void deleteAllESTs(); 00254 00255 /** Delete and clear out the last EST in the list. 00256 00257 This method can be used to delete the last EST in the 00258 list. This method rests the maximum EST length instance 00259 variable as needed. This method is typically used to remove 00260 dummy ESTs that are added to the end of the list by some 00261 filters. 00262 00263 \param[in] count The number of ESTs to be removed from the 00264 list. 00265 */ 00266 static void deleteLastESTs(const int count); 00267 00268 /** Obtain the ID of this EST. 00269 00270 \return The ID of the EST that was set when this EST was 00271 created. 00272 */ 00273 inline int getID() const { return id; } 00274 00275 /** Obtain the information associated with this EST. 00276 00277 The name and other information associated with the EST. This 00278 information is typically the first header line read from a 00279 FASTA file. This information can be NULL if the EST is only 00280 partially loaded. 00281 00282 \return Any information available for this EST. 00283 */ 00284 inline const char* getInfo() const { return info; } 00285 00286 /** Obtain the actual sequence of base pairs for this EST. 00287 00288 Note that sequence ifnoramtion for an EST can be null if itis 00289 only partially loaded from a file. Entries are parially 00290 loaded to reduce memory foot print when processing large data 00291 sets. 00292 00293 \return The actual sequence of base paris for this EST if 00294 available. Otherwise this method returns NULL. 00295 */ 00296 inline const char* getSequence() const { return sequence; } 00297 00298 /** Obtain the similarity metric for this EST. 00299 00300 The similarity metric is a quantitative representation of the 00301 similarity between two ESTs. The similarity metric is 00302 generated during analysis when one EST is compared with 00303 another. The similarity value is initialized to -1. 00304 00305 \return The similarity metric for this EST. 00306 */ 00307 inline float getSimilarity() const { return similarity; } 00308 00309 /** Set the similarity metric for an EST. 00310 00311 This method must be used to change the similarity metric for 00312 this EST. The similarity metric is a quantitative 00313 representation of the similarity between two ESTs. The 00314 similarity metric is generated during analysis when one EST is 00315 compared with another. 00316 00317 \param[in] sim The similarity metric value to which this EST's 00318 similarity much be changed. 00319 */ 00320 inline void setSimilarity(const float sim) { similarity = sim; } 00321 00322 /** Method to clear general information and sequence data. 00323 00324 This method can be used to unpopulate the FASTA header and 00325 actual sequence (base pairs) information from this EST. This 00326 frees up memory allocated to hold this data thereby minimizing 00327 the memory footprint for this EST. This enables holding a 00328 large number of skeleton EST's in memory. 00329 */ 00330 void unpopulate(); 00331 00332 /** Repopulate necessary information from a given fasta file. 00333 00334 This method can be used to request an EST to repopulate its 00335 FASTA header and actual sequence (base pair) information from 00336 a given FASTA file. This method uses the offset (saved when 00337 this EST was originally loaded) to load the information from 00338 the file. 00339 00340 \param[in,out] fastaFile The file from where the EST 00341 information is to be loaded. If the file changes during EST 00342 analysis the behavior of this method is undefined. 00343 00344 \return This method returns true if the repopulating the data 00345 was successfully completed. On errors this method returns 00346 false. 00347 */ 00348 bool repopulate(FILE *fastaFile); 00349 00350 /** Change the custom data associated with this EST. 00351 00352 This method can be used to change (or set) the custom data 00353 associated with this EST. Note that any earlier custom data 00354 associated with this EST is lost (and deleted if necessary by 00355 auto_ptr) before the new value is set. 00356 00357 \param[in,out] src The new custom data to be set for this EST. 00358 After this call, this EST owns the data referred by src. 00359 */ 00360 void setCustomData(std::auto_ptr<ESTCustomData>& src) { customData = src; } 00361 00362 /** Change the custom data associated with this EST. 00363 00364 This method can be used to change (or set) the custom data 00365 associated with this EST. Note that any earlier custom data 00366 associated with this EST is lost (and deleted if necessary by 00367 auto_ptr) before the new value is set. 00368 00369 \param[in,out] src The new custom data to be set for this EST. 00370 After this call, this EST owns the data referred by src. 00371 */ 00372 void setCustomData(ESTCustomData* src) 00373 { customData = std::auto_ptr<ESTCustomData>(src); } 00374 00375 00376 /** Obtain a mutable reference to custom data associated with this EST. 00377 00378 This method can be used to obtain a mutable reference to the 00379 custom data associated with this EST. This method essentially 00380 returns the custom value set by the last successful call to 00381 one of the polymorphic setCustomData() methods in this class. 00382 By default this method returns NULL. 00383 00384 \note The custom data set in this class is returned as a 00385 auto_ptr. 00386 00387 \return The custom data (if any) associated with this EST. 00388 */ 00389 inline std::auto_ptr<ESTCustomData>& getCustomData() { return customData; } 00390 00391 /** Obtain an immutable reference to custom data associated with 00392 this EST. 00393 00394 This method can be used to obtain an immutable reference to 00395 the custom data associated with this EST. This method 00396 essentially returns the custom value set by the last 00397 successful call to one of the polymorphic setCustomData() 00398 methods in this class. By default this method returns NULL. 00399 00400 \note The custom data set in this class is returned as a 00401 auto_ptr. 00402 00403 \return The custom data (if any) associated with this EST. 00404 */ 00405 inline const std::auto_ptr<ESTCustomData>& getCustomData() const 00406 { return customData; } 00407 00408 /** The destructor. 00409 00410 The destructor for the EST essentially releases the memory 00411 used to hold the information and sequence data for a given EST. 00412 */ 00413 ~EST(); 00414 00415 /** Functor for EST sorting. 00416 00417 This Functor is used when sorting ESTs based on similarity 00418 metric at the end of analysis just prior to generating the 00419 final report. 00420 */ 00421 struct LessEST : public std::binary_function<EST, EST, bool> { 00422 inline bool operator()(const EST* x, const EST* y) { 00423 return (x->similarity > y->similarity); 00424 } 00425 }; 00426 00427 /** Helper method to read a line from a given file. 00428 00429 This is a helper method that can be used to read a long line 00430 from a given file. 00431 00432 \param[in] fp The file from where the line is to be read. 00433 00434 \return The string read from the file. 00435 */ 00436 static std::string getLine(FILE *fp); 00437 00438 /** Determine if this EST has already been processed. 00439 00440 This method exposes a generic flag that is provided as a 00441 convenience for algorithms to mark if this EST has gone 00442 through their processing. 00443 00444 \return This method returns \c true if this EST has been 00445 flagged as having been processed. Otherwise this method 00446 returns \c false. 00447 */ 00448 inline bool hasBeenProcessed() const { return processed; } 00449 00450 /** Set if this EST has already been processed. 00451 00452 This method provides a generic flag as a convenience for 00453 algorithms to mark if this EST has gone through their 00454 processing. By default ESTs are marked has processed when they 00455 are instantiated. 00456 00457 \param[in] processedFlag If this flag is \c true then this EST 00458 is flagged as having been processed. If this flag is \c false 00459 then the EST is flagged as not-processed (and requiring 00460 processing). 00461 */ 00462 inline void setProcessed(const bool processedFlag) 00463 { processed = processedFlag; } 00464 00465 protected: 00466 /** The unique ID for this EST. 00467 00468 This member holds the unique ID for this EST. The ID is set 00469 when the EST is instantiated and is never changed during the 00470 life time of this EST. The id is used to access and extract 00471 EST information. 00472 */ 00473 const int id; 00474 00475 /** The name and other information associated with the EST. This 00476 information is typically the first header line read from a 00477 FASTA file. The information may be dynamically loaded on 00478 demand to reduce memory footprint when processing large data 00479 sets. 00480 */ 00481 char *info; 00482 00483 /** The actual sequence of base pairs associated with this EST. 00484 This information is typically read from a FASTA file. The 00485 information may be dynamically loaded on demand to reduce 00486 memory footprint when processing large data sets. 00487 */ 00488 char *sequence; 00489 00490 /** The offset in the FASTA file to load the data from. 00491 00492 The offset of in the FASTA file from where this EST was read. 00493 This information can be used to conditionally and rapidly load 00494 EST's from a file. This value is initialized when the EST is 00495 insantiated and is never changed during the life time of an 00496 object. 00497 */ 00498 const long offset; 00499 00500 /** A similarity value for this EST with respect to another EST. 00501 00502 This instance variable is used to hold a similarity metric for 00503 this EST. The similarity metric is generated during analysis 00504 when one EST is compared with another. The similarity value 00505 is initialized to -1. It is accessed via the getSimilarity() 00506 method and changed via the setSimilarity() method. 00507 */ 00508 float similarity; 00509 00510 /** Instance variable to track if EST has gone through some 00511 processing. 00512 00513 This is a generic flag that is provided as a convenience for 00514 algorithms to mark if this EST has gone through their 00515 processing. By default this instance variable is intialized 00516 to \c false. Once it has been processed, the setProcessed() 00517 method can be used to set/reset this flag. The 00518 hasBeenProcessed() method can be used to determine if this EST 00519 has already been processed. 00520 */ 00521 bool processed; 00522 00523 /** Size of the longest EST. 00524 00525 This static instance variable is used to track the size (in 00526 number of nucleotides) of the longest EST ever 00527 instantiated. The size of the longest EST can be used by 00528 algorithms to optimally allocate memory for processing ESTs. 00529 */ 00530 static size_t maxESTlen; 00531 00532 /** Place holder for some other custom data. 00533 00534 This pointer acts as a convenient place holder for other 00535 classes to associate some uninterpreted user data (or data 00536 structure). This member is initialized to NULL in the 00537 constructor. Note that this pointer is managed using an 00538 auto_ptr that automatically deletes the data when the auto_ptr 00539 loses ownership of the data object. 00540 */ 00541 std::auto_ptr<ESTCustomData> customData; 00542 00543 private: 00544 /** The default constructor. 00545 00546 The default constructor has been made private to ensure that 00547 EST's are never directly created. Instead, a valid EST must 00548 be created using other constructors. 00549 */ 00550 EST(); 00551 00552 /** A utility method to duplicate a c-string. 00553 00554 This msethod is a simple utililty method that can be used to 00555 duplicate a given C-string. This method uses the stsandard 00556 C++ new operator to duplicate the given C-string. 00557 00558 \return This method simply returns NULL if src is 00559 NULL. Otherwise this method returns a pointer to a duplicate 00560 version of the specified string. 00561 */ 00562 static char* duplicate(const char *src); 00563 00564 /** Helper method to normalize a given nucleotide sequence. 00565 00566 This method is used to normalize fragments read from a FASTA 00567 file. This method normalizes the sequences such that the 00568 resulting sequence is over the set {'A', 'T', 'C', 'G', 'N'} 00569 in the following manner: 00570 00571 <ul> 00572 00573 <li>If the maskBases flag is true, then all lowercase 00574 nucleotides are converted to 'N'. Otherwise they are converted 00575 to uppercase equivalents.</li> 00576 00577 <li>All nucleotides that are not in "ATCG" are converted to 00578 'N'.</li> 00579 00580 </ul> 00581 00582 \param[in,out] sequence The sequence of nucleotides to be 00583 normalized by this method. 00584 00585 \param[in] maskBases If this flag is \c true, then all 00586 lowercase "atcg" bases are converted to 'N'. Otherwise they 00587 are converted to uppercase letters. 00588 */ 00589 static void normalizeBases(std::string& sequence, 00590 const bool maskBases = true); 00591 00592 /** The list of EST's currently being used. 00593 00594 This list contains the complete set of ESTs that are currently 00595 defined. This list includes partially loaded ESTs as well. 00596 New entries are added to the list by the create method. 00597 */ 00598 static std::vector<EST*> estList; 00599 00600 /** A dummy operator= 00601 00602 The operator=() is supressed for this class as it has constant members 00603 whose value is set when the object is created. These values cannot be 00604 changed during the lifetime of this object. 00605 00606 \param[in] src The source object from where data is to be copied. 00607 Currently this value is ignored. 00608 00609 \return Reference to this. 00610 */ 00611 EST& operator=(const EST& src); 00612 }; 00613 00614 #endif