00001 #ifndef TV_HEURISTIC_H 00002 #define TV_HEURISTIC_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "NewUVHeuristic.h" 00038 #include "EST.h" 00039 #include <utility> 00040 00041 /** Heuristic based upon the T/V heuristic used in WCD, a type of 00042 common word heuristic. 00043 00044 The idea of the <i>t/v</i>-heuristic is to require the common 00045 words in a pair of ESTs to be found reasonably close to each other 00046 but not too close. The rule of this heuristic is as follows: 00047 00048 <ol> 00049 00050 <li>Assume we are given two ESTs to analyze, say e<sub>i</sub> and 00051 e<sub>j</sub> and a threshold \em t.</li> 00052 00053 <li>Consider all \em v words appearing in e<sub>i</sub>.</li> 00054 00055 <li>At least \em t of these \em v words must appear in \em j so 00056 that they do not overlap (their starting positions must be at 00057 least \em v base pairs different) and are at least 100 base pairs 00058 of each other.</li> 00059 00060 <li>If there are at least \em t \em v words the huristic 00061 passes. If not, the pair need not be considered further.</li> 00062 00063 </ol> 00064 */ 00065 class TVHeuristic : public NewUVHeuristic { 00066 friend class HeuristicFactory; 00067 public: 00068 /** Display valid command line arguments for this heuristic. 00069 00070 This method is used to display all valid command line options 00071 that are supported by this heuristic. Note that this method 00072 invokes the corresponding method in the base class to display 00073 any options supported by the base class. This method is 00074 typically used in the \c main() method when displaying usage 00075 information. 00076 00077 \param[out] os The output stream to which the valid command 00078 line arguments must be written. 00079 */ 00080 virtual void showArguments(std::ostream& os); 00081 00082 /** Process command line arguments. 00083 00084 This method is used to process command line arguments specific 00085 to this heuristic. This method is typically used from the \c 00086 main method just after the heuristic has been instantiated. 00087 This method consumes all valid command line arguments. If the 00088 command line arguments were valid and successfully processed, 00089 then this method returns \c true. 00090 00091 \note This method calls the corresponding base class 00092 implementation to display common options. 00093 00094 \param[in,out] argc The number of command line arguments to be 00095 processed. When arguments are processed/consumed this 00096 parameter is changed to reflect the number of arguments 00097 processed. 00098 00099 \param[in,out] argv The array of command line arguments to be 00100 processed. When arguments are processed/consumed, the consumed 00101 arguments are removed from this array. 00102 00103 \return This method returns \c true if the command line 00104 arguments were successfully processed. Otherwise this method 00105 returns \c false. 00106 */ 00107 virtual bool parseArguments(int& argc, char **argv); 00108 00109 /** Method to begin heuristic analysis (if any). 00110 00111 This method is invoked just before commencement of EST 00112 analysis. This method essentially passes control to the base 00113 class that merely creates the arrays for building hash maps. 00114 00115 \return If the initialization process was sucessful, then this 00116 method returns 0. Otherwise this method returns with a 00117 non-zero error code. 00118 */ 00119 virtual int initialize(); 00120 00121 /** Set the reference EST id for analysis. 00122 00123 This method is invoked just before a batch of ESTs are 00124 analyzed via a call to the analyze(EST *) method. Setting the 00125 reference EST provides this heuristic an opportunity to 00126 pre-compute the normal and reverse-complement hash tabes for 00127 words of varying sizes. The hash table enables rapid searching 00128 for words in the \c runHeuristic method. 00129 00130 \note This method must be called only after the initialize() 00131 method is called. 00132 00133 \return If the hash table creation process was sucessful, then 00134 this method returns 0. Otherwise this method returns an error 00135 code. 00136 */ 00137 virtual int setReferenceEST(const int estIdx); 00138 00139 /** The destructor. 00140 00141 The destructor frees memory allocated for holding any dynamic 00142 data. in the base class. 00143 */ 00144 virtual ~TVHeuristic(); 00145 00146 /** Obtain the window length used for <i>t/v</i> heuristic. 00147 00148 The window length defines the length of the window within 00149 which common words are tracked and reported by this heuristic. 00150 Typically, this window length must match the window length 00151 used for D2 analysis for the heuristic to be meanigful. The 00152 default value is 100. This value can be overridden by the 00153 user via suitable command line arguments. 00154 00155 \return The current window (or frame) size set for <i>t/v</i> 00156 heuristic. 00157 */ 00158 int getWindowLen() { return windowLen; } 00159 00160 protected: 00161 /** The default constructor. 00162 00163 The constructor has been made protected to ensure that this 00164 class is never directly instantiated. Instead it should be 00165 created via a suitable call to the HeuristicFactory API 00166 method(s). 00167 00168 \param[in] outputFileName The output file to which any 00169 heuristic data is to be written. Currently, this value is 00170 ignored. 00171 */ 00172 TVHeuristic(const std::string& outputFileName); 00173 00174 /** Determine whether the analyzer should analyze, according to 00175 this heuristic. 00176 00177 This method can be used to compare a given EST with the 00178 reference EST (set via the call to the setReferenceEST()) 00179 method. This method operates as follows: 00180 00181 <ol> 00182 00183 <li>It invokes the corresponding method in the base class to 00184 first run the UV-sample heuristic on the pair of ESTs. If the 00185 pair fails UV-sample heuristic this method returns immediately 00186 with \c false (indicating further analysis is not needed).</li> 00187 00188 <li>If the pair passes UV-sample heuristic then this method 00189 invokes the overloaded \c runHeuristic method with a suitable 00190 encoder (normal or reverse-complement encoder depending on the 00191 value of NewUVHeuristic::bestMatchIsRC flag) to analyze 00192 the pair of ESTs using the TV heuristic.</li> 00193 00194 </ol> 00195 00196 \param[in] otherEST The index (zero based) of the EST with 00197 which the reference EST is to be compared. 00198 00199 \return This method returns \c true if the heuristic says the 00200 EST pair should be analyzed, and \c false if it should not. 00201 */ 00202 virtual bool runHeuristic(const int otherEST); 00203 00204 /** Method to obtain and update the parameters for the heuristic 00205 based on the parameter set manager. 00206 00207 \return true if we should analyze these ESTs, false otherwise 00208 */ 00209 bool updateParameters(); 00210 00211 /** Templatized-method for counting common woards between two ESTs. 00212 00213 This method is a helper method that is invoked from the 00214 runHeuristic method to count the number of common words 00215 between the reference EST (set via call to setReferenceEST) 00216 and otherEST (parameter). This method operates as follows: 00217 00218 <ol> 00219 00220 <li>First the matchTable (instance variable) is cleared to all 00221 zeros.</li> 00222 00223 <li>Next the initial word of length NewUVHeuristic::v is 00224 constructed while ignoring bases marked as 'n' (this may 00225 require processing of more than the first NewUVHeuristic::v 00226 bases if one of them is a 'n'.</li> 00227 00228 </ol> 00229 00230 */ 00231 template <typename Encoder> 00232 int countCommonWords(const int otherEST, Encoder encoder, 00233 const char* refWordMap) { 00234 // First compute the hash for the first word using the supplied 00235 // encoder. 00236 const char *otherSeq = EST::getEST(otherEST)->getSequence(); 00237 register int hash = 0; 00238 int ignoreMask = 0; 00239 // Set first window length entries to zero. 00240 memset(matchTable, 0, sizeof(char) * (windowLen + NewUVHeuristic::v)); 00241 // Compute hash for initial word while skipping over bases 00242 // makred 'n'. This may require processing of more then v-1 00243 // bases 00244 for(int i = 0; (i < NewUVHeuristic::v - 1); i++) { 00245 hash = encoder(hash, otherSeq[i], ignoreMask); 00246 } 00247 // Skip first windowLen entries to simplify logic in loop below. 00248 char *matchTicker = matchTable + windowLen; 00249 // Now see how many common words exist in the two ESTs 00250 int numMatch = 0, maxMatch = 0; 00251 int oldWindowPos = -windowLen; 00252 for(int i = NewUVHeuristic::v - 1; (i < otherESTLen); i++) { 00253 hash = encoder(hash, otherSeq[i], ignoreMask); 00254 numMatch -= matchTicker[oldWindowPos++]; 00255 if (!ignoreMask) { 00256 // If ignoreMask is zero, it tells us that the hash is 00257 // not tainted due to 'n' bp and it is good to be 00258 // used for the operations below. 00259 matchTicker[i] = refWordMap[hash]; 00260 numMatch += refWordMap[hash]; 00261 maxMatch = std::max(maxMatch, numMatch); 00262 } 00263 } 00264 // Return number of matches encountered. 00265 return maxMatch; 00266 } 00267 00268 /** Method to display statistics regarding operation of this 00269 heuristic. 00270 00271 This method can be used to obtain a dump of the statistics 00272 gathered regarding the operation of this heuristic. This 00273 method calls the base class method first which prints some 00274 common statistics. It then displays the number of times the 00275 <i>u/v</i> sample heuristic (the base class) returned success 00276 causing the <i>t/v</i> heuristic to be run. 00277 00278 \param[out] os The output stream to which the statistics 00279 regarding the heuristic is to be dumped. 00280 */ 00281 virtual void printStats(std::ostream& os) const; 00282 00283 private: 00284 /** The set of arguments specific to the TV heuristic. 00285 00286 This instance variable contains a static list of arguments 00287 that are specific only to this analyzer class. This argument 00288 list is statically defined and shared by all instances of this 00289 class. 00290 00291 \note Use of static arguments and parameters renders this UV 00292 sample heuristic class not to be MT-safe. 00293 */ 00294 static arg_parser::arg_record argsList[]; 00295 00296 /** The number of minumum number of common words. 00297 00298 This instance variable contains the minimum number of words 00299 (that are close but not too close) that have matching values 00300 in pairs of ESTs. The default is 65. However, this value can 00301 be overridden by a command line argument. 00302 */ 00303 int t; 00304 00305 /** The window length to be used for <i>t/v</i> heuristic. 00306 00307 The window length defines the length of the window within 00308 which common words are tracked and reported by this heuristic. 00309 Typically, this window length must match the window length 00310 used for D2 analysis for the heuristic to be meanigful. The 00311 default value is 100. This value can be overridden by the 00312 user via suitable command line arguments. 00313 */ 00314 int windowLen; 00315 00316 /** A large table to track matches. 00317 00318 This instance variable contains a large table that tracks 00319 matches encountered as this heuristic tracks matching words. 00320 */ 00321 char *matchTable; 00322 00323 /** Instance variable to track the number of times UV sample 00324 heuristic passed. 00325 00326 This instance variable is used to track the number of times 00327 the UV sample heuristic passed. This value indicates the 00328 number of times the TV heuristic was actually run. This value 00329 is incremented in the runHeuristic method and is displayed by 00330 the printStats() method. 00331 */ 00332 int uvSuccessCount; 00333 }; 00334 00335 #endif