00001 #ifndef NEW_UV_HEURISTIC_H 00002 #define NEW_UV_HEURISTIC_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "arg_parser.h" 00038 #include "Heuristic.h" 00039 #include "HeuristicFactory.h" 00040 #include "Utilities.h" 00041 #include "HashMap.h" 00042 00043 #include <string> 00044 #include <vector> 00045 00046 /** \typedef HashMap<int, std::vector<short> > UVHashTable; 00047 00048 \brief A hash map to provide rapid access to the set of 00049 pre-computed hash values for a given EST. 00050 00051 This typedef provides a convenient short cut to refer to a hash 00052 map that is used to cache pre-computed hash values for a given 00053 EST. The zero-based index of the EST is used as the key into this 00054 hash map. Each entry in the hash map contains a std::vector that 00055 essentially contains the array of hash values. 00056 00057 \note Refer to the additional documentation with the instance 00058 variable uvCache for motivation for this hash map. 00059 */ 00060 typedef HashMap<int, std::vector<unsigned short> > UVHashTable; 00061 00062 /** Heuristic based upon the "u/v sample heuristic" used in WCD, 00063 a type of common word heuristic. Considers all words of length v 00064 in the first sequence and every 16th word of length v in the second 00065 sequence. Returns true if it finds at least u common words. 00066 */ 00067 class NewUVHeuristic : public Heuristic { 00068 friend class HeuristicFactory; 00069 public: 00070 /** Display valid command line arguments for this heuristic. 00071 00072 This method must be used to display all valid command line 00073 options that are supported by this heuristic. Note that 00074 derived classes may override this method to display additional 00075 command line options that are applicable to it. This method 00076 is typically used in the main() method when displaying usage 00077 information. 00078 00079 \note Derived heuristic classes <b>must</b> override this 00080 method to display help for their custom command line 00081 arguments. When this method is overridden don't forget to 00082 call the corresponding base class implementation to display 00083 common options. 00084 00085 \param[out] os The output stream to which the valid command 00086 line arguments must be written. 00087 */ 00088 virtual void showArguments(std::ostream& os); 00089 00090 /** Process command line arguments. 00091 00092 This method is used to process command line arguments specific 00093 to this heuristic. This method is typically used from the 00094 main method just after the heuristic has been instantiated. 00095 This method consumes all valid command line arguments. If the 00096 command line arguments were valid and successfully processed, 00097 then this method returns \c true. 00098 00099 \note Derived heuristic classes <b>must</b> override this 00100 method to process any command line arguments that are custom 00101 to their operation. When this method is overridden don't 00102 forget to call the corresponding base class implementation to 00103 display common options. 00104 00105 \param[in,out] argc The number of command line arguments to be 00106 processed. 00107 00108 \param[in,out] argv The array of command line arguments. 00109 00110 \return This method returns \c true if the command line 00111 arguments were successfully processed. Otherwise this method 00112 returns \c false. 00113 */ 00114 virtual bool parseArguments(int& argc, char **argv); 00115 00116 /** Method to begin heuristic analysis (if any). 00117 00118 This method is invoked just before commencement of EST 00119 analysis. This method typically loads additional information 00120 that may be necessary for a given heuristic from data files. 00121 In addition, it may perform any pre-processing as the case may 00122 be. 00123 00124 \note Derived classes must override this method. 00125 00126 \return If the initialization process was sucessful, then this 00127 method returns 0. Otherwise this method returns with a 00128 non-zero error code. 00129 */ 00130 virtual int initialize(); 00131 00132 /** Set the reference EST id for analysis. 00133 00134 This method is invoked just before a batch of ESTs are 00135 analyzed via a call to the analyze(EST *) method. Setting the 00136 reference EST provides heuristics an opportunity to optimize 00137 certain operations, if possible. 00138 00139 \note This method must be called only after the initialize() 00140 method is called. 00141 00142 \return If the initialization process was sucessful, then this 00143 method returns 0. Otherwise this method returns an error code. 00144 */ 00145 virtual int setReferenceEST(const int estIdx); 00146 00147 /** The destructor. 00148 00149 The destructor frees memory allocated for holding any dynamic 00150 data in the base class. 00151 */ 00152 virtual ~NewUVHeuristic(); 00153 00154 protected: 00155 /** The default constructor. 00156 00157 The constructor has been made protected to ensure that this 00158 class is never directly instantiated. Instead one of the 00159 derived Heuristic classes must be instantiated via the 00160 HeuristicFactory API methods. 00161 00162 \param[in] name The human readable name for this heuristic. 00163 This name is used when generating errors, warnings, and other 00164 output messages for this heuristic. 00165 00166 \param[in] outputFileName The output file to which any 00167 analysis information is to be written. Currently this 00168 parameter is unused. 00169 */ 00170 NewUVHeuristic(const std::string& name, 00171 const std::string& UNREFERENCED_PARAMETER(outputFileName)); 00172 00173 /** Determine whether the analyzer should analyze, according to 00174 this heuristic. 00175 00176 This method can be used to compare a given EST with the 00177 reference EST (set via the call to the setReferenceEST()) 00178 method. 00179 00180 \param[in] otherEST The index (zero based) of the EST with 00181 which the reference EST is to be compared. 00182 00183 \return This method returns true if the heuristic says the 00184 EST pair should be analyzed, and false if it should not. 00185 */ 00186 virtual bool runHeuristic(const int otherEST); 00187 00188 /** Method to compute the <i>u/v</i> hash values for a given EST. 00189 00190 This method is a utility method that was introduced to 00191 streamline the process of computing and caching <i>u/v</i> 00192 hash values for a given EST. This method uses the variables 00193 \c u, \c v, and \c wordShift (all of them user configurable) 00194 along with a ESTCodec::NormalEncoder object to compute hash 00195 values into a std::vector. The vector is added to the \c 00196 uvCache hash map for future reference. 00197 00198 \param[in] estIdx The zero-based index of the EST whose hash 00199 values is to be computed and cached. 00200 */ 00201 void computeHash(const int estIdx); 00202 00203 int refESTLen; 00204 00205 int otherESTLen; 00206 00207 int u; 00208 00209 int wordShift; 00210 00211 int passes; 00212 00213 /** Instance variable to track if a given word (of length \c v) 00214 appears in reference EST. 00215 00216 This instance variable is created in the initialize() method 00217 to point to an array of 4<sup>v</sup> 00218 00219 */ 00220 char* s1WordMap; 00221 00222 /** Instance variable to track if a given word (of length \c v) 00223 appears in reference EST. 00224 00225 This instance variable is created in the initialize() method 00226 to point to an array of 4<sup>v</sup> 00227 */ 00228 char* s1RCWordMap; 00229 00230 /** Flag to indicate if normal or reverse-complement version 00231 provided best match. 00232 00233 This flag is set at the end of the runHeuristic method in this 00234 class to indicate if the normal or the reverse-complement 00235 check yielded the best possible match. If this flag is \c 00236 false, then the normal check yielded the best match. If this 00237 value is \c true, then the reverse-complement check yielded 00238 the best match. 00239 */ 00240 bool bestMatchIsRC; 00241 00242 /** Instance variable to maintain the \em v parameter for the 00243 <i>u/v</i> heuristic. 00244 00245 */ 00246 static int v; 00247 00248 static int BitMask; 00249 00250 /** Instance variable to store the number of bits to be shifted to 00251 create hash values. 00252 00253 <p>This instance variable is set to the value of 2 * (\em v - 00254 1) (in the \c initialize method) to reflect the number of bits 00255 that need to be shifted in order to build the hash values for 00256 common words (including the values stored in \c s1WordMap and 00257 \c s1RCWordMap).</p> 00258 00259 <p>This instance variable is actually passed on to the 00260 ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder when 00261 computing hash values. Since this is value is passed in a 00262 template parameter, it is defined to be static (to ensure that 00263 it has external linkage as per the ISO/ANSI standards 00264 requirement).</p> 00265 */ 00266 static int bitsToShift; 00267 00268 private: 00269 /** The set of arguments specific to the UV heuristic. 00270 00271 This instance variable contains a static list of arguments 00272 that are specific only to this analyzer class. This argument 00273 list is statically defined and shared by all instances of this 00274 class. 00275 00276 \note Use of static arguments and parameters renders this UV 00277 sample heuristic class not to be MT-safe. 00278 */ 00279 static arg_parser::arg_record argsList[]; 00280 00281 /** A hash map to cache hash values (<i>v</i> base pairs in 00282 length) to seedup <i>u/v</i> heuristic. 00283 00284 <p>The <i>u/v</i> heuristic \b used to generate hash values 00285 (in the \c runHeuristic method) by iterating over the base 00286 pairs in a given EST sequence. However, this approach turned 00287 out to be rather slow. Furthermore, it was observed that the 00288 same set of hash values were recomputed for various pair-wise 00289 EST comparisons.</p> 00290 00291 <p>Therefore, to improve the overall performance of the 00292 <i>u/v</i> heuristic, it was proposed that the hash values be 00293 cached to improve performance. Of course, this does increase 00294 the net amount of memory consumed. Consequently, to aid in 00295 caching only the required subset of EST hash values, this hash 00296 map was introduced to store the necessary sequences and 00297 rapidly access them when needed.</p> 00298 00299 <p>The entries in this hash map are computed by the \c 00300 computeHash method, which is invoked from the runHeuristic 00301 method.</p> 00302 */ 00303 UVHashTable uvCache; 00304 00305 /** The hint key that is used to add hint for normal or 00306 reverse-complement D2 computation. 00307 00308 This hint key is used to set a hint in the \c hints hash 00309 map. This string is defined as a constant to save compute time 00310 in the core \c runHeuristics method. 00311 */ 00312 const std::string hintKey; 00313 }; 00314 00315 #endif