00001 #ifndef UV_SAMPLE_HEURISTIC_H 00002 #define UV_SAMPLE_HEURISTIC_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "arg_parser.h" 00038 #include "Heuristic.h" 00039 #include "HeuristicFactory.h" 00040 #include "Utilities.h" 00041 #include "HashMap.h" 00042 00043 #include <string> 00044 #include <vector> 00045 00046 /** \typedef HashMap<int, std::vector<short> > UVHashTable; 00047 00048 \brief A hash map to provide rapid access to the set of 00049 pre-computed hash values for a given EST. 00050 00051 This typedef provides a convenient short cut to refer to a hash 00052 map that is used to cache pre-computed hash values for a given 00053 EST. The zero-based index of the EST is used as the key into this 00054 hash map. Each entry in the hash map contains a std::vector that 00055 essentially contains the array of hash values. 00056 00057 \note Refer to the additional documentation with the instance 00058 variable uvCache for motivation for this hash map. 00059 */ 00060 typedef HashMap<int, std::vector<unsigned short> > UVHashTable; 00061 00062 /** Heuristic based upon the "u/v sample heuristic" used in WCD, 00063 a type of common word heuristic. Considers all words of length v 00064 in the first sequence and every 16th word of length v in the second 00065 sequence. Returns true if it finds at least u common words. 00066 */ 00067 class UVSampleHeuristic : public Heuristic { 00068 friend class HeuristicFactory; 00069 public: 00070 /** Display valid command line arguments for this heuristic. 00071 00072 This method must be used to display all valid command line 00073 options that are supported by this heuristic. Note that 00074 derived classes may override this method to display additional 00075 command line options that are applicable to it. This method 00076 is typically used in the main() method when displaying usage 00077 information. 00078 00079 \note Derived heuristic classes <b>must</b> override this 00080 method to display help for their custom command line 00081 arguments. When this method is overridden don't forget to 00082 call the corresponding base class implementation to display 00083 common options. 00084 00085 \param[out] os The output stream to which the valid command 00086 line arguments must be written. 00087 */ 00088 virtual void showArguments(std::ostream& os); 00089 00090 /** Process command line arguments. 00091 00092 This method is used to process command line arguments specific 00093 to this heuristic. This method is typically used from the 00094 main method just after the heuristic has been instantiated. 00095 This method consumes all valid command line arguments. If the 00096 command line arguments were valid and successfully processed, 00097 then this method returns \c true. 00098 00099 \note Derived heuristic classes <b>must</b> override this 00100 method to process any command line arguments that are custom 00101 to their operation. When this method is overridden don't 00102 forget to call the corresponding base class implementation to 00103 display common options. 00104 00105 \param[in,out] argc The number of command line arguments to be 00106 processed. 00107 00108 \param[in,out] argv The array of command line arguments. 00109 00110 \return This method returns \c true if the command line 00111 arguments were successfully processed. Otherwise this method 00112 returns \c false. 00113 */ 00114 virtual bool parseArguments(int& argc, char **argv); 00115 00116 /** Method to begin heuristic analysis (if any). 00117 00118 This method is invoked just before commencement of EST 00119 analysis. This method typically loads additional information 00120 that may be necessary for a given heuristic from data files. 00121 In addition, it may perform any pre-processing as the case may 00122 be. 00123 00124 \note Derived classes must override this method. 00125 00126 \return If the initialization process was sucessful, then this 00127 method returns 0. Otherwise this method returns with a 00128 non-zero error code. 00129 */ 00130 virtual int initialize(); 00131 00132 /** Set the reference EST id for analysis. 00133 00134 This method is invoked just before a batch of ESTs are 00135 analyzed via a call to the analyze(EST *) method. Setting the 00136 reference EST provides heuristics an opportunity to optimize 00137 certain operations, if possible. 00138 00139 \note This method must be called only after the initialize() 00140 method is called. 00141 00142 \return If the initialization process was sucessful, then this 00143 method returns 0. Otherwise this method returns an error code. 00144 */ 00145 virtual int setReferenceEST(const int estIdx); 00146 00147 /** The destructor. 00148 00149 The destructor frees memory allocated for holding any dynamic 00150 data in the base class. 00151 */ 00152 virtual ~UVSampleHeuristic(); 00153 00154 protected: 00155 /** The default constructor. 00156 00157 The constructor has been made protected to ensure that this 00158 class is never directly instantiated. Instead one of the 00159 derived Heuristic classes must be instantiated via the 00160 HeuristicFactory API methods. 00161 00162 \param[in] name The human readable name for this heuristic. 00163 This name is used when generating errors, warnings, and other 00164 output messages for this heuristic. 00165 00166 \param[in] outputFileName The output file to which any 00167 analysis information is to be written. Currently this 00168 parameter is unused. 00169 */ 00170 UVSampleHeuristic(const std::string& name, 00171 const std::string& outputFileName); 00172 00173 /** Determine whether the analyzer should analyze, according to 00174 this heuristic. 00175 00176 This method can be used to compare a given EST with the 00177 reference EST (set via the call to the setReferenceEST()) 00178 method. 00179 00180 \param[in] otherEST The index (zero based) of the EST with 00181 which the reference EST is to be compared. 00182 00183 \return This method returns true if the heuristic says the 00184 EST pair should be analyzed, and false if it should not. 00185 */ 00186 virtual bool runHeuristic(const int otherEST); 00187 00188 /** Method to compute the <i>u/v</i> hash values for a given EST. 00189 00190 This method is a utility method that was introduced to 00191 streamline the process of computing and caching <i>u/v</i> 00192 hash values for a given EST. This method uses the variables 00193 \c u, \c v, and \c wordShift (all of them user configurable) 00194 along with a ESTCodec::NormalEncoder object to compute hash 00195 values into a std::vector. The vector is added to the \c 00196 uvCache hash map for future reference. 00197 00198 \param[in] estIdx The zero-based index of the EST whose hash 00199 values is to be computed and cached. 00200 */ 00201 void computeHash(const int estIdx); 00202 00203 /** Instance variable to track if a given word (of length \c v) 00204 appears in reference EST. 00205 00206 This instance variable is created in the initialize() method 00207 to point to an array of 4<sup>v</sup> 00208 00209 */ 00210 char* s1WordMap; 00211 00212 /** Instance variable to track if a given word (of length \c v) 00213 appears in reference EST. 00214 00215 This instance variable is created in the initialize() method 00216 to point to an array of 4<sup>v</sup> 00217 */ 00218 char* s1RCWordMap; 00219 00220 /** Flag to indicate if normal or reverse-complement version 00221 provided best match. 00222 00223 This flag is set at the end of the runHeuristic method in this 00224 class to indicate if the normal or the reverse-complement 00225 check yielded the best possible match. If this flag is \c 00226 false, then the normal check yielded the best match. If this 00227 value is \c true, then the reverse-complement check yielded 00228 the best match. 00229 */ 00230 bool bestMatchIsRC; 00231 00232 /** Instance variable to maintain the \em v parameter for the 00233 <i>u/v</i> heuristic. 00234 00235 */ 00236 static int v; 00237 00238 static int wordShift; 00239 00240 static int BitMask; 00241 00242 /** Instance variable to store the number of bits to be shifted to 00243 create hash values. 00244 00245 <p>This instance variable is set to the value of 2 * (\em v - 00246 1) (in the \c initialize method) to reflect the number of bits 00247 that need to be shifted in order to build the hash values for 00248 common words (including the values stored in \c s1WordMap and 00249 \c s1RCWordMap).</p> 00250 00251 <p>This instance variable is actually passed on to the 00252 ESTCodec::NormalEncoder or ESTCodec::RevCompEncoder when 00253 computing hash values. Since this is value is passed in a 00254 template parameter, it is defined to be static (to ensure that 00255 it has external linkage as per the ISO/ANSI standards 00256 requirement).</p> 00257 */ 00258 static int bitsToShift; 00259 00260 private: 00261 /** The set of arguments specific to the UV heuristic. 00262 00263 This instance variable contains a static list of arguments 00264 that are specific only to this analyzer class. This argument 00265 list is statically defined and shared by all instances of this 00266 class. 00267 00268 \note Use of static arguments and parameters renders this UV 00269 sample heuristic class not to be MT-safe. 00270 */ 00271 static arg_parser::arg_record argsList[]; 00272 00273 static int u; 00274 00275 /** A hash map to cache hash values (<i>v</i> base pairs in 00276 length) to seedup <i>u/v</i> heuristic. 00277 00278 <p>The <i>u/v</i> heuristic \b used to generate hash values 00279 (in the \c runHeuristic method) by iterating over the base 00280 pairs in a given EST sequence. However, this approach turned 00281 out to be rather slow. Furthermore, it was observed that the 00282 same set of hash values were recomputed for various pair-wise 00283 EST comparisons.</p> 00284 00285 <p>Therefore, to improve the overall performance of the 00286 <i>u/v</i> heuristic, it was proposed that the hash values be 00287 cached to improve performance. Of course, this does increase 00288 the net amount of memory consumed. Consequently, to aid in 00289 caching only the required subset of EST hash values, this hash 00290 map was introduced to store the necessary sequences and 00291 rapidly access them when needed.</p> 00292 00293 <p>The entries in this hash map are computed by the \c 00294 computeHash method, which is invoked from the runHeuristic 00295 method.</p> 00296 */ 00297 UVHashTable uvCache; 00298 00299 /** The hint key that is used to add hint for normal or 00300 reverse-complement D2 computation. 00301 00302 This hint key is used to set a hint in the \c hints hash 00303 map. This string is defined as a constant to save compute time 00304 in the core \c runHeuristics method. 00305 */ 00306 const std::string hintKey; 00307 }; 00308 00309 #endif