00001 #ifndef MST_CACHE_H 00002 #define MST_CACHE_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include <vector> 00038 #include <utility> 00039 #include <functional> 00040 #include <iostream> 00041 00042 #include "ESTAnalyzer.h" 00043 #include "CachedESTInfoHelper.h" 00044 00045 /** \def SMList 00046 00047 \brief Typedef for std::vector<CachedESTInfo>. 00048 00049 This typedef is a convenience definition to handle a vector of 00050 CachedESTInfo objects associated with a given EST in the MST 00051 Cache. This data type is used by the MSTClusterMaker to create a 00052 temporary list of cache entries and then add a whole bunch of 00053 entries to the MSTCache. Adding a bunch of cache entries at a 00054 time facilitates distribution of data (to various workers) and 00055 streamlines management of cached information. 00056 */ 00057 typedef std::vector<CachedESTInfo> SMList; 00058 00059 /** A cache to maintain similarity/distance metrics to build a Minimum 00060 Spanning Tree (MST). 00061 00062 <p>This class is the base class of all distributed-cache 00063 maintainers used for rapdily building a Minimum Spanning Tree 00064 (MST) for clustering. This class defines the core API for all 00065 ESTCache classes. In addition, it includes a few helper methods 00066 that provides some convenience methods to manage the caches. 00067 Furthermore, this class streamlines the operation of the Master 00068 and Worker processes when building a MST in a distributed manner. 00069 This class (and its children) essentially cache 00070 similarity/distance metrics to minimize the number of times the 00071 expensive operation of computing similarity metrics is performed 00072 via an EST analyzer. </p> 00073 */ 00074 class MSTCache { 00075 public: 00076 /** The destructor. 00077 00078 The parent class (std::vector) manages all the memory 00079 operations for this class. Consequently, the destructor does 00080 not have any special tasks to perform. It here present merely 00081 to adhere to coding conventions. 00082 */ 00083 virtual ~MSTCache() {} 00084 00085 /** Determine if a EST (given its index) is already present in the 00086 MST. 00087 00088 This method can be used to determine if a given EST has been 00089 added to the MST. This method was introduces to make the code 00090 a bit more readable at certain spots. 00091 00092 \param[in] estIdx The index of the EST to be tested. 00093 00094 \return This method returns \c true if the EST (with the given 00095 index) is present in the MST. Otherwise it returns \c false. 00096 */ 00097 inline bool isESTinMST(const int estIdx) const 00098 { return nodesAdded[estIdx]; } 00099 00100 /** Suggest to clear caches that contain entries for a newly added 00101 EST. 00102 00103 This method can be used to suggest to the cache to prune 00104 entries corresponding to the \c estIdxAdded from the various 00105 data structures maintained by this cache. The pruning 00106 operation is useful for the following two purposes: 00107 00108 <ol> 00109 00110 <li>First it removes entries from various lists corresponding 00111 to estIdxAdded (because these entries are vestigial as the 00112 node as already been added to the MST) thereby reducing memory 00113 usage. </li> 00114 00115 <li>Second, if a list becomes empty then the cache needs to be 00116 repopulated with fresh entries for further MST 00117 computations.</li> 00118 00119 </ol> 00120 00121 \note Some dervied classes may choose not to prune caches or 00122 add entries to the \c repopulateList. Consequently, this 00123 method must be viewed as a suggestion to the \c MSTCache to 00124 give it a chance to prune caches as needed. 00125 00126 \param[in] estIdxAdded The index of the EST that has just been 00127 added to the MST. 00128 00129 \param[out] repopulateList The index values of ESTs whose 00130 cache needs to be repopulated. The list of EST indexes added 00131 to this list are only the ESTs that are owned by this process. 00132 If none of the caches need to be recomputed then this list is 00133 empty (all existing entries are removed). 00134 00135 \param[in] prefixListSize If this parameter is set to \c true, 00136 then the number of entries added to the repopulateList is 00137 inserted at the beginning of the vector. This eases 00138 transmission of lists via MPI to the master process. 00139 */ 00140 virtual void pruneCaches(const int estIdxAdded, 00141 std::vector<int>& repopulateList, 00142 const bool prefixListSize = true) = 0; 00143 00144 /** Give the MSTCache a chance to preprocess lists before 00145 distributing them over the network. 00146 00147 This method is a helper method that can be used by the 00148 MSTClusterMaker to have a local metric list to be preprocessed 00149 prior to dispatching it over the network. Preprocessing a 00150 local list possibly reduces the size and distributes some of 00151 the processing overheads (thereby accelerating overall 00152 processing). 00153 00154 \note The default method implementation in the base class does 00155 absolutely nothing. 00156 00157 \param[in,out] list The list of entries that must be 00158 preprocessed. 00159 */ 00160 virtual void preprocess(SMList& UNREFERENCED_PARAMETER(list)) {} 00161 00162 /** Add/merges a batch of new cache entries with the current 00163 entries in the cache for a given EST. 00164 00165 This method merges a given batch of new cache entries with 00166 existing cache entries for the given EST. Some dervied 00167 classes may ensure that only MaxCacheSize entries are retained 00168 in the cache (retained entries \b must be chosen so that they 00169 have the best metrics). Entries in the cache are removed to 00170 accommodate new entries. 00171 00172 \note Derived classes must override this method. However, 00173 they may choose to ignore the \c MaxCacheSize suggestion and 00174 retain more entries than necessary to reduce unnecessary 00175 computations. 00176 00177 \param[in] estIdx The zero-based index of the reference EST 00178 against which the batch of cache entries in \c list were 00179 analyzed and generated. 00180 00181 \param[in] list A SMList containing the batch of new cache 00182 entries to be merged with any existing entries in the cache 00183 for the given EST (whose zero-based index is \c estIdx). 00184 */ 00185 virtual void mergeList(const int estIdx, const SMList& list) = 0; 00186 00187 /** Obtains the top-most similar entry from the MSTCache. 00188 00189 This method searches the similarity metrics stored in this 00190 cache for various ESTs to locate the best entry with the 00191 highest similarity. It populates the parameters with the 00192 appropriate value. Note that the parameters are initialized 00193 to -1, -1, -1.0f, and -1 respectively. 00194 00195 \param[out] srcESTidx The source EST index from where the 00196 similarity metric is being measured. The srcESTidx is already 00197 present in the MST. 00198 00199 \param[out] destESTidx The destination EST index that is the 00200 best choice to be added to the MST (based on the local 00201 information). 00202 00203 \param[out] metric The similarity/distance metric between the 00204 srcESTidx and the destESTidx. 00205 00206 \param[out] alignmentData The alignment data associated with 00207 the srcESTidx and the destESTidx. 00208 00209 \param[out] directionData The direction data associated with 00210 the srcESTidx and the destESTidx. 00211 */ 00212 virtual void getBestEntry(int& srcESTidx, int& destESTidx, 00213 float& metric, int& alignmentData, 00214 int& directionData) const = 0; 00215 00216 /** Display cache usage statistics. 00217 00218 This method can be used to print the current cache usage 00219 statistics. Different derived classes track and report 00220 different statistics. Consequently, the actual statistics 00221 reported by this method will vary. 00222 00223 \note Derived classes must override this method. 00224 00225 \param[out] os The output stream to which statistics must be 00226 written. 00227 00228 \param[in] MyRank The MPI rank of the process to which this 00229 cache is assocaited. This information is used merely for 00230 displaying more informative statistics and does not influence 00231 the actual values displayed by this method. 00232 */ 00233 virtual void displayStats(std::ostream &os, const int MyRank) const = 0; 00234 00235 protected: 00236 /** The constructor. 00237 00238 The MST cache requires information about the total number of 00239 ESTs in the system in order to effectively track the ESTs 00240 already added to the MST. This value must be passed in as the 00241 only parameter. 00242 00243 \param[in] totalESTCount The total number of ESTs to be 00244 processed (that is, to be added to the MST). 00245 00246 \param[in] startOwnESTidx The starting index of the EST that 00247 is owned by the process that is using this cache. This value 00248 is used to internally normalize est index values to 0 for the 00249 first EST owned by this process. 00250 00251 \param[in] numOwnESTs The number of ESTs owned by the process 00252 that is using this cache. This information is used to reserve 00253 necessary space to hold SMLists for each EST owned by the 00254 manager/worker process. 00255 00256 \param[in] analyzer The analyzer to be used for EST 00257 comparisons, comparing metrics, and ordering entries in this 00258 cache depending on the type of EST analyzer (whether it is 00259 similarity based or distance metric based). 00260 00261 \param[in] repopulateCaches If this parameter is \c true then 00262 the MSTCache will request lists to be repopulated when 00263 needed. Repopulating lists guarantees that ultimately a MST 00264 will be developed. If repopulation is suppressed then the 00265 resulting spanning tree may not be a MST. 00266 00267 \param[in] maxCachePerEST This parameter \b suggests the 00268 maximum number of cached entries that must be maintained for a 00269 given EST. This parameter may not be really used by all 00270 dervied classes (this is only a suggestion). 00271 */ 00272 MSTCache(const int totalESTCount, const int startOwnESTidx, 00273 const int numOwnESTs, const ESTAnalyzer *analyzer, 00274 const bool repopulateCaches, const int maxCachePerEST); 00275 00276 /** Utility method to copy first n-entries from input to output SMList. 00277 00278 This is a rather straightforward method that can be used to 00279 copy the first \c cout entries from the \c input list and 00280 append them to the \c output list. 00281 00282 \param[in] input The list of entries that must be copied to 00283 the \c output list. 00284 00285 \param[in] count The number of entries to be copied. If this 00286 value is greater then input.size(), then only input.size() 00287 entries are copied. 00288 00289 \param[out] output The output list to which the entries are to 00290 be copied. 00291 */ 00292 static void copy_n(const SMList& input, const size_t count, 00293 SMList &output); 00294 00295 /** The analyzer to be used for any EST comparison. 00296 00297 This pointer is initialized in the constructor to point to 00298 the EST analyzer that is used for EST comparisons. This 00299 pointer is essentially used for comparing EST metrics for 00300 ordering information in the cache. 00301 */ 00302 const ESTAnalyzer* const analyzer; 00303 00304 /** Bit-vector to determine if a given node has been added to the 00305 MST. 00306 00307 This vector is used to track if a given EST node has been been 00308 added to the MST. This std::vector<bool> is a specialization 00309 that optimizes memory usage. So this member should not occupy 00310 much memory. The maximum number of nodes is set in the 00311 constructor and all entries are initialized to false. The 00312 pruneCache() method sets the suitable entry to \c true after 00313 the EST has been added to the MST. The isESTinMST() method 00314 uses this vector. 00315 */ 00316 std::vector<bool> nodesAdded; 00317 00318 /** The index of the first EST whose information is cached. 00319 00320 The starting index of the EST that is owned by the process 00321 that is using this cache. This value is used by some derived 00322 classes to internally normalize EST index values to zero for 00323 the first EST owned by this process. This value is 00324 initialized in the constructor and is never changed during the 00325 lifetime of this object. 00326 */ 00327 const int startOwnESTidx; 00328 00329 /** The number of ESTs that this cache logically owns and manages. 00330 00331 This instance variable maintains the number of ESTs that are 00332 logically owned and managed by this cache. This value is 00333 initialized in the constructor and is never changed during the 00334 lifetime of this object. 00335 */ 00336 const int numOwnESTs; 00337 00338 /** The \b suggested maximum number of cached entries per EST. 00339 00340 This instance variable is a \b suggested maximum number of 00341 cached entries that must be maintained for a given EST. This 00342 parameter may not be really used by all dervied classes (as 00343 this is only a suggestion). 00344 */ 00345 const int maxCachePerEST; 00346 00347 /** Flag to control if caches must be repopulated. 00348 00349 <p> If this parameter is \c true then the MSTCache will 00350 request lists to be repopulated when needed. Repopulating 00351 lists guarantees that ultimately a MST will be developed. If 00352 repopulation is suppressed then the resulting spanning tree 00353 may not be a MST. </p> 00354 00355 <p> This value is set in the constructor and is never changed 00356 during the life time of this object. </p> 00357 00358 */ 00359 const bool repopulateCaches; 00360 00361 /** A dummy operator= 00362 00363 The operator=() is supressed for this class as it has constant 00364 members whose value is set when the object is created. These 00365 values cannot be changed during the lifetime of this object. 00366 00367 \param[in] src The source object from where data is to be 00368 copied. Currently this value is ignored. 00369 00370 \return Reference to this. 00371 */ 00372 MSTCache& operator=(const MSTCache& src); 00373 }; 00374 00375 #endif