00001 #ifndef MST_CLUSTER_MAKER_H 00002 #define MST_CLUSTER_MAKER_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "ClusterMaker.h" 00038 #include "MSTCache.h" 00039 #include "MSTCluster.h" 00040 #include "MST.h" 00041 00042 #include <fstream> 00043 00044 /** A Minimum Spanning Tree (MST) based parallel cluster maker. 00045 00046 This class encapsulates the core functionality needed to construct 00047 a MST-based EST clusters in a parallel/distributed manner using 00048 the Message Passing Interface (MPI) library. This class includes 00049 functionality for both the Manager (MPI Rank == 0) and Worker (MPI 00050 Rank > 0) processes. Necessary functionality to distinguish and 00051 operate either as Manager or Worker is already built into the 00052 class. This class uses the MSTCache and MSTCluster classes to 00053 help in performing the various activities. Refer to the 00054 documentation on the various method for detailed description on 00055 their functionality and usage. 00056 */ 00057 class MSTClusterMaker : public ClusterMaker { 00058 friend class ClusterMakerFactory; 00059 public: 00060 /** The set of tags exchanged between various processes. 00061 00062 This enum provides meanigful names to the various tags 00063 (integers) exchanged between the master and worker processes 00064 participating in the construction of a MST in a 00065 parallel/distributed manner. 00066 */ 00067 enum MessageTags{REPOPULATE_REQUEST, COMPUTE_SIMILARITY_REQUEST, 00068 SIMILARITY_LIST, SIMILARITY_COMPUTATION_DONE, 00069 COMPUTE_MAX_SIMILARITY_REQUEST, MAX_SIMILARITY_RESPONSE, 00070 ADD_EST, TRANSITIVITY_LIST, COMPUTE_TOTAL_ANALYSIS_COUNT}; 00071 00072 /** The destructor. 00073 00074 The destructor frees up all any dynamic memory allocated by 00075 this object for its operations. 00076 */ 00077 virtual ~MSTClusterMaker(); 00078 00079 /** Display valid command line arguments for this cluster maker. 00080 00081 This method must be used to display all valid command line 00082 options that are supported by this cluster maker (and its base 00083 classes). 00084 00085 \note This method calls the base class's showArguments first. 00086 00087 \param[out] os The output stream to which the valid command 00088 line arguments must be written. 00089 */ 00090 virtual void showArguments(std::ostream& os); 00091 00092 /** Process command line arguments. 00093 00094 This method is used to process command line arguments specific 00095 to this cluster maker. This method is typically used from the 00096 main method just after the cluster maker has been 00097 instantiated. This method consumes all valid command line 00098 arguments. If the command line arguments were valid and 00099 successfully processed, then this method returns \c true. 00100 00101 \note This method consumes its custom command line arguments 00102 first and then call's the base class's parseArguments() method. 00103 00104 \param[in,out] argc The number of command line arguments to be 00105 processed. 00106 00107 \param[in,out] argv The array of command line arguments. 00108 00109 \return This method returns \c true if the command line 00110 arguments were successfully processed. Otherwise this method 00111 returns \c false. 00112 */ 00113 virtual bool parseArguments(int& argc, char **argv); 00114 00115 /** Method to begin clustering. 00116 00117 This method must be used to create clusters based on a given 00118 EST analysis method. This method performs the following 00119 tasks: 00120 00121 */ 00122 virtual int makeClusters(); 00123 00124 /** Method to display performance statistics. 00125 00126 This method overrides the empty implementation in the base 00127 class to display statistics on cache usage and MPI calls for 00128 tracking and reporting the performance and behavior of this 00129 class. 00130 00131 \param[out] os The output stream to which the statistics must 00132 be written. 00133 */ 00134 virtual void displayStats(std::ostream& os); 00135 00136 /** Add a dummy cluster to the cluster maker. 00137 00138 This method can be used to add a dummy cluster to the cluster 00139 maker. The dummy clusters are added as direct descendants of 00140 the \c root cluster with the given name. 00141 00142 \note This method is currently used by the Filter hierarchy to 00143 add ESTs that are logically filtered out and must not be part 00144 of the core clustering process. 00145 00146 \param[in] name A human readable name to be set for this 00147 cluster. No special checks are made on the contents of the 00148 string. 00149 00150 \return This method returns the ID value set for the newly 00151 added dummy index. 00152 */ 00153 virtual int addDummyCluster(const std::string name); 00154 00155 /** Add a EST directly to a given cluster. 00156 00157 This method can be used to add an EST directly to a 00158 cluster. This bypasses any traditional mechanism and directly 00159 adds the EST to the specified cluster. 00160 00161 \note The EST is added with an invalid metric value. ESTs 00162 added to a cluster are not included in the standard clustering 00163 process. Adding an EST that has already been added to the 00164 same/another cluster results in undefined behaviors. 00165 00166 \param[in] clusterID The unique ID of the cluster to which the 00167 EST is to be added. This value must have been obtained from an 00168 earlier (successful) call to the ClusterMaker::addDummyCluster 00169 method. 00170 00171 \param[in] estIdx The EST to be added to the given 00172 cluster. Once the EST has been added to this cluster it will 00173 not be included in the clustering process performed by this 00174 cluster maker. 00175 */ 00176 virtual void addEST(const int clusterID, const int estIdx); 00177 00178 protected: 00179 /** Variable to indicate per-EST similarity cache size. 00180 00181 This variable is used to indicate the number of similarity 00182 metrics that must be cached for a given EST. This value is 00183 initialized to 128. The value is changed by the 00184 parseArguments() method if the user has specified an option to 00185 override the default. 00186 */ 00187 static int cacheSize; 00188 00189 /** Variable to indicate if strict ordering of worker Ranks must 00190 be followed. 00191 00192 <p> If this member variable is \c true, then messages 00193 dispatched by workers and the manager are always read in a 00194 fixed order of increasing ranks. That is, messages from rank 00195 0 (zero) are processed first, then messages from process with 00196 rank 1, so on and so forth. On the other hand if this 00197 variable is \c false, then messages are processed in the order 00198 they are received. </p> 00199 00200 <p> The strictOrder approach guarantees consistent results for 00201 each run (involving the same number of processes) and the 00202 resulting MSTs are alll identical. However, a process may 00203 have to wait (idle wasting time) until a message from the 00204 appropriate process (with a given rank) is actually received. 00205 This may slow down the overall computational rate, 00206 particularly when the work load get's skewed toward the end of 00207 MST construction.</p> 00208 00209 <p> On the other hand, if strictOrder is relaxed (by setting 00210 strictOrder variable to \c false) then messsages are processed 00211 as soon as they are received, in the order in which messages 00212 arrive. This approach minimizes wait times. However, the MST 00213 constructed between multiple runs may not be identical as 00214 equidistant (or nodes with same similarity metrics) nodes may 00215 be processed in different order. Reordering of equidistant 00216 nodes occur because in this mode a total order is not enfored 00217 and only a partial order of nodes is performed.</p> 00218 00219 <p> By default strictOrder is enabled. However, the value can 00220 be changed by the user through command line arguments. The 00221 change of value occurs in the parseArguments() method if the 00222 user has specified an option to override the default. 00223 */ 00224 static bool strictOrder; 00225 00226 /** Command line option to avoid the clustering phase. 00227 00228 If this member variable is \c true, then this class only 00229 generates MST information and does not do clustering. By 00230 default this variable is initialized to \c false. However, 00231 the value can be changed by the user through command line 00232 arguments. The change of value occurs in the parseArguments() 00233 method if the user has specified an option to override the 00234 default. 00235 */ 00236 static bool dontCluster; 00237 00238 /** Command line option to print a pretty cluster tree. 00239 00240 If this member variable is \c true, then this class prints a 00241 pretty ASCII tree with the cluster information By default this 00242 variable is initialized to \c false. However, the value can 00243 be changed by the user through command line argument 00244 (--pretty-print). The change of value occurs in the 00245 parseArguments() method if the user has specified an option to 00246 override the default. 00247 */ 00248 static bool prettyPrint; 00249 00250 /** Command line option to print the cluster tree for PEACE GUI. 00251 00252 If this member variable is \c true, then this class prints the 00253 ClusterTree in a format that is processible by the GUI. By 00254 default this variable is initialized to \c false. However, 00255 the value can be changed by the user through command line 00256 argument (--gui-print). The change of value occurs in the 00257 parseArguments() method if the user has specified an option to 00258 override the default. 00259 */ 00260 static bool guiPrint; 00261 00262 /** Variable to indicate if MST information must be simply read 00263 from a given file. 00264 00265 This member variable is used to hold the name of the file 00266 (with full path) from where MST information must be read. 00267 This instance variable is initialized to NULL. However, if 00268 the input MST file is specified then MST building is skipped 00269 and MST data read from the file is used for further processing. 00270 */ 00271 static char* inputMSTFile; 00272 00273 /** Variable to indicate if MST information must be written to a 00274 given file. 00275 00276 This member variable is used to hold the name of the file 00277 (with full path) to which MST information must be written. 00278 This instance variable is initialized to NULL. However, if 00279 the output MST file is specified then MST data built by this 00280 program is written to the specified file. 00281 */ 00282 static char* outputMSTFile; 00283 00284 /** Command line option to suppress cache repopulation. 00285 00286 <p> If this member variable is \c true, then this class does 00287 not repopulate caches once a EST cache becomes empty. By 00288 default this variable is initialized to \c false. However, 00289 the value can be changed by the user through command line 00290 argument (--no-cache-repop). The change of value occurs in 00291 the parseArguments() method if the user has specified an 00292 option to override the default.</p> 00293 00294 <p> If this parameter is not specified then the MSTCache will 00295 request lists to be repopulated when needed. Repopulating 00296 lists guarantees that ultimately a MST will be developed. If 00297 repopulation is suppressed via this parameter then the 00298 resulting spanning tree may not be a MST; however computation 00299 time decreases. </p> 00300 */ 00301 static bool noCacheRepop; 00302 00303 /** Command line option to enable maximum use of precomputed 00304 scores for building MST. 00305 00306 <p>If this member variable is set to a value other than -1, 00307 then the MSTClusterMaker will try to use all the ESTs that 00308 have a metric better than the value specified for 00309 maxUse. Maximally using good metrics will ultimately reduce 00310 the total number of analysis that need to be performed, 00311 thereby reducing overall time for clustering.</p> 00312 */ 00313 static int maxUse; 00314 00315 /** Command line option to set the clustering threshold to be 00316 used in deriving clusters from the MST. 00317 00318 This member variable is used to set the clustering threshold. 00319 There are two "special" options here: 00320 00321 1.0 -- Corresponds to the TwoPassD2 analyzer which uses different 00322 window lengths, each with different thresholds. If TwoPassD2 00323 is used then clsThreshold must be set to 1.0. 00324 00325 -1 -- Corresponds to the mean/variance-based threshold. Intended 00326 to be used with the CLU analyzer. 00327 */ 00328 static float clsThreshold; 00329 00330 /** Command line option to set the type of cache to be used by 00331 PEACE. 00332 00333 This member variable is used to indicate the type of cache 00334 that must be used to store metrics to facilitate rapid 00335 construction of the MST. The default cache used in the 00336 MSTHashCache indicated by the cacheType set to \c "hash". The 00337 alternative cache in the MSTMultiListCache (indicated by 00338 cacheType value of \c "mlist"). The user may override the 00339 default using the command line parameter \c --cacheType. 00340 */ 00341 static char* cacheType; 00342 00343 /** Name of file to report progress in during MST construction. 00344 00345 This command line argument provides the name of the log file 00346 where progress information is to be written. The progress 00347 information is in the form: \#estsProcessed, \#ests. This 00348 value is specified via a command line argument. 00349 */ 00350 static char *progFileName; 00351 00352 /** Helper method to perform manager tasks. 00353 00354 This method has been introduced to streamline the operations 00355 of the MSTClusterMaker when it operates as the manager. The 00356 MPI process with Rank 0 (zero) acts as the manager and 00357 coordinates all the activities of the MSTClusterMaker. This 00358 method is invoked from the makeClusters() method. 00359 00360 \return This method returns 0 (zero) if clusters were created 00361 successfully. Otherwise this method returns a non-zero value 00362 indicating an error. 00363 */ 00364 virtual int manager(); 00365 00366 /** Helper method to perform worker tasks. 00367 00368 This method has been introduced to streamline the operations 00369 of the MSTClusterMaker when it operates as a worker. All the 00370 MPI processes with non-zero rank act as a worker and 00371 collaborate with the manager to assist in various activities 00372 of the MSTClusterMaker. This method is invoked from the 00373 makeClusters() method. 00374 00375 \return This method returns 0 (zero) if clusters were created 00376 successfully. Otherwise this method returns a non-zero value 00377 indicating an error. 00378 */ 00379 virtual int worker(); 00380 00381 /** A method to handle initialization tasks for the MSTClusterMaker. 00382 This method is called after the ESTs have been loaded into the 00383 ESTAnalyzer, in the makeClusters method. 00384 00385 This method does nothing as the MSTClusterMaker does not need 00386 to do any initialization. It is provided as a convenience 00387 method for inheriting subclasses to use for initialization. 00388 */ 00389 virtual int initialize(); 00390 00391 /** Helper method to generate (or compute) or load MST data from file. 00392 00393 This is a helper method that is invoked from the 00394 makeClusters() method. This method performs one of the 00395 following tasks: 00396 00397 <ul> 00398 00399 <li>If an #inputMSTFile has not been specified, then this 00400 method builds an MST (either on one process or many MPI 00401 processes). This method first creates a local cache local 00402 cache that contains information to build the MST. It then 00403 builds the MST calling the manager() or worker() method 00404 depending on the MPI-rank of this process.</li> 00405 00406 <li>If an #inputMSTFile has indeed been specified as a command 00407 line parameter, then this method loads the MST from the 00408 specified MST file.</li> 00409 00410 </ul> 00411 00412 \return This method returns zero on success indicating that a 00413 valid MST is available for further processing. Otherwise this 00414 method returns a non-zero code signalling error. 00415 */ 00416 virtual int populateMST(); 00417 00418 /** Utility method to do the final clustering step. 00419 00420 This is a refactored (primarily to keep the code clutter to a 00421 minimum) utility method that is used to perform the final step 00422 in clustering. This method essentially calls the 00423 #MSTCluster::makeClusters method that builds the clusters 00424 using the MST. Once the clusters are built, this method dumps 00425 the cluster information to the user-specified (via command 00426 line arguments) output stream. 00427 00428 \return This method returns zero on success. If errors occur, 00429 this method returns a non-zero error code. 00430 */ 00431 virtual int buildAndShowClusters(); 00432 00433 /** Helper method to call the actual heavy-weight analysis 00434 method(s). 00435 00436 This is a helper method that is invoked from the 00437 populateCache() method to obtain the relationship metric 00438 (either via CLU or d2) between the current parent EST and the 00439 given otherEST. This method was introduced to enable chlid 00440 classes (such as TransMSTClusterMaker) to conveniently 00441 intercept analyzer calls and potentially shortcircuit them 00442 using concepts of conditional-transitivity. 00443 00444 \param[in] otherEST The index of the other EST to which the 00445 metric is required. 00446 00447 \return This method returns a similarity/distance metric by 00448 comparing the ESTs. This method may return -1, if the otherEST 00449 is significantly different from the reference EST (possibly 00450 warranting no further analysis) that a meanigful metric cannot 00451 be generated. 00452 */ 00453 virtual float analyze(const int otherEST); 00454 00455 /** Computes sends/receives similarity list for a given EST. 00456 00457 This method is a shared method that is used by both the 00458 manager and workers. This method is used to compute the 00459 similarity metric and cache the highest set of similarity 00460 metrics. This method operates as follows: 00461 00462 <ol> 00463 00464 <li>Each process computes a subset of the EST similarity 00465 metric in the range \em k*Rank < otherEstIdx < (\em k+1)*Rank, 00466 where k=estList.size() / MPI::COMM_WORLD.Get_size(), and Rank 00467 is the MPI rank of this process. </li> 00468 00469 <li>If this process is the cache owner for the est, (that is, 00470 estIdx % Rank == 0), then it receives data from other 00471 processes and merges the information with its own list, 00472 retaining the top-most similarity metrics.</li> 00473 00474 <li>If this process is \b not the cache owner for the est, 00475 (that is, estIdx % Rank != 0), then it sends the computed 00476 similarity metrics to the owner process. <li> 00477 00478 </ol> 00479 00480 \param[in] estIdx The index of the EST that was just added to 00481 the MST and for which the adjacent neighbors need to be 00482 determined. 00483 00484 \param[out] metricList If this pointer is not NULL, then this 00485 vector is populated with the set of metrics that were computed 00486 for estIdx <b>only on the owner process</b>. This list 00487 contains the metrics collated from all the processes 00488 participating in the distributed computing process. Currently, 00489 this feature is used by TransMSTClusterMaker to obtain the 00490 list of metrics computed. 00491 */ 00492 virtual void populateCache(const int estIdx, SMList* metricList = NULL); 00493 00494 /** Method to generate progress logs (if requested by user). 00495 00496 This method is a helper method that is called from the core 00497 manager() method loop to generate progress logs as ESTs are 00498 analyzed and updated. This method cuts logs only if the 00499 progFileName comamnd line argument was specified and the 00500 progress file could be created. 00501 00502 \note The progress file is opened only once, first time this 00503 method is called. So this method should not hurt performance 00504 by much. 00505 00506 \param[in] estsAnalyzed The number of ESTs analyzed thus far. 00507 00508 \param[in] totalESTcount The total number of ESTs to be 00509 analyzed. 00510 */ 00511 void updateProgress(const int estsAnalyzed, const int totalESTcount); 00512 00513 /** Helper method in Manager process to update distributed caches. 00514 00515 This is a helper method that is used only in the Manager 00516 process to perform the following tasks using the newly added 00517 estIdx value: 00518 00519 <ol> 00520 00521 <li>First, this method broadcasts the newly added EST index 00522 (\c estIdx) to all the workers.</li> 00523 00524 <li>Next it prunes it local cache via the 00525 MSTCache::pruneCaches() method.</li> 00526 00527 <li>It then collects requests to repopulate specific caches 00528 from all the workers.</li> 00529 00530 <li>It then adds the newly created est to the list of caches 00531 to be repopulated and broadcasts request to repopulate caches 00532 to each worker and participates in cache repopulation task by 00533 calling the populateCache() method.</li> 00534 00535 </ol> 00536 00537 \param[in] estIdx The index of the newly added EST. 00538 00539 \param[in] refreshEST If this flag is \c true (the default 00540 value), then the neighbors for the newly added EST (specified 00541 by estIdx) are computed and the caches are updated. 00542 00543 \return This method returns 0 on success or an suitable error 00544 code on failure. 00545 */ 00546 int managerUpdateCaches(int estIdx, const bool refreshEST = true); 00547 00548 /** Helper method in \b Manager process to collaboratively compute 00549 the next EST to be added to the MST. 00550 00551 This is a helper method that is used only in the Manager 00552 process to perform the following tasks using the newly added 00553 estIdx value: 00554 00555 <ol> 00556 00557 <li>First, this method sends request to compute the best local 00558 choice to each of the worker processes.</li> 00559 00560 <li>Next it computes its own local (at the Manager's end) best 00561 choice for the next EST node to be added.</li> 00562 00563 <li>It then collects response for best local choice from each 00564 worker process and tracks the best reported value.</li> 00565 00566 </ol> 00567 00568 \param[out] parentESTidx The source EST index from where the 00569 similarity metric is being measured. The srcESTidx is already 00570 present in the MST. 00571 00572 \param[out] estToAdd The destination EST index that is the 00573 best choice to be added to the MST (based on the local 00574 information). 00575 00576 \param[out] similarity The similarity metric between the 00577 srcESTidx and the destESTidx. 00578 00579 \param[out] alignmentData The alignment information between 00580 the two ESTs represented by their index values in parentESTidx 00581 and estToAdd. 00582 00583 \param[out] directionData The direction information between 00584 the two ESTs represented by their index values in parentESTidx 00585 and estToAdd. 00586 */ 00587 void computeNextESTidx(int& parentESTidx, int& estToAdd, 00588 float &similarity, int& alignmentData, 00589 int& directionData) const; 00590 00591 /** Determine the owner process Rank for a given estIdx. 00592 00593 This method is a convenience method to determine the Rank of 00594 the process that logically owns a given EST. The owning 00595 process is responsible for maintaining the cache for a given 00596 EST. The owners are assigned in a simple fashion and ESTs are 00597 evenly divided up amongst all the processes. 00598 00599 \param[in] estIdx The index of the EST whose owner process's 00600 rank is requested. It is assumed that the estIdx is valid. 00601 If invalid EST index values are supplied then the operation of 00602 this method is undefined. 00603 00604 \note This method must be invoked only after MPI::Intialize() 00605 has beeen called and the ESTs to be processed have be loaded 00606 (so that EST::getESTList() returns a valid list of ESTs). 00607 00608 00609 \return The rank of the owner process for the given estIdx. 00610 */ 00611 int getOwnerProcess(const int estIdx) const; 00612 00613 /** Helper method to compute the start and ending indexes of the 00614 EST that this process owns. 00615 00616 This method was introduced to keep the math and logic clutter 00617 involved in computing the list of owned ESTs out of the 00618 methods that use the information. This method returns the 00619 range, such that: \c startIndex <= \em ownedESTidx < \c 00620 endIndex. 00621 00622 00623 \note This method must be invoked only after MPI::Intialize() 00624 has beeen called and the ESTs to be processed have be loaded 00625 (so that EST::getESTList() returns a valid list of ESTs). 00626 00627 \param[out] startIndex The starting (zero-based) index value 00628 of the contiguous range of ESTs that this process owns. 00629 00630 \param[out] endIndex The ending (zero-based) index value of 00631 the contiguous range ESTs that this process owns. The value 00632 returned in this parameter is \b not included in the range of 00633 values. 00634 */ 00635 void getOwnedESTidx(int& startIndex, int& endIndex); 00636 00637 /** Helper method for a worker process. 00638 00639 This method is invoked from the worker() method to receive and 00640 process various requests from the manager process. This 00641 method currently handles the following requests: 00642 00643 <ul> 00644 00645 <li>\c COMPUTE_SIMILARITY_REQUEST : Computes the subset of the 00646 similarity metric for the given EST index and returns the 00647 partial list back to the owner process.</li> 00648 00649 <li> \c COMPUTE_MAX_SIMILARITY_REQUEST : Computes the highest 00650 similarity value between all the ESTs on this cluster and 00651 returns the top entry back to the manager. Once this request 00652 has been processed this method returns control back. 00653 00654 </ul> 00655 */ 00656 void workerProcessRequests(); 00657 00658 /** Distribute data and tag to all the workers. 00659 00660 This method provides a convenient mechanism to broadcast a 00661 given integer data and tag to all the workers. 00662 00663 \param[in] data The integer to be sent to each and every 00664 worker. 00665 00666 \param[in] tag The message tag to be sent to each and every 00667 process. 00668 */ 00669 void sendToWorkers(int data, const int tag) const; 00670 00671 /** Method to detect if a given SMList has at least one, valid 00672 entry. 00673 00674 This method is used (in the populateCache()) to determine if a 00675 given SMList has at least one valid entry. This method is 00676 useful particularly when a empty SMList is received from a 00677 remote process and in this case there ine one entry in the 00678 SMList (-1, -1). 00679 00680 \param[in] list The list to check if it has a valid entry. 00681 */ 00682 bool hasValidSMEntry(const SMList& list) const; 00683 00684 /** Helper method to distribute index of newly added EST to all 00685 workers and gather cache repopulation requests. 00686 00687 This is a helper method that was added to streamline the code 00688 in managerUpdateCaches method. This method performs the 00689 following tasks: 00690 00691 <ol> 00692 00693 <li>First it uses the \c sendToWorkers() method to distribute 00694 the \c estIdx (parameter) value to all the workers. </li> 00695 00696 <li> Next it prunes the local caches on the manager.</li> 00697 00698 <li>It then obtains repopulation requests from each worker and 00699 places EST indexes to be repopulated in the repoulateList 00700 parameter.</li> 00701 00702 </ol> 00703 00704 \note This method must be inovked only on the manager. 00705 00706 \param[in] estIdx The index of the newly added EST that must 00707 be distributed to all the workers. 00708 00709 \param[out] repopulateList A vector that will contain the list 00710 of ESTs that need to be repopulated (based on requests 00711 received from various workers). 00712 */ 00713 void estAdded(const int estIdx, std::vector<int>& repopulateList); 00714 00715 /** Helper method in \b Manager process to add as many child nodes 00716 as possible for the given parent. 00717 00718 This is a helper method that is used only in the Manager 00719 process <b>only when the \c maxUse parameter is != -1</b>. 00720 This method tries to add more children rooted at the given 00721 parent to the MST as long as the metric is better than \c 00722 maxUse value. This method operates as follows: 00723 00724 <ol> 00725 00726 <li>First, this method sends request to compute the best local 00727 choice to each of the worker processes.</li> 00728 00729 <li>Next it computes its own local (at the Manager's end) best 00730 choice for the next EST node to be added.</li> 00731 00732 <li>It then collects response for best local choice from each 00733 worker process and tracks the best reported value.</li> 00734 00735 <li>If the next best entry is still rooted at this parent and 00736 the metric is better than \c maxUse then the EST is added to 00737 MST and the process is repeated from step 1. Otherwise, the 00738 parameters are updated to the last added EST and the method 00739 returns.</li> 00740 00741 </ol> 00742 00743 \param[in] parentESTidx The source EST index from where the 00744 similarity metric is being measured. The parentESTidx is 00745 already present in the MST. 00746 00747 \param[in,out] estToAdd The EST that has just been added to 00748 the MST. This method updates this value if additional ESTs 00749 are added to the MST by this method. 00750 00751 \param[in,out] metric The similarity/distance metric between 00752 the parentESTidx and the estToAdd. This method updates this 00753 value if additional ESTs are added to the MST by this method. 00754 00755 \param[in,out] alignmentData The alignment information between 00756 the two ESTs represented by their index values in parentESTidx 00757 and estToAdd. This method updates this value if additional 00758 ESTs are added to the MST by this method. 00759 00760 \param[in,out] directionData The direction information between 00761 the two ESTs represented by their index values in parentESTidx 00762 and estToAdd. This method updates this value if additional 00763 ESTs are added to the MST by this method. 00764 00765 \param[in,out] pendingESTs The number of pending ESTs that 00766 have not yet been added to the MST. This value is used and 00767 udpated by this method each time it adds a EST. 00768 */ 00769 void addMoreChildESTs(const int parentESTidx, int& estToAdd, 00770 float &metric, int& alignmentData, 00771 int& directionData, int& pendingESTs); 00772 00773 /** The default constructor. 00774 00775 The default constructor for this class. The constructor is 00776 made private so that this class cannot be directly 00777 instantiated. However, since the ClusterMakerFactory is a 00778 friend of this class, an object can be instantiated via the 00779 ClusterMakerFactory::create() method. 00780 00781 \param[in,out] analyzer The EST analyzer to be used for 00782 obtaining similarity metrics between two ESTs. This parameter 00783 is simply passed onto the base class. 00784 00785 \param[in] refESTidx The reference EST index value to be used 00786 to root the spanning tree created by this method. This 00787 parameter should be >= 0. This value is simply passed onto 00788 the base class. 00789 00790 \param[in] outputFile The name of the output file to which the 00791 raw MST cluster information is to be written. If this 00792 parameter is the empty string then output is written to 00793 standard output. This value is simply passed onto the base 00794 class. 00795 */ 00796 MSTClusterMaker(ESTAnalyzer *analyzer, const int refESTidx, 00797 const std::string& outputFile); 00798 00799 /** The set of common arguments for the MST cluster maker. 00800 00801 This instance variable contains a static list of arguments 00802 that are common all the MST cluster maker objects. 00803 */ 00804 static arg_parser::arg_record argsList[]; 00805 00806 /** The cache that holds similarity metrics for MST construction. 00807 00808 This object is used to cache the similarity metrics for all 00809 ESTs that are owned by this process (that is, estIdx % Rank == 00810 0, where Rank is the MPI rank of this process). The cache 00811 contains similarity metrics to facilitate rapid construction 00812 of the MST. Both the manager and worker processes have their 00813 own caches and manage them independently. This spreads out 00814 the memory requirement for the caches across multiple 00815 processes enabling large (in 10s of GB) caches. 00816 00817 The cache is created just before the clustering process 00818 commences and is deleted immediately after the clustering 00819 process (to minimize memory footprint). 00820 */ 00821 MSTCache *cache; 00822 00823 /** File stream to log progress information. 00824 00825 This output stream is created when the first progress information 00826 is logged and closed after the last progress information has been 00827 logged. The progress information is generated by the updateProgress 00828 method if progressFileName is not NULL. 00829 */ 00830 std::ofstream progressFile; 00831 00832 private: 00833 /** The Minimum Spanning Tree (MST) built by this class. 00834 00835 This instance variable holds a pointer to the MST created by 00836 this class when it operates as a manager process. This 00837 pointer is initialized to NULL and a MST is created in the 00838 manager() method. 00839 */ 00840 MST* mst; 00841 00842 /** The top-level root cluster that contains all other clusters. 00843 00844 This member represents the top-level root cluster that contain 00845 all other clusters created by this cluster maker. This cluster 00846 also contains dummy clusters that are created by Filter 00847 objects used in conjunction with clustering. 00848 */ 00849 MSTCluster root; 00850 }; 00851 00852 #endif