00001 #ifndef TRANS_MST_CLUSTER_MAKER_H 00002 #define TRANS_MST_CLUSTER_MAKER_H 00003 00004 //-------------------------------------------------------------------- 00005 // 00006 // This file is part of PEACE. 00007 // 00008 // PEACE is free software: you can redistribute it and/or modify it 00009 // under the terms of the GNU General Public License as published by 00010 // the Free Software Foundation, either version 3 of the License, or 00011 // (at your option) any later version. 00012 // 00013 // PEACE is distributed in the hope that it will be useful, but 00014 // WITHOUT ANY WARRANTY; without even the implied warranty of 00015 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 // General Public License for more details. 00017 // 00018 // You should have received a copy of the GNU General Public License 00019 // along with PEACE. If not, see <http://www.gnu.org/licenses/>. 00020 // 00021 // Miami University makes no representations or warranties about the 00022 // suitability of the software, either express or implied, including 00023 // but not limited to the implied warranties of merchantability, 00024 // fitness for a particular purpose, or non-infringement. Miami 00025 // University shall not be liable for any damages suffered by licensee 00026 // as a result of using, result of using, modifying or distributing 00027 // this software or its derivatives. 00028 // 00029 // By using or copying this Software, Licensee agrees to abide by the 00030 // intellectual property laws, and all other applicable laws of the 00031 // U.S., and the terms of GNU General Public License (version 3). 00032 // 00033 // Authors: Dhananjai M. Rao raodm@muohio.edu 00034 // 00035 //--------------------------------------------------------------------- 00036 00037 #include "MSTClusterMaker.h" 00038 #include "TransCacheEntry.h" 00039 00040 #define NO_ERROR 0 00041 #define ERROR_NO_HEURISTIC 1 00042 00043 /** \typedef std::vector<int, TransCacheEntry> MetricCacheMap 00044 00045 \brief A shortcut to refer to a vector of TransCacheEntry objects. 00046 00047 This typedef provides a convenient shortcut to refer to a vector 00048 containing the information to compute metrics using 00049 conditional-transitivity. The index for this vector is the index 00050 of the EST to which a CachedESTInfo corresponds to. This index is 00051 the value in CachedESTInfo::estIdx. 00052 */ 00053 typedef std::vector<TransCacheEntry*> MetricCacheMap; 00054 00055 /** A Minimum Spanning Tree (MST) based parallel cluster maker that 00056 uses conditional-transitivity relations to accelerate clustering. 00057 00058 This class aims to enhance the performance (without \em 00059 signficiantly impacting quality of clusters) of the standard 00060 MSTClusterMaker by minimizing the number of the heavy weight 00061 analysis (such as: d2 or clu) performed for building the MST. 00062 Reduction in the number of required heavy weight analyses is 00063 achieved by applying the concept of transitivity in a conditional 00064 manner as follows: 00065 00066 <ol> 00067 00068 <li>Given three ESTs, say e<sub>1</sub>, e<sub>2</sub>, and 00069 e<sub>2</sub>, assume heavy weight metrics m<sub>12</sub> (between 00070 e<sub>1</sub> & e<sub>2</sub> and m<sub>23</sub> (between 00071 e<sub>2</sub> & e<sub>3</sub>) have been computed.</li> 00072 00073 <li>When the heavy weight metric m<sub>13</sub> (between 00074 e<sub>1</sub> & e<sub>3</sub>) is required, this class obtains 00075 this information in the following manner: 00076 00077 <ol> 00078 00079 <li>First a heuristic (such as: the <i>t/v</i> heuiristic) is 00080 applied to determine if the pair e<sub>1</sub> & e<sub>3</sub> 00081 meet the stipulated relationship condition. If not the ESTs are 00082 unrelated and conditional-transitivity is not 00083 applicable. Therefore, an invalid large distance value is assumed.</li> 00084 00085 <li>However, if the pair of ESTs pass the heuristic test then they 00086 are related and conditional-transitivity is applicable. In this 00087 case the heavy weight metric is deduced using a binary function 00088 <i>f</i>(m<sub>12</sub>, m<sub>23</sub>). Currently, the default 00089 implementation for <i>f</i> is \c std::max() method that selects 00090 the maximum of m<sub>12</sub> and m<sub>23</sub></li> 00091 00092 </ol> 00093 00094 </li> 00095 00096 </ol> 00097 00098 In order to rapidly retrieve pertinent metrics (or identify the 00099 lack of existing metrics), this class maintains a distributed set 00100 of hash maps in the \c transCache instance variable. Each entry 00101 in the \c transCache hash map is a TransCacheEntry object and 00102 corresponds to a given EST (identified by its index). The 00103 TransCacheEntry in-turn has a hash map of CachedESTInfo objects 00104 that provide all the necessary information to determine metrics 00105 via transitivity. 00106 */ 00107 class TransMSTClusterMaker : public MSTClusterMaker { 00108 friend class ClusterMakerFactory; 00109 public: 00110 /** Method to display performance statistics. 00111 00112 This method overrides the empty implementation in the base 00113 class to display additional statistics on 00114 conditional-transitivity application. 00115 00116 \note This method calls the base class implementation 00117 first. Consequently, all the statistics displayed by the base 00118 class will continue to be displayed. 00119 00120 \param[out] os The output stream to which the statistics must 00121 be written. 00122 */ 00123 virtual void displayStats(std::ostream& os); 00124 00125 /** The destructor. 00126 00127 The destructor frees up all any dynamic memory allocated by 00128 this object for its operations. 00129 */ 00130 virtual ~TransMSTClusterMaker(); 00131 00132 protected: 00133 /** A method to handle initialization tasks for the TransMSTClusterMaker. 00134 00135 This method is called after the ESTs have been loaded into the 00136 ESTAnalyzer and is used to initialize the vector of TransCacheEntrys, 00137 as well as to check for the existence of a heuristic chain 00138 on the analyzer. Without a suitable heuristic, the 00139 TransMSTClusterMaker cannot function properly and will exit 00140 with an error code. 00141 */ 00142 virtual int initialize(); 00143 00144 /** Helper method to call the actual heavy-weight analysis 00145 method(s). 00146 00147 This method overrides the default implementation in the 00148 base-class before the actual call to the heavy weight analyzer 00149 is made. This method attempts to use conditional-transitivity 00150 to try and minimize the number of calls to the heavy weight 00151 analyzer. This method operates in the following manner: 00152 00153 \param[in] otherEST The index of the other EST to which the 00154 metric is required. 00155 00156 \return This method returns a similarity/distance metric by 00157 comparing the ESTs. This method may return -1, if the otherEST 00158 is significantly different from the reference EST (possibly 00159 warranting no further analysis) that a meanigful metric cannot 00160 be generated. 00161 */ 00162 virtual float analyze(const int otherEST); 00163 00164 /** Computes sends/receives similarity list for a given EST. 00165 00166 This method overrides the default implementation in the base 00167 class that performs the core tasks of: (i) computing 00168 similarity metrics, (ii) gathering metrics from various 00169 processes to obtain the highest set of similarity metrics, and 00170 (iii) indicating to the manager that similarity metric 00171 calculations have been completed. 00172 00173 \note This method is invoked on all the worker processes and 00174 the manager process. 00175 00176 This method overrides the default implementation and performs 00177 the following tasks: 00178 00179 <ol> 00180 00181 <li>It calls the base class method to perform all the tasks in 00182 the usual manner.</li> 00183 00184 <li>Next the process that owns \c estIdx broadcasts the cached 00185 metrics to all the other processes. Each process in-turn 00186 waits to receive the cached metrics from the owner 00187 process.</li> 00188 00189 <li>Then each process extracts and populates metrics for 00190 conditional-transitivity calculation by adding/updating 00191 appropriate entries to the transCache. 00192 00193 </ol> 00194 00195 \param[in] estIdx The index of the EST that was just added to 00196 the MST and for which the adjacent neighbors need to be 00197 determined. 00198 00199 \param[out] metricList If this pointer is not NULL, then this 00200 vector is populated with the set of metrics that were computed 00201 for estIdx <b>only on the owner process</b>. This list 00202 contains the metrics collated from all the processes 00203 participating in the distributed computing process. Currently, 00204 this feature is used by TransMSTClusterMaker to obtain the 00205 list of metrics computed. 00206 */ 00207 virtual void populateCache(const int estIdx, 00208 SMList* UNREFERENCED_PARAMETER(metricList) = NULL); 00209 00210 /** The default constructor. 00211 00212 The default constructor for this class. The constructor is 00213 made protected so that this class cannot be directly 00214 instantiated. However, since the ClusterMakerFactory is a 00215 friend of this class, an object can be instantiated via the 00216 ClusterMakerFactory::create() method. 00217 00218 \param[in,out] analyzer The EST analyzer to be used for 00219 obtaining similarity metrics between two ESTs. This parameter 00220 is simply passed onto the base class. 00221 00222 \param[in] refESTidx The reference EST index value to be used 00223 to root the spanning tree created by this method. This 00224 parameter should be >= 0. This value is simply passed onto 00225 the base class. 00226 00227 \param[in] outputFile The name of the output file to which the 00228 raw MST cluster information is to be written. If this 00229 parameter is the empty string then output is written to 00230 standard output. This value is simply passed onto the base 00231 class. 00232 */ 00233 TransMSTClusterMaker(ESTAnalyzer *analyzer, const int refESTidx, 00234 const std::string& outputFile); 00235 00236 /** Helper method to remove invalid metrics from a given list of 00237 values. 00238 00239 This is a helper method that is invoked from populateCache() 00240 method to remove invalid/unwanted entries from a given list of 00241 metrics. This method iterates over the entries in \c list and 00242 removes all entires that have their metric set to a value that 00243 is worse than badMetric value. 00244 00245 \param[in] list The list of metrics that need to be 00246 pruned. 00247 00248 \param[out] entries The list of entries that are better than 00249 badMetric. 00250 */ 00251 void pruneMetricEntries(const SMList& list, SMList& entries); 00252 00253 /** Helper method to process a list of similarity metrics for caching. 00254 This method is called in the populateCache method and will take 00255 care of a list of similarity metrics, either computed locally or 00256 received via remote communication. This method goes through the list 00257 of similarity metrics and updates TransCacheEntrys as appropriate. 00258 00259 \param[in] metricList The list of metrics received or computed. 00260 */ 00261 void processMetricList(SMList& metricList); 00262 00263 private: 00264 /** An hash map that acts as the cache to hold known metrics. 00265 00266 <p>This instance variable holds a cache of the previously 00267 computed metrics in a from that enables rapid application of 00268 conditional-transitivity to deduce metrics for related ESTs. 00269 The hash key for this hash map is the index of the reference 00270 EST for which metrics are to be computed. That is, given two 00271 ESTs e<sub>i</sub> and e<sub>j</sub>, 00272 metricCache[i].getMetric(j, metric) provides an estimate of 00273 metric assuming e<sub>i</sub> and e<sub>j</sub> pass 00274 <i>t/v</i> heuristic.<p> 00275 00276 <p>Entries in this cache are primarily added by the 00277 populateCache() method and entries are used by the analyze() 00278 method.</p> 00279 */ 00280 MetricCacheMap metricCache; 00281 00282 /** Instance variable to track total number of calls for analysis. 00283 00284 This instance variable tracks the number of times the 00285 analyze() method was invoked. This instance variable 00286 essentially tracks the number of times the heavy-weight 00287 analysis method would have been invoked if 00288 conditional-transitivity was not applied. This instance 00289 variable is set to zero in the constructor, incremented in the 00290 analyze() method, and displayed by the displayStats() method. 00291 */ 00292 int analyzeCount; 00293 00294 /** Instance variable to track number of times transitivity was 00295 successfully applied. 00296 00297 This instance variable tracks the number of times transitivity 00298 was successfully applied. This number indices how effective 00299 transitivity approach was for a given set of ESTs. This 00300 instance variable is set to zero in the constructor, 00301 incremented in the analyze() method, and displayed by the 00302 displayStats() method. 00303 */ 00304 int successCount; 00305 00306 /** The current reference EST for which analysis is underway. 00307 00308 This instance variable tracks the index of the reference EST 00309 for which we are trying to figure out the closest neighbor in 00310 the MST. This instance variable is set in the populateCache() 00311 method and is used in the analyze method. Note that the 00312 analyze method is actually called from within 00313 MSTClusterMaker::populateCache() (the base class 00314 implementation). 00315 */ 00316 int currRefESTidx; 00317 00318 static float badMetric; 00319 }; 00320 00321 #endif