00001 #ifndef NEW_UV_HEURISTIC_CPP
00002 #define NEW_UV_HEURISTIC_CPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "NewUVHeuristic.h"
00038 #include "HeuristicChain.h"
00039 #include "ESTCodec.h"
00040 #include "EST.h"
00041 #include "ParameterSetManager.h"
00042
00043
00044 int NewUVHeuristic::v = 8;
00045 int NewUVHeuristic::BitMask = 0;
00046
00047
00048
00049 int NewUVHeuristic::bitsToShift = 0;
00050
00051
00052 arg_parser::arg_record NewUVHeuristic::argsList[] = {
00053 {"--uv_v", "v (length of common words) (default=8)",
00054 &NewUVHeuristic::v, arg_parser::INTEGER},
00055 {NULL, NULL, NULL, arg_parser::BOOLEAN}
00056 };
00057
00058 NewUVHeuristic::NewUVHeuristic(const std::string& name,
00059 const std::string& UNREFERENCED_PARAMETER(outputFileName))
00060 : Heuristic(name), hintKey("D2_DoRC") {
00061
00062 s1WordMap = NULL;
00063 s1RCWordMap = NULL;
00064
00065
00066 u = 6;
00067 wordShift = 8;
00068 passes = 3;
00069 refESTLen = 0;
00070 otherESTLen = 0;
00071 }
00072
00073 NewUVHeuristic::~NewUVHeuristic() {
00074 if (s1WordMap != NULL) {
00075 delete [] s1WordMap;
00076 }
00077 if (s1RCWordMap != NULL) {
00078 delete [] s1RCWordMap;
00079 }
00080
00081
00082 uvCache.clear();
00083 }
00084
00085 void
00086 NewUVHeuristic::showArguments(std::ostream& os) {
00087
00088 arg_parser ap(NewUVHeuristic::argsList);
00089 os << ap;
00090 }
00091
00092 bool
00093 NewUVHeuristic::parseArguments(int& argc, char **argv) {
00094 arg_parser ap(NewUVHeuristic::argsList);
00095 ap.check_args(argc, argv, false);
00096
00097 if (wordShift < 1) {
00098 std::cerr << heuristicName
00099 << ": Word shift must be >= 1"
00100 << "(use --uv_wordShift option)\n";
00101 return false;
00102 } else if (u < 1 || v < 1) {
00103 std::cerr << heuristicName
00104 << ": u and v must be >= 1"
00105 << "(use --uv_u and --uv_v options)\n";
00106 return false;
00107 } else if (passes < 1) {
00108 std::cerr << heuristicName
00109 << ": passes must be >= 1"
00110 << "(use --uv_passes option)\n";
00111 return false;
00112 }
00113
00114 return true;
00115 }
00116
00117 int
00118 NewUVHeuristic::initialize() {
00119 const int MapSize = (1 << (v * 2));
00120
00121 s1WordMap = new char[MapSize];
00122
00123 s1RCWordMap = new char[MapSize];
00124
00125 bitsToShift = 2 * (v - 1);
00126
00127 return 0;
00128 }
00129
00130 int
00131 NewUVHeuristic::setReferenceEST(const int estIdx) {
00132
00133
00134 BitMask = (1 << (v * 2)) - 1;
00135
00136 if ((estIdx < 0) || (estIdx >= EST::getESTCount())) {
00137
00138 return 1;
00139 }
00140
00141 refESTidx = estIdx;
00142
00143 const int MapSize = (1 << (v * 2));
00144
00145
00146 memset(s1WordMap, 0, sizeof(char) * MapSize);
00147 memset(s1RCWordMap, 0, sizeof(char) * MapSize);
00148
00149 const EST *estS1 = EST::getEST(refESTidx);
00150 ASSERT ( estS1 != NULL );
00151 const char* s1 = estS1->getSequence();
00152 refESTLen = strlen(s1);
00153
00154
00155
00156 ESTCodec::NormalEncoder<bitsToShift, BitMask> encoder;
00157 ESTCodec& codec = ESTCodec::getCodec();
00158
00159 int hash = 0, ignoreMask = 0;
00160 for(int i = 0; (i < v - 1); i++) {
00161 hash = encoder(hash, s1[i], ignoreMask);
00162 }
00163
00164
00165 codec.setRevCompTable(v);
00166 const int End = strlen(s1);
00167 for (int i = v - 1; (i <= End); i++) {
00168 hash = encoder(hash, s1[i], ignoreMask);
00169 if (!ignoreMask) {
00170
00171
00172
00173 s1WordMap[hash] = 1;
00174
00175 s1RCWordMap[codec.encode2rc(hash)] = 1;
00176 }
00177 }
00178 return 0;
00179 }
00180
00181 void
00182 NewUVHeuristic::computeHash(const int estIdx) {
00183
00184
00185 const EST *estS2 = EST::getEST(estIdx);
00186 ASSERT ( estS2 != NULL );
00187 const char *sq2 = estS2->getSequence();
00188 ASSERT ( sq2 != NULL );
00189 const int End = otherESTLen - v;
00190 ASSERT ( End > 0 );
00191
00192
00193
00194 ESTCodec::NormalEncoder<bitsToShift, BitMask> encoder;
00195
00196
00197 std::vector<unsigned short>& hashList = uvCache[estIdx];
00198 hashList.reserve(End / wordShift + 1);
00199
00200
00201 for (register int start = 0; (start < End); start += wordShift) {
00202
00203 unsigned short hash = 0;
00204 int ignoreMask = 0;
00205 const int endBP = start + v;
00206 for(register int i = start; (i < endBP); i++) {
00207 hash = (unsigned short) encoder(hash, sq2[i], ignoreMask);
00208 }
00209
00210 if (!ignoreMask) {
00211
00212
00213 hashList.push_back(hash);
00214 }
00215 }
00216 }
00217
00218 bool
00219 NewUVHeuristic::runHeuristic(const int otherEST) {
00220
00221 HeuristicChain::getHeuristicChain()->setHint("MST_RC", 0);
00222
00223 VALIDATE({
00224 if (otherEST == refESTidx) {
00225 return true;
00226 }
00227 if ((otherEST < 0) || (otherEST >= EST::getESTCount())) {
00228
00229 return false;
00230 }
00231 });
00232
00233
00234
00235 UVHashTable::iterator cacheEntry = uvCache.find(otherEST);
00236 if (cacheEntry == uvCache.end()) {
00237
00238
00239 computeHash(otherEST);
00240
00241 cacheEntry = uvCache.find(otherEST);
00242 }
00243
00244 const std::vector<unsigned short>& otherHash = cacheEntry->second;
00245 const int hashSize = otherHash.size();
00246 if (hashSize == 0) {
00247
00248 return false;
00249 }
00250
00251
00252 register int numMatches = 0, numRCmatches = 0;
00253 register int factor = u / passes;
00254 for (register int pass = 0; (pass < passes); pass++) {
00255 for (register int word = pass; (word < hashSize); word+=passes) {
00256
00257
00258 numMatches += s1WordMap [otherHash[word]];
00259 numRCmatches += s1RCWordMap[otherHash[word]];
00260 }
00261 if (pass != passes-1) {
00262 register int numMatchesReq = factor*(pass+1);
00263 if ((numMatches < numMatchesReq) && (numRCmatches < numMatchesReq)) {
00264
00265 return false;
00266 }
00267 }
00268 }
00269
00270
00271
00272
00273
00274
00275
00276 if ((numMatches >= u) || (numRCmatches >= u)) {
00277
00278
00279 bestMatchIsRC = (numMatches < numRCmatches);
00280
00281 HeuristicChain::getHeuristicChain()->setHint(hintKey, bestMatchIsRC);
00282
00283 HeuristicChain::getHeuristicChain()->setHint("MST_RC",
00284 ((int)bestMatchIsRC) * -2
00285 + 1);
00286
00287 return true;
00288 }
00289
00290 return false;
00291 }
00292
00293 #endif