00001 #ifndef UV_SAMPLE_HEURISTIC_CPP
00002 #define UV_SAMPLE_HEURISTIC_CPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "UVSampleHeuristic.h"
00038 #include "HeuristicChain.h"
00039 #include "ESTCodec.h"
00040 #include "EST.h"
00041
00042
00043 int UVSampleHeuristic::u = 4;
00044 int UVSampleHeuristic::v = 8;
00045 int UVSampleHeuristic::wordShift = 16;
00046 int UVSampleHeuristic::BitMask = 0;
00047
00048
00049
00050 int UVSampleHeuristic::bitsToShift = 0;
00051
00052
00053 arg_parser::arg_record UVSampleHeuristic::argsList[] = {
00054 {"--uv_u", "u (number of v-word matches) (default=4)",
00055 &UVSampleHeuristic::u, arg_parser::INTEGER},
00056 {"--uv_v", "v (length of common words) (default=8)",
00057 &UVSampleHeuristic::v, arg_parser::INTEGER},
00058 {"--uv_wordShift", "Word Shift (default=16)",
00059 &UVSampleHeuristic::wordShift, arg_parser::INTEGER},
00060 {NULL, NULL, NULL, arg_parser::BOOLEAN}
00061 };
00062
00063 UVSampleHeuristic::UVSampleHeuristic(const std::string& name,
00064 const std::string& UNREFERENCED_PARAMETER(outputFileName))
00065 : Heuristic(name), hintKey("D2_DoRC") {
00066
00067 s1WordMap = NULL;
00068 s1RCWordMap = NULL;
00069 }
00070
00071 UVSampleHeuristic::~UVSampleHeuristic() {
00072 if (s1WordMap != NULL) {
00073 delete [] s1WordMap;
00074 }
00075 if (s1RCWordMap != NULL) {
00076 delete [] s1RCWordMap;
00077 }
00078
00079
00080 uvCache.clear();
00081 }
00082
00083 void
00084 UVSampleHeuristic::showArguments(std::ostream& os) {
00085
00086 arg_parser ap(UVSampleHeuristic::argsList);
00087 os << ap;
00088 }
00089
00090 bool
00091 UVSampleHeuristic::parseArguments(int& argc, char **argv) {
00092 arg_parser ap(UVSampleHeuristic::argsList);
00093 ap.check_args(argc, argv, false);
00094
00095 if (wordShift < 1) {
00096 std::cerr << heuristicName
00097 << ": Word shift must be >= 1"
00098 << "(use --uv_wordShift option)\n";
00099 return false;
00100 } else if (u < 1 || v < 1) {
00101 std::cerr << heuristicName
00102 << ": u and v must be >= 1"
00103 << "(use --uv_u and --uv_v options)\n";
00104 return false;
00105 }
00106
00107 return true;
00108 }
00109
00110 int
00111 UVSampleHeuristic::initialize() {
00112 const int MapSize = (1 << (v * 2));
00113
00114 s1WordMap = new char[MapSize];
00115
00116 s1RCWordMap = new char[MapSize];
00117
00118 bitsToShift = 2 * (v - 1);
00119
00120 return 0;
00121 }
00122
00123 int
00124 UVSampleHeuristic::setReferenceEST(const int estIdx) {
00125 if (refESTidx == estIdx) {
00126
00127
00128 return 0;
00129 }
00130
00131
00132
00133 BitMask = (1 << (v * 2)) - 1;
00134
00135 if ((estIdx < 0) || (estIdx >= EST::getESTCount())) {
00136
00137 return 1;
00138 }
00139
00140 refESTidx = estIdx;
00141
00142 const int MapSize = (1 << (v * 2));
00143
00144
00145 memset(s1WordMap, 0, sizeof(char) * MapSize);
00146 memset(s1RCWordMap, 0, sizeof(char) * MapSize);
00147
00148 const EST *estS1 = EST::getEST(refESTidx);
00149 ASSERT ( estS1 != NULL );
00150 const char* s1 = estS1->getSequence();
00151
00152
00153
00154 ESTCodec::NormalEncoder<bitsToShift, BitMask> encoder;
00155 ESTCodec& codec = ESTCodec::getCodec();
00156
00157 register int hash = 0;
00158 int ignoreMask = 0;
00159 for(int i = 0; (i < v - 1); i++) {
00160 hash = encoder(hash, s1[i], ignoreMask);
00161 }
00162
00163
00164 codec.setRevCompTable(v);
00165 const int End = (int) strlen(s1) - v;
00166 for (int i = v - 1; (i <= End); i++) {
00167 hash = encoder(hash, s1[i], ignoreMask);
00168 if (!ignoreMask) {
00169
00170
00171
00172 s1WordMap[hash] = 1;
00173
00174 s1RCWordMap[codec.encode2rc(hash)] = 1;
00175 }
00176 }
00177 return 0;
00178 }
00179
00180 void
00181 UVSampleHeuristic::computeHash(const int estIdx) {
00182
00183
00184 const EST *estS2 = EST::getEST(estIdx);
00185 ASSERT ( estS2 != NULL );
00186 const char *sq2 = estS2->getSequence();
00187 ASSERT ( sq2 != NULL );
00188 const int End = (int) strlen(sq2) - v;
00189 ASSERT ( End > 0 );
00190
00191
00192 ESTCodec::NormalEncoder<bitsToShift, BitMask> encoder;
00193
00194
00195 std::vector<unsigned short>& hashList = uvCache[estIdx];
00196 hashList.reserve(End / wordShift + 1);
00197
00198
00199 for (register int start = 0; (start < End); start += wordShift) {
00200
00201 unsigned short hash = 0;
00202 int ignoreMask = 0;
00203 const int endBP = start + v;
00204 for(register int i = start; (i < endBP); i++) {
00205 hash = (unsigned short) encoder(hash, sq2[i], ignoreMask);
00206 }
00207
00208 if (!ignoreMask) {
00209
00210 hashList.push_back(hash);
00211 }
00212 }
00213 }
00214
00215 bool
00216 UVSampleHeuristic::runHeuristic(const int otherEST) {
00217
00218 VALIDATE({
00219 if (otherEST == refESTidx) {
00220 return true;
00221 }
00222 if ((otherEST < 0) || (otherEST >= EST::getESTCount())) {
00223
00224 return false;
00225 }
00226 });
00227
00228
00229
00230 UVHashTable::iterator cacheEntry = uvCache.find(otherEST);
00231 if (cacheEntry == uvCache.end()) {
00232
00233
00234 computeHash(otherEST);
00235
00236 cacheEntry = uvCache.find(otherEST);
00237 }
00238
00239 const std::vector<unsigned short>& otherHash = cacheEntry->second;
00240 const int hashSize = (int) otherHash.size();
00241 if (hashSize == 0) {
00242
00243 return false;
00244 }
00245
00246
00247 register int numMatches = 0, numRCmatches = 0;
00248 for (register int word = 0; (word < hashSize); word++) {
00249
00250
00251 numMatches += s1WordMap [otherHash[word]];
00252 numRCmatches += s1RCWordMap[otherHash[word]];
00253 }
00254
00255
00256
00257
00258
00259
00260
00261 if ((numMatches >= u) || (numRCmatches >= u)) {
00262
00263
00264 bestMatchIsRC = (numMatches < numRCmatches);
00265
00266 HeuristicChain::getHeuristicChain()->setHint(hintKey, bestMatchIsRC);
00267
00268 return true;
00269 }
00270
00271 return false;
00272 }
00273
00274 #endif