00001 #ifndef CLU_CPP
00002 #define CLU_CPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "CLU.h"
00038 #include "ResultLog.h"
00039 #include "EST.h"
00040 #include <cmath>
00041 #include <queue>
00042 #include <algorithm>
00043
00044
00045 int CLU::abundanceFraction = 10;
00046
00047
00048 arg_parser::arg_record CLU::argsList[] = {
00049 {"--abdFrac", "Abundance Fraction (default=10)",
00050 &CLU::abundanceFraction, arg_parser::INTEGER},
00051 {NULL, NULL, NULL, arg_parser::BOOLEAN}
00052 };
00053
00054
00055 char CLU::CharToInt[256];
00056
00057 CLU::CLU(const int refESTidx, const std::string& outputFile)
00058 : FWAnalyzer("clu", refESTidx, outputFile) {
00059
00060
00061 memset(CharToInt, 0, sizeof(CharToInt));
00062
00063 CharToInt[(int) 'A'] = CharToInt[(int) 'a'] = 0;
00064 CharToInt[(int) 'G'] = CharToInt[(int) 'g'] = 1;
00065 CharToInt[(int) 'C'] = CharToInt[(int) 'c'] = 2;
00066 CharToInt[(int) 'T'] = CharToInt[(int) 't'] = 3;
00067 }
00068
00069 CLU::~CLU() {
00070
00071 EST::deleteAllESTs();
00072
00073
00074
00075 }
00076
00077 void
00078 CLU::showArguments(std::ostream& os) {
00079 FWAnalyzer::showArguments(os);
00080
00081 arg_parser ap(CLU::argsList);
00082 os << ap;
00083 }
00084
00085 bool
00086 CLU::parseArguments(int& argc, char **argv) {
00087 arg_parser ap(CLU::argsList);
00088 ap.check_args(argc, argv, false);
00089
00090 if ((abundanceFraction < 1) || (abundanceFraction > 100)) {
00091 std::cerr << analyzerName
00092 << ": Abundance fraction must be in the range 0 to 100"
00093 << "(use --abdFrac option)\n";
00094 return false;
00095 }
00096
00097 return FWAnalyzer::parseArguments(argc, argv);
00098 }
00099
00100 int
00101 CLU::initialize() {
00102 int result = FWAnalyzer::initialize();
00103 if (result != 0) {
00104
00105 return result;
00106 }
00107
00108
00109
00110 std::vector<EST*>& estList = EST::getESTList();
00111 for(int id = 0; (id < (int) estList.size()); id++) {
00112 estList[id]->setCustomData(new CLUESTData());
00113 }
00114
00115 return 0;
00116 }
00117
00118 void
00119 CLU::dumpEST(ResultLog& log, const EST* est, const bool isReference) {
00120 if (isReference) {
00121 const char* start=(htmlLog ? "<font face=\"courier\" color=red>" : "");
00122 const char* end =(htmlLog ? "</font>" : "");
00123 log.report(est->getInfo(), "%s%s%s", "n/a",
00124 start, est->getSequence(), end);
00125 } else {
00126 const char* start = (htmlLog ? "<font face=\"courier\">" : "");
00127 const char* end = (htmlLog ? "</font>" : "");
00128 log.report(est->getInfo(), "%s%s%s", "%f",
00129 start, est->getSequence(), end, est->getSimilarity());
00130 }
00131 }
00132
00133 int
00134 CLU::setReferenceEST(const int estIdx) {
00135
00136
00137 std::vector<EST*>& estList = EST::getESTList();
00138 if ((estIdx < 0) || (estIdx >= (int) estList.size())) {
00139
00140 return 1;
00141 }
00142 ASSERT ( estList[estIdx] != NULL );
00143
00144 refESTidx = estIdx;
00145
00146 return 0;
00147 }
00148
00149 void
00150 CLU::buildHashMaps(EST *est) {
00151 ASSERT ( est != NULL );
00152 ASSERT ( est->getCustomData().get() != NULL );
00153
00154 CLUESTData *customData =
00155 dynamic_cast<CLUESTData*>(est->getCustomData().get());
00156 ASSERT ( customData != NULL );
00157 if (customData->referenceHashMap != NULL) {
00158
00159
00160 return;
00161 }
00162
00163 std::string sequence = est->getSequence();
00164 ASSERT ( sequence.size() > 0 );
00165
00166
00167 createCLUHashMap(customData->referenceHashMap, sequence.c_str());
00168
00169 std::reverse(sequence.begin(), sequence.end());
00170 const char Complements[] = {'T', 'C', 'G', 'A'};
00171 const int SeqLen = (int) sequence.size();
00172 for(int i = 0; (i < SeqLen); i++) {
00173 sequence[i] = Complements[(int) CharToInt[(int) sequence[i]]];
00174 }
00175
00176 createCLUHashMap(customData->complementHashMap, sequence.c_str());
00177 }
00178
00179 void
00180 CLU::createCLUHashMap(int* &hashMap, const char *sequence) {
00181
00182 const int MapSize = (int) pow(4.0, wordSize);
00183 if (hashMap == NULL) {
00184 hashMap = new int[MapSize];
00185 }
00186
00187 memset(hashMap, 0, sizeof(int) * MapSize);
00188
00189 ASSERT ( sequence != NULL );
00190 int hash = 0;
00191 int i;
00192 for(i = 0; (i < wordSize); i++) {
00193 hash <<= 2;
00194 hash += CharToInt[(int) sequence[i]];
00195 }
00196
00197 hashMap[hash]++;
00198
00199
00200
00201
00202 const int SeqLen = (int) strlen(sequence) - wordSize;
00203
00204
00205
00206 const int BitMask = (1 << (wordSize * 2)) - 1;
00207 for (; (i < SeqLen); i++) {
00208 hash <<= 2;
00209 hash += CharToInt[(int) sequence[i]];
00210 hash &= BitMask;
00211 hashMap[hash]++;
00212 }
00213
00214
00215
00216 filterHashMap(hashMap, SeqLen);
00217 }
00218
00219 void
00220 CLU::filterHashMap(int *hashMap, const int sequenceLength) {
00221
00222
00223
00224
00225
00226
00227
00228
00229 char *LowCompSeq = reinterpret_cast<char*>(alloca(wordSize * 3));
00230 memset(LowCompSeq + wordSize * 0, 1, wordSize);
00231 memset(LowCompSeq + wordSize * 1, 2, wordSize);
00232 memset(LowCompSeq + wordSize * 2, 3, wordSize);
00233
00234
00235
00236 int hash = 0;
00237 hashMap[hash] = 0;
00238 const int SeqLen = wordSize * 3;
00239
00240
00241
00242 const int BitMask= (1 << (wordSize * 2)) - 1;
00243 for(int i = 0; (i < SeqLen); i++) {
00244 hash <<= 2;
00245 hash += LowCompSeq[i];
00246 hash &= BitMask;
00247
00248 hashMap[hash] = 0;
00249 }
00250
00251 const int MapSize = (int) pow(4.0, wordSize);
00252 const int AbundanceFactor = sequenceLength / abundanceFraction;
00253 for(int i = 0; (i < MapSize); i++) {
00254 if (!hashMap[i]) {
00255
00256 continue;
00257 }
00258 if (hashMap[i] > AbundanceFactor) {
00259 hashMap[i] = 0;
00260 } else {
00261 hashMap[i] = 1;
00262 }
00263 }
00264 }
00265
00266 int
00267 CLU::getSimilarity(const int* const hashMap,
00268 const char* const sequence) const {
00269
00270 int *cd = reinterpret_cast<int*>(alloca(sizeof(int) * frameSize));
00271
00272 memset(cd, 0, sizeof(int) * frameSize);
00273
00274
00275
00276 const int BitMask = (1 << (wordSize * 2)) - 1;
00277
00278 const char* bp = sequence;
00279
00280
00281 int hash = 0;
00282 for(int i = 0; (i < wordSize); i++) {
00283 hash <<= 2;
00284 hash += CharToInt[(int) *bp++];
00285 }
00286
00287
00288 std::queue<int> prevHashValues;
00289 prevHashValues.push(hash);
00290
00291
00292 unsigned int cdIndex = hashMap[hash];
00293 for(int i = 1; (i < frameSize); i++) {
00294 hash <<= 2;
00295 hash += CharToInt[(int) *bp++];
00296 hash &= BitMask;
00297 cdIndex += hashMap[hash];
00298
00299 prevHashValues.push(hash);
00300 }
00301 ASSERT ( (int) prevHashValues.size() == frameSize );
00302
00303
00304 const int SeqLen = (int) strlen(sequence) - frameSize;
00305 for(int i = frameSize; (i < SeqLen); i++) {
00306
00307 cdIndex = (cdIndex >= (unsigned) frameSize) ? (frameSize - 1) : cdIndex;
00308
00309 cd[cdIndex]++;
00310
00311
00312 hash <<= 2;
00313 hash += CharToInt[(int) *bp++];
00314 hash &= BitMask;
00315
00316 prevHashValues.push(hash);
00317
00318 cdIndex += hashMap[hash];
00319 cdIndex -= hashMap[prevHashValues.front()];
00320
00321 prevHashValues.pop();
00322
00323 ASSERT ( (int) prevHashValues.size() == frameSize );
00324 }
00325
00326
00327
00328
00329
00330 cdIndex = (cdIndex >= (unsigned) frameSize) ? (frameSize - 1) : cdIndex;
00331
00332 cd[cdIndex]++;
00333
00334
00335
00336
00337 const int Weight[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00338 0, 0, 1, 3, 7, 15, 27, 34, 52, 47};
00339 ASSERT ( sizeof(Weight) / sizeof(int) == frameSize);
00340 int similarity = 0;
00341 for(int i = 0; (i < frameSize); i++) {
00342 similarity += (cd[i] * Weight[i]);
00343 }
00344
00345 return similarity;
00346 }
00347
00348 float
00349 CLU::getMetric(const int estIdx) {
00350 if (estIdx == refESTidx) {
00351
00352
00353 return 0;
00354 }
00355
00356 EST *estSq = EST::getEST(refESTidx);
00357 EST *estSb = EST::getEST(estIdx);
00358
00359 if (strlen(estSb->getSequence()) < strlen(estSq->getSequence())) {
00360
00361 EST *temp = estSq;
00362 estSq = estSb;
00363 estSb = temp;
00364 }
00365
00366 buildHashMaps(estSq);
00367
00368
00369 CLUESTData *customData =
00370 dynamic_cast<CLUESTData*>(estSq->getCustomData().get());
00371 ASSERT ( customData != NULL );
00372 ASSERT ( customData->referenceHashMap != NULL );
00373 ASSERT ( customData->complementHashMap != NULL );
00374
00375 const char* otherSeq = estSb->getSequence();
00376 ASSERT ( otherSeq != NULL );
00377 const int refSim = getSimilarity(customData->referenceHashMap, otherSeq);
00378 const int compSim = getSimilarity(customData->complementHashMap, otherSeq);
00379
00380 return (float) ((refSim > compSim) ? refSim : compSim);
00381 }
00382
00383 CLU::CLUESTData::~CLUESTData() {
00384 if (referenceHashMap != NULL) {
00385 delete [] referenceHashMap;
00386 }
00387 if (complementHashMap != NULL) {
00388 delete [] complementHashMap;
00389 }
00390 }
00391
00392 #endif