00001 #ifndef CLU_CPP
00002 #define CLU_CPP
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 #include "CLU.h"
00038 #include "ResultLog.h"
00039 #include "EST.h"
00040 #include <cmath>
00041 #include <queue>
00042 #include <algorithm>
00043 
00044 
00045 int  CLU::abundanceFraction = 10; 
00046 
00047 
00048 arg_parser::arg_record CLU::argsList[] = {
00049     {"--abdFrac", "Abundance Fraction (default=10)",
00050      &CLU::abundanceFraction, arg_parser::INTEGER},
00051     {NULL, NULL, NULL, arg_parser::BOOLEAN}
00052 };
00053 
00054 
00055 char CLU::CharToInt[256];
00056 
00057 CLU::CLU(const int refESTidx, const std::string& outputFile)
00058     : FWAnalyzer("clu", refESTidx, outputFile) {
00059     
00060     
00061     memset(CharToInt, 0, sizeof(CharToInt));
00062     
00063     CharToInt[(int) 'A'] = CharToInt[(int) 'a'] = 0;
00064     CharToInt[(int) 'G'] = CharToInt[(int) 'g'] = 1;
00065     CharToInt[(int) 'C'] = CharToInt[(int) 'c'] = 2;
00066     CharToInt[(int) 'T'] = CharToInt[(int) 't'] = 3;
00067 }
00068 
00069 CLU::~CLU() {
00070     
00071     EST::deleteAllESTs();
00072     
00073     
00074     
00075 }
00076 
00077 void
00078 CLU::showArguments(std::ostream& os) {
00079     FWAnalyzer::showArguments(os);
00080     
00081     arg_parser ap(CLU::argsList);
00082     os << ap;
00083 }
00084 
00085 bool
00086 CLU::parseArguments(int& argc, char **argv) {
00087     arg_parser ap(CLU::argsList);
00088     ap.check_args(argc, argv, false);
00089     
00090     if ((abundanceFraction < 1) || (abundanceFraction > 100)) {
00091         std::cerr << analyzerName
00092                   << ": Abundance fraction must be in the range 0 to 100"
00093                   << "(use --abdFrac option)\n";
00094         return false;
00095     }
00096     
00097     return FWAnalyzer::parseArguments(argc, argv);
00098 }
00099 
00100 int
00101 CLU::initialize() {
00102     int result = FWAnalyzer::initialize();
00103     if (result != 0) {
00104         
00105         return result;
00106     }
00107     
00108     
00109     
00110     std::vector<EST*>& estList = EST::getESTList();
00111     for(int id = 0; (id < (int) estList.size()); id++) {
00112         estList[id]->setCustomData(new CLUESTData());
00113     }
00114     
00115     return 0;
00116 }
00117 
00118 void
00119 CLU::dumpEST(ResultLog& log, const EST* est, const bool isReference) {
00120     if (isReference) {
00121         const char* start=(htmlLog ? "<font face=\"courier\" color=red>" : "");
00122         const char* end  =(htmlLog ? "</font>" : "");
00123         log.report(est->getInfo(), "%s%s%s", "n/a",
00124                    start, est->getSequence(), end);
00125     } else {
00126         const char* start = (htmlLog ? "<font face=\"courier\">" : "");
00127         const char* end   = (htmlLog ? "</font>" : "");
00128         log.report(est->getInfo(), "%s%s%s", "%f",
00129                    start, est->getSequence(), end, est->getSimilarity());
00130     }
00131 }
00132 
00133 int
00134 CLU::setReferenceEST(const int estIdx) {
00135     
00136     
00137     std::vector<EST*>& estList = EST::getESTList();
00138     if ((estIdx < 0) || (estIdx >= (int) estList.size())) {
00139         
00140         return 1;
00141     }
00142     ASSERT ( estList[estIdx] != NULL );
00143     
00144     refESTidx = estIdx;
00145     
00146     return 0;
00147 }
00148 
00149 void
00150 CLU::buildHashMaps(EST *est) {
00151     ASSERT ( est != NULL );
00152     ASSERT ( est->getCustomData().get() != NULL );
00153     
00154     CLUESTData *customData =
00155         dynamic_cast<CLUESTData*>(est->getCustomData().get());
00156     ASSERT ( customData != NULL );
00157     if (customData->referenceHashMap != NULL) {
00158         
00159         
00160         return;
00161     }
00162     
00163     std::string sequence = est->getSequence();
00164     ASSERT ( sequence.size() > 0 );
00165     
00166     
00167     createCLUHashMap(customData->referenceHashMap, sequence.c_str());
00168     
00169     std::reverse(sequence.begin(), sequence.end());
00170     const char Complements[] = {'T', 'C', 'G', 'A'};
00171     const int SeqLen = (int) sequence.size();
00172     for(int i = 0; (i < SeqLen); i++) {
00173         sequence[i] = Complements[(int) CharToInt[(int) sequence[i]]];
00174     }
00175     
00176     createCLUHashMap(customData->complementHashMap, sequence.c_str());
00177 }
00178 
00179 void
00180 CLU::createCLUHashMap(int* &hashMap, const char *sequence) {
00181     
00182     const int MapSize = (int) pow(4.0, wordSize);
00183     if (hashMap == NULL) {
00184         hashMap = new int[MapSize];
00185     }
00186     
00187     memset(hashMap, 0, sizeof(int) * MapSize);
00188     
00189     ASSERT ( sequence != NULL );
00190     int hash = 0;
00191     int i;
00192     for(i = 0; (i < wordSize); i++) {
00193         hash <<= 2;
00194         hash  += CharToInt[(int) sequence[i]];
00195     }
00196     
00197     hashMap[hash]++;
00198     
00199     
00200     
00201     
00202     const int SeqLen  = (int) strlen(sequence) - wordSize;
00203     
00204     
00205     
00206     const int BitMask = (1 << (wordSize * 2)) - 1;
00207     for (; (i < SeqLen); i++) {
00208         hash <<= 2;
00209         hash  += CharToInt[(int) sequence[i]];
00210         hash  &= BitMask;
00211         hashMap[hash]++;
00212     }
00213     
00214     
00215     
00216     filterHashMap(hashMap, SeqLen);
00217 }
00218 
00219 void
00220 CLU::filterHashMap(int *hashMap, const int sequenceLength) {
00221     
00222     
00223     
00224     
00225     
00226     
00227     
00228     
00229     char *LowCompSeq = reinterpret_cast<char*>(alloca(wordSize * 3));
00230     memset(LowCompSeq + wordSize * 0, 1, wordSize);
00231     memset(LowCompSeq + wordSize * 1, 2, wordSize);
00232     memset(LowCompSeq + wordSize * 2, 3, wordSize);
00233 
00234     
00235     
00236     int hash         = 0;
00237     hashMap[hash]    = 0;
00238     const int SeqLen = wordSize * 3;
00239     
00240     
00241     
00242     const int BitMask= (1 << (wordSize * 2)) - 1;
00243     for(int i = 0; (i < SeqLen); i++) {
00244         hash <<= 2;
00245         hash  += LowCompSeq[i];
00246         hash  &= BitMask;
00247         
00248         hashMap[hash] = 0;
00249     }
00250     
00251     const int MapSize         = (int) pow(4.0, wordSize);
00252     const int AbundanceFactor = sequenceLength / abundanceFraction;
00253     for(int i = 0; (i < MapSize); i++) {
00254         if (!hashMap[i]) {
00255             
00256             continue;
00257         }
00258         if (hashMap[i] > AbundanceFactor) {
00259             hashMap[i] = 0;
00260         } else {
00261             hashMap[i] = 1;
00262         }
00263     }
00264 }
00265 
00266 int
00267 CLU::getSimilarity(const int* const hashMap,
00268                    const char* const sequence) const {
00269     
00270     int *cd = reinterpret_cast<int*>(alloca(sizeof(int) * frameSize));
00271     
00272     memset(cd, 0, sizeof(int) * frameSize);
00273     
00274     
00275     
00276     const int BitMask = (1 << (wordSize * 2)) - 1;
00277     
00278     const char* bp = sequence;
00279 
00280     
00281     int hash = 0;
00282     for(int i = 0; (i < wordSize); i++) {
00283         hash <<= 2;
00284         hash  += CharToInt[(int) *bp++];
00285     }
00286     
00287     
00288     std::queue<int> prevHashValues;
00289     prevHashValues.push(hash);
00290     
00291     
00292     unsigned int cdIndex = hashMap[hash];
00293     for(int i = 1; (i < frameSize); i++) {
00294         hash   <<= 2;
00295         hash    += CharToInt[(int) *bp++];
00296         hash    &= BitMask;
00297         cdIndex += hashMap[hash];
00298         
00299         prevHashValues.push(hash);
00300     }
00301     ASSERT ( (int) prevHashValues.size() == frameSize );
00302     
00303     
00304     const int SeqLen = (int) strlen(sequence) - frameSize;
00305     for(int i = frameSize; (i < SeqLen); i++) {
00306         
00307         cdIndex = (cdIndex >= (unsigned) frameSize) ? (frameSize - 1) : cdIndex;
00308         
00309         cd[cdIndex]++;
00310         
00311         
00312         hash <<= 2;
00313         hash  += CharToInt[(int) *bp++];
00314         hash  &= BitMask;
00315         
00316         prevHashValues.push(hash);
00317         
00318         cdIndex += hashMap[hash];
00319         cdIndex -= hashMap[prevHashValues.front()];
00320         
00321         prevHashValues.pop();
00322         
00323         ASSERT ( (int) prevHashValues.size() == frameSize );
00324     }
00325 
00326     
00327     
00328     
00329     
00330     cdIndex = (cdIndex >= (unsigned) frameSize) ? (frameSize - 1) : cdIndex;
00331     
00332     cd[cdIndex]++;
00333     
00334     
00335     
00336     
00337     const int Weight[] = { 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
00338                            0,  0,  1,  3,  7, 15, 27, 34, 52, 47};
00339     ASSERT ( sizeof(Weight) / sizeof(int) == frameSize);
00340     int similarity = 0;
00341     for(int i = 0; (i < frameSize); i++) {
00342         similarity += (cd[i] * Weight[i]);
00343     }
00344     
00345     return similarity;
00346 }
00347 
00348 float
00349 CLU::getMetric(const int estIdx) {
00350     if (estIdx == refESTidx) {
00351         
00352         
00353         return 0;
00354     }
00355     
00356     EST *estSq = EST::getEST(refESTidx);
00357     EST *estSb = EST::getEST(estIdx);
00358     
00359     if (strlen(estSb->getSequence()) < strlen(estSq->getSequence())) {
00360         
00361         EST *temp = estSq;
00362         estSq     = estSb;
00363         estSb     = temp;
00364     }
00365     
00366     buildHashMaps(estSq);
00367     
00368     
00369     CLUESTData *customData =
00370         dynamic_cast<CLUESTData*>(estSq->getCustomData().get());
00371     ASSERT ( customData != NULL );
00372     ASSERT ( customData->referenceHashMap  != NULL );
00373     ASSERT ( customData->complementHashMap != NULL );
00374     
00375     const char* otherSeq = estSb->getSequence();
00376     ASSERT ( otherSeq != NULL );
00377     const int refSim  = getSimilarity(customData->referenceHashMap, otherSeq);
00378     const int compSim = getSimilarity(customData->complementHashMap, otherSeq);
00379     
00380     return (float) ((refSim > compSim) ? refSim : compSim);
00381 }
00382 
00383 CLU::CLUESTData::~CLUESTData() {
00384     if (referenceHashMap != NULL) {
00385         delete [] referenceHashMap;
00386     }
00387     if (complementHashMap != NULL) {
00388         delete [] complementHashMap;
00389     }
00390 }
00391 
00392 #endif