00001 #ifndef EST_CPP
00002 #define EST_CPP
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 
00033 
00034 
00035 
00036 
00037 #include "EST.h"
00038 #include <string>
00039 #include <algorithm>
00040 #include <iostream>
00041 #include <cctype>
00042 #include <cstring>
00043 #include <cstdio>
00044 
00045 
00046 std::vector<EST*> EST::estList;
00047 
00048 
00049 size_t EST::maxESTlen = 0;
00050 
00051 EST::EST(const int idValue, const char *information, const char* seq,
00052          const int fileOffset) : id(idValue), offset(fileOffset),
00053                                  customData(NULL) {
00054     
00055     info       = EST::duplicate(information);
00056     sequence   = EST::duplicate(seq);
00057     similarity = 0;
00058     processed  = false;
00059 }
00060 
00061 EST::~EST() {
00062     unpopulate();
00063 }
00064 
00065 void
00066 EST::unpopulate() {
00067     if (info != NULL) {
00068         delete [] info;
00069     }
00070     if (sequence != NULL) {
00071         delete [] sequence;
00072     }
00073 }
00074 
00075 std::string
00076 EST::getLine(FILE *fastaFile) {
00077     char buffer[1024];
00078     std::string retVal;
00079     char *result = NULL;
00080     
00081     do {
00082         
00083         result = fgets(buffer, 1023, fastaFile);
00084         const int len = (int) strlen(buffer);
00085         if (buffer[len - 1] == '\n') {
00086             
00087             buffer[len - 1] = '\0';
00088             
00089             result = NULL;
00090         }
00091         
00092         retVal += buffer;
00093     } while (result != NULL);
00094 
00095     
00096     return retVal;
00097 }
00098 
00099 EST*
00100 EST::create(const int id, const char *info,
00101             const char* sequence, const long offset) {
00102     if (id != (int) estList.size()) {
00103         
00104         return NULL;
00105     }
00106     
00107     EST *newEST = new EST(id, info, sequence, offset);
00108     
00109     estList.push_back(newEST);
00110     
00111     return newEST;
00112 }
00113 
00114 EST*
00115 EST::create(FILE* fastaFile, int& lineNum, const bool maskBases) {
00116     
00117     if (feof(fastaFile) || ferror(fastaFile)) {
00118         
00119         return NULL;
00120     }
00121     
00122     
00123     const long offset = ftell(fastaFile);
00124     
00125     
00126     int headerChar;
00127     if ((headerChar = fgetc(fastaFile)) != '>') {
00128         
00129         ungetc(headerChar, fastaFile);
00130         return NULL;
00131     }
00132     
00133     
00134     
00135     
00136     std::string headerLine = getLine(fastaFile);
00137     lineNum++;
00138     
00139     
00140     
00141     std::string sequence;
00142     do {
00143         
00144         
00145         if ((headerChar = fgetc(fastaFile)) != EOF) {
00146             ungetc(headerChar, fastaFile);
00147             if (headerChar != '>') {
00148                 
00149                 sequence += getLine(fastaFile);
00150                 
00151                 lineNum++;
00152             }
00153         }
00154     } while (!feof(fastaFile) && !ferror(fastaFile) && (headerChar != '>'));
00155     
00156     if (!ferror(fastaFile)) {
00157         
00158         normalizeBases(sequence, maskBases);
00159         
00160         const char* const seqBP = sequence.c_str();
00161         
00162         maxESTlen = std::max(maxESTlen, strlen(seqBP));
00163         
00164         EST *est = new EST((int) estList.size(), headerLine.c_str(), seqBP,
00165                            offset);
00166         
00167         estList.push_back(est);
00168         
00169         return est;
00170     }
00171 
00172     
00173     return NULL;
00174 }
00175 
00176 void
00177 EST::deleteAllESTs() {
00178     const int ESTCount = (int) estList.size();
00179     for(int id = 0; (id < ESTCount); id++) {
00180         delete estList[id];
00181     }
00182     
00183     estList.clear();
00184 }
00185 
00186 void
00187 EST::deleteLastESTs(const int count) {
00188     for(int i = 0; ((estList.size() > 0) && (i < count)); i++) {
00189         
00190         delete estList.back();
00191         estList.pop_back();
00192     }
00193 }
00194 
00195 void
00196 EST::dumpESTList(std::ostream& os) {
00197     const int EstCount = (int) estList.size();
00198     for(int id = 0; (id < EstCount); id++) {
00199         
00200         estList[id]->dumpEST(os);
00201     }
00202 }
00203 
00204 void
00205 EST::dumpESTList(std::ostream& os, const bool processed) {
00206     const int EstCount = (int) estList.size();
00207     for(int id = 0; (id < EstCount); id++) {
00208         
00209         if (estList[id]->processed == processed) {
00210             
00211             estList[id]->dumpEST(os);
00212         }
00213     }
00214 }
00215 
00216 void
00217 EST::dumpEST(std::ostream& os) {
00218     const int LineSize = 100;
00219     os << ">";
00220     os << getInfo() << std::endl;
00221     
00222     
00223     const char *seq   = getSequence();
00224     const int  seqLen = (int) strlen(seq);
00225     for(int pos = 0; (pos < seqLen); pos++) {
00226         if ((pos > 0) && ((pos % LineSize) == 0)) {
00227             os << "\n";
00228         }
00229         os << seq[pos];
00230     }
00231     os << "\n";
00232 }
00233 
00234 char*
00235 EST::duplicate(const char *src) {
00236     char *copy = NULL;
00237     if (src != NULL) {
00238         const size_t len = strlen(src) + 1;
00239         copy = new char[len];
00240 #ifdef _WINDOWS
00241         strcpy_s(copy, len, src);
00242 #else
00243         strncpy(copy, src, len);
00244 #endif
00245     }
00246     return copy;
00247 }
00248 
00249 void
00250 EST::normalizeBases(std::string& sequence, const bool maskBases) {
00251     const size_t seqLen = sequence.size();
00252     const std::string LowCaseBases = "atcg";
00253     const std::string UpCaseBases  = "ATCG";
00254     
00255     for(size_t i = 0; (i < seqLen); i++) {
00256         size_t index = 0;
00257         
00258         char nt   = sequence[i];
00259         if ((index = LowCaseBases.find(nt)) != std::string::npos) {
00260             nt = (maskBases ? 'N' : UpCaseBases[index]);
00261         } else if (UpCaseBases.find(nt) == std::string::npos) {
00262             
00263             nt= 'N';
00264         }
00265         
00266         sequence[i] = nt;
00267     }
00268 }
00269 
00270 
00271 EST::EST() : id(-1), info(NULL), sequence(NULL), offset(-1), similarity(0) {}
00272 
00273 
00274 EST&
00275 EST::operator=(const EST&) {
00276     return *this;
00277 }
00278 
00279 int
00280 EST::getProcessedESTCount() {
00281     int count = 0;
00282     const int ESTCount = (int) estList.size();
00283     for(int id = 0; (id < ESTCount); id++) {
00284         if (estList[id]->processed) {
00285             count++;
00286         }
00287     }
00288     return count;
00289 }
00290 
00291 size_t
00292 EST::getMaxESTLen() {
00293     if (maxESTlen == 0) {
00294         
00295         
00296         const int ESTCount = (int) estList.size();
00297         for(int id = 0; (id < ESTCount); id++) {
00298             maxESTlen = std::max(maxESTlen, strlen(estList[id]->getSequence()));
00299         }
00300     }
00301     
00302     return maxESTlen;
00303 }
00304 
00305 #endif