00001 #ifndef EST_CPP
00002 #define EST_CPP
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 #include "EST.h"
00038 #include <string>
00039 #include <algorithm>
00040 #include <iostream>
00041 #include <cctype>
00042 #include <cstring>
00043 #include <cstdio>
00044
00045
00046 std::vector<EST*> EST::estList;
00047
00048
00049 size_t EST::maxESTlen = 0;
00050
00051 EST::EST(const int idValue, const char *information, const char* seq,
00052 const int fileOffset) : id(idValue), offset(fileOffset),
00053 customData(NULL) {
00054
00055 info = EST::duplicate(information);
00056 sequence = EST::duplicate(seq);
00057 similarity = 0;
00058 processed = false;
00059 }
00060
00061 EST::~EST() {
00062 unpopulate();
00063 }
00064
00065 void
00066 EST::unpopulate() {
00067 if (info != NULL) {
00068 delete [] info;
00069 }
00070 if (sequence != NULL) {
00071 delete [] sequence;
00072 }
00073 }
00074
00075 std::string
00076 EST::getLine(FILE *fastaFile) {
00077 char buffer[1024];
00078 std::string retVal;
00079 char *result = NULL;
00080
00081 do {
00082
00083 result = fgets(buffer, 1023, fastaFile);
00084 const int len = (int) strlen(buffer);
00085 if (buffer[len - 1] == '\n') {
00086
00087 buffer[len - 1] = '\0';
00088
00089 result = NULL;
00090 }
00091
00092 retVal += buffer;
00093 } while (result != NULL);
00094
00095
00096 return retVal;
00097 }
00098
00099 EST*
00100 EST::create(const int id, const char *info,
00101 const char* sequence, const long offset) {
00102 if (id != (int) estList.size()) {
00103
00104 return NULL;
00105 }
00106
00107 EST *newEST = new EST(id, info, sequence, offset);
00108
00109 estList.push_back(newEST);
00110
00111 return newEST;
00112 }
00113
00114 EST*
00115 EST::create(FILE* fastaFile, int& lineNum, const bool maskBases) {
00116
00117 if (feof(fastaFile) || ferror(fastaFile)) {
00118
00119 return NULL;
00120 }
00121
00122
00123 const long offset = ftell(fastaFile);
00124
00125
00126 int headerChar;
00127 if ((headerChar = fgetc(fastaFile)) != '>') {
00128
00129 ungetc(headerChar, fastaFile);
00130 return NULL;
00131 }
00132
00133
00134
00135
00136 std::string headerLine = getLine(fastaFile);
00137 lineNum++;
00138
00139
00140
00141 std::string sequence;
00142 do {
00143
00144
00145 if ((headerChar = fgetc(fastaFile)) != EOF) {
00146 ungetc(headerChar, fastaFile);
00147 if (headerChar != '>') {
00148
00149 sequence += getLine(fastaFile);
00150
00151 lineNum++;
00152 }
00153 }
00154 } while (!feof(fastaFile) && !ferror(fastaFile) && (headerChar != '>'));
00155
00156 if (!ferror(fastaFile)) {
00157
00158 normalizeBases(sequence, maskBases);
00159
00160 const char* const seqBP = sequence.c_str();
00161
00162 maxESTlen = std::max(maxESTlen, strlen(seqBP));
00163
00164 EST *est = new EST((int) estList.size(), headerLine.c_str(), seqBP,
00165 offset);
00166
00167 estList.push_back(est);
00168
00169 return est;
00170 }
00171
00172
00173 return NULL;
00174 }
00175
00176 void
00177 EST::deleteAllESTs() {
00178 const int ESTCount = (int) estList.size();
00179 for(int id = 0; (id < ESTCount); id++) {
00180 delete estList[id];
00181 }
00182
00183 estList.clear();
00184 }
00185
00186 void
00187 EST::deleteLastESTs(const int count) {
00188 for(int i = 0; ((estList.size() > 0) && (i < count)); i++) {
00189
00190 delete estList.back();
00191 estList.pop_back();
00192 }
00193 }
00194
00195 void
00196 EST::dumpESTList(std::ostream& os) {
00197 const int EstCount = (int) estList.size();
00198 for(int id = 0; (id < EstCount); id++) {
00199
00200 estList[id]->dumpEST(os);
00201 }
00202 }
00203
00204 void
00205 EST::dumpESTList(std::ostream& os, const bool processed) {
00206 const int EstCount = (int) estList.size();
00207 for(int id = 0; (id < EstCount); id++) {
00208
00209 if (estList[id]->processed == processed) {
00210
00211 estList[id]->dumpEST(os);
00212 }
00213 }
00214 }
00215
00216 void
00217 EST::dumpEST(std::ostream& os) {
00218 const int LineSize = 100;
00219 os << ">";
00220 os << getInfo() << std::endl;
00221
00222
00223 const char *seq = getSequence();
00224 const int seqLen = (int) strlen(seq);
00225 for(int pos = 0; (pos < seqLen); pos++) {
00226 if ((pos > 0) && ((pos % LineSize) == 0)) {
00227 os << "\n";
00228 }
00229 os << seq[pos];
00230 }
00231 os << "\n";
00232 }
00233
00234 char*
00235 EST::duplicate(const char *src) {
00236 char *copy = NULL;
00237 if (src != NULL) {
00238 const size_t len = strlen(src) + 1;
00239 copy = new char[len];
00240 #ifdef _WINDOWS
00241 strcpy_s(copy, len, src);
00242 #else
00243 strncpy(copy, src, len);
00244 #endif
00245 }
00246 return copy;
00247 }
00248
00249 void
00250 EST::normalizeBases(std::string& sequence, const bool maskBases) {
00251 const size_t seqLen = sequence.size();
00252 const std::string LowCaseBases = "atcg";
00253 const std::string UpCaseBases = "ATCG";
00254
00255 for(size_t i = 0; (i < seqLen); i++) {
00256 size_t index = 0;
00257
00258 char nt = sequence[i];
00259 if ((index = LowCaseBases.find(nt)) != std::string::npos) {
00260 nt = (maskBases ? 'N' : UpCaseBases[index]);
00261 } else if (UpCaseBases.find(nt) == std::string::npos) {
00262
00263 nt= 'N';
00264 }
00265
00266 sequence[i] = nt;
00267 }
00268 }
00269
00270
00271 EST::EST() : id(-1), info(NULL), sequence(NULL), offset(-1), similarity(0) {}
00272
00273
00274 EST&
00275 EST::operator=(const EST&) {
00276 return *this;
00277 }
00278
00279 int
00280 EST::getProcessedESTCount() {
00281 int count = 0;
00282 const int ESTCount = (int) estList.size();
00283 for(int id = 0; (id < ESTCount); id++) {
00284 if (estList[id]->processed) {
00285 count++;
00286 }
00287 }
00288 return count;
00289 }
00290
00291 size_t
00292 EST::getMaxESTLen() {
00293 if (maxESTlen == 0) {
00294
00295
00296 const int ESTCount = (int) estList.size();
00297 for(int id = 0; (id < ESTCount); id++) {
00298 maxESTlen = std::max(maxESTlen, strlen(estList[id]->getSequence()));
00299 }
00300 }
00301
00302 return maxESTlen;
00303 }
00304
00305 #endif