00001 #include "mal_readonly.h"
00002 #include "generaldata.h"
00003 #include "readdata.h"
00004 #include "featuredata.h"
00005 #include <sstream>
00006 #include <cctype>
00007
00008 using namespace std;
00009
00010
00011 MAl_Readonly::MAl_Readonly( size_t bufsize, set<db_recno_t>& recnolist, TrapperDoc* pdoc ) :
00012 buff_size(bufsize + 1),
00013 num_seq( recnolist.size() ),
00014 doc(pdoc),
00015 selectedReads(recnolist)
00016 {
00017
00018
00019
00020 seqs.reserve( buff_size );
00021 quals.reserve( buff_size );
00022 DNPs.reserve( buff_size );
00023 tags.reserve( buff_size );
00024 time_course.reserve( buff_size );
00025
00026 names.reserve( buff_size );
00027 headers.reserve( buff_size );
00028 mates.reserve( buff_size );
00029 strands.reserve( buff_size );
00030
00031
00032
00033 seq_rows.reserve( buff_size );
00034 seq_begin_global.reserve( buff_size );
00035 seq_end_global.reserve( buff_size );
00036 seq_beginGood.reserve( buff_size );
00037 seq_endGood.reserve( buff_size );
00038 mate_lengths.reserve( buff_size );
00039
00040
00041 ID_to_dbID.reserve( num_seq );
00042 ID_to_buffID.reserve( num_seq );
00043 buffID_to_dbID.reserve( buff_size );
00044 buffID_to_ID.reserve( buff_size );
00045 put_in_db.reserve( buff_size );
00046
00047 for( set<db_recno_t>::iterator it = recnolist.begin(); it != recnolist.end(); ++it ) {
00048 ID_to_dbID.push_back( *it );
00049 ID_to_buffID.push_back( 0 );
00050 }
00051
00052
00053 for( size_t i = 0; i < buff_size; i++ ) {
00054
00055 buffID_to_dbID.push_back( 0 );
00056 buffID_to_ID.push_back( 0 );
00057 put_in_db.push_back(false);
00058
00059
00060
00061 seq_rows.push_back( 0 );
00062 seq_begin_global.push_back( 0 );
00063 seq_end_global.push_back( 0 );
00064
00065 seqs.push_back( vector<base_t>() );
00066 quals.push_back( vector<qual_t>() );
00067 time_course.push_back( vector<double>() );
00068 DNPs.push_back( vector<dnp_struct>() );
00069 tags.push_back( multimap<size_t, tag_struct>() );
00070 names.push_back( string() );
00071 mates.push_back( string() );
00072 strands.push_back( string() );
00073
00074
00075 seq_beginGood.push_back( 0 );
00076 seq_endGood.push_back( 0 );
00077 mate_lengths.push_back( 0 );
00078
00079 }
00080 cerr<<"buff_size: "<<buff_size<<endl;
00081 }
00082
00083 MAl_Readonly::~MAl_Readonly()
00084 {
00085
00086 }
00087
00088
00089
00090
00091 size_t MAl_Readonly::get_num_seq()
00092 {
00093 return num_seq;
00094 }
00095
00096 std::string MAl_Readonly::get_name( size_t ID )
00097 {
00098 ID = get_buffID( ID );
00099
00100 assert( ID < names.size() );
00101
00102 return names[ ID ];
00103
00104 }
00105
00106 std::string MAl_Readonly::get_header( size_t ID )
00107 {
00108 ID = get_buffID( ID );
00109
00110 assert( ID < headers.size() );
00111
00112 return headers[ ID ];
00113
00114 }
00115
00116 std::string MAl_Readonly::get_seq( size_t ID )
00117 {
00118 ID = get_buffID( ID );
00119
00120 assert( ID < seqs.size() );
00121
00122
00123 return string(seqs[ ID ].begin(), seqs[ ID ].end());
00124
00125 }
00126
00127 string MAl_Readonly::get_strand( size_t ID )
00128 {
00129 ID = get_buffID( ID );
00130
00131 assert( ID < strands.size() );
00132
00133 return strands[ ID ];
00134
00135 }
00136
00137 size_t MAl_Readonly::get_len( size_t ID)
00138 {
00139 ID = get_buffID( ID );
00140
00141 assert( ID < seqs.size() );
00142
00143 return seqs[ ID ].size();
00144
00145 }
00146
00147 void MAl_Readonly::select_read( size_t ID, bool status )
00148 {
00149
00150 assert( ID < ID_to_dbID.size() );
00151
00152 db_recno_t recno = ID_to_dbID[ ID ];
00153 if ( recno == 0 ) return;
00154
00155 set<db_recno_t>::iterator pos;
00156 if ( status == true )
00157 selectedReads.insert( recno );
00158 else if ( ( pos = selectedReads.find( recno )) != selectedReads.end() ) {
00159 selectedReads.erase( pos );
00160 }
00161
00162
00163 }
00164
00165
00166
00167
00168 size_t MAl_Readonly::get_seq_row( size_t ID )
00169 {
00170 ID = get_buffID( ID );
00171
00172 assert( ID < seq_rows.size() );
00173
00174 return seq_rows[ ID ];
00175
00176 }
00177
00178
00179 size_t MAl_Readonly::get_seq_begin_global( size_t ID )
00180
00181 {
00182 ID = get_buffID( ID );
00183
00184 assert( ID < seq_begin_global.size() );
00185
00186 return seq_begin_global[ ID ];
00187
00188 }
00189
00190
00191 size_t MAl_Readonly::get_seq_end( size_t ID )
00192 {
00193
00194 return get_seq_end_global( ID ) - get_seq_begin_global( ID );
00195 }
00196
00197 size_t MAl_Readonly::get_seq_end_global( size_t ID )
00198 {
00199 ID = get_buffID( ID );
00200
00201 assert( ID < seq_end_global.size() );
00202
00203 return seq_end_global[ ID ];
00204
00205 }
00206
00207
00208 size_t MAl_Readonly::get_beginGood( size_t ID )
00209 {
00210 ID = get_buffID( ID );
00211
00212 assert( ID < seq_beginGood.size() );
00213
00214 return seq_beginGood[ ID ];
00215 }
00216
00217 size_t MAl_Readonly::get_beginGood_global( size_t ID )
00218 {
00219 return get_beginGood( ID ) + get_seq_begin_global( ID );
00220
00221 }
00222
00223
00224 size_t MAl_Readonly::get_endGood( size_t ID )
00225 {
00226 ID = get_buffID( ID );
00227
00228 assert( ID < seq_endGood.size() );
00229
00230 return seq_endGood[ ID ];
00231
00232 }
00233
00234 size_t MAl_Readonly::get_endGood_global( size_t ID )
00235 {
00236 return get_endGood( ID ) + get_seq_begin_global( ID );
00237 }
00238
00239 base_t MAl_Readonly::get_base( size_t ID, size_t index )
00240 {
00241
00242 ID = get_buffID( ID );
00243
00244 assert( ID < seqs.size() );
00245
00246 if (index >= seqs[ ID ].size() ) {
00247 cout << flush;
00248 cerr << "index (local) = " << index << ", seqs[ ID ].size() = " << seqs[ ID ].size() << endl;
00249
00250 }
00251 assert( index < seqs[ ID ].size() );
00252
00253 base_t tmp = seqs[ ID ][ index ];
00254 if (tmp == '*') {
00255 tmp = '-';
00256 }
00257
00258 return tmp;
00259
00260 }
00261
00262 base_t MAl_Readonly::get_base_global( size_t ID, size_t index )
00263 {
00264 return get_base( ID, index - get_seq_begin_global( ID ) );
00265
00266 }
00267
00268 qual_t MAl_Readonly::get_qual( size_t ID, size_t index )
00269 {
00270 ID = get_buffID( ID );
00271
00272 assert( ID < quals.size() );
00273 assert( index < quals[ ID ].size() );
00274
00275 return quals[ ID ][ index ];
00276
00277 }
00278
00279 qual_t MAl_Readonly::get_qual_global( size_t ID, size_t index )
00280 {
00281
00282 return get_qual( ID, index - get_seq_begin_global( ID ) );
00283
00284 }
00285
00286
00287 bool MAl_Readonly::is_DNP(size_t ID, size_t index)
00288 {
00289 ID = get_buffID( ID );
00290
00291 if ( index >= DNPs[ ID ].size() ) {
00292
00293 }
00294
00295 assert( ID < DNPs.size() );
00296 assert( index < DNPs[ ID ].size() );
00297
00298 return DNPs[ ID ][ index ].isDNP;
00299 }
00300
00301 bool MAl_Readonly::is_DNP_global(size_t ID, size_t index)
00302 {
00303
00304 return is_DNP( ID, index - get_seq_begin_global( ID ) );
00305 }
00306
00307 int MAl_Readonly::get_DNP_ID(size_t ID, size_t index)
00308 {
00309 ID = get_buffID( ID );
00310
00311 assert( ID < DNPs.size() );
00312 assert( index < DNPs[ ID ].size() );
00313
00314 return DNPs[ ID ][ index ].ID;
00315 }
00316
00317 int MAl_Readonly::get_DNP_ID_global(size_t ID, size_t index)
00318 {
00319 return get_DNP_ID( ID, index - get_seq_begin_global( ID ) );
00320 }
00321
00322 int MAl_Readonly::get_DNP_type(size_t ID, size_t index)
00323 {
00324 ID = get_buffID( ID );
00325
00326 assert( ID < DNPs.size() );
00327 assert( index < DNPs[ ID ].size() );
00328
00329 return DNPs[ ID ][ index ].type;
00330 }
00331
00332 int MAl_Readonly::get_DNP_type_global(size_t ID, size_t index)
00333 {
00334 return get_DNP_type( ID, index - get_seq_begin_global( ID ) );
00335 }
00336
00337
00338
00339
00340 size_t MAl_Readonly::get_buffID(size_t ID)
00341 {
00342
00343
00344 assert( ID < ID_to_buffID.size() );
00345 size_t buffID = ID_to_buffID[ ID ];
00346
00347 if ( buffID == 0 ) {
00348
00349
00350 buffID = next_buffID();
00351
00352 if ( buffID >= put_in_db.size() ) {
00353 cerr<<"buffID: "<<buffID<<endl;
00354 cerr<<"put_in_db.size(): "<<put_in_db.size()<<endl;
00355 }
00356
00357 assert( buffID < put_in_db.size() );
00358
00359 if ( put_in_db[ buffID ] ) {
00360
00361 flush_buffer( buffID );
00362 }
00363
00364
00365
00366
00367 read_from_db( buffID, ID );
00368
00369
00370 }
00371 assert( buffID != 0 );
00372 return buffID;
00373
00374 }
00375
00376 size_t MAl_Readonly::next_buffID()
00377 {
00378
00379 static size_t curr_buffID(0);
00380 ++curr_buffID;
00381
00382 if ( curr_buffID == buff_size ) curr_buffID = 1;
00383
00384 return curr_buffID;
00385
00386 }
00387
00388 void MAl_Readonly::flush_buffer( size_t buffID )
00389 {
00390
00391
00392 }
00393
00394 void MAl_Readonly::read_from_db(size_t buffID, size_t ID)
00395 {
00396
00397 db_recno_t recno = ID_to_dbID[ ID ];
00398
00399 if ( recno != 0 ) {
00400
00401
00402
00403
00404 read_seq_from_db( recno, buffID );
00405 read_feat_from_db( recno, buffID, "ReadMetaData");
00406 read_feat_from_db( recno, buffID, "DnaStrData");
00407 read_feat_from_db( recno, buffID, "QualityData");
00408 read_feat_from_db( recno, buffID, "DnpData");
00409 read_feat_from_db( recno, buffID, "TagData");
00410
00411
00412
00413
00414
00415
00416
00417
00418
00419
00420
00421 }
00422
00423
00424 ID_to_buffID[ ID ] = buffID;
00425 buffID_to_dbID[ buffID ] = recno;
00426 buffID_to_ID[ buffID ] = ID;
00427 put_in_db[ buffID ] = true;
00428 }
00429
00430 void MAl_Readonly::read_seq_from_db( db_recno_t recno, size_t buffID )
00431 {
00432
00433 Database::PrimaryIterator<ReadData> read_it(doc, "ReadData");
00434
00435 int ret_read = read_it.setFromRecno(recno);
00436 ReadData* r_test = (ret_read != DB_NOTFOUND) ? read_it.answer() : 0;
00437 assert( r_test );
00438
00439 seq_begin_global[ buffID ] = r_test->startPos();
00440 seq_end_global[ buffID ] = r_test->endPos() + 1;
00441 seq_rows[ buffID ] = r_test->row();
00442
00443
00444
00445
00446
00447
00448
00449
00450
00451
00452
00453
00454
00455 }
00456
00457 void MAl_Readonly::read_feat_from_db( db_recno_t recno, size_t buffID, const string& data_type_name)
00458 {
00459 Database::SecondaryIterator<FeatureData> feat_it("readRecno", doc, data_type_name);
00460
00461 feat_it.key()->setReadRecno( recno );
00462
00463 if ( feat_it.set() != 0 ) {
00464
00465 return;
00466 }
00467
00468
00469 if ( DnaStrData* data = dynamic_cast<DnaStrData*>(feat_it.answer()) ) {
00470 seqs[buffID] = data->dnaVector.stlVector();
00471
00472 DNPs[buffID] = vector<dnp_struct>(seqs[buffID].size(), dnp_struct());
00473 }
00474 else if ( QualityData* data = dynamic_cast<QualityData*>(feat_it.answer()) ) {
00475 quals[buffID] = data->qualityVector.stlVector();
00476 }
00477 else if ( DnpData* data = dynamic_cast<DnpData*>(feat_it.answer()) ) {
00478
00479 do {
00480
00481 DNPs[buffID][ data->startPos() ].isDNP = true;
00482 DNPs[buffID][ data->startPos() ].recno = data->getRecno();
00483 DNPs[buffID][ data->startPos() ].ID = data->get_dnpID();
00484 DNPs[buffID][ data->startPos() ].type = data->get_dnp_type();
00485
00486 } while ( (feat_it.nextdup() == 0) && (data = dynamic_cast<DnpData*>(feat_it.answer())) );
00487 }
00488 else if ( TagData* data = dynamic_cast<TagData*>(feat_it.answer()) ) {
00489 do {
00490 ostringstream os;
00491 os << data->getScore() <<" "<< data->getInfo();
00492 tags[buffID].insert( pair<size_t, tag_struct>( data->endPos() + 1, tag_struct(data->startPos(), data->endPos() + 1, data_type_name, os.str(), data->getRecno()) ) );
00493
00494 } while ( (feat_it.nextdup() == 0) && (data = dynamic_cast<TagData*>(feat_it.answer())) );
00495 }
00496 else if (ReadMetaData* data = dynamic_cast<ReadMetaData*>(feat_it.answer()) ) {
00497
00498 names[ buffID ] = data->name();
00499 mates[ buffID ] = data->mate();
00500 strands[ buffID ] = data->strand();
00501 seq_beginGood[ buffID ] = data->beginGood();
00502 seq_endGood[ buffID ] = data->endGood() + 1;
00503 mate_lengths[ buffID ] = data->mateLength();
00504 time_course[ buffID ] = data->tc_vec.stlVector();
00505
00506 }
00507
00508 }
00509
00510
00511
00512
00513 base_t MAl_Readonly::comp_base(const char& base)
00514 {
00515 char tmp = toupper(base);
00516 if ( tmp == 'A') {
00517 return base + 19;
00518 }
00519 else if ( tmp == 'T' ) {
00520 return base - 19;
00521 }
00522 else if ( tmp == 'G' ) {
00523 return base - 4;
00524 }
00525 else if ( tmp == 'C' ) {
00526 return base + 4;
00527 }
00528 else {
00529 return base;
00530 }
00531 }
00532
00533 double MAl_Readonly::get_max_expression(size_t ID)
00534 {
00535 ID = get_buffID( ID );
00536
00537 assert( ID < time_course.size() );
00538
00539 double tc_max(0.000001);
00540 bool maxfound(false);
00541
00542
00543 for( size_t i = 0; i < time_course[ID].size(); i++ ) {
00544 if ( time_course[ID][i] > tc_max ) {
00545 tc_max = time_course[ID][i];
00546 maxfound = true;
00547 }
00548 }
00549 if ( maxfound ) {
00550
00551 return tc_max;
00552 }
00553
00554 return 0.0;
00555 }
00556
00557 size_t MAl_Readonly::get_num_expression_points(size_t ID)
00558 {
00559 ID = get_buffID( ID );
00560
00561 assert( ID < time_course.size() );
00562
00563 return time_course[ID].size();
00564 }
00565
00566 double MAl_Readonly::get_expression(size_t ID, size_t expression_index)
00567 {
00568 ID = get_buffID( ID );
00569
00570 assert( ID < time_course.size() );
00571 assert( expression_index < time_course[ID].size() );
00572
00573 return time_course[ID][expression_index];
00574 }
00575