mal_readonly.cc

Go to the documentation of this file.
00001 #include "mal_readonly.h"
00002 #include "generaldata.h"
00003 #include "readdata.h"
00004 #include "featuredata.h"
00005 #include <sstream>
00006 #include <cctype>
00007 
00008 using namespace std;
00009 
00010 
00011 MAl_Readonly::MAl_Readonly( size_t bufsize, set<db_recno_t>& recnolist, TrapperDoc* pdoc ) :
00012   buff_size(bufsize + 1),// +1 because buffID == 0 is unused
00013   num_seq( recnolist.size() ),
00014   doc(pdoc),
00015   selectedReads(recnolist)
00016 {
00017   //Init the bookkeeping arrays, the rest will be automatically filled as the reads are used
00018 
00019   //These guys are of the buffer size
00020   seqs.reserve( buff_size );
00021   quals.reserve( buff_size );
00022   DNPs.reserve( buff_size );//NOTA BENE, vector<bool> is special case...
00023   tags.reserve( buff_size );
00024   time_course.reserve( buff_size );
00025 
00026   names.reserve( buff_size );
00027   headers.reserve( buff_size );
00028   mates.reserve( buff_size );
00029   strands.reserve( buff_size );
00030   
00031 
00032   //New deal - NOT always kept in memory...
00033   seq_rows.reserve( buff_size );
00034   seq_begin_global.reserve( buff_size );
00035   seq_end_global.reserve( buff_size );//Unnecessary?? We have the sizes of seqs...
00036   seq_beginGood.reserve( buff_size );//NB, not global!
00037   seq_endGood.reserve( buff_size );//NB, not global!
00038   mate_lengths.reserve( buff_size );
00039 
00040   //Buffer stuff
00041   ID_to_dbID.reserve( num_seq );//Should be of actual data set size
00042   ID_to_buffID.reserve( num_seq );//Ditto
00043   buffID_to_dbID.reserve( buff_size );//buffer size
00044   buffID_to_ID.reserve( buff_size );//buffer size
00045   put_in_db.reserve( buff_size );//buffer size
00046   
00047   for( set<db_recno_t>::iterator it = recnolist.begin(); it != recnolist.end(); ++it ) {
00048     ID_to_dbID.push_back( *it );
00049     ID_to_buffID.push_back( 0 );
00050   }
00051 
00052   //Init all containers
00053   for( size_t i = 0; i < buff_size; i++ ) {
00054 
00055     buffID_to_dbID.push_back( 0 );
00056     buffID_to_ID.push_back( 0 );
00057     put_in_db.push_back(false);
00058 
00059     //NOTA BENE, ONLY START, END AND ROW so far...
00060     
00061     seq_rows.push_back( 0 );//Need better initval...
00062     seq_begin_global.push_back( 0 );//Ditto
00063     seq_end_global.push_back( 0 );//Ditto
00064 
00065     seqs.push_back( vector<base_t>() );
00066     quals.push_back( vector<qual_t>() );
00067     time_course.push_back( vector<double>() );
00068     DNPs.push_back( vector<dnp_struct>() );
00069     tags.push_back( multimap<size_t, tag_struct>() );
00070     names.push_back( string() );
00071     mates.push_back( string() );
00072     strands.push_back( string() );
00073     
00074 
00075     seq_beginGood.push_back( 0 );
00076     seq_endGood.push_back( 0 );
00077     mate_lengths.push_back( 0 );
00078     
00079   }
00080   cerr<<"buff_size: "<<buff_size<<endl;
00081 }
00082 
00083 MAl_Readonly::~MAl_Readonly()
00084 {
00085   //Nothing to be done here...
00086 }
00087 
00088 
00089 //Public methods - common
00090 
00091 size_t MAl_Readonly::get_num_seq()
00092 {
00093   return num_seq;
00094 }
00095 
00096 std::string MAl_Readonly::get_name( size_t ID )
00097 {
00098   ID = get_buffID( ID );
00099   
00100   assert( ID < names.size() );
00101   
00102   return names[ ID ];
00103   
00104 }
00105 
00106 std::string MAl_Readonly::get_header( size_t ID )
00107 {
00108   ID = get_buffID( ID );
00109   
00110   assert( ID < headers.size() );
00111   
00112   return headers[ ID ];
00113   
00114 }
00115 
00116 std::string MAl_Readonly::get_seq( size_t ID )
00117 {
00118   ID = get_buffID( ID );
00119   
00120   assert( ID < seqs.size() );
00121   
00122 //   return seqs[ ID ];
00123   return string(seqs[ ID ].begin(), seqs[ ID ].end());
00124   
00125 }
00126 
00127 string MAl_Readonly::get_strand( size_t ID )
00128 {
00129   ID = get_buffID( ID );
00130   
00131   assert( ID < strands.size() );
00132   
00133   return strands[ ID ];
00134   
00135 }
00136 
00137 size_t MAl_Readonly::get_len( size_t ID)
00138 {
00139   ID = get_buffID( ID );
00140   
00141   assert( ID < seqs.size() );
00142   
00143   return seqs[ ID ].size();
00144   
00145 }
00146 
00147 void MAl_Readonly::select_read( size_t ID, bool status )
00148 {
00149   
00150   assert( ID < ID_to_dbID.size() );
00151   
00152   db_recno_t recno = ID_to_dbID[ ID ];
00153   if ( recno == 0 ) return;
00154 
00155   set<db_recno_t>::iterator pos;
00156   if ( status == true )
00157     selectedReads.insert( recno );
00158   else if ( ( pos = selectedReads.find( recno )) != selectedReads.end() ) {
00159     selectedReads.erase( pos );
00160   }
00161   
00162   
00163 }
00164 
00165 
00166 //Public methods - separate interfaces
00167 
00168 size_t MAl_Readonly::get_seq_row( size_t ID )
00169 {
00170   ID = get_buffID( ID );
00171 
00172   assert( ID < seq_rows.size() );
00173   
00174   return seq_rows[ ID ];
00175   
00176 }
00177 
00178 
00179 size_t MAl_Readonly::get_seq_begin_global( size_t ID )
00180 // int MAl_Readonly::get_seq_begin_global( size_t ID )
00181 {
00182   ID = get_buffID( ID );
00183 
00184   assert( ID < seq_begin_global.size() );
00185   
00186   return seq_begin_global[ ID ];
00187                            
00188 }
00189 
00190 
00191 size_t MAl_Readonly::get_seq_end( size_t ID )
00192 {
00193   
00194   return get_seq_end_global( ID ) - get_seq_begin_global( ID );
00195 }
00196 
00197 size_t MAl_Readonly::get_seq_end_global( size_t ID )
00198 {
00199   ID = get_buffID( ID );
00200 
00201   assert( ID < seq_end_global.size() );
00202   
00203   return seq_end_global[ ID ];
00204   
00205 }
00206 
00207 
00208 size_t MAl_Readonly::get_beginGood( size_t ID )
00209 {
00210   ID = get_buffID( ID );
00211 
00212   assert( ID < seq_beginGood.size() );
00213   
00214   return seq_beginGood[ ID ];
00215 }
00216 
00217 size_t MAl_Readonly::get_beginGood_global( size_t ID )
00218 {
00219   return get_beginGood( ID ) + get_seq_begin_global( ID );
00220   
00221 }
00222 
00223 
00224 size_t MAl_Readonly::get_endGood( size_t ID )
00225 {
00226   ID = get_buffID( ID );
00227 
00228   assert( ID < seq_endGood.size() );
00229   
00230   return seq_endGood[ ID ];
00231   
00232 }
00233 
00234 size_t MAl_Readonly::get_endGood_global( size_t ID )
00235 {
00236   return get_endGood( ID ) + get_seq_begin_global( ID );
00237 }
00238 
00239 base_t MAl_Readonly::get_base( size_t ID, size_t index )
00240 {
00241   
00242   ID = get_buffID( ID );
00243   
00244   assert( ID < seqs.size() );
00245 
00246   if (index >= seqs[ ID ].size() ) {
00247     cout << flush;
00248     cerr << "index (local) = " << index << ", seqs[ ID ].size() = " << seqs[ ID ].size() << endl;
00249 
00250   }
00251   assert( index < seqs[ ID ].size() );
00252   
00253   base_t tmp = seqs[ ID ][ index ];
00254   if (tmp == '*') {
00255     tmp = '-';
00256   }
00257   
00258   return tmp;
00259 //  return seqs[ ID ][ index ];
00260 }
00261 
00262 base_t MAl_Readonly::get_base_global( size_t ID, size_t index )
00263 {
00264   return get_base( ID, index - get_seq_begin_global( ID ) );
00265   
00266 }
00267 
00268 qual_t MAl_Readonly::get_qual( size_t ID, size_t index )
00269 {
00270   ID = get_buffID( ID );
00271   
00272   assert( ID < quals.size() );
00273   assert( index < quals[ ID ].size() );
00274   
00275   return quals[ ID ][ index ];
00276   
00277 }
00278 
00279 qual_t MAl_Readonly::get_qual_global( size_t ID, size_t index )
00280 {
00281   
00282   return get_qual( ID, index - get_seq_begin_global( ID ) );
00283   
00284 }
00285 
00286 
00287 bool MAl_Readonly::is_DNP(size_t ID, size_t index)
00288 {
00289   ID = get_buffID( ID );
00290 
00291   if ( index >= DNPs[ ID ].size() ) {
00292     
00293   }
00294 
00295   assert( ID < DNPs.size() );
00296   assert( index < DNPs[ ID ].size() );
00297   
00298   return DNPs[ ID ][ index ].isDNP;
00299 }
00300 
00301 bool MAl_Readonly::is_DNP_global(size_t ID, size_t index)
00302 {
00303 
00304   return is_DNP( ID, index - get_seq_begin_global( ID ) );
00305 }
00306 
00307 int MAl_Readonly::get_DNP_ID(size_t ID, size_t index)
00308 {
00309   ID = get_buffID( ID );
00310 
00311   assert( ID < DNPs.size() );
00312   assert( index < DNPs[ ID ].size() );
00313   
00314   return DNPs[ ID ][ index ].ID;  
00315 }
00316 
00317 int MAl_Readonly::get_DNP_ID_global(size_t ID, size_t index)
00318 {
00319   return get_DNP_ID( ID, index - get_seq_begin_global( ID ) ); 
00320 }
00321 
00322 int MAl_Readonly::get_DNP_type(size_t ID, size_t index)
00323 {
00324   ID = get_buffID( ID );
00325 
00326   assert( ID < DNPs.size() );
00327   assert( index < DNPs[ ID ].size() );
00328   
00329   return DNPs[ ID ][ index ].type;
00330 }
00331 
00332 int MAl_Readonly::get_DNP_type_global(size_t ID, size_t index)
00333 {
00334   return get_DNP_type( ID, index - get_seq_begin_global( ID ) );  
00335 }
00336 
00337 
00338 //Private methods
00339 
00340 size_t MAl_Readonly::get_buffID(size_t ID)
00341 {
00342   
00343 
00344   assert( ID < ID_to_buffID.size() );
00345   size_t buffID = ID_to_buffID[ ID ];
00346 
00347   if ( buffID == 0 ) {
00348     //Seq is not present in buffer and thus has to be read into buffer,
00349     //OR a new seq is being created
00350     buffID = next_buffID();
00351 
00352     if ( buffID >= put_in_db.size() ) {
00353       cerr<<"buffID: "<<buffID<<endl;
00354       cerr<<"put_in_db.size(): "<<put_in_db.size()<<endl;
00355     }
00356     
00357     assert( buffID < put_in_db.size() );
00358     
00359     if ( put_in_db[ buffID ] ) {
00360       
00361       flush_buffer( buffID );
00362     }
00363     
00364     //If dbID is 0, a new seq is being created and nothing has to be read from DB
00365     //This is taken care of in read_from_db()
00366 
00367     read_from_db( buffID, ID );
00368 
00369     
00370   }
00371   assert( buffID != 0 );
00372   return buffID;
00373 
00374 }
00375 
00376 size_t MAl_Readonly::next_buffID()
00377 {
00378 //   static size_t curr_buffID(static_cast<size_t>(-1));//Have to check this, but should work although size_t is unsigned
00379   static size_t curr_buffID(0);//ID == 0 is unused...
00380   ++curr_buffID;
00381   
00382   if ( curr_buffID == buff_size ) curr_buffID = 1;
00383   
00384   return curr_buffID;
00385   
00386 }
00387 
00388 void MAl_Readonly::flush_buffer( size_t buffID )
00389 {
00390   //Not implemented in read only MAl
00391   
00392 }
00393 
00394 void MAl_Readonly::read_from_db(size_t buffID, size_t ID)
00395 {
00396 
00397   db_recno_t recno = ID_to_dbID[ ID ];
00398 
00399   if ( recno != 0 ) {
00400 
00401     //NB: EXTREMELY important that DNP features and other features
00402     //that appear more than once on reads are read AFTER DnaStrData!!!
00403 
00404     read_seq_from_db( recno, buffID );
00405     read_feat_from_db( recno, buffID, "ReadMetaData");//Metadata
00406     read_feat_from_db( recno, buffID, "DnaStrData");//The seq
00407     read_feat_from_db( recno, buffID, "QualityData");//The qual
00408     read_feat_from_db( recno, buffID, "DnpData");//The dnps
00409     read_feat_from_db( recno, buffID, "TagData");//General tags
00410     
00411     
00412     /*
00413       read_feat_from_db( recno, buffID, "beginGood");//Maybe this should be added as data in the seq class...
00414       read_feat_from_db( recno, buffID, "endGood");//Ditto
00415       read_feat_from_db( recno, buffID, "name");//Ditto
00416       read_feat_from_db( recno, buffID, "header");//Ditto
00417       read_feat_from_db( recno, buffID, "strand");//Ditto
00418       
00419     */
00420     
00421   }
00422   
00423 
00424   ID_to_buffID[ ID ] = buffID;
00425   buffID_to_dbID[ buffID ] = recno;
00426   buffID_to_ID[ buffID ] = ID;
00427   put_in_db[ buffID ] = true;
00428 }
00429 
00430 void MAl_Readonly::read_seq_from_db( db_recno_t recno, size_t buffID )
00431 {
00432 
00433   Database::PrimaryIterator<ReadData> read_it(doc, "ReadData");
00434 
00435   int ret_read = read_it.setFromRecno(recno);
00436   ReadData* r_test = (ret_read != DB_NOTFOUND) ? read_it.answer() : 0;
00437   assert( r_test );
00438 
00439   seq_begin_global[ buffID ] = r_test->startPos();
00440   seq_end_global[ buffID ] = r_test->endPos() + 1;//Index annoyance...
00441   seq_rows[ buffID ] = r_test->row();
00442 //   names[ buffID ] = r_test->name();
00443 //   mates[ buffID ] = r_test->mate();
00444 //   strands[ buffID ] = r_test->strand();
00445 //   seq_beginGood[ buffID ] = r_test->beginGood();
00446 //   seq_endGood[ buffID ] = r_test->endGood() + 1;
00447 //   mate_lengths[ buffID ] = r_test->mateLength();
00448 //   time_course[ buffID ] = r_test->tc_vec.stlVector();
00449   
00450 //   cerr<<"Reading seq: "<<endl;
00451 //   cerr<<"recno: "<<recno<<endl;
00452 //   cerr<<"r_test->startPos(): "<<r_test->startPos()<<endl;
00453 //   cerr<<"r_test->endPos(): "<<r_test->endPos()<<endl;
00454 
00455 }
00456 
00457 void MAl_Readonly::read_feat_from_db( db_recno_t recno, size_t buffID, const string& data_type_name)
00458 {
00459   Database::SecondaryIterator<FeatureData> feat_it("readRecno", doc, data_type_name);
00460   
00461   feat_it.key()->setReadRecno( recno );
00462 
00463   if ( feat_it.set() != 0 ) {
00464 
00465     return;
00466   }
00467 
00468   
00469   if ( DnaStrData* data = dynamic_cast<DnaStrData*>(feat_it.answer()) ) {
00470     seqs[buffID] = data->dnaVector.stlVector();
00471     //Have to init DNPs here to assert that DNP vec is of same size...
00472     DNPs[buffID] = vector<dnp_struct>(seqs[buffID].size(), dnp_struct());
00473   }
00474   else if ( QualityData* data = dynamic_cast<QualityData*>(feat_it.answer()) ) {
00475     quals[buffID] = data->qualityVector.stlVector();
00476   }
00477   else if ( DnpData* data = dynamic_cast<DnpData*>(feat_it.answer()) ) {
00478     
00479     do {
00480 
00481       DNPs[buffID][ data->startPos() ].isDNP = true;
00482       DNPs[buffID][ data->startPos() ].recno = data->getRecno();
00483       DNPs[buffID][ data->startPos() ].ID = data->get_dnpID();      
00484       DNPs[buffID][ data->startPos() ].type = data->get_dnp_type();      
00485 
00486     } while ( (feat_it.nextdup() == 0) && (data = dynamic_cast<DnpData*>(feat_it.answer())) );
00487   }
00488   else if ( TagData* data = dynamic_cast<TagData*>(feat_it.answer()) ) {
00489     do {
00490       ostringstream os;
00491       os << data->getScore() <<" "<< data->getInfo();
00492       tags[buffID].insert( pair<size_t, tag_struct>( data->endPos() + 1, tag_struct(data->startPos(), data->endPos() + 1, data_type_name, os.str(), data->getRecno()) ) );
00493       
00494     } while ( (feat_it.nextdup() == 0) && (data = dynamic_cast<TagData*>(feat_it.answer())) );
00495   }
00496   else if (ReadMetaData* data = dynamic_cast<ReadMetaData*>(feat_it.answer()) ) {
00497     
00498     names[ buffID ] = data->name();
00499     mates[ buffID ] = data->mate();
00500     strands[ buffID ] = data->strand();
00501     seq_beginGood[ buffID ] = data->beginGood();
00502     seq_endGood[ buffID ] = data->endGood() + 1;
00503     mate_lengths[ buffID ] = data->mateLength();
00504     time_course[ buffID ] = data->tc_vec.stlVector();
00505     
00506   }
00507   
00508 }
00509 
00510 
00511 
00512 
00513 base_t MAl_Readonly::comp_base(const char& base)
00514 {
00515   char tmp = toupper(base);
00516   if ( tmp == 'A') {
00517     return base + 19;
00518   }
00519   else if ( tmp == 'T' ) {
00520     return base - 19;    
00521   }
00522   else if ( tmp == 'G' ) {
00523     return base - 4;    
00524   }
00525   else if ( tmp == 'C' ) {
00526     return base + 4;    
00527   }
00528   else {
00529     return base;
00530   }
00531 }
00532 
00533 double MAl_Readonly::get_max_expression(size_t ID)
00534 {
00535   ID = get_buffID( ID );
00536 
00537   assert( ID < time_course.size() );
00538   
00539   double tc_max(0.000001);
00540   bool maxfound(false);
00541 //   cerr<<names[ ID ]<<endl;
00542   
00543   for( size_t i = 0; i < time_course[ID].size(); i++ ) {
00544     if ( time_course[ID][i] > tc_max ) {
00545       tc_max = time_course[ID][i];
00546       maxfound = true;
00547     }
00548   }
00549   if ( maxfound ) {
00550 //     cerr<<"yo, tc_max: "<<tc_max<<endl;
00551     return tc_max;
00552   }
00553 
00554   return 0.0;
00555 }
00556 
00557 size_t MAl_Readonly::get_num_expression_points(size_t ID)
00558 {
00559   ID = get_buffID( ID );
00560 
00561   assert( ID < time_course.size() );
00562 
00563   return time_course[ID].size();
00564 }
00565 
00566 double MAl_Readonly::get_expression(size_t ID, size_t expression_index)
00567 {
00568   ID = get_buffID( ID );
00569 
00570   assert( ID < time_course.size() );
00571   assert( expression_index < time_course[ID].size() );
00572 
00573   return time_course[ID][expression_index];
00574 }
00575 

Generated on Fri Jul 17 20:19:29 2009 for ngsview by  doxygen 1.5.1