macsimparser.cpp

Go to the documentation of this file.
00001 #include <qmessagebox.h>
00002 #include <qregexp.h>
00003 #include "macsimparser.h"
00004 #include "trapperdoc.h"
00005 #include "generaldata.h"
00006 #include "readdata.h"
00007 #include "featuredata.h"
00008 #include "qdir.h"
00009 #include <cassert>
00010 #include <iostream>
00011 #include <list>
00012 #include <string>
00013 #include <algorithm>
00014 
00015 
00016 using namespace std;
00017 
00018 MacsimParser::MacsimParser(DbEnv* dbenv, QString projectDir) : 
00019   env(dbenv), projDir(projectDir), current_doc(0), format_ok(false)
00020 {
00021   curr_row = 1;
00022   curr_al_num = 0;
00023   att_trans["sense"] = "strand";
00024   att_trans["seq-name"] = "name";
00025   att_trans["seq-data"] = "trappervector";
00026 
00027 //   cerr<<"MacsimParser: NB, due to long names of genomic seq in old files, they are currently being shortened"<<endl;
00028 }
00029 
00030 
00031 bool MacsimParser::startElement(const QString& namespaceURI,
00032                                 const QString& localName,
00033                                 const QString& qName,
00034                                 const QXmlAttributes& attributes)
00035 {
00036 
00037   if ( qName == "macsim" ) {
00038     format_ok = true;
00039     return true;
00040     
00041   }
00042   else if ( format_ok ) {
00043     
00044     return true;
00045   }
00046   return false;
00047   
00048 }
00049 
00050 bool MacsimParser::endElement( const QString & namespaceURI, const QString & localName, 
00051                                const QString & qName)
00052 {
00053   if ( qName == "alignment" ) {
00054     curr_al_num++;
00055     if ( curr_al_num % 100 == 0 ) {
00056       cerr<<curr_al_num<<" alignments parsed"<<endl;
00057     }
00058 
00059     curr_row = 1;
00060     //Delete old doc, if any
00061     delete current_doc;
00062     return true;
00063   }
00064   else if ( qName == "aln-name") {
00065     //Contig name
00066     
00067     //Create contig data
00068 
00069     QString contigName = curr_data;
00070 
00071 
00072     //Some bookkeeping to check if contig is already present
00073     QStringList contigNamesAlreadyPresent = contigNamesInProjectDir();
00074     if ( contigNamesAlreadyPresent.contains( contigName ) ) {
00075       QString message;
00076       message = QString("The contig name \"%1\" was specified in the import file, but that contig is already present in the project dir. Skipping this one").arg(contigName);
00077       QMessageBox::warning(0,"",message);
00078       return false;
00079     }
00080     //Create directory for contig
00081     QString fName = projDir + "/" + contigName;
00082                 
00083 
00084     //Create directories and databases for this contig
00085 
00086     current_doc = new TrapperDoc(env);
00087     current_doc->openDocument(fName);
00088 
00089     return true;
00090 
00091   }
00092   else if ( qName == "sequence") {
00093     //One ReadData parsed
00094 
00095     QXmlAttributes read_att;
00096 
00097     //Extract ReadData startPos, endPos etc, recalc corresponding feature coordinates
00098     //This also needs to be done for other features, when they are added
00099 
00100     //strand (group)
00101     if ( att_map["ReadData"]["strand"].toInt() > -1 ) {
00102       read_att.append("strand", namespaceURI, localName, "U");
00103     }
00104     else {
00105       read_att.append("strand", namespaceURI, localName, "C");
00106     }
00107     
00108     //name (seq-name)
00109     read_att.append("name", namespaceURI, localName, att_map["ReadData"]["name"]);
00110     
00111     //trappervector (seq-data)
00112     QString orig_str = att_map["DnaStrData"]["trappervector"].lower();
00113     
00114     //Parse out leading and trailing gaps, calculate start and endpos
00115     int indexleft = orig_str.find( QRegExp("[a-z]") );
00116     int indexright = orig_str.findRev( QRegExp("[a-z]") );
00117     assert( indexleft > -1 && indexright > -1 );
00118     /*
00119     -----aaatac---
00120     L = 14
00121     seqL = 6
00122     startPos = 5
00123     endPos = 10
00124     index = 5
00125     indexright = 10
00126 
00127     => startPos = indexleft, endPos = indexright
00128     Feature: endPos = seqL - 1 = indexright - indexleft
00129     */
00130     QString seq = orig_str.mid( indexleft, indexright - indexleft + 1 );
00131     seq.replace('u','t');
00132     seq.replace('-','*');
00133 
00134     read_att.append("startPos", namespaceURI, localName, QString().number(indexleft));
00135     read_att.append("endPos", namespaceURI, localName, QString().number(indexright));
00136     
00137     //Features, currently only DnaStrData
00138     QXmlAttributes dna_att;
00139     dna_att.append("startPos", namespaceURI, localName, "0");
00140     dna_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00141     dna_att.append("trappervector", namespaceURI, localName, seq);
00142 
00143 
00144     //Create ReadData
00145 
00146     Database::Creator<ReadData> read_creator( current_doc, "ReadData" );
00147     read_creator.data()->readAttributes( read_att );
00148     read_creator.data()->setRow(curr_row);
00149     curr_row++;
00150     
00151     db_recno_t new_recno = read_creator.create( false );
00152 
00153     //Create FeatureData
00154 
00155     Database::Creator<DnaStrData> feat_creator( current_doc, "DnaStrData" );
00156     feat_creator.data()->readAttributes( dna_att );
00157     feat_creator.data()->setReadRecno( new_recno );
00158     feat_creator.create(false);
00159 
00160     //Hacking galore: create dummy vals for qual and chromat also
00161     QXmlAttributes chr_att;
00162     chr_att.append("startPos", namespaceURI, localName, "0");
00163     chr_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00164 
00165     Database::Creator<ChromatData> chr_creator( current_doc, "ChromatData" );
00166     chr_creator.data()->readAttributes( chr_att );
00167     chr_creator.data()->setReadRecno( new_recno );
00168     chr_creator.create(false);
00169 
00170     QXmlAttributes qual_att;
00171     qual_att.append("startPos", namespaceURI, localName, "0");
00172     qual_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00173     QString dummyqual;
00174     for( size_t i = 0; i < static_cast<size_t>(indexright - indexleft + 1); i++ ) {
00175       dummyqual.append(" 20");
00176     }
00177     qual_att.append("trappervector", namespaceURI, localName, dummyqual);
00178 
00179     Database::Creator<QualityData> qual_creator( current_doc, "QualityData" );
00180     qual_creator.data()->readAttributes( qual_att );
00181     qual_creator.data()->setReadRecno( new_recno );
00182     qual_creator.create(false);
00183 
00184     //Tag data
00185     
00186     for( size_t i = 0; i < tags.size(); i++ ) {
00187       assert( tags[i].size() == 5 );
00188       list<string> type_list = GeneralMaker::listRegistered();
00189       string type = tags[i][0].ascii();
00190 
00191       if ( find( type_list.begin(), type_list.end(), type ) == type_list.end() ) {
00192         tags[i][4] = type + " " + tags[i][4];
00193         type = "TagData";
00194         
00195       }
00196 //       cerr<<"type: "<<type<<endl;
00197       
00198       
00199       Database::Creator<TagData> tag_creator( current_doc, type );
00200       
00201       QXmlAttributes tag_att;
00202 //       tag_att.append( "startPos", namespaceURI, localName, QString().number( tags[i][1].toInt() - indexleft) );
00203 //       tag_att.append( "endPos", namespaceURI, localName, QString().number( tags[i][2].toInt() - indexleft) );
00204       tag_att.append( "startPos", namespaceURI, localName, QString().number( tags[i][1].toInt() - 1) );
00205       tag_att.append( "endPos", namespaceURI, localName, QString().number( tags[i][2].toInt() - 1) );
00206       tag_att.append( "score", namespaceURI, localName, tags[i][3] );
00207       tag_att.append( "info", namespaceURI, localName, tags[i][4] );
00208 
00209       tag_creator.data()->readAttributes( tag_att );
00210 
00211       tag_creator.data()->setReadRecno( new_recno );
00212       tag_creator.create(false);
00213       
00214     }
00215     
00216 
00217     //Clear old data
00218     att_map.clear();
00219     tags.clear();
00220     return true;
00221     
00222   }
00223   else if ( qName == "seq-name" || qName == "group" || qName == "sense" )  {
00224     //Fill attributes with name, strand, and sequence
00225 //     if ( qName == "seq-name" && curr_data.find('|') != -1 ) {
00226 //       curr_data = curr_data.section('|', 1, 1).section(' ', 0, 0);
00227 //     }
00228     
00229     
00230     att_map["ReadData"][ att_trans[qName] ] = curr_data;
00231 
00232     return true;
00233   }
00234   else if ( qName == "seq-data" ) {
00235     att_map["DnaStrData"][ att_trans[qName] ] = curr_data;
00236     return true;
00237   }
00238   else if ( qName == "ftype" ) {
00239     vector<QString> tmp;
00240     tags.push_back(tmp);
00241     tags[ tags.size() - 1 ].push_back(curr_data);
00242     
00243     return true;
00244   }
00245   else if ( qName == "fstart" || qName == "fstop" || qName == "fscore" || qName == "fnote" ) {
00246     tags[ tags.size() - 1 ].push_back(curr_data);
00247     return true;
00248   }
00249   
00250   return true;
00251 }
00252 
00253 bool MacsimParser::characters ( const QString & ch ) 
00254 {
00255   curr_data = ch;
00256   return true;
00257 }
00258 
00259 QStringList MacsimParser::contigNamesInProjectDir()
00260 {
00261     QString contigName;
00262     QDir d( projDir );
00263     Q_ASSERT( d.exists() );
00264     d.setFilter( QDir::Dirs | QDir::NoSymLinks );
00265     d.setSorting( QDir::Name );
00266     QStringList slist = d.entryList();
00267     return slist;
00268 }

Generated on Fri Jul 17 20:19:29 2009 for ngsview by  doxygen 1.5.1