00001 #include <qmessagebox.h>
00002 #include <qregexp.h>
00003 #include "macsimparser.h"
00004 #include "trapperdoc.h"
00005 #include "generaldata.h"
00006 #include "readdata.h"
00007 #include "featuredata.h"
00008 #include "qdir.h"
00009 #include <cassert>
00010 #include <iostream>
00011 #include <list>
00012 #include <string>
00013 #include <algorithm>
00014
00015
00016 using namespace std;
00017
00018 MacsimParser::MacsimParser(DbEnv* dbenv, QString projectDir) :
00019 env(dbenv), projDir(projectDir), current_doc(0), format_ok(false)
00020 {
00021 curr_row = 1;
00022 curr_al_num = 0;
00023 att_trans["sense"] = "strand";
00024 att_trans["seq-name"] = "name";
00025 att_trans["seq-data"] = "trappervector";
00026
00027
00028 }
00029
00030
00031 bool MacsimParser::startElement(const QString& namespaceURI,
00032 const QString& localName,
00033 const QString& qName,
00034 const QXmlAttributes& attributes)
00035 {
00036
00037 if ( qName == "macsim" ) {
00038 format_ok = true;
00039 return true;
00040
00041 }
00042 else if ( format_ok ) {
00043
00044 return true;
00045 }
00046 return false;
00047
00048 }
00049
00050 bool MacsimParser::endElement( const QString & namespaceURI, const QString & localName,
00051 const QString & qName)
00052 {
00053 if ( qName == "alignment" ) {
00054 curr_al_num++;
00055 if ( curr_al_num % 100 == 0 ) {
00056 cerr<<curr_al_num<<" alignments parsed"<<endl;
00057 }
00058
00059 curr_row = 1;
00060
00061 delete current_doc;
00062 return true;
00063 }
00064 else if ( qName == "aln-name") {
00065
00066
00067
00068
00069 QString contigName = curr_data;
00070
00071
00072
00073 QStringList contigNamesAlreadyPresent = contigNamesInProjectDir();
00074 if ( contigNamesAlreadyPresent.contains( contigName ) ) {
00075 QString message;
00076 message = QString("The contig name \"%1\" was specified in the import file, but that contig is already present in the project dir. Skipping this one").arg(contigName);
00077 QMessageBox::warning(0,"",message);
00078 return false;
00079 }
00080
00081 QString fName = projDir + "/" + contigName;
00082
00083
00084
00085
00086 current_doc = new TrapperDoc(env);
00087 current_doc->openDocument(fName);
00088
00089 return true;
00090
00091 }
00092 else if ( qName == "sequence") {
00093
00094
00095 QXmlAttributes read_att;
00096
00097
00098
00099
00100
00101 if ( att_map["ReadData"]["strand"].toInt() > -1 ) {
00102 read_att.append("strand", namespaceURI, localName, "U");
00103 }
00104 else {
00105 read_att.append("strand", namespaceURI, localName, "C");
00106 }
00107
00108
00109 read_att.append("name", namespaceURI, localName, att_map["ReadData"]["name"]);
00110
00111
00112 QString orig_str = att_map["DnaStrData"]["trappervector"].lower();
00113
00114
00115 int indexleft = orig_str.find( QRegExp("[a-z]") );
00116 int indexright = orig_str.findRev( QRegExp("[a-z]") );
00117 assert( indexleft > -1 && indexright > -1 );
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130 QString seq = orig_str.mid( indexleft, indexright - indexleft + 1 );
00131 seq.replace('u','t');
00132 seq.replace('-','*');
00133
00134 read_att.append("startPos", namespaceURI, localName, QString().number(indexleft));
00135 read_att.append("endPos", namespaceURI, localName, QString().number(indexright));
00136
00137
00138 QXmlAttributes dna_att;
00139 dna_att.append("startPos", namespaceURI, localName, "0");
00140 dna_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00141 dna_att.append("trappervector", namespaceURI, localName, seq);
00142
00143
00144
00145
00146 Database::Creator<ReadData> read_creator( current_doc, "ReadData" );
00147 read_creator.data()->readAttributes( read_att );
00148 read_creator.data()->setRow(curr_row);
00149 curr_row++;
00150
00151 db_recno_t new_recno = read_creator.create( false );
00152
00153
00154
00155 Database::Creator<DnaStrData> feat_creator( current_doc, "DnaStrData" );
00156 feat_creator.data()->readAttributes( dna_att );
00157 feat_creator.data()->setReadRecno( new_recno );
00158 feat_creator.create(false);
00159
00160
00161 QXmlAttributes chr_att;
00162 chr_att.append("startPos", namespaceURI, localName, "0");
00163 chr_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00164
00165 Database::Creator<ChromatData> chr_creator( current_doc, "ChromatData" );
00166 chr_creator.data()->readAttributes( chr_att );
00167 chr_creator.data()->setReadRecno( new_recno );
00168 chr_creator.create(false);
00169
00170 QXmlAttributes qual_att;
00171 qual_att.append("startPos", namespaceURI, localName, "0");
00172 qual_att.append("endPos", namespaceURI, localName, QString().number(indexright - indexleft) );
00173 QString dummyqual;
00174 for( size_t i = 0; i < static_cast<size_t>(indexright - indexleft + 1); i++ ) {
00175 dummyqual.append(" 20");
00176 }
00177 qual_att.append("trappervector", namespaceURI, localName, dummyqual);
00178
00179 Database::Creator<QualityData> qual_creator( current_doc, "QualityData" );
00180 qual_creator.data()->readAttributes( qual_att );
00181 qual_creator.data()->setReadRecno( new_recno );
00182 qual_creator.create(false);
00183
00184
00185
00186 for( size_t i = 0; i < tags.size(); i++ ) {
00187 assert( tags[i].size() == 5 );
00188 list<string> type_list = GeneralMaker::listRegistered();
00189 string type = tags[i][0].ascii();
00190
00191 if ( find( type_list.begin(), type_list.end(), type ) == type_list.end() ) {
00192 tags[i][4] = type + " " + tags[i][4];
00193 type = "TagData";
00194
00195 }
00196
00197
00198
00199 Database::Creator<TagData> tag_creator( current_doc, type );
00200
00201 QXmlAttributes tag_att;
00202
00203
00204 tag_att.append( "startPos", namespaceURI, localName, QString().number( tags[i][1].toInt() - 1) );
00205 tag_att.append( "endPos", namespaceURI, localName, QString().number( tags[i][2].toInt() - 1) );
00206 tag_att.append( "score", namespaceURI, localName, tags[i][3] );
00207 tag_att.append( "info", namespaceURI, localName, tags[i][4] );
00208
00209 tag_creator.data()->readAttributes( tag_att );
00210
00211 tag_creator.data()->setReadRecno( new_recno );
00212 tag_creator.create(false);
00213
00214 }
00215
00216
00217
00218 att_map.clear();
00219 tags.clear();
00220 return true;
00221
00222 }
00223 else if ( qName == "seq-name" || qName == "group" || qName == "sense" ) {
00224
00225
00226
00227
00228
00229
00230 att_map["ReadData"][ att_trans[qName] ] = curr_data;
00231
00232 return true;
00233 }
00234 else if ( qName == "seq-data" ) {
00235 att_map["DnaStrData"][ att_trans[qName] ] = curr_data;
00236 return true;
00237 }
00238 else if ( qName == "ftype" ) {
00239 vector<QString> tmp;
00240 tags.push_back(tmp);
00241 tags[ tags.size() - 1 ].push_back(curr_data);
00242
00243 return true;
00244 }
00245 else if ( qName == "fstart" || qName == "fstop" || qName == "fscore" || qName == "fnote" ) {
00246 tags[ tags.size() - 1 ].push_back(curr_data);
00247 return true;
00248 }
00249
00250 return true;
00251 }
00252
00253 bool MacsimParser::characters ( const QString & ch )
00254 {
00255 curr_data = ch;
00256 return true;
00257 }
00258
00259 QStringList MacsimParser::contigNamesInProjectDir()
00260 {
00261 QString contigName;
00262 QDir d( projDir );
00263 Q_ASSERT( d.exists() );
00264 d.setFilter( QDir::Dirs | QDir::NoSymLinks );
00265 d.setSorting( QDir::Name );
00266 QStringList slist = d.entryList();
00267 return slist;
00268 }