mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 05:00:24 +08:00
153 lines
5.2 KiB
C++
153 lines
5.2 KiB
C++
// $Id: nexusFormat.cpp 5987 2009-03-18 18:13:53Z itaymay $
|
|
|
|
#include "nexusFormat.h"
|
|
#include "someUtil.h"
|
|
#include "errorMsg.h"
|
|
#include <map>
|
|
|
|
sequenceContainer nexusFormat::read(istream &infile, const alphabet* pAlph) {
|
|
sequenceContainer mySeqData = readUnAligned(infile, pAlph);
|
|
mySeqData.makeSureAllSeqAreSameLengthAndGetLen();
|
|
return mySeqData;
|
|
}
|
|
|
|
sequenceContainer nexusFormat::readUnAligned(istream &infile, const alphabet* pAlph) {
|
|
if (!infile) {
|
|
errorMsg::reportError("unable to read mase format, could not open file");
|
|
}
|
|
sequenceContainer mySeqData;;
|
|
|
|
vector<string> seqFileData;
|
|
putFileIntoVectorStringArray(infile,seqFileData);
|
|
|
|
vector<string>::const_iterator it1 = seqFileData.begin();
|
|
// make sure that the first 6 chars in the first line is #NEXUS
|
|
if (it1->size()<6) errorMsg::reportError("first word in a nexus sequence file format must be #NEXUS",1);
|
|
if ( ((*it1)[0] != '#')
|
|
|| (((*it1)[1] != 'N') && ((*it1)[1] != 'n'))
|
|
|| (((*it1)[2] != 'E') && ((*it1)[2] != 'e'))
|
|
|| (((*it1)[3] != 'X') && ((*it1)[3] != 'x'))
|
|
|| (((*it1)[4] != 'U') && ((*it1)[4] != 'u'))
|
|
|| (((*it1)[5] != 'S') && ((*it1)[5] != 's')) ) {
|
|
errorMsg::reportError("first word in a nexus sequence file format must be #NEXUS",1);
|
|
}
|
|
it1++;
|
|
|
|
while ( ( (*it1).find("matrix") == -1) && ( (*it1).find("MATRIX") == -1) && (it1!= seqFileData.end()))
|
|
{ //check for the word matrix
|
|
++it1;
|
|
}
|
|
|
|
int localid=0;
|
|
//int x1 = ((*it1).find("matrix") != -1);
|
|
//int x2 = ((*it1).find("MATRIX") != -1);
|
|
if (((*it1).find("matrix") != -1) || ((*it1).find("MATRIX") != -1))
|
|
{
|
|
//taken from clustalFormat:
|
|
//In case of codon alpahabet we cannot add a seqeunce that is not dividable by 3.
|
|
//In this case the last nucleotides in each line (zero, one or two)
|
|
//should be saved. The next time the same sequence name appears -
|
|
//these saveed nucleotidea and are added to the begining of the line.
|
|
map<string ,string> stringsToAdd;
|
|
|
|
|
|
for (++it1; it1 != seqFileData.end() ; ++it1)
|
|
{
|
|
if (((*it1).find("end;") != -1) || ((*it1).find("END;") != -1))
|
|
break;
|
|
if (it1->empty() || ((*it1).find(';') != -1))
|
|
{ // empty line constinue
|
|
continue;
|
|
}
|
|
sequence seq(pAlph);
|
|
|
|
string taxonName;
|
|
string remark;
|
|
string stringSeq;
|
|
bool beforeName = true;
|
|
string::const_iterator stringIt = (it1)->begin();
|
|
for (; stringIt != (it1)->end(); ++stringIt)
|
|
{ //first loop finds the taxon name
|
|
if ( ((*stringIt) == ' ') || ((*stringIt) == '\t'))
|
|
if (beforeName == true)
|
|
continue; //spaces before taxon name are legal
|
|
else
|
|
break; //A space marks the end of the taxon name
|
|
else
|
|
{
|
|
taxonName += (*stringIt);
|
|
beforeName = false;
|
|
}
|
|
}
|
|
|
|
//check if a new sequence.
|
|
//if the name already exists then init stringSeq with the nucleotide from the previous line of the same sequence
|
|
if (stringsToAdd.find(taxonName)!=stringsToAdd.end())
|
|
stringSeq = stringsToAdd[taxonName];
|
|
|
|
for (; stringIt != (it1)->end(); ++stringIt)
|
|
{//second loop finds the sequecne
|
|
if ( ((*stringIt)==' ') || ((*stringIt) == '\t'))
|
|
continue;
|
|
else stringSeq += (*stringIt);
|
|
}
|
|
|
|
//when alphabet is codon stringSeq must be dividable by 3.
|
|
// 1. save the reminder (0,1 or 2 last nucleotides) in stringToAdd
|
|
// 2. substr the reminder from the sequence line.
|
|
// 3. keep stringToAdd in map (according the name) to be added later.
|
|
string stringToAdd="";
|
|
if (pAlph->size()>=60){ // codon?
|
|
if ((stringSeq.size()%3)==1){ //add the last nucleotide to the next line
|
|
stringToAdd += stringSeq[stringSeq.size()-1];
|
|
stringSeq = stringSeq.substr(0,stringSeq.size()-1);
|
|
}
|
|
if ((stringSeq.size() % 3) == 2){ //add the 2 last nucleotide to the next line
|
|
stringToAdd+=stringSeq[stringSeq.size()-2];
|
|
stringToAdd+=stringSeq[stringSeq.size()-1];
|
|
stringSeq = stringSeq.substr(0, stringSeq.size() - 2);
|
|
}
|
|
}
|
|
stringsToAdd[taxonName] = stringToAdd; //update the map with the new stringToAdd
|
|
//add sequence to container
|
|
int id = mySeqData.getId(taxonName, false);
|
|
if (id==-1) { // new sequence.
|
|
mySeqData.add(sequence(stringSeq, taxonName,remark,localid, pAlph));
|
|
localid++;
|
|
}
|
|
else {// the sequence is already there...
|
|
sequence tmp(stringSeq,taxonName, remark, id, pAlph);
|
|
mySeqData[id].operator += (tmp);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
errorMsg::reportError("no sequence data in nexus file - no matrix keyword found");
|
|
}
|
|
|
|
return mySeqData;
|
|
}
|
|
|
|
void nexusFormat::write(ostream &out, const sequenceContainer& sc) {
|
|
//vector<string> gfr = sd.getGeneralFileRemarks();
|
|
//if (gfr.empty()) out<<";;\n;;\n";
|
|
//for (vector<string>::const_iterator k=gfr.begin() ; k != gfr.end() ; ++k )
|
|
// out<<(*k)<<endl;
|
|
out<<"#NEXUS"<<endl;
|
|
out<<"begin data;"<<endl;
|
|
out<<"dimensions ntax="<<sc.numberOfSeqs()<<" nchar="<<sc.seqLen() <<";"<<endl;
|
|
if (sc.alphabetSize() == 4)
|
|
out<<"format datatype=dna gap=-;"<<endl;
|
|
else
|
|
out<<"format datatype=protein gap=-;"<<endl;
|
|
out<<"matrix"<<endl;
|
|
|
|
for (sequenceContainer::constTaxaIterator itSeq=sc.constTaxaBegin();itSeq!=sc.constTaxaEnd();++itSeq) {
|
|
out<<"\t"<<itSeq->name()<<"\t"<<itSeq->toString()<<endl;
|
|
}
|
|
out<<";"<<endl;
|
|
out<<"end;"<<endl;
|
|
}
|
|
|