// $Id: sequenceContainer.cpp 11751 2013-09-12 21:52:03Z cohenofi $ #include "sequenceContainer.h" #include "logFile.h" #include "someUtil.h" #include "fastaFormat.h" sequenceContainer::sequenceContainer(const sequenceContainer& other,const alphabet *inAlph) : _generalRemarks(other._generalRemarks), _id2place(other._id2place) { for (int i=0; i < other._seqDataVec.size(); ++i) _seqDataVec.push_back(sequence(other._seqDataVec[i],inAlph)); } //if bAugumentShorterSeqs=true then add gap characters at the end of short seqeunces const int sequenceContainer::makeSureAllSeqAreSameLengthAndGetLen(bool bAugumentShorterSeqs) { if (_seqDataVec.size() == 0) return 0; const int len = _seqDataVec[0].seqLen(); for (int i=1; i < _seqDataVec.size(); ++i) { if (_seqDataVec[i].seqLen()!=len) { if (bAugumentShorterSeqs) { for (int pos = _seqDataVec[i].seqLen(); pos < len; ++pos) _seqDataVec[i].push_back(getAlphabet()->gap()); } else { cerr<<_seqDataVec[i].name()<<" length = "<<_seqDataVec[i].seqLen()<<" "<<_seqDataVec[0].name()<<" length = "" "<name() == _seqDataVec[i].name()) { // _seqDataVec[i]+=(*tit); // break; // } // } // ++tit; // } // } //} void sequenceContainer::changeGaps2MissingData() { for (int i = 0; i < seqLen();++i) {//going over al positions for (int j = 0; j < _seqDataVec.size();++j) { if (_seqDataVec[j][i] == -1){ _seqDataVec[j][i]=getAlphabet()->unknown(); // missing data } } } } const int sequenceContainer::getId(const string &seqName, bool issueWarningIfNotFound) const { int k; for (k=0 ; k < _seqDataVec.size() ; ++k) { if (_seqDataVec[k].name() == seqName) return (_seqDataVec[k].id()); } if (k == _seqDataVec.size() && issueWarningIfNotFound) { // debuggin LOG(5,<<"seqName = "< res; for (int i=0; i < _seqDataVec.size(); ++i) { res.push_back(_seqDataVec[i].name()); } return res; } sequenceContainer::sequenceContainer() { _id2place.resize(100,-1); } sequenceContainer::~sequenceContainer(){} void sequenceContainer::add(const sequence& inSeq) { _seqDataVec.push_back(inSeq); if (_id2place.size() < inSeq.id()+1) { _id2place.resize(inSeq.id()+100,-1); } if (_id2place[inSeq.id()] != -1) { string err = "Two sequences with the same id - error in function sequenceContainer::add"; err+= "\nThe id of the sequence you are trying to add = "; err += int2string(inSeq.id()); errorMsg::reportError(err); } _id2place[inSeq.id()] = _seqDataVec.size()-1; } //given a sequence id the sequence is removed from the sequence container //and the vector _id2place is updated. void sequenceContainer::remove(const int idSeq) { if (idSeq > _id2place.size()-1 || idSeq<0) errorMsg::reportError("the id of sequence is not mapped by id2place in function sequenceContainer::remove"); int place = _id2place[idSeq]; if (place < 0) errorMsg::reportError("cannot find place of the id in the sequence container in function sequenceContainer::remove"); _seqDataVec.erase(_seqDataVec.begin()+place); _id2place[idSeq] = -1; for (int i=place;i<_seqDataVec.size();i++) { int id = _seqDataVec[i].id(); _id2place[id]--; } } // remove all sequences from the sequence container void sequenceContainer::removeAll(){ Vint ids2remove(numberOfSeqs()); for(int i= 0; i posToRemove(seqLen(),0); bool gapCol; int i,j; for (i = 0; i < seqLen();++i) {//going over al positions gapCol = false; for (j = 0; j < _seqDataVec.size();++j) { if (_seqDataVec[j][i] == -1) posToRemove[i] = 1; } } removePositions(posToRemove); } void sequenceContainer::removeGapPositionsAllSeqs(){ vector posToRemove(seqLen(),1); bool gapCol; int i,j; for (i = 0; i < seqLen();++i) {//going over al positions gapCol = false; for (j = 0; j < _seqDataVec.size();++j) { if (_seqDataVec[j][i] != -1) posToRemove[i] = 0; } } removePositions(posToRemove); } void sequenceContainer::removeGapPositionsAccordingToAReferenceSeq(const string & seqName){ int idOfRefSeq = getId(seqName,true); vector posToRemove(seqLen(),0); int i; for (i = 0; i < seqLen();++i) {//going over al positions if (_seqDataVec[idOfRefSeq][i] == -1) posToRemove[i] = 1; } removePositions(posToRemove); } void sequenceContainer::removeUnknownPositionsAccordingToAReferenceSeq(const string & seqName){ int idOfRefSeq = getId(seqName,true); vector posToRemove(seqLen(),0); int i; for (i = 0; i < seqLen();++i) {//going over al positions if (_seqDataVec[idOfRefSeq][i] == getAlphabet()->unknown()) posToRemove[i] = 1; } removePositions(posToRemove); } //removePositions: the positions to be removed are marked as '1' in posToRemoveVec //all othehr positions are '0' void sequenceContainer::removePositions(const Vint & posToRemoveVec) { for (int z = 0; z < _seqDataVec.size();++z) { _seqDataVec[z].removePositions(posToRemoveVec); } } sequenceContainer sequenceContainer::getSubSeq(const int startPos, const int endPos) { sequenceContainer subSeq(*this); vector posToRemove(seqLen(),true); for (int i = startPos; i <= endPos;++i) {//going over al positions posToRemove[i] = false; } subSeq.removePositions(posToRemove); return subSeq; } void sequenceContainer::changeDotsToGoodCharacters() { for (int i = 0; i < seqLen();++i) {//going over al positions int charInFirstSeq = _seqDataVec[0][i]; if (charInFirstSeq == -3) { LOG(5,<<" position is "<unknown(); for (int i=0; i < numberOfSeqs(); ++i) { if ((*this)[i][pos] == unknown ) --numOfNonCharPos; } return numOfNonCharPos; } bool sequenceContainer::isInvariable(const int pos) const { int charFound = getAlphabet()->unknown(); for (int i=0; i < numberOfSeqs(); ++i) { if ((*this)[i][pos] >= 0) { if (charFound == getAlphabet()->unknown()) charFound = (*this)[i][pos]; else if (charFound != (*this)[i][pos]) return false; } } return true; } int sequenceContainer::getInvariablePosNum() const { int sum = 0; for (int pos = 0; pos < seqLen(); ++pos) { if (isInvariable(pos)) ++sum; } return sum; } // new func for gainLoss project void sequenceContainer::startZeroSequenceContainerGL(const sequenceContainer &sc, const gainLossAlphabet& alph, const int minNumOfOnes, const int minNumOfZeros) { //if(minNumOfOnes==0 && minNumOfZeros==0) // return; string str0 = "0"; string str1 = "1"; vector strV; strV.resize(sc.numberOfSeqs()); string remark =""; switch (minNumOfOnes) { case (1) : for(int i=0; iadd(sequence(strV[i],sc.name(i),remark,i,&alph)); } } //concatenate two sequecneContainers. //The sequence names must be identical in the two containers. //returns false if: (1) A sequence_name in one of the containers does not match any sequence_name in the other container. void sequenceContainer::concatenate(sequenceContainer& other) { if (other.numberOfSeqs() != numberOfSeqs()){ string msg = "Not the same number of taxa, can't concatenate: other="+ int2string(other.numberOfSeqs()) + " this=" + int2string( numberOfSeqs()) +"\n"; errorMsg::reportError(msg); return; } for (sequenceContainer::taxaIterator itThis=(*this).taxaBegin();itThis!=(*this).taxaEnd();++itThis) { //for(int i = 0; i < numberOfSeqs(); ++i) { bool bFound = false; //out << (*this)[i].name()<name().compare(itOther->name()) == 0) { //(*this)[i] += other[j]; // was i ????? *(itThis) += *(itOther); bFound = true; break; } } if (bFound == false) { string msg = "Can't find sequence name in the second MSA: " +itThis->name(); errorMsg::reportError(msg); } } } ////////////////////////////////////////////////////////////////////////// const bool sequenceContainer::operator==(const sequenceContainer& sq) const { if (_seqDataVec.size() != sq._seqDataVec.size()) // not the same number of sequences in sequenceContainer return false; const int numberOfSeqs = _seqDataVec.size(); const int len = _seqDataVec[0].seqLen(); for (int i=0; i < numberOfSeqs; ++i) { string nameI = name(i); int idI = getId(nameI); int idSq = sq.getId(nameI); if (_seqDataVec[idI].seqLen()!=sq._seqDataVec[idSq].seqLen()) return false; for (int pos = 0; pos < len; ++pos) { if (_seqDataVec[idI][pos]!=sq._seqDataVec[idSq][pos]) return false; } } return true; } ////////////////////////////////////////////////////////////////////////// int sequenceContainer::getNumOfOccurancesPerPos(const int pos, const char charId){ int numOfOccurancesPerPos = 0; const int numberOfSeqs = _seqDataVec.size(); const int len = _seqDataVec[0].seqLen(); for (int i=0; i < numberOfSeqs; ++i) { string nameI = name(i); int idI = getId(nameI); if (_seqDataVec[idI][pos]==charId) numOfOccurancesPerPos++; } return numOfOccurancesPerPos; } ////////////////////////////////////////////////////////////////////////// vector sequenceContainer::getSeqNamesThatMatchPos(const int pos, const char charId){ vector SeqNamesThatMatchPos; const int numberOfSeqs = _seqDataVec.size(); const int len = _seqDataVec[0].seqLen(); for (int i=0; i < numberOfSeqs; ++i) { string nameI = name(i); int idI = getId(nameI); if (_seqDataVec[idI][pos]==charId) SeqNamesThatMatchPos.push_back(nameI); } return SeqNamesThatMatchPos; } ////////////////////////////////////////////////////////////////////////// // added counts for unKnown data const vector sequenceContainer::getAlphabetDistribution(bool isCountUnknown) const { vector alphabetVec; int alphSize = alphabetSize()+1; //unKnown int UnknownVal = getAlphabet()->unknown(); alphabetVec.resize( alphSize); const int numberOfSeqs = _seqDataVec.size(); const int len = _seqDataVec[0].seqLen(); for (int i=0; i < numberOfSeqs; ++i) { for (int pos = 0; pos < len; ++pos) { for(int alph = 0 ; alph sequenceContainer::getAlphabetDistribution(int pos,bool isCountUnknown) const { vector alphabetVec; alphabetVec.resize( alphabetSize()); const int numberOfSeqs = _seqDataVec.size(); for (int i=0; i < numberOfSeqs; ++i) { for(int alph = 0 ; alph