mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 01:00:25 +08:00
516 lines
15 KiB
C++
516 lines
15 KiB
C++
// $Id: sequenceContainer.cpp 11751 2013-09-12 21:52:03Z cohenofi $
|
|
#include "sequenceContainer.h"
|
|
#include "logFile.h"
|
|
#include "someUtil.h"
|
|
#include "fastaFormat.h"
|
|
|
|
sequenceContainer::sequenceContainer(const sequenceContainer& other,const alphabet *inAlph) :
|
|
_generalRemarks(other._generalRemarks),
|
|
_id2place(other._id2place)
|
|
{
|
|
for (int i=0; i < other._seqDataVec.size(); ++i)
|
|
_seqDataVec.push_back(sequence(other._seqDataVec[i],inAlph));
|
|
}
|
|
|
|
|
|
//if bAugumentShorterSeqs=true then add gap characters at the end of short seqeunces
|
|
const int sequenceContainer::makeSureAllSeqAreSameLengthAndGetLen(bool bAugumentShorterSeqs) {
|
|
if (_seqDataVec.size() == 0) return 0;
|
|
const int len = _seqDataVec[0].seqLen();
|
|
for (int i=1; i < _seqDataVec.size(); ++i) {
|
|
if (_seqDataVec[i].seqLen()!=len) {
|
|
if (bAugumentShorterSeqs) {
|
|
for (int pos = _seqDataVec[i].seqLen(); pos < len; ++pos)
|
|
_seqDataVec[i].push_back(getAlphabet()->gap());
|
|
}
|
|
else {
|
|
cerr<<_seqDataVec[i].name()<<" length = "<<_seqDataVec[i].seqLen()<<" "<<_seqDataVec[0].name()<<" length = "" "<<len<<endl;
|
|
errorMsg::reportError("not all sequences are of the same lengths");
|
|
}
|
|
}
|
|
}
|
|
|
|
return len;
|
|
}
|
|
|
|
//void sequenceContainer::addFromsequenceContainer(sequenceContainer& seqToAdd){
|
|
// if (_seqDataVec.empty()) { // first sequence to add
|
|
// sequenceContainer::taxaIterator tit;
|
|
// sequenceContainer::taxaIterator titEND;
|
|
// tit.begin(seqToAdd);
|
|
// titEND.end(seqToAdd);
|
|
// while (tit!=titEND) {
|
|
// _seqDataVec.push_back(*tit);
|
|
//
|
|
// }
|
|
// }
|
|
// else {// now we are adding sequences to sequences that are already there.
|
|
// sequenceContainer::taxaIterator tit;
|
|
// sequenceContainer::taxaIterator titEND;
|
|
// tit.begin(seqToAdd);
|
|
// titEND.end(seqToAdd);
|
|
// while (tit!=titEND) {
|
|
// for (int i=0; i < _seqDataVec.size(); ++i) {
|
|
// if (tit->name() == _seqDataVec[i].name()) {
|
|
// _seqDataVec[i]+=(*tit);
|
|
// break;
|
|
// }
|
|
// }
|
|
// ++tit;
|
|
// }
|
|
// }
|
|
//}
|
|
|
|
void sequenceContainer::changeGaps2MissingData() {
|
|
|
|
for (int i = 0; i < seqLen();++i) {//going over al positions
|
|
for (int j = 0; j < _seqDataVec.size();++j) {
|
|
if (_seqDataVec[j][i] == -1){
|
|
_seqDataVec[j][i]=getAlphabet()->unknown(); // missing data
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
const int sequenceContainer::getId(const string &seqName, bool issueWarningIfNotFound) const {
|
|
int k;
|
|
for (k=0 ; k < _seqDataVec.size() ; ++k) {
|
|
if (_seqDataVec[k].name() == seqName) return (_seqDataVec[k].id());
|
|
}
|
|
if (k == _seqDataVec.size() && issueWarningIfNotFound) {
|
|
// debuggin
|
|
LOG(5,<<"seqName = "<<seqName<<endl);
|
|
for (k=0 ; k < _seqDataVec.size() ; ++k) {
|
|
LOG(5,<<"_seqDataVec["<<k<<"].name() ="<<_seqDataVec[k].name()<<endl);
|
|
}
|
|
//end dubug
|
|
LOG(0,<<seqName<<endl);
|
|
vector<string> err;
|
|
err.push_back("Could not find a sequence that matches the sequence name ");
|
|
err.push_back(seqName);
|
|
err.push_back("in function sequenceContainer::getSeqPtr ");
|
|
err.push_back(" make sure that names in tree file match name in sequence file ");
|
|
errorMsg::reportError(err); // also quit the program
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
const Vstring sequenceContainer::names() const {
|
|
vector<string> res;
|
|
for (int i=0; i < _seqDataVec.size(); ++i) {
|
|
res.push_back(_seqDataVec[i].name());
|
|
}
|
|
return res;
|
|
}
|
|
|
|
sequenceContainer::sequenceContainer() {
|
|
_id2place.resize(100,-1);
|
|
}
|
|
|
|
sequenceContainer::~sequenceContainer(){}
|
|
|
|
void sequenceContainer::add(const sequence& inSeq) {
|
|
_seqDataVec.push_back(inSeq);
|
|
if (_id2place.size() < inSeq.id()+1) {
|
|
_id2place.resize(inSeq.id()+100,-1);
|
|
}
|
|
if (_id2place[inSeq.id()] != -1) {
|
|
string err = "Two sequences with the same id - error in function sequenceContainer::add";
|
|
err+= "\nThe id of the sequence you are trying to add = ";
|
|
err += int2string(inSeq.id());
|
|
errorMsg::reportError(err);
|
|
}
|
|
_id2place[inSeq.id()] = _seqDataVec.size()-1;
|
|
}
|
|
|
|
|
|
//given a sequence id the sequence is removed from the sequence container
|
|
//and the vector _id2place is updated.
|
|
void sequenceContainer::remove(const int idSeq) {
|
|
if (idSeq > _id2place.size()-1 || idSeq<0)
|
|
errorMsg::reportError("the id of sequence is not mapped by id2place in function sequenceContainer::remove");
|
|
int place = _id2place[idSeq];
|
|
|
|
if (place < 0)
|
|
errorMsg::reportError("cannot find place of the id in the sequence container in function sequenceContainer::remove");
|
|
_seqDataVec.erase(_seqDataVec.begin()+place);
|
|
|
|
_id2place[idSeq] = -1;
|
|
for (int i=place;i<_seqDataVec.size();i++) {
|
|
int id = _seqDataVec[i].id();
|
|
_id2place[id]--;
|
|
}
|
|
}
|
|
// remove all sequences from the sequence container
|
|
void sequenceContainer::removeAll(){
|
|
Vint ids2remove(numberOfSeqs());
|
|
for(int i= 0; i<numberOfSeqs() ;i++){
|
|
ids2remove[i] =placeToId(i);
|
|
}
|
|
for(int i= 0; i<ids2remove.size() ;i++){
|
|
remove(ids2remove[i]);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
//removes identical sequences in the sequence container.
|
|
void sequenceContainer::removeIdenticalSequences(){
|
|
bool exist;
|
|
for (int i=1;i<_seqDataVec.size();i++){
|
|
sequence sq1 = _seqDataVec[i];
|
|
for (int j=0;j<i;j++){
|
|
sequence sq2 = _seqDataVec[j];
|
|
exist = true;
|
|
if (sq1.seqLen() != sq2.seqLen()) continue;
|
|
for (int pos=0;pos<sq1.seqLen();pos++){
|
|
if (sq1[pos] != sq2[pos]){
|
|
exist = false;
|
|
break;
|
|
}
|
|
}
|
|
if (exist) {
|
|
remove(sq1.id());
|
|
i--;
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
void sequenceContainer::removeGapPositions(){
|
|
vector<int> posToRemove(seqLen(),0);
|
|
bool gapCol;
|
|
int i,j;
|
|
for (i = 0; i < seqLen();++i) {//going over al positions
|
|
gapCol = false;
|
|
for (j = 0; j < _seqDataVec.size();++j) {
|
|
if (_seqDataVec[j][i] == -1) posToRemove[i] = 1;
|
|
}
|
|
}
|
|
removePositions(posToRemove);
|
|
}
|
|
void sequenceContainer::removeGapPositionsAllSeqs(){
|
|
vector<int> posToRemove(seqLen(),1);
|
|
bool gapCol;
|
|
int i,j;
|
|
for (i = 0; i < seqLen();++i) {//going over al positions
|
|
gapCol = false;
|
|
for (j = 0; j < _seqDataVec.size();++j) {
|
|
if (_seqDataVec[j][i] != -1) posToRemove[i] = 0;
|
|
}
|
|
}
|
|
removePositions(posToRemove);
|
|
}
|
|
void sequenceContainer::removeGapPositionsAccordingToAReferenceSeq(const string & seqName){
|
|
int idOfRefSeq = getId(seqName,true);
|
|
vector<int> posToRemove(seqLen(),0);
|
|
int i;
|
|
for (i = 0; i < seqLen();++i) {//going over al positions
|
|
if (_seqDataVec[idOfRefSeq][i] == -1) posToRemove[i] = 1;
|
|
}
|
|
removePositions(posToRemove);
|
|
}
|
|
|
|
void sequenceContainer::removeUnknownPositionsAccordingToAReferenceSeq(const string & seqName){
|
|
int idOfRefSeq = getId(seqName,true);
|
|
vector<int> posToRemove(seqLen(),0);
|
|
int i;
|
|
for (i = 0; i < seqLen();++i) {//going over al positions
|
|
if (_seqDataVec[idOfRefSeq][i] == getAlphabet()->unknown()) posToRemove[i] = 1;
|
|
}
|
|
removePositions(posToRemove);
|
|
}
|
|
|
|
//removePositions: the positions to be removed are marked as '1' in posToRemoveVec
|
|
//all othehr positions are '0'
|
|
void sequenceContainer::removePositions(const Vint & posToRemoveVec) {
|
|
for (int z = 0; z < _seqDataVec.size();++z) {
|
|
_seqDataVec[z].removePositions(posToRemoveVec);
|
|
}
|
|
}
|
|
|
|
|
|
sequenceContainer sequenceContainer::getSubSeq(const int startPos, const int endPos) {
|
|
sequenceContainer subSeq(*this);
|
|
|
|
vector<int> posToRemove(seqLen(),true);
|
|
for (int i = startPos; i <= endPos;++i) {//going over al positions
|
|
posToRemove[i] = false;
|
|
}
|
|
subSeq.removePositions(posToRemove);
|
|
|
|
return subSeq;
|
|
}
|
|
|
|
|
|
void sequenceContainer::changeDotsToGoodCharacters() {
|
|
for (int i = 0; i < seqLen();++i) {//going over al positions
|
|
int charInFirstSeq = _seqDataVec[0][i];
|
|
if (charInFirstSeq == -3) {
|
|
LOG(5,<<" position is "<<i<<endl);
|
|
errorMsg::reportError(" the first line contains dots ");
|
|
}
|
|
for (int j = 1; j < _seqDataVec.size();++j) {
|
|
if ((_seqDataVec[j][i] == -3)) {
|
|
_seqDataVec[j][i] = charInFirstSeq; // missing data
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int sequenceContainer::numberOfSequencesWithoutGaps (const int pos) const {
|
|
int numOfNonCharPos = numberOfSeqs();
|
|
for (int i=0; i < numberOfSeqs(); ++i) {
|
|
if ((*this)[i][pos] <0) --numOfNonCharPos;
|
|
}
|
|
return numOfNonCharPos;
|
|
}
|
|
|
|
int sequenceContainer::numberOfSequencesWithoutUnknowns (const int pos) const {
|
|
int numOfNonCharPos = numberOfSeqs();
|
|
int unknown = getAlphabet()->unknown();
|
|
for (int i=0; i < numberOfSeqs(); ++i) {
|
|
if ((*this)[i][pos] == unknown )
|
|
--numOfNonCharPos;
|
|
}
|
|
return numOfNonCharPos;
|
|
}
|
|
|
|
bool sequenceContainer::isInvariable(const int pos) const {
|
|
int charFound = getAlphabet()->unknown();
|
|
for (int i=0; i < numberOfSeqs(); ++i) {
|
|
if ((*this)[i][pos] >= 0) {
|
|
if (charFound == getAlphabet()->unknown())
|
|
charFound = (*this)[i][pos];
|
|
else if (charFound != (*this)[i][pos])
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
int sequenceContainer::getInvariablePosNum() const {
|
|
int sum = 0;
|
|
for (int pos = 0; pos < seqLen(); ++pos) {
|
|
if (isInvariable(pos))
|
|
++sum;
|
|
}
|
|
return sum;
|
|
}
|
|
|
|
// new func for gainLoss project
|
|
void sequenceContainer::startZeroSequenceContainerGL(const sequenceContainer &sc, const gainLossAlphabet& alph, const int minNumOfOnes, const int minNumOfZeros)
|
|
{
|
|
//if(minNumOfOnes==0 && minNumOfZeros==0)
|
|
// return;
|
|
|
|
string str0 = "0";
|
|
string str1 = "1";
|
|
vector<string> strV;
|
|
strV.resize(sc.numberOfSeqs());
|
|
string remark ="";
|
|
switch (minNumOfOnes) {
|
|
case (1) :
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of 0 ones
|
|
strV[i] = str0;
|
|
}
|
|
break;
|
|
case (2) :
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of 0 ones
|
|
strV[i] = str0;
|
|
}
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of only 1 ones
|
|
for(int j=0; j<sc.numberOfSeqs(); j++){
|
|
if(j==i){
|
|
strV[i]+=str1;
|
|
}
|
|
else{
|
|
strV[i]+=str0;
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
case (3) :
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of 0 ones
|
|
strV[i] = str0;
|
|
}
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of only 1 ones
|
|
for(int j=0; j<sc.numberOfSeqs(); j++){
|
|
if(j==i){
|
|
strV[i]+=str1;
|
|
}
|
|
else{
|
|
strV[i]+=str0;
|
|
}
|
|
}
|
|
}
|
|
// add patterns of only 2 ones
|
|
for(int onePosition1=0; onePosition1<sc.numberOfSeqs(); onePosition1++){
|
|
for(int onePosition2=0; onePosition2<sc.numberOfSeqs(); onePosition2++){
|
|
if(onePosition2<=onePosition1)
|
|
continue;
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
if(i==onePosition1 || i==onePosition2){
|
|
strV[i]+=str1;
|
|
}
|
|
else{
|
|
strV[i]+=str0;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
switch (minNumOfZeros) {
|
|
case (0) :
|
|
break;
|
|
case (1) :
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
// add patterns of 0 zeroes (only '1')
|
|
strV[i] += str1;
|
|
}
|
|
break;
|
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
|
|
|
for(int i=0; i<sc.numberOfSeqs();i++){
|
|
//cout<<strV[i]<<endl;
|
|
this->add(sequence(strV[i],sc.name(i),remark,i,&alph));
|
|
}
|
|
}
|
|
|
|
|
|
//concatenate two sequecneContainers.
|
|
//The sequence names must be identical in the two containers.
|
|
//returns false if: (1) A sequence_name in one of the containers does not match any sequence_name in the other container.
|
|
void sequenceContainer::concatenate(sequenceContainer& other) {
|
|
if (other.numberOfSeqs() != numberOfSeqs()){
|
|
string msg = "Not the same number of taxa, can't concatenate: other="+ int2string(other.numberOfSeqs()) + " this=" + int2string( numberOfSeqs()) +"\n";
|
|
errorMsg::reportError(msg);
|
|
return;
|
|
}
|
|
for (sequenceContainer::taxaIterator itThis=(*this).taxaBegin();itThis!=(*this).taxaEnd();++itThis) {
|
|
//for(int i = 0; i < numberOfSeqs(); ++i) {
|
|
bool bFound = false;
|
|
//out << (*this)[i].name()<<endl;
|
|
|
|
for (sequenceContainer::taxaIterator itOther=other.taxaBegin();itOther!=other.taxaEnd();++itOther) {
|
|
//for (int j = 0; j < other.numberOfSeqs(); ++j) {
|
|
//if((*this)[i].name().compare(other[j].name()) == 0)
|
|
if(itThis->name().compare(itOther->name()) == 0)
|
|
{
|
|
//(*this)[i] += other[j]; // was i ?????
|
|
*(itThis) += *(itOther);
|
|
bFound = true;
|
|
break;
|
|
}
|
|
}
|
|
if (bFound == false)
|
|
{
|
|
string msg = "Can't find sequence name in the second MSA: " +itThis->name();
|
|
errorMsg::reportError(msg);
|
|
}
|
|
}
|
|
}
|
|
//////////////////////////////////////////////////////////////////////////
|
|
const bool sequenceContainer::operator==(const sequenceContainer& sq) const {
|
|
if (_seqDataVec.size() != sq._seqDataVec.size()) // not the same number of sequences in sequenceContainer
|
|
return false;
|
|
const int numberOfSeqs = _seqDataVec.size();
|
|
const int len = _seqDataVec[0].seqLen();
|
|
for (int i=0; i < numberOfSeqs; ++i) {
|
|
string nameI = name(i);
|
|
int idI = getId(nameI);
|
|
int idSq = sq.getId(nameI);
|
|
if (_seqDataVec[idI].seqLen()!=sq._seqDataVec[idSq].seqLen())
|
|
return false;
|
|
for (int pos = 0; pos < len; ++pos)
|
|
{
|
|
if (_seqDataVec[idI][pos]!=sq._seqDataVec[idSq][pos])
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
int sequenceContainer::getNumOfOccurancesPerPos(const int pos, const char charId){
|
|
int numOfOccurancesPerPos = 0;
|
|
const int numberOfSeqs = _seqDataVec.size();
|
|
const int len = _seqDataVec[0].seqLen();
|
|
|
|
for (int i=0; i < numberOfSeqs; ++i) {
|
|
string nameI = name(i);
|
|
int idI = getId(nameI);
|
|
if (_seqDataVec[idI][pos]==charId)
|
|
numOfOccurancesPerPos++;
|
|
}
|
|
return numOfOccurancesPerPos;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
vector<string> sequenceContainer::getSeqNamesThatMatchPos(const int pos, const char charId){
|
|
vector<string> SeqNamesThatMatchPos;
|
|
const int numberOfSeqs = _seqDataVec.size();
|
|
const int len = _seqDataVec[0].seqLen();
|
|
|
|
for (int i=0; i < numberOfSeqs; ++i) {
|
|
string nameI = name(i);
|
|
int idI = getId(nameI);
|
|
if (_seqDataVec[idI][pos]==charId)
|
|
SeqNamesThatMatchPos.push_back(nameI);
|
|
}
|
|
return SeqNamesThatMatchPos;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
// added counts for unKnown data
|
|
const vector<int> sequenceContainer::getAlphabetDistribution(bool isCountUnknown) const {
|
|
vector<int> alphabetVec;
|
|
int alphSize = alphabetSize()+1; //unKnown
|
|
int UnknownVal = getAlphabet()->unknown();
|
|
alphabetVec.resize( alphSize);
|
|
const int numberOfSeqs = _seqDataVec.size();
|
|
const int len = _seqDataVec[0].seqLen();
|
|
for (int i=0; i < numberOfSeqs; ++i) {
|
|
for (int pos = 0; pos < len; ++pos) {
|
|
for(int alph = 0 ; alph<alphSize ;++alph){
|
|
if ( _seqDataVec[i][pos] ==alph)
|
|
++alphabetVec[alph];
|
|
else if( _seqDataVec[i][pos] ==UnknownVal)
|
|
++alphabetVec[alph];
|
|
}
|
|
}
|
|
}
|
|
return alphabetVec;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////
|
|
const vector<int> sequenceContainer::getAlphabetDistribution(int pos,bool isCountUnknown) const {
|
|
vector<int> alphabetVec;
|
|
alphabetVec.resize( alphabetSize());
|
|
const int numberOfSeqs = _seqDataVec.size();
|
|
for (int i=0; i < numberOfSeqs; ++i) {
|
|
for(int alph = 0 ; alph<alphabetSize() ;++alph){
|
|
if ( _seqDataVec[i][pos] ==alph)
|
|
++alphabetVec[alph];
|
|
}
|
|
}
|
|
return alphabetVec;
|
|
}
|
|
|
|
|
|
|