EukPhylo/PTL2/Scripts-GRID/guidance.v2.02/libs/phylogeny/sequence.cpp

// $Id: sequence.cpp 7627 2010-03-06 21:56:30Z cohenofi $

#include "sequence.h"

#include <algorithm>
using namespace std;


sequence::sequence(const string& str,
				   const string& name,
				   const string& remark,
				   const int id,
				   const alphabet* inAlph)
: _alphabet(inAlph->clone()), _remark(remark), _name(name),_id(id)
{
	for (int k=0; k < str.size() ;k += _alphabet->stringSize()) {
		int charId = inAlph->fromChar(str, k);
		if (charId == -99) {
			string textToPrint = "unable to read sequence: " + name;
			errorMsg::reportError(textToPrint);
		}

		_vec.push_back(charId);
	}
}


sequence::sequence(const sequence& other)
: _vec(other._vec), _alphabet(other._alphabet->clone()),
  _remark(other._remark), _name(other._name),_id(other._id)
{

}
// convert the other sequence to the alphabet inAlph.
sequence::sequence(const sequence& other,const alphabet* inAlph)
: _alphabet(inAlph->clone()), _remark(other._remark), _name(other._name), _id(other._id)
{
	const mulAlphabet* pMulAlphabet;
	// if the other.alphabet is amino or nucleotide and the inAlph is indel

	if ( (other._alphabet->size() == 20 && inAlph->size() == 2)
		|| (other._alphabet->size() == 4 && inAlph->size() == 2) )
	{
		for (int k=0; k < other.seqLen() ;k += other._alphabet->stringSize())
		{
			int charId = other._vec[k];

			if (charId == other._alphabet->gap())
					_vec.push_back(inAlph->fromChar("-",0));
			else
				_vec.push_back(inAlph->fromChar("X",0)); //also converts "." (charId==-3) to "X"
				//	unknown amino/nucleotide is converted to "X" and not to "?"
		}
	}

	// if the other.alphabet is amino or nucleotide and the inAlph is mulAlphabet
	else if ( (other._alphabet->size() == 20 && inAlph->size()%20 == 0)
		|| (other._alphabet->size() == 4 && inAlph->size()%4 == 0) )
	{
		for (int k=0; k < other.seqLen() ;++k)
		{
			int charId = other._vec[k];
			string ch = other._alphabet->fromInt(charId);
			int mulCharId = _alphabet->fromChar(ch,0);
			_vec.push_back(mulCharId);
		}
	//	 debug OZ
			//cout << "other sequence: " << other << endl;
			//cout << "mul sequence " << (*this) << endl;
	//	 end of debug
	}
	// if the other.alphabet is mulAlphabet and the inAlph is it's baseAlphabet
	// (for example, if other.alphabet is a multiplied-amino and inAlph is amino, then the converted sequence
	// will have alphabet amino)
	else if ( ((inAlph->size() == 20) && (other._alphabet->size()%20 == 0))
		|| (inAlph->size() == 4) && (other._alphabet->size()%4 == 0))
	{
		pMulAlphabet=(mulAlphabet*)(other._alphabet);
		for (int k=0; k < other.seqLen() ;++k)
			{
				int mulCharId = other._vec[k];
				int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
				_vec.push_back(baseId);
			}
	}

	// for gainLoss project - {0,1} in both, hence no conversion needed.
	// it should be the same for all cases with same alphabet
	else if ( inAlph->size() == other._alphabet->size() )
	{
		pMulAlphabet=(mulAlphabet*)(other._alphabet);
		for (int k=0; k < other.seqLen() ;++k)
		{
			int mulCharId = other._vec[k];
			//int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
			_vec.push_back(mulCharId);
		}
	}
	// I tried to implement it using dynamic_cast but it doesn't work...
	/*else if
			(
				(pMulAlphabet = dynamic_cast<const mulAlphabet*>(other._alphabet)) != NULL
				)
		{
			if			(pMulAlphabet->getBaseAlphabet()->size() == inAlph->size())
		 {
			for (int k=0; k < other.seqLen() ;++k)
			{
				int mulCharId = other._vec[k];
				int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
				_vec.push_back(baseId);
			}
		}
		}*/

	// (currently, there is no implimentions for other converts)
	else
	{
		string error = "unable to convert this kind of alphabet";
		errorMsg::reportError(error);
	}
}

sequence::~sequence()
{
	if (_alphabet)
		delete _alphabet;
}

void sequence::resize(const int k, const int* val) {
	if (val == NULL) {
		_vec.resize(k,_alphabet->unknown());
	}
	else {
		_vec.resize(k,*val);
	}
}

string sequence::toString() const{
	string tmp;
	for (int k=0; k < _vec.size() ; ++k ){
		tmp+= _alphabet->fromInt(_vec[k]);
	}
	return tmp;
}

string sequence::toString(const int pos) const{
	return _alphabet->fromInt(_vec[pos]);
}

void sequence::addFromString(const string& str) {
	for (int k=0; k < str.size() ; k+=_alphabet->stringSize()) {
		_vec.push_back(_alphabet->fromChar(str,k));
	}
}

class particip {
public:
	explicit particip()  {}
	bool operator()(int i) {
		return (i==-1000);
	}
};

//removePositions: the poitions to be removed are marked as '1' in posToRemoveVec
//all othehr positions are '0'
void sequence::removePositions(const vector<int> & posToRemoveVec)
{
	if(posToRemoveVec.size() != seqLen())
		errorMsg::reportError("the input vector must be same size as sequence length. in sequence::removePositions");
	for (int k=0; k < posToRemoveVec.size(); ++k) {
		if (posToRemoveVec[k] == 1)
			_vec[k] = -1000;
	}
	vector<int>::iterator vec_iter;
	vec_iter =  remove_if(_vec.begin(),_vec.end(),particip());
	_vec.erase(vec_iter,_vec.end()); // pg 1170, primer.
}

//return the number of sites that are specific = not unknown, nor ambiguity, nor gap (for example, for nucleotides it will true for A,C,G, or T).
int sequence::seqLenSpecific() const
{
	int res = 0;
	for (int pos = 0; pos < seqLen(); ++pos)
	{
		if (isSpecific(pos))
			++res;
	}
	return res;
}