mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 05:40:26 +08:00
193 lines
5.2 KiB
C++
193 lines
5.2 KiB
C++
// $Id: sequence.cpp 7627 2010-03-06 21:56:30Z cohenofi $
|
|
|
|
#include "sequence.h"
|
|
|
|
#include <algorithm>
|
|
using namespace std;
|
|
|
|
|
|
sequence::sequence(const string& str,
|
|
const string& name,
|
|
const string& remark,
|
|
const int id,
|
|
const alphabet* inAlph)
|
|
: _alphabet(inAlph->clone()), _remark(remark), _name(name),_id(id)
|
|
{
|
|
for (int k=0; k < str.size() ;k += _alphabet->stringSize()) {
|
|
int charId = inAlph->fromChar(str, k);
|
|
if (charId == -99) {
|
|
string textToPrint = "unable to read sequence: " + name;
|
|
errorMsg::reportError(textToPrint);
|
|
}
|
|
|
|
_vec.push_back(charId);
|
|
}
|
|
}
|
|
|
|
|
|
sequence::sequence(const sequence& other)
|
|
: _vec(other._vec), _alphabet(other._alphabet->clone()),
|
|
_remark(other._remark), _name(other._name),_id(other._id)
|
|
{
|
|
|
|
}
|
|
// convert the other sequence to the alphabet inAlph.
|
|
sequence::sequence(const sequence& other,const alphabet* inAlph)
|
|
: _alphabet(inAlph->clone()), _remark(other._remark), _name(other._name), _id(other._id)
|
|
{
|
|
const mulAlphabet* pMulAlphabet;
|
|
// if the other.alphabet is amino or nucleotide and the inAlph is indel
|
|
|
|
if ( (other._alphabet->size() == 20 && inAlph->size() == 2)
|
|
|| (other._alphabet->size() == 4 && inAlph->size() == 2) )
|
|
{
|
|
for (int k=0; k < other.seqLen() ;k += other._alphabet->stringSize())
|
|
{
|
|
int charId = other._vec[k];
|
|
|
|
if (charId == other._alphabet->gap())
|
|
_vec.push_back(inAlph->fromChar("-",0));
|
|
else
|
|
_vec.push_back(inAlph->fromChar("X",0)); //also converts "." (charId==-3) to "X"
|
|
// unknown amino/nucleotide is converted to "X" and not to "?"
|
|
}
|
|
}
|
|
|
|
// if the other.alphabet is amino or nucleotide and the inAlph is mulAlphabet
|
|
else if ( (other._alphabet->size() == 20 && inAlph->size()%20 == 0)
|
|
|| (other._alphabet->size() == 4 && inAlph->size()%4 == 0) )
|
|
{
|
|
for (int k=0; k < other.seqLen() ;++k)
|
|
{
|
|
int charId = other._vec[k];
|
|
string ch = other._alphabet->fromInt(charId);
|
|
int mulCharId = _alphabet->fromChar(ch,0);
|
|
_vec.push_back(mulCharId);
|
|
}
|
|
// debug OZ
|
|
//cout << "other sequence: " << other << endl;
|
|
//cout << "mul sequence " << (*this) << endl;
|
|
// end of debug
|
|
}
|
|
// if the other.alphabet is mulAlphabet and the inAlph is it's baseAlphabet
|
|
// (for example, if other.alphabet is a multiplied-amino and inAlph is amino, then the converted sequence
|
|
// will have alphabet amino)
|
|
else if ( ((inAlph->size() == 20) && (other._alphabet->size()%20 == 0))
|
|
|| (inAlph->size() == 4) && (other._alphabet->size()%4 == 0))
|
|
{
|
|
pMulAlphabet=(mulAlphabet*)(other._alphabet);
|
|
for (int k=0; k < other.seqLen() ;++k)
|
|
{
|
|
int mulCharId = other._vec[k];
|
|
int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
|
|
_vec.push_back(baseId);
|
|
}
|
|
}
|
|
|
|
// for gainLoss project - {0,1} in both, hence no conversion needed.
|
|
// it should be the same for all cases with same alphabet
|
|
else if ( inAlph->size() == other._alphabet->size() )
|
|
{
|
|
pMulAlphabet=(mulAlphabet*)(other._alphabet);
|
|
for (int k=0; k < other.seqLen() ;++k)
|
|
{
|
|
int mulCharId = other._vec[k];
|
|
//int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
|
|
_vec.push_back(mulCharId);
|
|
}
|
|
}
|
|
// I tried to implement it using dynamic_cast but it doesn't work...
|
|
/*else if
|
|
(
|
|
(pMulAlphabet = dynamic_cast<const mulAlphabet*>(other._alphabet)) != NULL
|
|
)
|
|
{
|
|
if (pMulAlphabet->getBaseAlphabet()->size() == inAlph->size())
|
|
{
|
|
for (int k=0; k < other.seqLen() ;++k)
|
|
{
|
|
int mulCharId = other._vec[k];
|
|
int baseId = pMulAlphabet->convertToBasedAlphaInt(mulCharId);
|
|
_vec.push_back(baseId);
|
|
}
|
|
}
|
|
}*/
|
|
|
|
// (currently, there is no implimentions for other converts)
|
|
else
|
|
{
|
|
string error = "unable to convert this kind of alphabet";
|
|
errorMsg::reportError(error);
|
|
}
|
|
}
|
|
|
|
sequence::~sequence()
|
|
{
|
|
if (_alphabet)
|
|
delete _alphabet;
|
|
}
|
|
|
|
void sequence::resize(const int k, const int* val) {
|
|
if (val == NULL) {
|
|
_vec.resize(k,_alphabet->unknown());
|
|
}
|
|
else {
|
|
_vec.resize(k,*val);
|
|
}
|
|
}
|
|
|
|
string sequence::toString() const{
|
|
string tmp;
|
|
for (int k=0; k < _vec.size() ; ++k ){
|
|
tmp+= _alphabet->fromInt(_vec[k]);
|
|
}
|
|
return tmp;
|
|
}
|
|
|
|
string sequence::toString(const int pos) const{
|
|
return _alphabet->fromInt(_vec[pos]);
|
|
}
|
|
|
|
void sequence::addFromString(const string& str) {
|
|
for (int k=0; k < str.size() ; k+=_alphabet->stringSize()) {
|
|
_vec.push_back(_alphabet->fromChar(str,k));
|
|
}
|
|
}
|
|
|
|
class particip {
|
|
public:
|
|
explicit particip() {}
|
|
bool operator()(int i) {
|
|
return (i==-1000);
|
|
}
|
|
};
|
|
|
|
//removePositions: the poitions to be removed are marked as '1' in posToRemoveVec
|
|
//all othehr positions are '0'
|
|
void sequence::removePositions(const vector<int> & posToRemoveVec)
|
|
{
|
|
if(posToRemoveVec.size() != seqLen())
|
|
errorMsg::reportError("the input vector must be same size as sequence length. in sequence::removePositions");
|
|
for (int k=0; k < posToRemoveVec.size(); ++k) {
|
|
if (posToRemoveVec[k] == 1)
|
|
_vec[k] = -1000;
|
|
}
|
|
vector<int>::iterator vec_iter;
|
|
vec_iter = remove_if(_vec.begin(),_vec.end(),particip());
|
|
_vec.erase(vec_iter,_vec.end()); // pg 1170, primer.
|
|
}
|
|
|
|
//return the number of sites that are specific = not unknown, nor ambiguity, nor gap (for example, for nucleotides it will true for A,C,G, or T).
|
|
int sequence::seqLenSpecific() const
|
|
{
|
|
int res = 0;
|
|
for (int pos = 0; pos < seqLen(); ++pos)
|
|
{
|
|
if (isSpecific(pos))
|
|
++res;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
|