mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 06:50:25 +08:00
421 lines
14 KiB
C++
421 lines
14 KiB
C++
// $Id: posteriorDistance.cpp 5883 2009-02-06 10:42:11Z privmane $
|
|
|
|
#include "posteriorDistance.h"
|
|
#include "numRec.h"
|
|
#include "countTableComponent.h"
|
|
#include "likeDist.h"
|
|
#include "uniDistribution.h"
|
|
#include "someUtil.h"
|
|
#include "jcDistance.h"
|
|
#include <cmath>
|
|
|
|
|
|
class C_eval_gammaMLDistancesPosterior_d{
|
|
private:
|
|
const stochasticProcess& _sp;
|
|
const sequence& _s1;
|
|
const sequence& _s2;
|
|
const Vdouble* _weights;
|
|
const VVdoubleRep& _posteriorProb; // pos, rate
|
|
public:
|
|
C_eval_gammaMLDistancesPosterior_d(const stochasticProcess& sp,
|
|
const sequence& s1,
|
|
const sequence& s2,
|
|
const VVdoubleRep& posteriorProb,
|
|
const Vdouble * weights)
|
|
: _sp(sp),
|
|
_s1(s1),
|
|
_s2(s2),
|
|
_weights(weights),
|
|
_posteriorProb(posteriorProb)
|
|
{};
|
|
|
|
|
|
MDOUBLE operator() (MDOUBLE dist) {
|
|
MDOUBLE sumL=0.0;
|
|
doubleRep posLikelihood = 0.0;
|
|
MDOUBLE posLikelihood_d = 0.0;
|
|
for (int pos=0; pos < _s1.seqLen(); ++pos){
|
|
if (_s1.isUnknown(pos) && _s2.isUnknown(pos)) continue; // the case of two unknowns
|
|
posLikelihood = 0.0;
|
|
posLikelihood_d = 0.0;
|
|
if (_s1.isUnknown(pos) && _s2.isSpecific(pos)) {
|
|
// this is the more complicated case, where s1 = ?, s2 = specific
|
|
posLikelihood = _sp.freq(_s2[pos]);
|
|
posLikelihood_d =0.0;
|
|
}
|
|
else if (_s2.isUnknown(pos) && _s1.isSpecific(pos)) {
|
|
posLikelihood = _sp.freq(_s1[pos]);
|
|
posLikelihood_d =0.0;
|
|
} else {
|
|
for (int rateCategor = 0; rateCategor<_sp.categories(); ++rateCategor) {
|
|
MDOUBLE rate = _sp.rates(rateCategor);
|
|
MDOUBLE pij= 0.0;
|
|
MDOUBLE dpij=0.0;
|
|
if (_s1.isSpecific(pos) && _s2.isSpecific(pos)) {//simple case, where AA i is changing to AA j
|
|
pij= _sp.Pij_t(_s1[pos],_s2[pos],dist*rate);
|
|
dpij= _sp.dPij_dt(_s1[pos],_s2[pos],dist*rate)*rate;
|
|
doubleRep tmp = _sp.freq(_s1[pos])*_posteriorProb[pos][rateCategor];
|
|
posLikelihood += pij *tmp;
|
|
posLikelihood_d += dpij*convert(tmp);
|
|
}
|
|
else {// this is the most complicated case, when you have combinations of letters,
|
|
// for example B in one sequence and ? in the other.
|
|
for (int iS1 =0; iS1< _sp.alphabetSize(); ++iS1) {
|
|
for (int iS2 =0; iS2< _sp.alphabetSize(); ++iS2) {
|
|
if ((_s1.getAlphabet()->relations(_s1[pos],iS1)) &&
|
|
(_s2.getAlphabet()->relations(_s2[pos],iS2))) {
|
|
doubleRep exp = _sp.freq(iS1)*_posteriorProb[pos][rateCategor];;
|
|
posLikelihood += exp* _sp.Pij_t(iS1,iS2,dist*rate);
|
|
posLikelihood_d += convert(exp) * _sp.dPij_dt(iS1,iS2,dist*rate)*rate;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}// end of for rate categories
|
|
}
|
|
assert(posLikelihood!=0.0);
|
|
sumL += posLikelihood_d/convert(posLikelihood)*(_weights ? (*_weights)[pos]:1.0);
|
|
}
|
|
return -sumL;
|
|
};
|
|
};
|
|
|
|
class C_eval_gammaMLDistancesPosterior{
|
|
private:
|
|
const stochasticProcess& _sp;
|
|
const sequence& _s1;
|
|
const sequence& _s2;
|
|
const Vdouble* _weights;
|
|
const VVdoubleRep& _posteriorProb; // pos, rate
|
|
public:
|
|
C_eval_gammaMLDistancesPosterior(const stochasticProcess& sp,
|
|
const sequence& s1,
|
|
const sequence& s2,
|
|
const VVdoubleRep& posteriorProb,
|
|
const Vdouble * weights): _sp(sp),
|
|
_s1(s1),
|
|
_s2(s2),
|
|
_weights(weights),
|
|
_posteriorProb(posteriorProb)
|
|
{};
|
|
|
|
|
|
MDOUBLE operator() (MDOUBLE dist) {
|
|
/*DEBUG LOG(9,<<"C_eval_gammaMLDistancesPosterior::operator():"); LOGDO(9,printTime(myLog::LogFile())); LOG(9,<<": dist = "<<dist<<endl); DEBUG*/
|
|
MDOUBLE sumL=0.0;
|
|
doubleRep posLikelihood = 0.0;
|
|
|
|
for (int pos=0; pos < _s1.seqLen(); ++pos){
|
|
/*DEBUG LOG(9,<<"C_eval_gammaMLDistancesPosterior::operator():"); LOGDO(9,printTime(myLog::LogFile())); LOG(9,<<": pos = "<<pos<<endl); DEBUG*/
|
|
if (_s1.isUnknown(pos) && _s2.isUnknown(pos)) continue; // the case of two unknowns
|
|
/*DEBUG LOG(9,<<"_posteriorProb ="<<_posteriorProb[pos]<<endl); DEBUG*/
|
|
posLikelihood = 0.0;
|
|
/*DEBUG LOG(9,<<"posLikelihood = "<<posLikelihood<<endl); DEBUG*/
|
|
if (_s1.isUnknown(pos) && _s2.isSpecific(pos)) {
|
|
// this is the more complicated case, where s1 = ?, s2 = specific
|
|
posLikelihood = _sp.freq(_s2[pos]);
|
|
}
|
|
else if (_s2.isUnknown(pos) && _s1.isSpecific(pos)) {
|
|
posLikelihood = _sp.freq(_s1[pos]);
|
|
} else {
|
|
for (int rateCategor = 0; rateCategor<_sp.categories(); ++rateCategor) {
|
|
MDOUBLE rate = _sp.rates(rateCategor);
|
|
/*DEBUG LOG(9,<<"rate = "<<rate<<endl); DEBUG*/
|
|
MDOUBLE pij= 0.0;
|
|
if (_s1.isSpecific(pos) && _s2.isSpecific(pos)) {//simple case, where AA i is changing to AA j
|
|
/*DEBUG LOG(9,<<"Both are specific"<<endl); DEBUG*/
|
|
pij= _sp.Pij_t(_s1[pos],_s2[pos],dist*rate);
|
|
doubleRep exp = _sp.freq(_s1[pos])*_posteriorProb[pos][rateCategor];
|
|
/*DEBUG LOG(9,<<"exp = "<<exp<<endl); DEBUG*/
|
|
posLikelihood += pij *exp;
|
|
/*DEBUG LOG(9,<<"posLikelihood = "<<posLikelihood<<endl); DEBUG*/
|
|
}
|
|
else {// this is the most complicated case, when you have combinations of letters,
|
|
// for example B in one sequence and ? in the other.
|
|
/*DEBUG LOG(9,<<"One or both are non-specific"<<endl); DEBUG*/
|
|
for (int iS1 =0; iS1< _sp.alphabetSize(); ++iS1) {
|
|
for (int iS2 =0; iS2< _sp.alphabetSize(); ++iS2) {
|
|
if ((_s1.getAlphabet()->relations(_s1[pos],iS1)) &&
|
|
(_s2.getAlphabet()->relations(_s2[pos],iS2))) {
|
|
doubleRep exp = _sp.freq(iS1)*_posteriorProb[pos][rateCategor];
|
|
posLikelihood += exp* _sp.Pij_t(iS1,iS2,dist*rate);
|
|
}
|
|
}
|
|
}
|
|
/*DEBUG LOG(9,<<"posLikelihood = "<<posLikelihood<<endl); DEBUG*/
|
|
}
|
|
}// end of for rate categories
|
|
}
|
|
assert(posLikelihood!=0.0);
|
|
sumL += log(posLikelihood)*(_weights ? (*_weights)[pos]:1);
|
|
}
|
|
/*DEBUG LOG(9,<<"C_eval_gammaMLDistancesPosterior::operator():"); LOGDO(9,printTime(myLog::LogFile())); LOG(9,<<": returning "<<(-sumL)<<endl); DEBUG*/
|
|
return -sumL;
|
|
};
|
|
};
|
|
|
|
posteriorDistance::posteriorDistance(const stochasticProcess & sp,
|
|
const VVdoubleRep & posteriorProb,
|
|
const MDOUBLE toll,
|
|
const MDOUBLE maxPairwiseDistance)
|
|
:
|
|
likeDist(sp,toll,maxPairwiseDistance),_posteriorProb(posteriorProb)
|
|
{}
|
|
|
|
posteriorDistance::posteriorDistance(stochasticProcess & sp,
|
|
const VVdoubleRep & posteriorProb,
|
|
const MDOUBLE toll,
|
|
const MDOUBLE maxPairwiseDistance)
|
|
:
|
|
likeDist(sp,toll,maxPairwiseDistance),_posteriorProb(posteriorProb)
|
|
{}
|
|
|
|
posteriorDistance::posteriorDistance(const stochasticProcess & sp,
|
|
const MDOUBLE toll,
|
|
const MDOUBLE maxPairwiseDistance)
|
|
:
|
|
likeDist(sp,toll,maxPairwiseDistance),_posteriorProb(0)
|
|
{}
|
|
|
|
|
|
posteriorDistance::posteriorDistance(stochasticProcess & sp,
|
|
const MDOUBLE toll,
|
|
const MDOUBLE maxPairwiseDistance)
|
|
:
|
|
likeDist(sp,toll,maxPairwiseDistance),_posteriorProb(0)
|
|
{}
|
|
|
|
posteriorDistance::posteriorDistance(const posteriorDistance& other):
|
|
likeDist(static_cast<likeDist>(other)), _posteriorProb(other._posteriorProb)
|
|
{}
|
|
|
|
|
|
|
|
// distance is computed based on the posterior probability
|
|
const MDOUBLE posteriorDistance::giveDistance(const sequence& s1,
|
|
const sequence& s2,
|
|
const Vdouble * weights,
|
|
MDOUBLE* score) const
|
|
{
|
|
/*DEBUG LOG(9,<<"posteriorDistance::giveDistance - start"<<endl); LOGDO(9,printTime(myLog::LogFile())); DEBUG*/
|
|
const MDOUBLE ax=0, cx=_maxPairwiseDistance;
|
|
MDOUBLE bx=_jcDist.giveDistance(s1,s2,weights,score)/*=1.0*/;
|
|
if (!(bx==bx)) bx = 1.0;
|
|
if (!(bx>0.0)) bx = 0.000001;
|
|
MDOUBLE dist=-1.0;
|
|
MDOUBLE resL = -dbrent(ax,bx,cx,
|
|
C_eval_gammaMLDistancesPosterior(_sp,s1,s2,_posteriorProb,weights),
|
|
C_eval_gammaMLDistancesPosterior_d(_sp,s1,s2,_posteriorProb,weights),
|
|
_toll,
|
|
&dist);
|
|
if (score) *score = resL;
|
|
return dist;
|
|
}
|
|
|
|
// =============================
|
|
// OBSOLETE: this function was moved to pairwiseGammaDistance.cpp
|
|
class C_evalAlphaForPairOfSeq{
|
|
private:
|
|
const countTableComponentGam& _ctc;
|
|
stochasticProcess& _sp;
|
|
const MDOUBLE _branchL;
|
|
public:
|
|
C_evalAlphaForPairOfSeq(const countTableComponentGam& ctc,
|
|
const MDOUBLE branchL,
|
|
stochasticProcess& sp):_ctc(ctc), _sp(sp), _branchL(branchL) {};
|
|
|
|
MDOUBLE operator() (MDOUBLE alpha) {
|
|
(static_cast<gammaDistribution*>(_sp.distr()))->setAlpha(alpha);
|
|
C_evalLikeDist cev(_ctc,_sp);
|
|
MDOUBLE L=cev(_branchL);
|
|
LOG(10,<<"check alpha="<<alpha<<", bl="<<_branchL<<" gives "<<L<<endl);
|
|
return L;
|
|
};
|
|
};
|
|
|
|
// OBSOLETE: this function was moved to pairwiseGammaDistance.cpp
|
|
// returns the best alpha.
|
|
MDOUBLE optimizeAlphaFixedDist(stochasticProcess & sp,
|
|
const countTableComponentGam & ctc,
|
|
const MDOUBLE branchL,
|
|
const vector<MDOUBLE> * weights,
|
|
MDOUBLE* score=NULL){ // changes sp.
|
|
MDOUBLE bestA=0.0;
|
|
MDOUBLE bestQ=0.0;
|
|
const MDOUBLE upperBoundOnAlpha = 15.0;
|
|
const MDOUBLE epsilonAlphaOptimization = 0.01;
|
|
const MDOUBLE cx=upperBoundOnAlpha;// left, midle, right limit on alpha
|
|
const MDOUBLE bx=cx*0.3;
|
|
const MDOUBLE ax=0.0;
|
|
|
|
|
|
bestQ = -brent(ax,bx,cx,
|
|
C_evalAlphaForPairOfSeq(ctc,branchL,sp),
|
|
epsilonAlphaOptimization,
|
|
&bestA);
|
|
(static_cast<gammaDistribution*>(sp.distr()))->setAlpha(bestA);
|
|
if (score) *score = bestQ;
|
|
return bestA;
|
|
}
|
|
|
|
|
|
|
|
// OBSOLETE: this function was moved to pairwiseGammaDistance.cpp
|
|
class C_eval_gammaMLAlpha{
|
|
private:
|
|
const stochasticProcess& _sp;
|
|
const sequence& _s1;
|
|
const sequence& _s2;
|
|
const MDOUBLE _distance;
|
|
const Vdouble* _weights;
|
|
// const VVdoubleRep& _posteriorProb; // pos, rate
|
|
public:
|
|
C_eval_gammaMLAlpha(const stochasticProcess& sp,
|
|
const sequence& s1,
|
|
const sequence& s2,
|
|
const MDOUBLE distance,
|
|
// const VVdoubleRep& posteriorProb,
|
|
const Vdouble * weights): _sp(sp),
|
|
_s1(s1),
|
|
_s2(s2),
|
|
_distance(distance),
|
|
_weights(weights)
|
|
// _posteriorProb(posteriorProb)
|
|
{};
|
|
|
|
// this cast is required as the distribution within the
|
|
// stochasticProcess is kept as the parent "distribution" class that
|
|
// knows nothing of Alpha
|
|
void setAlpha(MDOUBLE alpha) {
|
|
(static_cast<gammaDistribution*>(_sp.distr()))->setAlpha(alpha);
|
|
}
|
|
|
|
|
|
MDOUBLE operator() (MDOUBLE alpha) {
|
|
setAlpha(alpha);
|
|
MDOUBLE likelihood = likeDist::evalLikelihoodForDistance(_sp,_s1,_s2,_distance,_weights);
|
|
LOG(11,<<"check alpha="<<alpha<<", bl="<<_distance<<" gives "<<likelihood<<endl);
|
|
return -likelihood;
|
|
};
|
|
} ;
|
|
|
|
|
|
// OBSOLETE: this function was moved to pairwiseGammaDistance.cpp
|
|
// returns the best alpha.
|
|
MDOUBLE optimizeAlphaFixedDist( const sequence& s1,
|
|
const sequence& s2,
|
|
stochasticProcess & sp,
|
|
const MDOUBLE branchL,
|
|
const vector<MDOUBLE> * weights,
|
|
MDOUBLE* score=NULL){ // changes sp.
|
|
MDOUBLE bestA=0.0;
|
|
MDOUBLE bestQ=0.0;
|
|
const MDOUBLE upperBoundOnAlpha = 15.0;
|
|
const MDOUBLE epsilonAlphaOptimization = 0.01;
|
|
const MDOUBLE cx=upperBoundOnAlpha;// left, midle, right limit on alpha
|
|
const MDOUBLE bx=cx*0.3;
|
|
const MDOUBLE ax=0.0;
|
|
|
|
|
|
bestQ = -brent(ax,bx,cx,
|
|
C_eval_gammaMLAlpha(sp,s1,s2,branchL,weights),
|
|
epsilonAlphaOptimization,
|
|
&bestA);
|
|
(static_cast<gammaDistribution*>(sp.distr()))->setAlpha(bestA);
|
|
if (score) *score = bestQ;
|
|
return bestA;
|
|
}
|
|
|
|
|
|
|
|
MDOUBLE posteriorDistance::giveInitialGuessOfDistance(
|
|
const sequence& s1,
|
|
const sequence& s2,
|
|
const vector<MDOUBLE> * weights,
|
|
MDOUBLE* score) const {
|
|
uniDistribution ud;
|
|
stochasticProcess uniSp(&ud,_sp.getPijAccelerator());
|
|
likeDist ld(uniSp);
|
|
return (ld.giveDistance(s1,s2,weights,score));
|
|
}
|
|
|
|
// OBSOLETE? What's the difference between this function and giveDistanceOptAlphaForPairOfSequences???
|
|
MDOUBLE posteriorDistance::giveDistanceOptAlphaForEachPairOfSequences( const sequence& s1,
|
|
const sequence& s2,
|
|
const vector<MDOUBLE> * weights,
|
|
MDOUBLE* score,
|
|
MDOUBLE* alpha) const {
|
|
|
|
MDOUBLE toll = 0.0001;
|
|
|
|
MDOUBLE resL = 0.0;
|
|
MDOUBLE resQ = 0.0;
|
|
MDOUBLE currentDistance = giveInitialGuessOfDistance(s1,s2,weights,&resL);
|
|
|
|
countTableComponentGam ctc; // from technical reasons.
|
|
ctc.countTableComponentAllocatePlace(_sp.alphabetSize(),_sp.categories());
|
|
|
|
stochasticProcess tmpSp(_sp);
|
|
for (int z=0; z<s1.seqLen(); ++z) {
|
|
for (int j=0; j < tmpSp.categories(); ++j) {
|
|
ctc.addToCounts(s1[z],s2[z],j,weights?(*weights)[z]:tmpSp.ratesProb(j));
|
|
}
|
|
}
|
|
const int maxIter = 30;
|
|
MDOUBLE newDist = 0.0;
|
|
MDOUBLE lastBestAlpha = 0.0;
|
|
for (int i=0; i < maxIter; ++i) {
|
|
lastBestAlpha = optimizeAlphaFixedDist(tmpSp,ctc,currentDistance,weights,&resL); // changes sp.
|
|
(static_cast<gammaDistribution*>(tmpSp.distr()))->setAlpha(lastBestAlpha);
|
|
LOG(8,<<"lastBestAlpha="<<lastBestAlpha<<"("<<(static_cast<gammaDistribution*>(tmpSp.distr()))->getAlpha()<<")"<<"\t L="<<resL<<"\t");
|
|
likeDist tmpld(tmpSp); // we must create a new ld, that will include the stochastic process with the new alpha
|
|
newDist = tmpld.giveDistance(ctc,resQ);
|
|
LOG(8,<<"dist="<<newDist<<endl);
|
|
if (fabs(newDist-currentDistance)<toll) break;
|
|
currentDistance = newDist;
|
|
}
|
|
if (score) *score = resL;
|
|
if (alpha) *alpha = lastBestAlpha;
|
|
assert (newDist >=0);
|
|
return newDist;
|
|
|
|
}
|
|
|
|
|
|
|
|
// OBSOLETE: this function was moved to pairwiseGammaDistance.cpp
|
|
MDOUBLE posteriorDistance::giveDistanceOptAlphaForPairOfSequences( const sequence& s1,
|
|
const sequence& s2,
|
|
const vector<MDOUBLE> * weights,
|
|
MDOUBLE* score,
|
|
MDOUBLE* alpha) const {
|
|
|
|
MDOUBLE toll = 0.0001;
|
|
|
|
MDOUBLE resL = 0.0;
|
|
MDOUBLE currentDistance = giveInitialGuessOfDistance(s1,s2,weights,&resL);
|
|
|
|
countTableComponentGam ctc; // from technical reasons.
|
|
|
|
stochasticProcess tmpSp(_sp);
|
|
|
|
const int maxIter = 30;
|
|
MDOUBLE newDist = 0.0;
|
|
MDOUBLE lastBestAlpha = 0.0;
|
|
for (int i=0; i < maxIter; ++i) {
|
|
lastBestAlpha = optimizeAlphaFixedDist(s1, s2, tmpSp, currentDistance, weights, &resL); // changes sp.
|
|
LOG(8,<<"lastBestAlpha="<<lastBestAlpha<<"("<<"\t L="<<resL<<"\t");
|
|
likeDist tmpld(tmpSp); // we must create a new ld, that will include the stochastic process with the new alpha
|
|
newDist = tmpld.giveDistance(s1, s2, weights, &resL);
|
|
LOG(8,<<"dist="<<newDist<<"(L="<<resL<<")"<<endl);
|
|
if (fabs(newDist-currentDistance)<toll) break;
|
|
currentDistance = newDist;
|
|
}
|
|
if (score) *score = resL;
|
|
if (alpha) *alpha = lastBestAlpha;
|
|
assert (newDist >=0);
|
|
return newDist;
|
|
|
|
}
|