// $Id: likeDist.cpp 9582 2011-06-21 11:31:21Z cohenofi $ #include "likeDist.h" #include "numRec.h" #include "someUtil.h" stochasticProcess& likeDist::getNonConstStochasticProcess() { if (!_nonConstSpPtr) { errorMsg::reportError("likeDist::getNonConstStochasticProcess: Can't give non-const stochasticProcess because the stochasticProcess that was given to the constructor of this likeDist object was const"); } return *_nonConstSpPtr; } // ======================= functors needed for the computations ============= class C_evalLikeDistDirect{ private: const stochasticProcess& _sp; const sequence& _s1; const sequence& _s2; const vector * _weights; public: C_evalLikeDistDirect(const stochasticProcess& inS1, const sequence& s1, const sequence& s2, const vector * weights): _sp(inS1),_s1(s1),_s2(s2),_weights(weights) {}; MDOUBLE operator() (MDOUBLE dist) const { return -likeDist::evalLikelihoodForDistance(_sp,_s1,_s2,dist,_weights); } }; MDOUBLE likeDist::evalLikelihoodForDistance(const stochasticProcess& sp, const sequence& s1, const sequence& s2, const MDOUBLE dist, const vector * weights) { MDOUBLE sumL=0.0; // sum of log likelihoods MDOUBLE posLikelihood = 0.0; // likelihood of a specific position for (int pos=0; pos < s1.seqLen(); ++pos){ if (s1.isUnknown(pos) && s2.isUnknown(pos)) continue; // the case of two unknowns posLikelihood = 0.0; if (s1.isUnknown(pos) && s2.isSpecific(pos)) { // this is the more complicated case, where s1 = ?, s2 = specific posLikelihood = sp.freq(s2[pos]); } else if (s2.isUnknown(pos) && s1.isSpecific(pos)) { posLikelihood = sp.freq(s1[pos]); } else { for (int rateCategor = 0; rateCategorrelations(s1[pos],iS1)) && (s2.getAlphabet()->relations(s2[pos],iS2))) { posLikelihood += sp.freq(iS1)*sp.Pij_t(iS1,iS2,dist*rate)*sp.ratesProb(rateCategor); } } } } } // end of for on the rates } assert(posLikelihood!=0.0); sumL += log(posLikelihood)*(weights ? (*weights)[pos]:1.0); } return sumL; }; class C_evalLikeDistDirect_d{ // derivative. private: const stochasticProcess& _sp; const sequence& _s1; const sequence& _s2; const vector * _weights; public: C_evalLikeDistDirect_d(const stochasticProcess& sp, const sequence& s1, const sequence& s2, const vector * weights): _sp(sp),_s1(s1),_s2(s2),_weights(weights) {}; MDOUBLE operator() (MDOUBLE dist) const { MDOUBLE sumL=0.0; // sum of log likelihoods MDOUBLE posLikelihood = 0.0; // likelihood of a specific position MDOUBLE posLikelihood_d = 0.0; // derivative of the likelihood at a specific position for (int pos=0; pos < _s1.seqLen(); ++pos){ if (_s1.isUnknown(pos) && _s2.isUnknown(pos)) continue; // the case of two unknowns posLikelihood = 0.0; posLikelihood_d = 0.0; if (_s1.isUnknown(pos) && _s2.isSpecific(pos)) { // this is the more complicated case, where s1 = ?, s2 = specific posLikelihood = _sp.freq(_s2[pos]); posLikelihood_d =0.0; } else if (_s2.isUnknown(pos) && _s1.isSpecific(pos)) { posLikelihood = _sp.freq(_s1[pos]); posLikelihood_d =0.0; } else { for (int rateCategor = 0; rateCategor<_sp.categories(); ++rateCategor) { MDOUBLE rate = _sp.rates(rateCategor); MDOUBLE pij= 0.0; MDOUBLE dpij=0.0; if (_s1.isSpecific(pos) && _s2.isSpecific(pos)) { //simple case, where AA i is changing to AA j pij= _sp.Pij_t(_s1[pos],_s2[pos],dist*rate); dpij= _sp.dPij_dt(_s1[pos],_s2[pos],dist*rate)*rate; MDOUBLE tmp = _sp.freq(_s1[pos])*_sp.ratesProb(rateCategor); posLikelihood += pij *tmp; posLikelihood_d += dpij*tmp; } else {// this is the most complicated case, when you have combinations of letters, // for example B in one sequence and ? in the other. for (int iS1 =0; iS1< _sp.alphabetSize(); ++iS1) { for (int iS2 =0; iS2< _sp.alphabetSize(); ++iS2) { if ((_s1.getAlphabet()->relations(_s1[pos],iS1)) && (_s2.getAlphabet()->relations(_s2[pos],iS2))) { MDOUBLE exp = _sp.freq(iS1)*_sp.ratesProb(rateCategor); posLikelihood += exp* _sp.Pij_t(iS1,iS2,dist*rate); posLikelihood_d += exp * _sp.dPij_dt(iS1,iS2,dist*rate)*rate; } } } } }// end of for rate categories } assert(posLikelihood>0.0); sumL += (posLikelihood_d/posLikelihood)*(_weights ? (*_weights)[pos]:1.0); } return -sumL; }; }; // THIS FUNCTION EVALUATES THE LIKELIHOOD GIVEN THE DISTANCE MDOUBLE likeDist::evalLogLikelihoodGivenDistance(const sequence& s1, const sequence& s2, const MDOUBLE dis2evaluate) { C_evalLikeDistDirect Cev(_sp,s1,s2,NULL); return -Cev.operator ()(dis2evaluate); } MDOUBLE likeDist::giveDistanceThroughCTC( const sequence& s1, const sequence& s2, const vector * weights, MDOUBLE* score) const { // only in the case of homogenous model - work through pairwise EM like countTableComponentGam ctc; if (_sp.categories() != 1) { errorMsg::reportError("this function only work for homogenous model."); } ctc.countTableComponentAllocatePlace(s1.getAlphabet()->size(),1); for (int i=0; i MDOUBLE myNRmethod(MDOUBLE low, MDOUBLE current, MDOUBLE high, regF f, dF df, const MDOUBLE tol, const int max_it, int & zeroFound) { // finding zero of a function. zeroFound = 1; MDOUBLE currentF = f(current); if (fabs(currentF)0) && (highF>0)) || ((lowF<0) && (highF<0))) {// unable to find a zero zeroFound = 0; return 0; } if (lowF>0) {// fixing things to be in the right order. MDOUBLE tmp = low; low = high; high = tmp; tmp = lowF; lowF = highF; highF = tmp; } if (currentF>0) { high = current; highF = currentF; } else { low = current; lowF = currentF; } // now the zero is between current and either low or high. MDOUBLE currentIntervalSize = fabs(low-high); MDOUBLE oldIntervalSize = currentIntervalSize; // we have to decide if we do NR or devide the interval by two: // we want to check if the next NR step is within our interval // recall the the next NR guess is Xn+1 = Xn - f(Xn) / f(Xn+1) // So we want (current - currentF/currentDF) to be between low and high for (int i=0 ; i < max_it; ++i) { MDOUBLE currentDF = df(current); MDOUBLE newGuess = current - currentF/currentDF; if ((newGuess high) || (newGuess>low && newGuess< high)) { // in this case we should do a NR step. current = newGuess; currentF = f(current); if (currentF > 0){ high = current; highF = currentF; } else { low = current; lowF = currentF; } oldIntervalSize = currentIntervalSize; currentIntervalSize =fabs (high-low); if (currentIntervalSize < tol) { return current; } //LOG(5,<<"NR: low= "<