// $Id: mainSemphy.cpp 6002 2009-03-20 19:39:03Z privmane $ // DO: (1) ADD NUMBER OF DESCRETE CATEGORIES TO THE OPTIONS. // DO: (2) ADD NUMBER OF RANDOM STARTS TO THE OPTIONS. // DO: (3) CHENYSHEV PARAMETERS TO THE OPTIONS. // DO: (4) ADD PARAMETER CONCERNING THE STARTING NJ TREE. // DO: (5) ADD PARAMETER CONCERNING THE GAMMA PARAM OPTIMIZATION. #include "definitions.h" #include "mainSemphy.h" #include "jcDistance.h" #include "distanceTable.h" #include "nj.h" #include "constraints.h" #include "bblEM.h" #include "semphySearchBestTree.h" #include "logFile.h" #include "readDatMatrix.h" #include "datMatrixHolder.h" #include "likelihoodComputation.h" #include "likeDist.h" #include "bestAlpha.h" #include "bestParamUSSRV.h" #include "bestTamura92param.h" #include "bestGtrModelParams.h" #include "findRateOfGene.h" #include "talRandom.h" #include "someUtil.h" #include "getRandomWeights.h" #include "codon.h" #include "recognizeFormat.h" #include "generalGammaDistributionLaguerre.h" #include "correctToCanonialForm.h" #include "ssrvDistanceSeqs2Tree.h" #include #include #include #include #include using namespace std; //******************************************************************************* // constructors //******************************************************************************* mainSemphy::mainSemphy(const gengetopt_args_info& gn): _args_info(gn),_evolObj(_args_info),_weights(NULL), _s2tPtr(NULL), _numberOfRandomStart(1){ initializeFromArgsInfo(); } mainSemphy::mainSemphy(int argc, char* argv[]):_weights(NULL), _s2tPtr(NULL) , _numberOfRandomStart(1){ readCommandLineInformation(argc,argv); initializeFromArgsInfo(); myLog::printArgv(1, argc, argv); } void mainSemphy::readCommandLineInformation(int argc, char* argv[]) { if (argc < 2) errorMsg::reportError("The program must get some parameters in the command line, use -h for help"); if (cmdline_parser(argc, argv, &_args_info) != 0) { errorMsg::reportError("error reading command line",1); } cmdline2EvolObjs _evolObj(_args_info); } void mainSemphy::initializeFromArgsInfo(){ // AND WE CHECK TO SEE IF WE ARE ASKED FOR SOMETING WE CAN NOT DELIVER // if (_args_info.min_improv_given) errorMsg::reportError("minimum-improvement not yet implimented"); // if (_args_info.maxDistance_given) errorMsg::reportError("max distance not yet implimented"); // if (_args_info.exact_given) errorMsg::reportError("exact counts not yet implimented"); argsConsistencyCheck(); _evolObj.initializeRandomSeed(); _evolObj.initializeLogFile(); // initializeAlphabet(); _alphP=_evolObj.cmdline2Alphabet(); // initializeSequenceContainer(); _sc = _evolObj.cmdline2SequenceContainer(_alphP); // takeCareOfGaps(); _evolObj.takeCareOfGaps(_sc); // readTreeFile(); _etPtr=_evolObj.cmdline2Tree(); // make sure tree is in canonical form semphyCorrectToCanonialForm(); // readConstraintTreeFile(); _constraintTreePtr = _evolObj.cmdline2ConstraintTree(); // initializeStochaticProcess(); _spP=_evolObj.cmdline2StochasticProcessSafe(); // initializeOutputStream(); //out(). _outPtr = _evolObj.cmdline2OutputStream(); printSemphyTitle(out()); constraintTreeConsistencyCheck(); // check that the tree is compatible with the constraint tree if given. if (_args_info.posteriorRates_given) _posteriorRates = _evolObj.cmdline2PosteriorRates(); } void mainSemphy::printSemphyTitle(ostream & out) { if (_args_info.verbose_arg<0) return; // just don't print if using -v-1 out<<"#################################################################"<getNodesNum(); vector isRealTaxa(nNodes,0); vector all; _etPtr->getAllNodes(all,_etPtr->getRoot()); for (vector::iterator i=all.begin();i!=all.end();++i) isRealTaxa[(*i)->id()]=(*i)->isLeaf(); VVdouble dummyDistanceTable(nNodes); for (int i=0;i(_spP->getPijAccelerator()->getReplacementModel())) { // optimizing params of the tamura92 model bestTamura92ParamAlphaAndBBL tmpbestAlpha(*_etPtr,_sc,*_spP, _weights, 5, 0.05, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4BBL_arg, 5.0, _args_info.maxNumOfBBLIter_arg, 1.5 ); bestAlpha = tmpbestAlpha.getBestAlpha(); _treeLogLikelihood = tmpbestAlpha.getBestL(); } else if (dynamic_cast(_spP->getPijAccelerator()->getReplacementModel())) { // optimizing params of the gtr model bestGtrModel optimizer(*_etPtr, _sc, *_spP, _weights, 5, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, true, true); bestAlpha=optimizer.getBestAlpha(); _treeLogLikelihood = optimizer.getBestL(); } else { bestAlphaAndBBL tmpbestAlpha(*_etPtr,_sc,*_spP, _weights, 1.5, 5.0, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4BBL_arg, _args_info.maxNumOfBBLIter_arg); bestAlpha = tmpbestAlpha.getBestAlpha(); _treeLogLikelihood = tmpbestAlpha.getBestL(); } out()<<"# Best alpha after branch length optimiziation"<output(out()); LOG(printMsg,<<"# The tree"<output(myLog::LogFile())); } void mainSemphy::printTreeToTreeFile() const { if (_args_info.treeoutputfile_given) { ofstream treeO(_args_info.treeoutputfile_arg); if (! treeO.is_open()) { errorMsg::reportError("can not open tree output file"); } _etPtr->output(treeO); treeO.close(); } } void mainSemphy::nullifyTree() { if (_etPtr) delete (_etPtr); _etPtr = NULL; } // This function is used, so that for example in bp, a new tree will be computed in NJ. // The computeNJ for example will not compute the NJ tree if a tree is given. void mainSemphy::setTree(const tree& inEtPtr) { if (_etPtr) delete (_etPtr); _etPtr = new tree(inEtPtr); } // void mainSemphy::getDistanceTableAndNames(VVdouble& disTable, // vector & vNames, // const distanceMethod* cd) const { // giveDistanceTable(cd,_sc,disTable,vNames,_weights); // } // void mainSemphy::computeNJtreeFromDisTableAndNames(const VVdouble& disTable, // const vector & vNames) { // NJalg nj1; // if (_args_info.constraint_given) { // did we get a constraint tree // setTree(nj1.computeTree(disTable,vNames,_constraintTreePtr)); // } else { // setTree(nj1.computeTree(disTable,vNames)); // } // } void mainSemphy::computeNJtree(bool doBootstrapOneIteration) { distanceBasedMethod_t dtme = homogeneousRatesDTME; // default, vanila NJ if (_args_info.homogeneousRatesDTME_given || _args_info.NJ_given) dtme = homogeneousRatesDTME; else if (_args_info.pairwiseGammaDTME_given) dtme = pairwiseGammaDTME; else if (_args_info.commonAlphaDTME_given) dtme = commonAlphaDTME; else if (_args_info.rate4siteDTME_given) dtme = rate4siteDTME; else if (_args_info.posteriorDTME_given) dtme = posteriorDTME; else if (_args_info.SEMPHY_given) dtme = homogeneousRatesDTME; else errorMsg::reportError("mainSemphy::computeNJtree: An unsuppored DTME was specified"); bool useJcDistance = (_args_info.nucjc_given || _args_info.aaJC_given); if (!_s2tPtr) _s2tPtr = distanceBasedSeqs2TreeFactory(dtme, *_spP, useJcDistance, _args_info.optimizeAlpha_flag, _args_info.ssrv_flag, _args_info.epsilonLikelihoodImprovement4iterNJ_arg, _args_info.epsilonLikelihoodImprovement4pairwiseDistance_arg, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4BBL_arg, _args_info.maxNumOfBBLIter_arg); if (!doBootstrapOneIteration) { // No given initial tree if (_etPtr == NULL) { if (_args_info.commonAlphaDTME_given) { if (!_args_info.ssrv_flag) { commonAlphaDistanceSeqs2Tree *caS2tPtr = static_cast(_s2tPtr); if (_args_info.alpha_given) { // use the given alpha setTree(caS2tPtr->seqs2TreeIterative(_sc, _args_info.alpha_arg, _weights, _constraintTreePtr)); } else { // homogeneous rates in first iteration setTree(caS2tPtr->seqs2TreeIterative(_sc, _weights, _constraintTreePtr)); } } else { // Using an SSRV model - run with alpha & nu parameters ssrvDistanceSeqs2Tree *ssrvS2tPtr = static_cast(_s2tPtr); if (_args_info.alpha_given) { // use the given alpha & nu setTree(ssrvS2tPtr->seqs2TreeIterative(_sc, _args_info.alpha_arg, _args_info.nu_arg, _weights, _constraintTreePtr)); } else { // homogeneous rates in first iteration setTree(ssrvS2tPtr->seqs2TreeIterative(_sc, _weights, _constraintTreePtr)); } } } else if (_args_info.posteriorRates_given) { // posteriorDTME with given initial posteriorRates (from input file) posteriorDistanceSeqs2Tree *posteriorS2tPtr = static_cast(_s2tPtr); setTree(posteriorS2tPtr->seqs2TreeIterative(_sc, _args_info.alpha_arg, _posteriorRates, _weights, _constraintTreePtr)); } else { // all other methods setTree(_s2tPtr->seqs2Tree(_sc, _weights, _constraintTreePtr)); } // An initial tree (--tree) was given so pass it to the iterative seqs2Tree method // NOTE: argsConsistencyCheck makes sure that non-interative NJ can't be run with --tree } else { if (!_args_info.ssrv_flag) { iterativeDistanceSeqs2Tree *itS2tPtr = static_cast(_s2tPtr); if (_args_info.alpha_given) { // use the given alpha if (! _args_info.posteriorRates_given) { setTree(itS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _args_info.alpha_arg, _weights, _constraintTreePtr)); } else { // posteriorDTME with given initial posteriorRates (from input file) posteriorDistanceSeqs2Tree *posteriorS2tPtr = static_cast(_s2tPtr); setTree(posteriorS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _args_info.alpha_arg, _posteriorRates, _weights, _constraintTreePtr)); } } else { setTree(itS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _weights, _constraintTreePtr)); } } else { // Using an SSRV model - run with alpha & nu parameters ssrvDistanceSeqs2Tree *ssrvS2tPtr = static_cast(_s2tPtr); if (_args_info.alpha_given) { // use the given alpha & nu if (_args_info.nu_given) setTree(ssrvS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _args_info.alpha_arg, _args_info.nu_arg, _weights, _constraintTreePtr)); else setTree(ssrvS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _args_info.alpha_arg, _weights, _constraintTreePtr)); } else { setTree(ssrvS2tPtr->seqs2TreeIterative(_sc, *_etPtr, _weights, _constraintTreePtr)); } } } // Do one bootstrap iteration // If initial alpha or nu (for using an SSRV model) were given as // commandline input then they were already given to the _s2tPtr (or // internal objects) in its construction } else { if (!_args_info.BPonUserTree_given) { // Running bootstrap for the tree constructed during this run setTree(_s2tPtr->seqs2TreeBootstrap(_sc, _weights, _constraintTreePtr)); } else { // Running bootstrap on a given user tree: // If we use an iterative distance method (commonAlpha or posterior) // then side info must have been given as input too // so we need to pass it as argument if (dtme == commonAlphaDTME) { commonAlphaDistanceSeqs2Tree *caS2tPtr = static_cast(_s2tPtr); setTree(caS2tPtr->seqs2TreeBootstrap(_sc, _args_info.alpha_arg, _weights, _constraintTreePtr)); } else if (posteriorDTME) { posteriorDistanceSeqs2Tree *posteriorS2tPtr = static_cast(_s2tPtr); setTree(posteriorS2tPtr->seqs2TreeBootstrap(_sc, _posteriorRates, _weights, _constraintTreePtr)); } else { setTree(_s2tPtr->seqs2TreeBootstrap(_sc, _weights, _constraintTreePtr)); } } } } void mainSemphy::computeSemphyTree() { if (_etPtr == NULL) computeNJtree(); semphySearchBestTree(_sc,*_etPtr,_constraintTreePtr,*_spP,out(),_numberOfRandomStart, _args_info.optimizeAlpha_flag, _args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg, _args_info.epsilonLikelihoodImprovement4BBL_arg, _args_info.maxNumOfBBLIter_arg); } void mainSemphy::optimizeGlobalRate() { out()<<"we are in void mainSemphy::optimizeGlobalRate()"<(_spP)),_weights, 15,15,0.5,_args_info.epsilonLikelihoodImprovement4alphaOptimiz_arg,_args_info.epsilonLikelihoodImprovement4iterNJ_arg, _args_info.epsilonLikelihoodImprovement4BBL_arg, _args_info.maxNumOfBBLIter_arg); out()<<"# Best alpha (for fixed branch lengths)"<getBestAlpha() <getBestAlpha() <getBestNu() <getBestNu() <getBestL(); delete optimizer; } out()<<"# The likelihood of the tree"<categories()); // getPosteriorOfRatesAndLLPP(*_etPtr, _sc, *_spP, cup, computePijGam cpij; cpij.fillPij(*_etPtr,*_spP); for (int pos=0; pos < _sc.seqLen() ;++pos) { MDOUBLE tmpLL = log(likelihoodComputation::getLofPosAndPosteriorOfRates(pos,*_etPtr,_sc,cpij,*_spP, posPost)); _treeLogLikelihood += tmpLL; _llpp.push_back(tmpLL); _posterior.push_back(posPost); } } void mainSemphy::printLikelihoodAndLikelihoodPerPosition() const { out()<<"# The log likelihood of the tree is:"<categories();++i) LOG(8,<<" "<< _spP->rates(i)); LOG(8,<< endl); LOG(3,<<"# The posterior of the rates is:"<