// $Id: someUtil.cpp 15479 2016-10-10 16:25:21Z elilevy $ #include "someUtil.h" #include "errorMsg.h" #include "talRandom.h" #include #include #include #include #include #include #include using namespace std; // for the _mkdir call #if defined(WIN32) || defined(SunOS) || defined(solaris) #include #else #include #include // #include #endif //swap between the 4 variables such that the first becomes the second, second becomes the third and third becomes the fourth. //used in functoin mnbrack below. void shift3(MDOUBLE &a, MDOUBLE &b, MDOUBLE &c, const MDOUBLE d) { a=b; b=c; c=d; } MDOUBLE computeAverage(const vector& vec) { MDOUBLE sum=0.0; for (int i=0; i < vec.size(); ++i) { sum+=static_cast(vec[i]); } return sum/static_cast(vec.size()); } // X ~ Poisson(lamda) --> P(X=k) = ((lamda^k)/k!) * e^(-lamda) // It isn't smart to first calculate factorial(k) because the size of long int limits this calculation to k<=13 MDOUBLE copmutePoissonProbability(const int& k, const long double& lamda) { assert(k>=0); long double tmp = pow(lamda,k); // tmp = (lamda^k)/k! for (int i=2; i<=k; ++i) tmp/=i; return (tmp * exp(-lamda)); } MDOUBLE computeAverage(const vector& vec, const Vdouble* weightsV) { MDOUBLE sum=0.0; if(weightsV && !(weightsV->size() == vec.size() )) errorMsg::reportError("Using computeAverage with weights, where the number of weights not equal values"); for (int i=0; i < vec.size(); ++i){ if(weightsV) sum+=vec[i]* (*weightsV)[i]; else sum+=vec[i]; } return sum/static_cast(vec.size()); } MDOUBLE computeAverageOfAbs(const vector& vec, const Vdouble* weightsV) { MDOUBLE sum=0.0; if(weightsV && !(weightsV->size() == vec.size() )) errorMsg::reportError("Using computeAverage with weights, where the number of weights not equal values"); for (int i=0; i < vec.size(); ++i){ if(weightsV) sum+=abs(vec[i]* (*weightsV)[i]); else sum+=abs(vec[i]); } return sum/static_cast(vec.size()); } MDOUBLE computeMedian(const vector& vec) { int vecSize = vec.size(); if (vecSize<1) return 0; vector< vecElem > sortVec(vecSize); for (int x =0; x < vecSize ; ++x) { sortVec[x].setValue(vec[x]); sortVec[x].setPlace(x); } sort(sortVec.begin(), sortVec.end()); sort(sortVec.begin(), sortVec.end()); int highMedianIndex; if(vecSize>1) highMedianIndex = int((vecSize+1)/2); else highMedianIndex = int((vecSize)/2); // thus, highMedianIndex==0 MDOUBLE median = sortVec[highMedianIndex].getValue(); return median; } //// if quantile=0.5, the median is returned, if quantile=0.1, the low-ton-percentile is returned, quantile=0.9, the top-90-percentile is returned MDOUBLE computeQuantileFrac(const vector& vec, MDOUBLE quantile) { int vecSize = vec.size(); vector< vecElem > sortVec(vecSize); for (int x =0; x < vecSize ; ++x) { sortVec[x].setValue(vec[x]); sortVec[x].setPlace(x); } sort(sortVec.begin(), sortVec.end()); sort(sortVec.begin(), sortVec.end()); int qIndex = int((vecSize+1)*quantile); MDOUBLE quantileVal = sortVec[qIndex].getValue(); return quantileVal; } //// if quantile=2, the median is returned, if quantile=10, the ten-percentile is returned MDOUBLE computeQuantile(const vector& vec, MDOUBLE quantile) { MDOUBLE dividerForRank = 1+ 1.0/(quantile-1); int vecSize = vec.size(); vector< vecElem > sortVec(vecSize); for (int x =0; x < vecSize ; ++x) { sortVec[x].setValue(vec[x]); sortVec[x].setPlace(x); } sort(sortVec.begin(), sortVec.end()); sort(sortVec.begin(), sortVec.end()); int qIndex = int((vecSize+1)/dividerForRank); MDOUBLE quantileVal = sortVec[qIndex].getValue(); return quantileVal; } MDOUBLE computeStd(const vector& vec) {// page 60, Sokal and Rohlf MDOUBLE sum=0.0; MDOUBLE sumSqr=0.0; MDOUBLE vecSize = static_cast(vec.size()); for (int i=0; i < vec.size(); ++i) { sum+=static_cast(vec[i]); sumSqr+=(static_cast(vec[i])*static_cast(vec[i])); } MDOUBLE res= sumSqr-(sum*sum/vecSize); res /= (vecSize-1.0); res = sqrt(res); return res; } MDOUBLE computeStd(const vector& vec) {// page 60, Sokal and Rohlf MDOUBLE sum=0.0; MDOUBLE sumSqr=0.0; MDOUBLE vecSize = static_cast(vec.size()); for (int i=0; i < vec.size(); ++i) { sum+=vec[i]; sumSqr+=(vec[i]*vec[i]); } MDOUBLE res= sumSqr-(sum*sum/vecSize); res /= (vecSize-1.0); res = sqrt(res); return res; } void computeRelativeFreqsFollowingOneChanged(MDOUBLE newValFreq, int indexNewFreq,Vdouble &freqs){ MDOUBLE proportionAfterOptimization = 1.0 - newValFreq; MDOUBLE proportionBeforeOptimization = 1.0 - freqs[indexNewFreq]; MDOUBLE sum = 0.0; for (int i=0; i(x_abs); double theRemainingPart = fabs(x_abs-theIntegerPart); int integerRepresentingTheRemainingPart = static_cast(theRemainingPart*pow(10.0,lenght)); if (round) { integerRepresentingTheRemainingPart = static_cast(theRemainingPart*pow(10.0,lenght)+0.5); if (integerRepresentingTheRemainingPart == pow(10.0,lenght)) { integerRepresentingTheRemainingPart = 0; theIntegerPart++; } } string part1 = int2string(theIntegerPart); string part2 = int2string(integerRepresentingTheRemainingPart); while (part2.length()0 && result[i]=='0'){ result.erase(i); i--; } // removing "." if this is the last character in the string. if (result[result.length()-1]=='.') result.erase(result.length()-1); return result; } string int2string(const int num) { // the input to this program is say 56 // the output is the string "56" // this version of int2string is more portable // than sprintf like functions from c; // or sstream of stl. if (num == 0) return "0"; string res; int i = abs(num); int leftover; char k; while (i) { leftover = i%10; k = '0'+leftover; res = k+res; i/=10; } if (num<0) res = "-" + res; return res; }; void printTime(ostream& out) { time_t ltime; time( <ime ); out<<"# the date is "<< ctime( <ime )< &inseqFile){ inseqFile.clear(); string tmp1; while (getline(infile,tmp1, '\n' ) ) { if (tmp1.empty()) continue; if (tmp1.size() > 100000) { // was 15000 vector err; err.push_back("Unable to read file. It is required that each line is no longer than"); err.push_back("15000 characters. "); errorMsg::reportError(err,1); } if (tmp1[tmp1.size()-1]=='\r') {// in case we are reading a dos file tmp1.erase(tmp1.size()-1); }// remove the traling carrige-return inseqFile.push_back(tmp1); } } bool fromStringIterToInt(string::const_iterator & it, // ref must be here const string::const_iterator endOfString, int& res) {// the ref is so that we can use the it after the func. while (it != endOfString) { if ((*it == ' ') || (*it == '\t')) ++it;else break; // skeeping white spaces. } if (it != endOfString) { if (isdigit(*it) || (*it == '-')){ int k = atoi(&*it); if (*it == '-') ++it; for (int numDig = abs(k); numDig>0; numDig/=10) ++it; res = k; return true; } else return false; //unable to read int From String } return false; //unable to read int From String } string* searchStringInFile(const string& string2find, const int index, const string& inFileName) { ifstream f; f.open(inFileName.c_str()); if (!f.good()) { string tmp = "Unable to open file name: "+inFileName+" in function searchStringInFile"; errorMsg::reportError(tmp); } string numm = int2string(index); string realString2find = string2find+numm; istream_iterator is_string(f); istream_iterator end_of_stream; is_string = find(is_string,end_of_stream,realString2find); if(is_string == end_of_stream) {f.close();return NULL;} else { is_string++; if(is_string == end_of_stream) {f.close();return NULL;}; string* s = new string(*is_string); f.close(); return s; } f.close(); return NULL; } string* searchStringInFile(const string& string2find, const string& inFileName) {// return the string that is AFTER the string to search. ifstream f; f.open(inFileName.c_str()); if (!f.good()) { string tmp = "Unable to open file name: "+inFileName+" in function searchStringInFile"; errorMsg::reportError(tmp); } string realString2find = string2find; istream_iterator is_string(f); istream_iterator end_of_stream; is_string = find(is_string,end_of_stream,realString2find); if(is_string == end_of_stream) {f.close();return NULL;} else { is_string++; if(is_string == end_of_stream) {f.close();return NULL;}; string* s = new string(*is_string); f.close(); return s; } f.close(); return NULL; } bool doesWordExistInFile(const string& string2find,const string& inFileName) { ifstream f; f.open(inFileName.c_str()); if (!f.good()) { string tmp = "Unable to open file name: "+inFileName+" in function searchStringInFile"; errorMsg::reportError(tmp); } istream_iterator is_string(f); istream_iterator end_of_stream; is_string = find(is_string,end_of_stream,string2find); if(is_string == end_of_stream) return false; else return true; } string takeCharOutOfString(const string& charsToTakeOut, const string& fromString) { string finalString; for (int i=0; i x2) || DEQUAL(x1, x2,epsilon)); } bool DSMALL_EQUAL(const MDOUBLE x1, const MDOUBLE x2, MDOUBLE epsilon/*1.192092896e-07F*/){ return ((x1 < x2) || DEQUAL(x1, x2,epsilon)); } void createDir(const string & curDir, const string & dirName){// COPYRIGHT OF ITAY MAYROSE. string newDir; if (curDir == "") newDir = dirName; else newDir = curDir + string("/") + dirName; #ifdef WIN32 if( _mkdir(newDir.c_str()) == 0 ){ LOG(5, << "Directory " < 0.001) errorMsg::reportError(" problem - scalled average is not avgIn after scalling!!!"); return scaleFactor; } //calculates the mean square error distance between 2 vectors: MDOUBLE calcMSEDistBetweenVectors(const Vdouble& oneRatesVec, const Vdouble& otherRatesVec) { MDOUBLE res = 0.0; if (oneRatesVec.size() != otherRatesVec.size()) errorMsg::reportError("the two vectors to be compared are not the same size in function SimulateRates::calcDistBetweenRatesVectors()"); for (int i=0; i 0: if trueValues[i] < threshhold then do not add the rse for this psition to the result MDOUBLE calcRelativeMSEDistBetweenVectors(const Vdouble& trueValues, const Vdouble& inferredValues, const MDOUBLE threshhold/*0.0*/ ) { MDOUBLE res = 0.0; if (inferredValues.size() != trueValues.size()) errorMsg::reportError("the two vectors to be compared are not the same size in function SimulateRates::calcDistBetweenRatesVectors()"); int counter = 0; for (int i=0; i(seqLength), 2.0) -1)); //n^3 -n MDOUBLE numerator = 1.0 - ((6/en3n) * (sum_diff_sqr + (s_one + s_two)/12.0)); MDOUBLE denum = sqrt((1.0 - s_one/en3n) * (1.0 - s_two/en3n)); res = numerator/ denum; return res; } /******************************************************************************************** calculates the spearman rank correlation value, Ofir implementation *********************************************************************************************/ MDOUBLE calcRankCorrelation2(const Vdouble& oneRatesVec, const Vdouble& otherRatesVec) { int vecLen = oneRatesVec.size(); if(vecLen != otherRatesVec.size()) errorMsg::reportError("calcRankCorrelation2. Vectors length differ"); Vdouble orderVec1, orderVec2; orderRankNoTies(oneRatesVec, orderVec1); orderRankNoTies(otherRatesVec, orderVec2); MDOUBLE n = (double)vecLen; MDOUBLE dif,r,sum_dif=0; for (int i=0; i > sortVec(vecSize); // for (int x =0; x < vecSize ; ++x) // { // sortVec[x].setValue(vecIn[x]); // sortVec[x].setPlace(x); // } // sort(sortVec.begin(), sortVec.end()); // // //check for ties and correct their rank // Vdouble rankVec(vecSize); // MDOUBLE rank; // for (int i=0; i < vecSize; ) // { // if (sortVec[i].getValue() != sortVec[i+1].getValue()) // {//no tie // rankVec[i] = i; // ++i; // } // else // {//tie // int to =0; // for (to = i+1; (to<=vecSize) && (sortVec[i].getValue() == sortVec[to].getValue());++to) // ;//check how far the tie goes // to--; // rank = 0.5*(to + i); // for (int ji = i; ji<= to; ji++) // { // rankVec[ji] = rank; // } // // i = to+1; // } // } // for (int j =0; j < vecSize; ++j) { // assert ((rankVec[j] >= 0) && (rankVec[j] < vecSize)); // orderVecOut[sortVec[j].getPlace()] = rankVec[j]; // } // return orderVecOut; //} //orderVec - determine the relative order of vecIn //orderVecOut[i] is the rank of vecIn[i] //note that in case of ties the rank will be the midrank of the tied group //return sum of n^3 - n where n is the number of elements in each tied group - see spearman rank correlation MDOUBLE orderVec(const vector& vecIn, vector& orderVecOut) { int vecSize = vecIn.size(); orderVecOut.resize(vecSize); vector< vecElem > sortVec(vecSize); for (int x =0; x < vecSize ; ++x) { sortVec[x].setValue(vecIn[x]); sortVec[x].setPlace(x); } sort(sortVec.begin(), sortVec.end()); //check for ties and correct their rank Vdouble rankVec(vecSize); MDOUBLE sumRankDif = 0; //sum(Fk^3 - Fk) MDOUBLE rank; for (int i=0; i < vecSize-1; ) // loop was till vecSize, out of range with sortVec[i+1]. Fixed (?) { if (sortVec[i].getValue() != sortVec[i+1].getValue()) {//no tie rankVec[i] = i; ++i; } else {//tie int to =0; for (to = i+1; (to<=vecSize) && (sortVec[i].getValue() == sortVec[to].getValue());++to) ;//check how far the tie goes to--; rank = 0.5*(to + i); for (int ji = i; ji<= to; ji++) { rankVec[ji] = rank; } int numTies = to - i +1; //number o fties in this group sumRankDif += numTies*numTies*numTies - numTies; i = to+1; } } for (int j =0; j < vecSize; ++j) { assert ((rankVec[j] >= 0) && (rankVec[j] < vecSize)); orderVecOut[sortVec[j].getPlace()] = rankVec[j]; } return sumRankDif; } void orderRankNoTies(const vector& vecIn, vector& orderVecOut) { int vecSize = vecIn.size(); orderVecOut.resize(vecSize); vector< vecElem > sortVec(vecSize); for (int x =0; x < vecSize ; ++x) { sortVec[x].setValue(vecIn[x]); sortVec[x].setPlace(x); } sort(sortVec.begin(), sortVec.end()); for (int j =0; j < vecSize; ++j) { orderVecOut[sortVec[j].getPlace()] = j; } } void orderVec(const Vdouble& vecIn, vector< vecElem >& orderVecOut) { int vecSize = vecIn.size(); orderVecOut.resize(vecSize); for (int x =0; x < vecSize ; ++x) { orderVecOut[x].setValue(vecIn[x]); orderVecOut[x].setPlace(x); } sort(orderVecOut.begin(), orderVecOut.end()); } void splitString2(string str, string seperater, string &first, string &second) { int i = (int)str.find(seperater); //find seperator if(i != -1) { int y = 0; if(!str.empty()) { while(y != i) { first += str[y++]; //creating first string } y = y+(int)seperater.length(); //jumping forward seperater length while(y != str.length()) { second += str[y++]; //creating second string } } } else { first = str; second = "NULL"; //if seperator is not there then second string == null } } void splitString(const string& str,vector& subStrs,const string& delimiter) { // Skip delimiter at beginning. string::size_type lastPos = str.find_first_not_of(delimiter,0); // Find first "non-delimiter". string::size_type pos = str.find_first_of(delimiter,lastPos); while (string::npos != pos || string::npos != lastPos) { // Found a subStr, add it to the vector. subStrs.push_back(str.substr(lastPos,pos - lastPos)); // Skip delimiter. Note the "not_of" lastPos = str.find_first_not_of(delimiter,pos); // Find next "non-delimiter" pos = str.find_first_of(delimiter,lastPos); } } Vint getVintFromStr(const string& inStr) { Vint res; vector outStr; splitString(inStr, outStr, ","); for (int i = 0; i < outStr.size(); ++i) { int x = atoi(outStr[i].c_str()); res.push_back(x); } return res; } string getStrFromVint(const Vint& inVec) { string res(""); for (int i = 0; i < inVec.size(); ++i) { if (i > 0) res += ","; res += int2string(inVec[i]); } return res; } /******************************************************************************************** gainLoss project *********************************************************************************************/ int fromIndex2gainIndex(const int i, const int gainCategories, const int lossCategories){ int gainIndex; if(lossCategories<=gainCategories){ gainIndex = (int)floor((double)i/(lossCategories) ); } else{ gainIndex = i%(gainCategories); } return gainIndex; } int fromIndex2lossIndex(const int i, const int gainCategories, const int lossCategories){ int lossIndex; if(lossCategories<=gainCategories){ lossIndex = i%(lossCategories); } else{ lossIndex = (int)floor((double)i/(gainCategories) ); } return lossIndex; } int giveRandomState(const int alphabetSize, const int beginningState, const VVdouble &changeProbabilities) { for (int loop = 0 ; loop < 100000 ; ++loop) { MDOUBLE theRandNum = talRandom::giveRandomNumberBetweenZeroAndEntry(1.0); MDOUBLE sum = 0.0; for (int state = 0; state < alphabetSize; ++state) { sum += changeProbabilities[beginningState][state]; if (theRandNum < sum) { return state; } } } errorMsg::reportError("giveRandomState: could not give random character. The reason is unknown."); return 1; } int giveRandomState(const int alphabetSize, const Vdouble &frequencies) { for (int loop =0 ;loop<100000 ;loop++) { MDOUBLE theRandNum = talRandom::giveRandomNumberBetweenZeroAndEntry(0.999); MDOUBLE sum = 0.0; for (int j=0; j < alphabetSize;++j) { sum+=frequencies[j]; if (theRandNum0) return 1; else return -1; } MDOUBLE factorial(int x) { MDOUBLE fac = 1; for (int i=2; i<=x; i++) fac *= i; return fac; } MDOUBLE BinomialCoeff(int a, int b) { return factorial(a)/(factorial(b)*factorial(a-b)); } MDOUBLE exponentResolver(Vdouble& valuesVec){ //First find largest element in valuesVec MDOUBLE largest = VERYSMALL; int largestIndex = -1; for(int i = 0;i < valuesVec.size();++i){ if(valuesVec[i] > largest){ largest = valuesVec[i]; largestIndex = i; } } if(largestIndex == -1){ errorMsg::reportError("exponentResolver: Could not find the largest element in the input vector"); return 1; } //Now sum over all elements that are greater than -50. Note that exp(-50) is way smaller than the famous EPSILON so we are pretty safe from neglecting anything significant MDOUBLE sum = 1.0; MDOUBLE cutoff = -50; for(int i = 0;i < valuesVec.size();++i){ if(i == largestIndex) continue; if((valuesVec[i]-largest) < cutoff) continue; sum += exp(valuesVec[i]-largest); } MDOUBLE result = largest+log(sum); return(result); } MDOUBLE sumVdouble(const Vdouble & vec){ MDOUBLE sum = 0.0; for(int i = 0;i < vec.size();++i){ sum += vec[i]; } return(sum); }