// $Id: codon.cpp 5981 2009-03-17 14:39:39Z rubi $ #include "codon.h" #include "nucleotide.h" #include "amino.h" #include "logFile.h" #include "definitions.h" #include "someUtil.h" #include "matrixUtils.h" #include "sequenceContainer.h" #include #include #define INITIATION_CODON "i" vector > codonUtility::_trtvDiff; vector > codonUtility::_synNonsynDiff; vector > codonUtility::_nucDiffPlace; vector > codonUtility::_nucsDiff; codon::codon(){ geneticCodeString gcs=geneticCodeHolder::nuclearStandard; init(gcs); } codon::codon(const geneticCodeString& matrixFileString){ init(matrixFileString); } void codon::init(const geneticCodeString& matrixFileString) { readMatrixFromFile(matrixFileString.Val); } void codon::readMatrixFromFile(const string& matrixFileName){ //default value: "nuclearCode.txt" // cout<<"in codon constructor"<>val; if (val.size()==1) { //amino acid if(val == INITIATION_CODON) isInitCodon = true; else{ aa++; strAmino=val; if (strAmino=="*") { _alphabetSize=noOfCodons;} isInitCodon = false; } } else if (val.size()==3 && val[0]!='#'){ //codon, # symbolizes a comment if(isInitCodon){ map ::const_iterator iniItr =_codon2Int.find(val); if(iniItr == _codon2Int.end()) errorMsg::reportError("Initiation codon with undefined index at codon::readMatrixFromFile"); else _initiationIndex2codon[iniItr->second] = val; } else{ _geneticCode[val]=strAmino; _codon2Int[val]=noOfCodons; noOfCodons++; } } else { if (noOfCodons!=64){ string err="in codon::readMatrixFromFile: total number of codons = "+int2string(noOfCodons); errorMsg::reportError(err); } return; } } } codon& codon::operator=(const codon& other) { _geneticCode = other._geneticCode; //key - codon, value - amino acid _codon2Int = other._codon2Int;//key string of codon int= integer value of codon _alphabetSize = other._alphabetSize; _initiationIndex2codon = other._initiationIndex2codon; return *this; } // codon::codon(const codon& other): // _geneticCode(other._geneticCode), //key - codon, value - amino acid // _codon2Int(other._codon2Int),//key string of codon int= integer value of codon // _alphabetSize(other._alphabetSize){} //return -99 if not succeeds. int codon::fromChar(const string& s, const int pos) const { if (s.size() <= pos+2) { //errorMsg::reportError("Trying to read a codon pass the end of the string. The number of nucleotide may not be divisible by three"); string textToPrint("Trying to read a codon pass the end of the string. The number of nucleotide may not be divisible by three"); LOG(1,<4) || (p2 >4) || (p3 >4)) return unknown(); //unknown. string strCodon=""; //change U --> T if (p1==4) strCodon+="T"; else strCodon+=toupper(s[pos]); if (p2==4) strCodon+="T"; else strCodon+=toupper(s[pos+1]); if (p3==4) strCodon+="T"; else strCodon+=toupper(s[pos+2]); //const string strCodon = s.substr(pos,3); map tmpMap=_codon2Int; map ::iterator it1; it1=tmpMap.find(strCodon); if (it1==tmpMap.end()){ string err="error in codon::fromChar cannot find codon "+strCodon; errorMsg::reportError(err); } return tmpMap[strCodon]; } vector codon::fromString(const string &str) const { vector vec; if (str.size()%3!=0) { errorMsg::reportError("error in function codon::fromString. String length should be a multiplication of 3"); } for (int i=0;i tmpMap = _codon2Int; map ::iterator it=tmpMap.begin(); while (it!=tmpMap.end()){ if ((*it).second==in_id){ return (*it).first; } it++; } string err="error in function codon::fromInt: no codon found for the integer"; errorMsg::reportError(err); return (string("we should never get here - the reportError above will exit")); } codonUtility::replacementType codonUtility::codonReplacement(const int c1, const int c2, const codon &cod){ if (c1 == c2) return codonUtility::sameCodon; else if (codonUtility::aaOf(c1,cod) == codonUtility::aaOf(c2,cod)) return codonUtility::synonymous; return codonUtility::non_synonymous; } int codonUtility::aaOf(const int c1, const codon &cod){ amino a; if (c1==cod.gap()) return a.gap(); if (c1==cod.unknown()) return a.unknown(); string strCodon=cod.fromInt(c1); map geneticCode=cod.geneticCode(); map ::iterator pos; if ((pos=geneticCode.find(strCodon)) == geneticCode.end()){ string err="error in codonUtility::aaOf: cannot find codon "+strCodon; errorMsg::reportError(err); } if (pos->second.size() > 1){ errorMsg::reportError("error in codonUtility::aaOf: amino acid 1 letter code > 1"); } return a.fromChar(*pos->second.c_str()); } codonUtility::diffType codonUtility::codonDiff(const int c1, const int c2, const codon &cod){ if (c1==c2) return codonUtility::equal; nucleotide n; string s1 = cod.fromInt(c1); string s2 = cod.fromInt(c2); int pos1 = n.fromChar(s1[0])+n.fromChar(s2[0]); int pos2 = n.fromChar(s1[1])+n.fromChar(s2[1]); int pos3 = n.fromChar(s1[2])+n.fromChar(s2[2]); if (s1[0]!=s2[0] && s1[1]!=s2[1] && s1[2]!=s2[2]) return codonUtility::threesub; if (s1[0]==s2[0] && s1[1]==s2[1] && s1[2]!=s2[2]) { if (pos3%2==0) return codonUtility::tr; else return codonUtility::tv; } if (s1[1]==s2[1] && s1[2]==s2[2] && s1[0]!=s2[0]) { if (pos1%2==0) return codonUtility::tr; else return codonUtility::tv; } if (s1[0]==s2[0] && s1[2]==s2[2] && s1[1]!=s2[1]) { if (pos2%2==0) return codonUtility::tr; else return codonUtility::tv; } if (s1[0]==s2[0] && pos2%2==0 && pos3%2==0) return codonUtility::twoTrs; if (s1[1]==s2[1] && pos1%2==0 && pos3%2==0) return codonUtility::twoTrs; if (s1[2]==s2[2] && pos1%2==0 && pos2%2==0) return codonUtility::twoTrs; if (s1[0]==s2[0] && pos2%2!=0 && pos3%2!=0) return codonUtility::twoTvs; if (s1[1]==s2[1] && pos1%2!=0 && pos3%2!=0) return codonUtility::twoTvs; if (s1[2]==s2[2] && pos1%2!=0 && pos2%2!=0) return codonUtility::twoTvs; return codonUtility::trtv; } //return the place (0, 1, or 2) that the two codons are different //and the identity of the different nucleotide in the target codon. //For example, nucDiffPlace(ATG, ACG) retruns C2 codonUtility::nucDiffPlaceType codonUtility::nucDiffPlace(const int fromCodon, const int targetCodon, const codon &cod){ if (fromCodon == targetCodon) return codonUtility::EQUAL; codonUtility::nucDiffPlaceType res = A1; nucleotide nuc; string s1 = cod.fromInt(fromCodon); string s2 = cod.fromInt(targetCodon); int diffNum = 0; if (s1[0] != s2[0]){ ++diffNum; switch (s2[0]) { case 'A': res = A1; break; case 'C': res = C1; break; case 'G': res = G1; break; case 'T': res = T1; break; default: errorMsg::reportError("error in codonUtility::nucDiffPlace."); break; } } if (s1[1] != s2[1]){ ++diffNum; switch (s2[1]) { case 'A': res = A2; break; case 'C': res = C2; break; case 'G': res = G2; break; case 'T': res = T2; break; default: errorMsg::reportError("error in codonUtility::nucDiffPlace."); break; } } if (s1[2] != s2[2]){ ++diffNum; switch (s2[2]) { case 'A': res = A3; break; case 'C': res = C3; break; case 'G': res = G3; break; case 'T': res = T3; break; default: errorMsg::reportError("error in codonUtility::nucDiffPlace."); break; } } if (diffNum == 0) errorMsg::reportError("error in codonUtility::nucDiffPlace. Can't find different nucleotide"); if (diffNum > 1) res = MUL_SUB; return res; } //return the different nucleotides between the fron and target codons. //For example, nucsPlace(ATG, ACG) retruns TC codonUtility::nucsDiffType codonUtility::nucsDiff(const int fromCodon, const int targetCodon, const codon &cod){ if (fromCodon == targetCodon) return codonUtility::SAME; codonUtility::nucsDiffType res = AC; nucleotide nuc; string s1 = cod.fromInt(fromCodon); string s2 = cod.fromInt(targetCodon); int diffNum = 0; int from = 0; int to = 0; if (s1[0] != s2[0]) { ++diffNum; from = s1[0]; to = s2[0]; } if (s1[1] != s2[1]) { ++diffNum; from = s1[1]; to = s2[1]; } if (s1[2] != s2[2]) { ++diffNum; from = s1[2]; to = s2[2]; } switch(from) { case 'A': switch(to) { case 'G':res = AG;break; case 'T':res = AT;break; case 'C':res = AC;break; default: errorMsg::reportError("error in codonUtility::nucsDiff."); break; } break; case 'G': switch(to) { case 'A':res = AG;break; case 'T':res = GT;break; case 'C':res = CG;break; default: errorMsg::reportError("error in codonUtility::nucsDiff."); break; } break; case 'C': switch(to) { case 'G':res = CG;break; case 'T':res = CT;break; case 'A':res = AC;break; default: errorMsg::reportError("error in codonUtility::nucsDiff."); break; } break; case 'T': switch(to) { case 'G':res = GT;break; case 'A':res = AT;break; case 'C':res = CT;break; default: errorMsg::reportError("error in codonUtility::nucsDiff."); break; } break; default: errorMsg::reportError("error in codonUtility::nucsDiff."); break; } if (diffNum == 0) errorMsg::reportError("error in codonUtility::nucsDiff. Can't find different nucleotide"); if (diffNum > 1) res = DIFF; return res; } void codonUtility::initSubMatrices(const codon& cod){ if ((_trtvDiff.size() == cod.size()) && (_synNonsynDiff.size() == cod.size()) && (_nucDiffPlace.size() == cod.size()) && (_nucsDiff.size() == cod.size())) return; _trtvDiff.resize(cod.size()); _synNonsynDiff.resize(cod.size()); _nucDiffPlace.resize(cod.size()); _nucsDiff.resize(cod.size()); for (int i = 0; i < _trtvDiff.size(); ++i) { _trtvDiff[i].resize(cod.size()); _synNonsynDiff[i].resize(cod.size()); _nucDiffPlace[i].resize(cod.size()); _nucsDiff[i].resize(cod.size()); } //resizeMatrix(_trtvDiff, cod.size(), cod.size()); //resizeMatrix(_synNonsynDiff, cod.size(), cod.size()); //resizeMatrix(_nucDiffPlace, cod.size(), cod.size()); for (int i = 0; i < cod.size(); ++i){ for (int j =0; j <= i; ++j){ _trtvDiff[i][j] = _trtvDiff[j][i] = codonDiff(i, j, cod); _synNonsynDiff[i][j] = _synNonsynDiff[j][i] = codonReplacement(i, j, cod); _nucDiffPlace[i][j] = nucDiffPlace(i, j, cod); _nucDiffPlace[j][i] = nucDiffPlace(j, i, cod); _nucsDiff[i][j] = nucsDiff(i,j,cod); _nucsDiff[j][i] = nucsDiff(j,i,cod); } } } //returns the number (codonCounter) and frequency (codonUsage) of each codon in the sequnece container void codonUtility::getCodonUsage(const sequenceContainer& sc, Vint& codonCounter, Vdouble& codonUsage) { if (sc.getAlphabet()->size() != 61) errorMsg::reportError("cannot calculate codon usage when alphabet is not codon"); codonCounter.resize(61, 0); codonUsage.resize(61, 0.0); codon alph; int sum = 0; for (int s = 0; s < sc.numberOfSeqs();++s) { int id = sc.placeToId(s); for (int pos = 0; pos < sc.seqLen(); ++pos) { int cod = sc[id][pos]; if (alph.isSpecific(cod)) { ++sum; ++codonCounter[cod]; } } } for (int c = 0; c < codonCounter.size(); ++c) codonUsage[c] = static_cast(codonCounter[c]) / sum; } //in codonUsageFile: only 3-letter-codon and frequency seperated by "\t" void codonUtility::readCodonUsage(const string& codonUsageFileName, Vdouble& codonUsage,const codon &alph) { codonUsage.resize(alph.size(), 0.0); ifstream inFile(codonUsageFileName.c_str()); vector inFileData; putFileIntoVectorStringArray(inFile, inFileData); inFile.close(); if (inFileData.empty()){ errorMsg::reportError("unable to open file, or file is empty in codonUtility::readCodonUsage"); } vector::const_iterator it = inFileData.begin(); for (; it!= inFileData.end(); ++it) { if (it->empty()) //empty line continue; int endCodon = it->find_first_of("\t", 0); int startFreq = it->find_first_not_of("\t ", endCodon); if (startFreq>0) { string codonStr = it->substr(0, endCodon); string freqStr = it->substr(startFreq); MDOUBLE freq = string2double(freqStr); if(freq == 0.0) freq = EPSILON; codonUsage[alph.fromChar(codonStr, 0)] = freq; } } } //calculates the CAI for the whole MSA and for each position. //The calculation is based on a pre-calculated codonUsage vector. //The calculation is based on Sharp & Li (1987) NAR, 15:1281-1295 MDOUBLE codonUtility::calcCodonAdaptationIndex(const sequenceContainer& sc, const Vdouble& codonUsage, Vdouble& cai4site) { //the returned value: calculated as the average CAI for the MSA, rather than the geometrical mean as in Sharp & Li MDOUBLE wholeAlignmentCai = 0.0; codon alph; amino am; //1. calculate Wk = the frequency of codon k relative to the frequency of the optimal codon for that amino acid. Vdouble Wk(codonUsage.size(), 0.0); int aaId; for (aaId = 0; aaId < am.size(); ++aaId) { Vint codonsOfAa = aminoUtility::codonOf(aaId, alph); //finding the most frequent codon for this aa MDOUBLE mostFrequent = 0.0; Vint::const_iterator iter; for (iter = codonsOfAa.begin(); iter != codonsOfAa.end(); ++iter) { if (codonUsage[*iter] > mostFrequent) mostFrequent = codonUsage[*iter]; } //calculating Wk for (iter = codonsOfAa.begin(); iter != codonsOfAa.end(); ++iter) Wk[*iter] = codonUsage[*iter] / mostFrequent; } //2. calculate CAI cai4site.resize(sc.seqLen(), 0.0); int pos; for (pos = 0; pos < sc.seqLen(); ++pos) { MDOUBLE cai = 0.0; int informativeCodons = 0; for (int s = 0; s < sc.numberOfSeqs();++s) { int id = sc.placeToId(s); int cod = sc[id][pos]; if(!alph.isSpecific(cod)) continue; cai += Wk[cod]; ++informativeCodons; } cai /= static_cast(informativeCodons); cai4site[pos] = cai; wholeAlignmentCai += cai; } return wholeAlignmentCai; } bool codon::isStopCodon(const int in_id) const { if (in_id == unknown()) return false; if (in_id == gap()) return false; if ((in_id >= 0 ) && (in_id < _alphabetSize)) return false; return true; } bool codon::isInitiationCodon(const int in_id) const { bool result = true; map ::const_iterator itr = _initiationIndex2codon.find(in_id); if(itr == _initiationIndex2codon.end()){ result = false; } return result; }