mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 10:10:25 +08:00
3379 lines
124 KiB
C++
3379 lines
124 KiB
C++
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
|
|
***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
|
|
|
|
trimAl v1.4: a tool for automated alignment trimming in large-scale
|
|
phylogenetics analyses.
|
|
|
|
readAl v1.4: a tool for automated alignment conversion among different
|
|
formats.
|
|
|
|
statAl v1.4: a tool for getting stats about multiple sequence alignments.
|
|
|
|
|
|
2009-2015 Capella-Gutierrez S. and Gabaldon, T.
|
|
[scapella, tgabaldon]@crg.es
|
|
|
|
This file is part of trimAl/readAl.
|
|
|
|
trimAl/readAl are free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, the last available version.
|
|
|
|
trimAl/readAl are distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with trimAl/readAl. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** *****
|
|
***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
using namespace std;
|
|
|
|
#include <float.h>
|
|
#include "alignment.h"
|
|
#include "rwAlignment.cpp"
|
|
#include "autAlignment.cpp"
|
|
|
|
#include <deque>
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Class constructor */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
alignment::alignment(void) {
|
|
|
|
/* Alignment parameter */
|
|
sequenNumber = 0;
|
|
residNumber = 0;
|
|
|
|
/* Are the input sequences aligned? */
|
|
isAligned = false;
|
|
|
|
/* Should the output file be reversed? */
|
|
reverse = false;
|
|
|
|
/* Should be trimmed only terminal gaps? - set automated and manual boundaries
|
|
* values */
|
|
terminalGapOnly = false;
|
|
left_boundary = -1;
|
|
right_boundary = -1;
|
|
|
|
/* Input and output formats */
|
|
iformat = 0;
|
|
oformat = 0;
|
|
shortNames = false;
|
|
|
|
forceCaps = false;
|
|
upperCase = false;
|
|
lowerCase = false;
|
|
|
|
/* Indicate whether sequences composed only by gaps should be kept or not */
|
|
keepSequences = false;
|
|
|
|
/* Indicate whether original header, they may include non-alphanumerical
|
|
* characters, should be dumped into output stream without any preprocessing
|
|
* step */
|
|
keepHeader = false;
|
|
|
|
gapSymbol = "-";
|
|
|
|
/* Sequence datatype: DNA, RNA or Protein */
|
|
dataType = 0;
|
|
|
|
/* Window sizes to trim the input alignment */
|
|
ghWindow = 0;
|
|
shWindow = 0;
|
|
|
|
/* Minimum block size in the new alignment */
|
|
blockSize = 0;
|
|
|
|
/* Is this alignmnet new? */
|
|
oldAlignment = false;
|
|
|
|
/* Sequence residues number */
|
|
residuesNumber = NULL;
|
|
|
|
/* Columns and sequences that have been previously selected */
|
|
saveResidues = NULL;
|
|
saveSequences = NULL;
|
|
|
|
/* Input sequences as well other information such as sequences name, etc */
|
|
sequences = NULL;
|
|
seqsName = NULL;
|
|
seqsInfo = NULL;
|
|
|
|
/* Information about input alignment */
|
|
filename = "";
|
|
aligInfo = "";
|
|
|
|
/* Information computed from alignment */
|
|
sgaps = NULL;
|
|
scons = NULL;
|
|
seqMatrix = NULL;
|
|
|
|
identities = NULL;
|
|
overlaps = NULL;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Class constructor */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
alignment::alignment(string o_filename, string o_aligInfo, string *o_sequences, string *o_seqsName,
|
|
string *o_seqsInfo, int o_sequenNumber, int o_residNumber, int o_iformat, int o_oformat,
|
|
bool o_shortNames, int o_dataType, int o_isAligned, bool o_reverse, bool o_terminalGapOnly,
|
|
int o_left_boundary, int o_right_boundary,
|
|
bool o_keepSeqs, bool o_keepHeader, int OldSequences, int OldResidues, int *o_residuesNumber,
|
|
int *o_saveResidues, int *o_saveSequences, int o_ghWindow, int o_shWindow, int o_blockSize,
|
|
float **o_identities, float **o_overlaps) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int i, j, k, ll;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
oldAlignment = true;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Assign the parameter values to the variables */
|
|
sequenNumber = o_sequenNumber;
|
|
residNumber = o_residNumber;
|
|
|
|
iformat = o_iformat;
|
|
oformat = o_oformat;
|
|
shortNames = o_shortNames;
|
|
|
|
dataType = o_dataType;
|
|
|
|
ghWindow = o_ghWindow;
|
|
shWindow = o_shWindow;
|
|
|
|
blockSize = o_blockSize;
|
|
|
|
isAligned = o_isAligned;
|
|
reverse = o_reverse;
|
|
|
|
terminalGapOnly = o_terminalGapOnly;
|
|
right_boundary = o_right_boundary;
|
|
left_boundary = o_left_boundary;
|
|
|
|
filename = o_filename;
|
|
aligInfo = o_aligInfo;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
keepSequences = o_keepSeqs;
|
|
keepHeader = o_keepHeader;
|
|
|
|
forceCaps = false;
|
|
upperCase = false;
|
|
lowerCase = false;
|
|
|
|
gapSymbol = "-";
|
|
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Basic information for the new alignment */
|
|
sequences = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
sequences[i] = o_sequences[i];
|
|
|
|
seqsName = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
seqsName[i] = o_seqsName[i];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
residuesNumber = new int[sequenNumber];
|
|
if((isAligned) || (o_residuesNumber != NULL)) {
|
|
for(i = 0; i < sequenNumber; i++)
|
|
residuesNumber[i] = residNumber;
|
|
} else {
|
|
for(i = 0; i < sequenNumber; i++)
|
|
residuesNumber[i] = o_residuesNumber[i];
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(o_seqsInfo != NULL) {
|
|
seqsInfo = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
seqsInfo[i] = o_seqsInfo[i];
|
|
} else seqsInfo = NULL;
|
|
|
|
saveResidues = NULL;
|
|
if(o_saveResidues != NULL) {
|
|
saveResidues = new int[residNumber];
|
|
for(i = 0, j = 0; i < OldResidues; i++)
|
|
if(o_saveResidues[i] != -1) {
|
|
saveResidues[j] = o_saveResidues[i];
|
|
j++;
|
|
}
|
|
}
|
|
|
|
saveSequences = NULL;
|
|
if(o_saveSequences != NULL) {
|
|
saveSequences = new int[sequenNumber];
|
|
for(i = 0, j = 0; i < OldSequences; i++)
|
|
if(o_saveSequences[i] != -1) {
|
|
saveSequences[j] = o_saveSequences[i];
|
|
j++;
|
|
}
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
identities = NULL;
|
|
if(o_identities != NULL) {
|
|
identities = new float*[sequenNumber];
|
|
for(i = 0, j = 0; i < OldSequences; i++) {
|
|
if(o_saveSequences[i] != -1) {
|
|
identities[j] = new float[sequenNumber];
|
|
for(k = 0, ll = 0; k < OldSequences; k++) {
|
|
if(o_saveSequences[k] != -1) {
|
|
identities[j][ll] = o_identities[i][k];
|
|
ll++;
|
|
}
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
|
|
overlaps = NULL;
|
|
if(o_overlaps != NULL) {
|
|
overlaps = new float*[sequenNumber];
|
|
for(i = 0, j = 0; i < OldSequences; i++) {
|
|
if(o_saveSequences[i] != -1) {
|
|
overlaps[j] = new float[sequenNumber];
|
|
for(k = 0, ll = 0; k < OldSequences; k++) {
|
|
if(o_saveSequences[k] != -1) {
|
|
overlaps[j][ll] = o_overlaps[i][k];
|
|
ll++;
|
|
}
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Any structure associated to the new alignment is
|
|
* initialize to NULL. In this way, these structure,
|
|
* if it will be necessary, has to be computed */
|
|
sgaps = NULL;
|
|
scons = NULL;
|
|
seqMatrix = NULL;
|
|
identities = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Overlapping operator = to use it as a kind of class constructor */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
alignment &alignment::operator=(const alignment &old) {
|
|
int i, j;
|
|
|
|
if(this != &old) {
|
|
|
|
oldAlignment = true;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Assign the parameter values to the variables */
|
|
sequenNumber = old.sequenNumber;
|
|
residNumber = old.residNumber;
|
|
|
|
isAligned = old.isAligned;
|
|
reverse = old.reverse;
|
|
|
|
terminalGapOnly = old.terminalGapOnly;
|
|
right_boundary = old.right_boundary;
|
|
left_boundary = old.left_boundary;
|
|
|
|
iformat = old.iformat;
|
|
oformat = old.oformat;
|
|
shortNames = old.shortNames;
|
|
|
|
dataType = old.dataType;
|
|
|
|
ghWindow = old.ghWindow;
|
|
shWindow = old.shWindow;
|
|
|
|
blockSize = old.blockSize;
|
|
|
|
filename = old.filename;
|
|
aligInfo = old.aligInfo;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
keepSequences = old.keepSequences;
|
|
keepHeader = old.keepHeader;
|
|
|
|
forceCaps = old.forceCaps;
|
|
upperCase = old.upperCase;
|
|
lowerCase = old.lowerCase;
|
|
|
|
gapSymbol = old.gapSymbol;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
sequences = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
sequences[i] = old.sequences[i];
|
|
|
|
seqsName = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
seqsName[i] = old.seqsName[i];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
delete [] residuesNumber;
|
|
if(old.residuesNumber) {
|
|
residuesNumber = new int[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
residuesNumber[i] = old.residuesNumber[i];
|
|
} residuesNumber = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
delete [] seqsInfo;
|
|
if(old.seqsInfo) {
|
|
seqsInfo = new string[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
seqsInfo[i] = old.seqsInfo[i];
|
|
} else seqsInfo = NULL;
|
|
|
|
delete [] saveResidues;
|
|
if(old.saveResidues) {
|
|
saveResidues = new int[residNumber];
|
|
for(i = 0; i < residNumber; i++)
|
|
saveResidues[i] = old.saveResidues[i];
|
|
} else saveResidues = NULL;
|
|
|
|
delete [] saveSequences;
|
|
if(old.saveSequences) {
|
|
saveSequences = new int[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
saveSequences[i] = old.saveSequences[i];
|
|
} else saveSequences = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
delete [] identities;
|
|
if(old.identities) {
|
|
identities = new float*[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
identities[i] = new float[sequenNumber];
|
|
for(j = 0; j < sequenNumber; j++)
|
|
identities[i][j] = old.identities[i][j];
|
|
}
|
|
} else identities = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
delete [] overlaps;
|
|
if(old.overlaps) {
|
|
overlaps = new float*[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
overlaps[i] = new float[sequenNumber];
|
|
for(j = 0; j < sequenNumber; j++)
|
|
overlaps[i][j] = old.overlaps[i][j];
|
|
}
|
|
} else overlaps = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
delete sgaps;
|
|
sgaps = NULL;
|
|
|
|
delete scons;
|
|
scons = NULL;
|
|
|
|
delete seqMatrix;
|
|
seqMatrix = old.seqMatrix;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
return *this;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Class destructor */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
alignment::~alignment(void) {
|
|
int i;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(sequences != NULL)
|
|
delete [] sequences;
|
|
sequences = NULL;
|
|
|
|
if(seqsName != NULL)
|
|
delete [] seqsName;
|
|
seqsName = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(residuesNumber != NULL)
|
|
delete [] residuesNumber;
|
|
residuesNumber = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(seqsInfo != NULL)
|
|
delete [] seqsInfo;
|
|
seqsInfo = NULL;
|
|
|
|
if(saveResidues != NULL)
|
|
delete[] saveResidues;
|
|
saveResidues = NULL;
|
|
|
|
if(saveSequences != NULL)
|
|
delete[] saveSequences;
|
|
saveSequences = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(identities != NULL) {
|
|
for(i = 0; i < sequenNumber; i++)
|
|
delete [] identities[i];
|
|
delete [] identities;
|
|
}
|
|
identities = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(overlaps != NULL) {
|
|
for(i = 0; i < sequenNumber; i++)
|
|
delete [] overlaps[i];
|
|
delete [] overlaps;
|
|
}
|
|
overlaps = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(sgaps != NULL)
|
|
delete sgaps;
|
|
sgaps = NULL;
|
|
|
|
if(scons != NULL)
|
|
delete scons;
|
|
scons = NULL;
|
|
|
|
if(seqMatrix != NULL)
|
|
delete seqMatrix;
|
|
seqMatrix = NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
oldAlignment = false;
|
|
|
|
sequenNumber = 0;
|
|
residNumber = 0;
|
|
|
|
isAligned = false;
|
|
reverse = false;
|
|
|
|
terminalGapOnly = false;
|
|
right_boundary = -1;
|
|
left_boundary = -1;
|
|
|
|
iformat = 0;
|
|
oformat = 0;
|
|
shortNames = false;
|
|
|
|
dataType = 0;
|
|
|
|
ghWindow = 0;
|
|
shWindow = 0;
|
|
|
|
blockSize = 0;
|
|
|
|
filename.clear();
|
|
aligInfo.clear();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
// Try to load current alignment and inform otherwise
|
|
bool alignment::loadAlignment(char *alignmentFile) {
|
|
|
|
// Detect input alignment format - it is an strict detection procedure
|
|
iformat = formatInputAlignment(alignmentFile);
|
|
// Unless it is indicated somewhere else, output alignment format will be
|
|
// the same as the input one
|
|
oformat = iformat;
|
|
|
|
// Use the appropiate function to read input alignment
|
|
switch(iformat) {
|
|
case 1:
|
|
return loadClustalAlignment(alignmentFile);
|
|
case 3:
|
|
return loadNBRF_PirAlignment(alignmentFile);
|
|
case 8:
|
|
return loadFastaAlignment(alignmentFile);
|
|
case 11:
|
|
return loadPhylip3_2Alignment(alignmentFile);
|
|
case 12:
|
|
return loadPhylipAlignment(alignmentFile);
|
|
case 17:
|
|
return loadNexusAlignment(alignmentFile);
|
|
case 21:
|
|
return loadMegaInterleavedAlignment(alignmentFile);
|
|
case 22:
|
|
return loadMegaNonInterleavedAlignment(alignmentFile);
|
|
// Return a FALSE value - meaning the input alignment was not loaded
|
|
default:
|
|
return false;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
// Return input alignment format - it is useful to set-up output alignment one
|
|
int alignment::getInputFormat(void) {
|
|
return iformat;
|
|
}
|
|
|
|
// Return output alignment format
|
|
int alignment::getOutputFormat(void) {
|
|
return oformat;
|
|
}
|
|
|
|
// Return whether there is one or more alignments at the input file.
|
|
// Currently trimAl family only support single alignments per file.
|
|
int alignment::typeInputFile(void) {
|
|
return SINGLE;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function prints the alignmnet to the standard output using an
|
|
* appropiate function depending on the output format */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::printAlignment(void){
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(sequences == NULL)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
if((residNumber == 0) || (sequenNumber == 0)) {
|
|
cerr << endl << "WARNING: Output alignment has not been generated. "
|
|
<< "It is empty" << endl << endl;
|
|
return true;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
switch(oformat) {
|
|
case 1:
|
|
alignmentClustalToFile(cout);
|
|
break;
|
|
case 3:
|
|
alignmentNBRF_PirToFile(cout);
|
|
break;
|
|
case 8:
|
|
alignmentFastaToFile(cout);
|
|
break;
|
|
case 11:
|
|
alignmentPhylip3_2ToFile(cout);
|
|
break;
|
|
case 12:
|
|
alignmentPhylipToFile(cout);
|
|
break;
|
|
case 13:
|
|
alignmentPhylip_PamlToFile(cout);
|
|
break;
|
|
case 17:
|
|
alignmentNexusToFile(cout);
|
|
break;
|
|
case 21: case 22:
|
|
alignmentMegaToFile(cout);
|
|
break;
|
|
case 99:
|
|
getSequences(cout);
|
|
break;
|
|
case 100:
|
|
alignmentColourHTML(cout);
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
return true;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function puts the alignment in a given file */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::saveAlignment(char *destFile) {
|
|
|
|
ofstream file;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(sequences == NULL)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
if((residNumber == 0) || (sequenNumber == 0)) {
|
|
cerr << endl << "WARNING: Output alignment has not been generated. "
|
|
<< "It is empty." << endl << endl;
|
|
return true;
|
|
}
|
|
|
|
/* Check whether the input sequences file is aligned or not before creating
|
|
* a given output format. */
|
|
switch(oformat) {
|
|
case 1: case 11: case 12: case 13: case 17: case 21: case 22:
|
|
/* Check whether sequences in the alignment are aligned or not.
|
|
* Warn about it if there are not aligned. */
|
|
if (!isAligned) {
|
|
cerr << endl << "ERROR: Sequences are not aligned. Output format is "
|
|
<< "not compatible with unaligned sequences.";
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* File open and correct open check */
|
|
file.open(destFile);
|
|
if(!file) return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Depending on the output format, we call to the
|
|
* appropiate function */
|
|
switch(oformat) {
|
|
case 1:
|
|
alignmentClustalToFile(file);
|
|
break;
|
|
case 3:
|
|
alignmentNBRF_PirToFile(file);
|
|
break;
|
|
case 8:
|
|
alignmentFastaToFile(file);
|
|
break;
|
|
case 11:
|
|
alignmentPhylip3_2ToFile(file);
|
|
break;
|
|
case 12:
|
|
alignmentPhylipToFile(file);
|
|
break;
|
|
case 13:
|
|
alignmentPhylip_PamlToFile(file);
|
|
break;
|
|
case 17:
|
|
alignmentNexusToFile(file);
|
|
break;
|
|
case 21: case 22:
|
|
alignmentMegaToFile(file);
|
|
break;
|
|
case 99:
|
|
getSequences(file);
|
|
break;
|
|
case 100:
|
|
alignmentColourHTML(file);
|
|
break;
|
|
default:
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Close the output file */
|
|
file.close();
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* All is OK, return true */
|
|
return true;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function trimms a given alignment based on the gap distribution
|
|
* values. To trim the alignment, this function uses two parameters:
|
|
*
|
|
* baseLine: Minimum percentage of columns that should
|
|
* be conserved in the new alignment.
|
|
* gapsPct: Maximum percentage of gaps per column that
|
|
* should be in the new alignment.
|
|
*
|
|
* The function selects that combination of parameters that maximize the
|
|
* final number of columns in the new alignment. There is an extra parameter
|
|
* to ask for the complementary alignment. A complementary alignment consits
|
|
* of those columns that would delete applying the standard approach. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanGaps(float baseLine, float gapsPct, bool complementary) {
|
|
|
|
alignment *ret;
|
|
double cut;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If gaps statistics are not calculated, we
|
|
* calculate them */
|
|
if(calculateGapStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Obtain the cut point using the given parameters */
|
|
cut = sgaps -> calcCutPoint(baseLine, gapsPct);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Once we have the cut value proposed, we call the
|
|
* appropiate method to clean the alignment and, then,
|
|
* generate the new alignment. */
|
|
ret = cleanByCutValue(cut, baseLine, sgaps -> getGapsWindow(), complementary);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a reference of the new alignment */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function trimms a given alignment based on the similarity distribution
|
|
* values. To trim the alignment, this function uses two parameters:
|
|
*
|
|
* baseLine: Minimum percentage of columns that should
|
|
* be conserved in the new alignment.
|
|
* conservat: Minimum value of similarity per column that
|
|
* should be in the new alignment.
|
|
*
|
|
* The function selects that combination of parameters that maximize the
|
|
* final number of columns in the new alignment. There is an extra parameter
|
|
* to ask for the complementary alignment. A complementary alignment consits
|
|
* of those columns that would delete applying the standard approach. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanConservation(float baseLine, float conservationPct, bool complementary) {
|
|
|
|
alignment *ret;
|
|
float cut;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If conservation's statistics are not calculated,
|
|
* we calculate them */
|
|
if(calculateConservationStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Calculate the cut point using the given parameters */
|
|
cut = (float) scons -> calcCutPoint(baseLine, conservationPct);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Once we have the cut value, we call the appropiate
|
|
* method to clean the alignment and, then, generate
|
|
the new alignment */
|
|
ret = cleanByCutValue(cut, baseLine, scons -> getMdkwVector(), complementary);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a reference of the new alignment */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function trimms a given alignment based on the similarity and gaps
|
|
* distribution values. To trim the alignment, this function uses three
|
|
* parameters:
|
|
*
|
|
* baseLine: Minimum percentage of columns that should
|
|
* be conserved in the new alignment.
|
|
* gapsPct: Maximum percentage of gaps per column that
|
|
* should be in the new alignment.
|
|
* conservat: Minimum value of similarity per column that
|
|
* should be in the new alignment.
|
|
*
|
|
* The function selects that combination of parameters that maximize the
|
|
* final number of columns in the new alignment. If the baseLine parameter
|
|
* is the most strict, the other two ones would be relaxed to keep columns
|
|
* that would not be selected. There is an extra parameter to ask for the
|
|
* complementary alignment. A complementary alignment consits of those
|
|
* columns that would delete applying the standard approach. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::clean(float baseLine, float GapsPct, float conservationPct, bool complementary) {
|
|
|
|
alignment *ret;
|
|
float cutCons;
|
|
double cutGaps;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If gaps statistics are not calculated, we calculate
|
|
* them */
|
|
if(calculateGapStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If conservation's statistics are not calculated,
|
|
* we calculate them */
|
|
if(calculateConservationStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Calculate the two cut points using the parameters */
|
|
cutGaps = sgaps->calcCutPoint(baseLine, GapsPct);
|
|
cutCons = scons->calcCutPoint(baseLine, conservationPct);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Clean the alingment using the two cut values, the
|
|
* gapsWindow and MDK_Windows vectors and the baseline
|
|
* value */
|
|
ret = cleanByCutValue(cutGaps, sgaps -> getGapsWindow(), baseLine, cutCons, scons -> getMdkwVector(), complementary);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a reference of the clean alignment object */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function cleans a given alignment based on the consistency values
|
|
* asses from a dataset of alignments. The method takes three parameters:
|
|
*
|
|
* cutpoint: Lower limit (0-1) of comparefile value admits in
|
|
* the new alignment.
|
|
* baseline: Minimum percentage of columns to have in the new alignment
|
|
* vectValues: A vector with alignment's consistency values.
|
|
*
|
|
* The function computes the optimal parameter combination value to trim
|
|
* an alignment based on the consistency value from the comparison among
|
|
* a dataset of alignments with the same sequences. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanCompareFile(float cutpoint, float baseLine, float *vectValues, bool complementary) {
|
|
|
|
alignment *ret;
|
|
float cut, *vectAux;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Allocate memory */
|
|
vectAux = new float[residNumber];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Sort a copy of the vectValues vector, and take the
|
|
* value at 100% - baseline position. */
|
|
utils::copyVect((float *) vectValues, vectAux, residNumber);
|
|
utils::quicksort(vectAux, 0, residNumber-1);
|
|
cut = vectAux[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We have to decide which is the smallest value
|
|
* between the cutpoint value and the value from
|
|
* the minimum percentage threshold */
|
|
cut = cutpoint < cut ? cutpoint : cut;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate memory */
|
|
delete [] vectAux;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Clean the selected alignment using the input parameters. */
|
|
ret = cleanByCutValue(cut, baseLine, vectValues, complementary);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a refernce of the new alignment */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function removes those sequences that are misaligned with the rest
|
|
* of sequences in the alignment at given sequence and residue overlaps.
|
|
*
|
|
* overlapColumn: Set the minimum similarity fraction value that has
|
|
* to have a position from a given sequence to be
|
|
* considered as an hit.
|
|
* minimumOverlap: Set the minimum proportion of hits that has to be
|
|
* a sequence to keep it in the new alignment.
|
|
*
|
|
* At the same time, it's possible to get an alignment with those columns
|
|
* that this method should remove setting for that purpose the complementary
|
|
* parameter to True */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanSpuriousSeq(float overlapColumn, float minimumOverlap, bool complementary) {
|
|
|
|
float *overlapVector;
|
|
alignment *newAlig;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Allocate local memory */
|
|
overlapVector = new float[sequenNumber];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute the overlap's vector using the overlap
|
|
* column's value */
|
|
if(!calculateSpuriousVector(overlapColumn, overlapVector))
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Select and remove the sequences with a overlap less
|
|
* than threshold's overlap and create a new alignemnt */
|
|
newAlig = cleanOverlapSeq(minimumOverlap, overlapVector, complementary);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate local memory */
|
|
delete [] overlapVector;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a reference of the clean alignment object */
|
|
return newAlig;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function implements the gappyout approach. To trim the alignment, this
|
|
* method computes the slope from the gaps distribution from the input alig.
|
|
* Using this information, the method asses the most abrupt change between
|
|
* three consecutive points from that distribution and selects the first of
|
|
* this point as aan cut-off point.
|
|
*
|
|
* The method also can return the complementary alignment that consists of
|
|
* those columns that would be delete applying this algorithm. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::clean2ndSlope(bool complementarity) {
|
|
|
|
alignment *ret;
|
|
int cut;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If gaps statistics are not calculated, we calculate
|
|
* them */
|
|
if(calculateGapStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We get the cut point using a automatic method for
|
|
* this purpose. */
|
|
cut = sgaps -> calcCutPoint2ndSlope();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Using the cut point calculates in last steps, we
|
|
* clean the alignment and generate a new alignment */
|
|
ret = cleanByCutValue(cut, 0, sgaps->getGapsWindow(), complementarity);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Returns the new alignment. */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method implements the strict and strictplus approaches, to apply
|
|
* these algorithms, the program computes the gaps and similarity distribution
|
|
* values to decide which ones are the optimal cutoff points. To compute these
|
|
* cutoff points, the method applies those steps described in the manual. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanCombMethods(bool complementarity, bool variable) {
|
|
|
|
float simCut, first20Point, last80Point, *simil, *vectAux;
|
|
int i, j, acm, gapCut, *positions, *gaps;
|
|
double inic, fin, vlr;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If gaps statistics are not calculated, we calculate
|
|
* them */
|
|
if(calculateGapStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Computes the gap cut point using a automatic method
|
|
* and at the same time, we get the gaps values from
|
|
* the alignment. */
|
|
gapCut = sgaps -> calcCutPoint2ndSlope();
|
|
gaps = sgaps -> getGapsWindow();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If conservation's statistics are not calculated,
|
|
* we calculate them */
|
|
if(calculateConservationStats() != true)
|
|
return NULL;
|
|
|
|
// Once the conservation stats have been calculated, generate the vector
|
|
// containing the similarity value for each column considering parameters
|
|
// such as windows size
|
|
simil = scons -> getMdkwVector();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Allocate local memory and initializate it to -1 */
|
|
positions = new int[residNumber];
|
|
utils::initlVect(positions, residNumber, -1);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* The method only selects columns with gaps number
|
|
* less or equal than the gap's cut point. Counts the
|
|
* number of columns that have been selected */
|
|
for(i = 0, acm = 0; i < residNumber; i++) {
|
|
if(gaps[i] <= gapCut) {
|
|
positions[i] = i;
|
|
acm++;
|
|
}
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Allocate local memory and save the similaritys
|
|
* values for the columns that have been selected */
|
|
vectAux = new float[acm];
|
|
for(i = 0, j = 0; i < residNumber; i++)
|
|
if(positions[i] != -1)
|
|
vectAux[j++] = simil[i];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Sort the conservation's value vector. */
|
|
utils::quicksort(vectAux, 0, acm-1);
|
|
|
|
/* ...and search for the vector points at the 20 and
|
|
* 80% of length. */
|
|
first20Point = 0;
|
|
last80Point = 0;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(i = acm - 1, j = 1; i >= 0; i--, j++) {
|
|
if((((float) j/acm) * 100.0) <= 20.0)
|
|
first20Point = vectAux[i];
|
|
if((((float) j/acm) * 100.0) <= 80.0)
|
|
last80Point = vectAux[i];
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Computes the logaritmic's values for those points.
|
|
* Finally the method computes the similarity cut
|
|
* point using these values. */
|
|
inic = log10(first20Point);
|
|
fin = log10(last80Point);
|
|
vlr = ((inic - fin) / 10) + fin;
|
|
simCut = (float) pow(10, vlr);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Clean the alignment and generate a new alignment
|
|
* object using the gaps cut and the similaritys cut
|
|
* values */
|
|
alignment *ret = cleanStrict(gapCut, sgaps -> getGapsWindow(),
|
|
simCut, scons -> getMdkwVector(), complementarity, variable);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate local memory */
|
|
delete [] vectAux;
|
|
delete [] positions;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return a reference of the new alignment */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function allows to delete those columns consistents of only gaps.
|
|
* This function is specially useful when we remove misaligned sequences from
|
|
* a given alignmnet. As the rest of functions, this function can return the
|
|
* complementary alignment but this alignment would have only columns with
|
|
* all gaps. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::cleanNoAllGaps(bool complementarity) {
|
|
|
|
alignment *ret;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If gaps statistics are not calculated, we calculate
|
|
* them */
|
|
if(calculateGapStats() != true)
|
|
return NULL;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We want to conserve the columns with gaps' number
|
|
* less or equal than sequences' number - 1 */
|
|
ret = cleanByCutValue((sequenNumber - 1), 0, sgaps->getGapsWindow(), complementarity);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Returns the new alignment. */
|
|
return ret;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method computes the overlap for each sequence given a overlap value.
|
|
* This overlap set the minimum fraction that has to be a position for the
|
|
* selected sequence to count as an hit. This proportion measures how much
|
|
* is similar, (in terms of residues, indetermination and gaps), a given
|
|
* position respect to the element on its column. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::calculateSpuriousVector(float overlap, float *spuriousVector) {
|
|
|
|
int i, j, k, seqValue, ovrlap, hit;
|
|
float floatOverlap;
|
|
char indet;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute the overlap */
|
|
floatOverlap = overlap * float(sequenNumber-1);
|
|
ovrlap = int(overlap * (sequenNumber-1));
|
|
|
|
if(floatOverlap > float(ovrlap))
|
|
ovrlap++;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If the spurious vectos is NULL, returns false. */
|
|
if(spuriousVector == NULL)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Depending on the kind of alignment, we have
|
|
* different indetermination symbol */
|
|
if(getTypeAlignment() == AAType)
|
|
indet = 'X';
|
|
else
|
|
indet = 'N';
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For each alignment's sequence, computes its overlap */
|
|
for(i = 0, seqValue = 0; i < sequenNumber; i++, seqValue = 0) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For each alignment's column, computes the overlap
|
|
* between the selected sequence and the other ones */
|
|
for(j = 0; j < residNumber; j++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For sequences are before the sequence selected */
|
|
for(k = 0, hit = 0; k < i; k++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If the element of sequence selected is the same
|
|
* that the element of sequence considered, computes
|
|
* a hit */
|
|
if(sequences[i][j] == sequences[k][j])
|
|
hit++;
|
|
|
|
/* If the element of sequence selected isn't a 'X' nor
|
|
* 'N' (indetermination) or a '-' (gap) and the element
|
|
* of sequence considered isn't a a 'X' nor 'N'
|
|
* (indetermination) or a '-' (gap), computes a hit */
|
|
else if((sequences[i][j] != indet) && (sequences[i][j] != '-')
|
|
&& (sequences[k][j] != indet) && (sequences[k][j] != '-'))
|
|
hit++;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For sequences are after the sequence selected */
|
|
for(k = (i + 1); k < sequenNumber; k++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If the element of sequence selected is the same
|
|
* that the element of sequence considered, computes
|
|
* a hit */
|
|
if(sequences[i][j] == sequences[k][j])
|
|
hit++;
|
|
|
|
/* If the element of sequence selected isn't a 'X' nor
|
|
* 'N' (indetermination) or a '-' (gap) and the element
|
|
* of sequence considered isn't a a 'X' nor 'N'
|
|
* (indetermination) or a '-' (gap), computes a hit */
|
|
else if((sequences[i][j] != indet) && (sequences[i][j] != '-')
|
|
&& (sequences[k][j] != indet) && (sequences[k][j] != '-'))
|
|
hit++;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Finally, if the hit's number divided by number of
|
|
* sequences minus one is greater or equal than
|
|
* overlap's value, computes a column's hit. */
|
|
if(hit >= ovrlap)
|
|
seqValue++;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For each alignment's sequence, computes its spurious's
|
|
* or overlap's value as the column's hits -for that
|
|
sequence- divided by column's number. */
|
|
spuriousVector[i] = ((float) seqValue / residNumber);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If there is not problem in the method, return true */
|
|
return true;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method compute the cluster belongs to a given alignment at certain
|
|
* identity threshold. To know the clusters from the alignment, those
|
|
* clusters are calculated as follow: 1) Select the longest sequences. 2)
|
|
* Using the previous computed identity values, compare if the second
|
|
* longest sequence belongs, because the identity value with the cluster
|
|
* representative is equal or greater than a given threshold, to this cluster
|
|
* or not. 3) If the sequence belongs to this cluster, we add it, if not
|
|
* belongs, we create a new cluster and fix this sequence as its representative
|
|
* 4) Continue with the rest of sequences. In the case that a given sequence
|
|
* can belong to more than one cluster, we choose the cluster which one
|
|
* maximize the identity value respect to its representative sequence */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int *alignment::calculateRepresentativeSeq(float maximumIdent) {
|
|
|
|
int i, j, pos, clusterNum, **seqs;
|
|
int *cluster;
|
|
static int *repres;
|
|
float max;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Ask for the sequence identities assesment */
|
|
if(identities == NULL)
|
|
calculateSeqIdentity();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
seqs = new int*[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
seqs[i] = new int[2];
|
|
seqs[i][0] = utils::removeCharacter('-', sequences[i]).size();
|
|
seqs[i][1] = i;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
utils::quicksort(seqs, 0, sequenNumber-1);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
cluster = new int[sequenNumber];
|
|
cluster[0] = seqs[sequenNumber - 1][1];
|
|
clusterNum = 1;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(i = sequenNumber - 2; i >= 0; i--) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(j = 0, max = 0, pos = -1; j < clusterNum; j++) {
|
|
if(identities[seqs[i][1]][cluster[j]] > maximumIdent) {
|
|
if(identities[seqs[i][1]][cluster[j]] > max) {
|
|
max = identities[seqs[i][1]][cluster[j]];
|
|
pos = j;
|
|
}
|
|
}
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(pos == -1) {
|
|
cluster[j] = seqs[i][1];
|
|
clusterNum++;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
repres = new int[clusterNum + 1];
|
|
repres[0] = clusterNum;
|
|
for(i = 0; i < clusterNum; i++)
|
|
repres[i+1] = cluster[i];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate dinamic memory */
|
|
for(i = 0; i < sequenNumber; i++)
|
|
delete [] seqs[i];
|
|
|
|
delete [] cluster;
|
|
delete [] seqs;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
return repres;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method looks for the optimal cut point for a given clusters number.
|
|
* The idea is to find a identity cut-off point that can be used to get a
|
|
* number of representative sequences similar to the input parameter */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
float alignment::getCutPointClusters(int clusterNumber) {
|
|
|
|
float max, min, avg, gMax, gMin, startingPoint, prevValue = 0, iter = 0;
|
|
int i, j, clusterNum, *cluster, **seqs;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If the user wants only one cluster means that all
|
|
* of sequences have to be in the same clauster.
|
|
* Otherwise, if the users wants the maximum number of
|
|
* clusters means that each sequence have to be in their
|
|
* own cluster */
|
|
if(clusterNumber == sequenNumber) return 1;
|
|
else if(clusterNumber == 1) return 0;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Ask for the sequence identities assesment */
|
|
if(identities == NULL)
|
|
calculateSeqIdentity();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute the maximum, the minimum and the average
|
|
* identity values from the sequences */
|
|
for(i = 0,gMax = 0, gMin = 1, startingPoint = 0; i < sequenNumber; i++) {
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(j = 0, max = 0, avg = 0, min = 1; j < i; j++) {
|
|
if(max < identities[i][j])
|
|
max = identities[i][j];
|
|
if(min > identities[i][j])
|
|
min = identities[i][j];
|
|
avg += identities[i][j];
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(j = i + 1; j < sequenNumber; j++) {
|
|
if(max < identities[i][j])
|
|
max = identities[i][j];
|
|
if(min > identities[i][j])
|
|
min = identities[i][j];
|
|
avg += identities[i][j];
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
startingPoint += avg / (sequenNumber - 1);
|
|
if(max > gMax) gMax = max;
|
|
if(min < gMin) gMin = min;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Take the starting point as the average value */
|
|
startingPoint /= sequenNumber;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute and sort the sequence length */
|
|
seqs = new int*[sequenNumber];
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
seqs[i] = new int[2];
|
|
seqs[i][0] = utils::removeCharacter('-', sequences[i]).size();
|
|
seqs[i][1] = i;
|
|
}
|
|
utils::quicksort(seqs, 0, sequenNumber-1);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Create the data structure to store the different
|
|
* clusters for a given thresholds */
|
|
cluster = new int[sequenNumber];
|
|
cluster[0] = seqs[sequenNumber - 1][1];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/**** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Look for the optimal identity value to get the
|
|
* number of cluster set by the user. To do that, the
|
|
* method starts in the average identity value and moves
|
|
* to the right or lefe side of this value depending on
|
|
* if this value is so strict or not. We set an flag to
|
|
* avoid enter in an infinite loop, if we get the same
|
|
* value for more than 10 consecutive point without get
|
|
* the given cluster number means that it's impossible
|
|
* to get this cut-off and we need to stop the search */
|
|
do {
|
|
clusterNum = 1;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Start the search */
|
|
for(i = sequenNumber - 2; i >= 0; i--) {
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
for(j = 0; j < clusterNum; j++)
|
|
if(identities[seqs[i][1]][cluster[j]] > startingPoint)
|
|
break;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
if(j == clusterNum) {
|
|
cluster[j] = seqs[i][1];
|
|
clusterNum++;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Given the cutoff point, if we get the same cluster
|
|
* number or we have been getting for more than 10
|
|
* consecutive times the same cutoff point, we stop */
|
|
if((clusterNum == clusterNumber) || (iter > 10))
|
|
break;
|
|
else {
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We have to move to the left side from the average
|
|
* to decrease the cutoff value and get a smaller number
|
|
* of clusters */
|
|
if(clusterNum > clusterNumber) {
|
|
gMax = startingPoint;
|
|
startingPoint = (gMax + gMin)/2;
|
|
} else {
|
|
/* In the opposite side, we have to move to the right
|
|
* side from the cutting point to be more strict and get
|
|
* a bigger number of clusters */
|
|
gMin = startingPoint;
|
|
startingPoint = (gMax + gMin)/2;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If the clusters number from the previous iteration
|
|
* is different from the current number, we put the
|
|
* iteration number to 0 and store this new value */
|
|
if(prevValue != clusterNum) {
|
|
iter = 0;
|
|
prevValue = clusterNum;
|
|
/* Otherwise, we increase the iteration number to
|
|
* avoid enter in an infinitve loop */
|
|
} else iter++;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
} while(true);
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate dinamic memory */
|
|
for(i = 0; i < sequenNumber; i++) delete [] seqs[i];
|
|
delete [] seqs;
|
|
delete [] cluster;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
return startingPoint;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method select one representative sequence (the longest one) per each
|
|
* cluster from the input alignment and generate a new alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::getClustering(float identityThreshold) {
|
|
|
|
string *matrixAux, *newSeqsName;
|
|
int i, j, *clustering;
|
|
alignment *newAlig;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Get the representative member for each cluster
|
|
* given a maximum identity threshold */
|
|
clustering = calculateRepresentativeSeq(identityThreshold);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Put all sequences to be deleted and get back those
|
|
* sequences that are representative for each cluster
|
|
* */
|
|
for(i = 0; i < sequenNumber; i ++)
|
|
saveSequences[i] = -1;
|
|
for(i = 1; i <= clustering[0]; i ++)
|
|
saveSequences[clustering[i]] = clustering[i];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We allocate memory to save the sequences selected */
|
|
matrixAux = new string[clustering[0]];
|
|
newSeqsName = new string[clustering[0]];
|
|
|
|
/* Copy to new structures the information that have
|
|
* been selected previously. */
|
|
for(i = 0, j = 0; i < sequenNumber; i++)
|
|
if(saveSequences[i] != -1) {
|
|
newSeqsName[j] = seqsName[i];
|
|
matrixAux[j] = sequences[i];
|
|
j++;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* When we have all parameters, we create the new
|
|
* alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
clustering[0], residNumber, iformat, oformat, shortNames, dataType, isAligned,
|
|
reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber, residuesNumber,
|
|
saveResidues, saveSequences, ghWindow, shWindow, blockSize, identities,
|
|
overlaps);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocated auxiliar memory */
|
|
delete [] matrixAux;
|
|
delete [] newSeqsName;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function returns the backtranslation for a given protein processed
|
|
* alignment into its CDS alignment. To do this convertion, the function needs
|
|
* the Coding sequences as well the original composition of each protein
|
|
* sequence. Also, the function needs to know which columns/sequences will be
|
|
* in the final alignment to carray out the conversion. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
alignment *alignment::getTranslationCDS(int newResidues, int newSequences, int *ColumnsToKeep, string *oldSeqsName, sequencesMatrix *seqMatrix, alignment *ProtAlig) {
|
|
|
|
string *matrixAux;
|
|
alignment *newAlig;
|
|
int i, j, k, l, oldResidues;
|
|
int *mappedSeqs, *tmpSequence, *selectedRes;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Map the selected protein sequences to the input
|
|
* coding sequences */
|
|
mappedSeqs = new int[newSequences];
|
|
for(i = 0; i < sequenNumber; i++)
|
|
for(j = 0; j < newSequences; j++)
|
|
if(!seqsName[i].compare(oldSeqsName[j])) {
|
|
mappedSeqs[j] = i;
|
|
break;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Get the original alignment size as well the original
|
|
* selected/non-selected columns */
|
|
oldResidues = seqMatrix -> getResidNumber();
|
|
selectedRes = new int[oldResidues];
|
|
|
|
for(i = 0; i < oldResidues; i++)
|
|
selectedRes[i] = i;
|
|
|
|
for(j = 0; j < ColumnsToKeep[0]; j++)
|
|
selectedRes[j] = -1;
|
|
|
|
for(i = 0; i < newResidues - 1; i++)
|
|
for(j = ColumnsToKeep[i] + 1; j < ColumnsToKeep[i+1]; j++)
|
|
selectedRes[j] = -1;
|
|
|
|
for(j = ColumnsToKeep[newResidues - 1] + 1; j < oldResidues; j++)
|
|
selectedRes[j] = -1;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We allocate memory to save the columns selected */
|
|
matrixAux = new string[newSequences];
|
|
tmpSequence = new int[oldResidues];
|
|
|
|
/* Using the information about which residues for each
|
|
* sequence was selected by others function, we process
|
|
* these residues to recover the corresponding codons */
|
|
for(i = 0; i < newSequences; i++)
|
|
if(seqMatrix -> getSequence(oldSeqsName[i], tmpSequence)) {
|
|
for(j = 0; j < oldResidues; j++) {
|
|
if((selectedRes[j] != -1) && (tmpSequence[j] != 0)) {
|
|
|
|
for(k = 3 * (tmpSequence[j] - 1), l = 0; l < 3; k++, l++) {
|
|
/* Check whether the nucleotide sequences end has been reached or not.
|
|
* If it has been reached, complete backtranslation using indetermination
|
|
* symbols 'N' */
|
|
if((int) sequences[mappedSeqs[i]].length() > k)
|
|
matrixAux[i].resize(matrixAux[i].size() + 1, sequences[mappedSeqs[i]][k]);
|
|
else
|
|
matrixAux[i].resize(matrixAux[i].size() + 1, 'N');
|
|
}
|
|
} else if(selectedRes[j] != -1) {
|
|
matrixAux[i].resize(matrixAux[i].size() + 3, '-');
|
|
}
|
|
}
|
|
/* If there is any problems with a sequence then
|
|
* the function returns an error */
|
|
} else {
|
|
return NULL;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
|
|
/* Remove columns/sequences composed only by gaps before making the retrotranslation */
|
|
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* When we have all parameters, we create the new
|
|
* alignment */
|
|
newAlig = new alignment(filename, "", matrixAux, oldSeqsName, NULL, newSequences,
|
|
newResidues * 3, ProtAlig -> getInputFormat(), ProtAlig -> getOutputFormat(),
|
|
ProtAlig -> getShortNames(), DNAType, true, ProtAlig -> getReverse(),
|
|
terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, oldResidues * 3, NULL, NULL,
|
|
NULL, 0, 0, ProtAlig -> getBlockSize(), NULL, NULL);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocated auxiliar memory */
|
|
delete [] matrixAux;
|
|
delete mappedSeqs;
|
|
delete tmpSequence;
|
|
delete selectedRes;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function computes the gaps statistics for the input alignment. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::calculateGapStats(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If alignment matrix is not created, return false */
|
|
if(sequences == NULL)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If sgaps object is not created, we create them
|
|
and calculate the statistics */
|
|
if(sgaps == NULL) {
|
|
sgaps = new statisticsGaps(sequences, sequenNumber, residNumber, dataType);
|
|
sgaps -> applyWindow(ghWindow);
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
return true;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Print the gaps value for each column in the alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printStatisticsGapsColumns(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Check if there is computed the gaps statistics */
|
|
if(calculateGapStats())
|
|
/* then prints the information */
|
|
sgaps -> printGapsColumns();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Print the acumulative gaps distribution value from the input alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printStatisticsGapsTotal(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Check it there is computed the gaps statistics */
|
|
if(calculateGapStats())
|
|
/* then prints the information */
|
|
sgaps -> printGapsAcl();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Set the similarity matrix. This matrix is necessary for some methods in
|
|
* the program */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::setSimilarityMatrix(similarityMatrix *sm) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If scons object is not created, we create them */
|
|
if(scons == NULL)
|
|
scons = new statisticsConservation(sequences, sequenNumber, residNumber, dataType);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Associate the matrix to the similarity statistics
|
|
* object */
|
|
if(!scons -> setSimilarityMatrix(sm))
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If it's OK, we return true */
|
|
return true;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method compute the similarity values from the input alignment if it
|
|
* has not been computed before */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::calculateConservationStats(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* It the gaps statistics object has not been created
|
|
* we create it */
|
|
if(calculateGapStats() != true)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* It the similarity statistics object has not been
|
|
* created we create it */
|
|
if(scons == NULL)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Ask for the similarity matrix */
|
|
if(scons -> isSimMatrixDef() != true)
|
|
return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute the similarity statistics from the input
|
|
* alignment */
|
|
if(!scons -> calculateVectors(sequences, sgaps->getGapsWindow()))
|
|
return false;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Ask to know if it is necessary to apply any window
|
|
* method. If it's necessary, we apply it */
|
|
if(scons->isDefinedWindow())
|
|
return true;
|
|
else
|
|
return scons->applyWindow(shWindow);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Print the similarity value for each column from the alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printStatisticsConservationColumns(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Check if the similarity statistics object has been
|
|
* created */
|
|
if(calculateConservationStats())
|
|
/* then prints the information */
|
|
scons -> printConservationColumns();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Print the accumulative similarity distribution values from the alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printStatisticsConservationTotal(void) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Check if the similarity statistics object has been
|
|
* created */
|
|
if(calculateConservationStats())
|
|
/* then prints the information */
|
|
scons -> printConservationAcl();
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method prints the correspondece between the columns in the original
|
|
* and in the trimmed alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printCorrespondence(void) {
|
|
int i;
|
|
|
|
cout << "#ColumnsMap\t";
|
|
/* Print the saveResidues relathionship */
|
|
for(i = 0; i < residNumber - 1; i++)
|
|
cout << saveResidues[i] << ", ";
|
|
cout << saveResidues[i] << endl;
|
|
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return the number of the sequences in the alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getNumSpecies(void) {
|
|
return sequenNumber;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return the number of residues in the alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getNumAminos(void) {
|
|
return residNumber;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Sets the condition to remove only terminal gaps after applying any
|
|
* trimming method or not. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::trimTerminalGaps(bool terminalOnly_, int *boundaries_) {
|
|
terminalGapOnly = terminalOnly_;
|
|
|
|
/* We are only interested in the first and last number of the vector - this
|
|
* vector is generated by other function where it is important to store all
|
|
* intervals */
|
|
if (boundaries_ != NULL) {
|
|
left_boundary = boundaries_[0];
|
|
right_boundary = boundaries_[1];
|
|
}
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method stores the diferent windows values in the alignment object */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::setWindowsSize(int ghWindow_, int shWindow_) {
|
|
ghWindow = ghWindow_;
|
|
shWindow = shWindow_;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method lets to change the output format */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::setOutputFormat(int format_, bool shortNames_) {
|
|
oformat = format_;
|
|
shortNames = shortNames_;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method sets a new block size value */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::setBlockSize(int blockSize_) {
|
|
blockSize = blockSize_;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return true if the shortNames flag has been setted */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getShortNames(void) {
|
|
return shortNames;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return true if the reverse flag has been setted. */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getReverse(void) {
|
|
return reverse;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method lets to change the output alignment orientation */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::setReverse(void) {
|
|
reverse = true;
|
|
}
|
|
|
|
|
|
/* Set appropiate flag to decide whether sequences composed only by gaps should
|
|
* be kept or not */
|
|
void alignment::setKeepSequencesFlag(bool flag) {
|
|
keepSequences = flag;
|
|
}
|
|
|
|
/* Set appropiate flag to decide whether original sequences header should be
|
|
* dumped into the output stream with or without any preprocessing step */
|
|
void alignment::setKeepSeqsHeaderFlag(bool flag) {
|
|
keepHeader = flag;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Return the block size value */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getBlockSize(void) {
|
|
return blockSize;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method returns the sequences name */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::getSequences(string *Names) {
|
|
for(int i = 0; i < sequenNumber; i++)
|
|
Names[i] = seqsName[i];
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::getSequences(string *Names, int *lengths) {
|
|
for(int i = 0; i < sequenNumber; i++) {
|
|
lengths[i] = (int) utils::removeCharacter('-', sequences[i]).length();
|
|
Names[i] = seqsName[i];
|
|
}
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::getSequences(string *Names, string *Sequences, int *Lengths) {
|
|
for(int i = 0; i < sequenNumber; i++) {
|
|
Names[i] = seqsName[i];
|
|
Sequences[i] = utils::removeCharacter('-', sequences[i]);
|
|
Lengths[i] = (int) Sequences[i].length();
|
|
}
|
|
}
|
|
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For a given set of sequences name, check for its correspondence with
|
|
* the current sequences name. If there is not a complete correspondence
|
|
* between both sets, the method return false, otherwise, return true */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::getSeqNameOrder(string *names, int *orderVector) {
|
|
int i, j, numNames;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* For each name in the current alignment, we look
|
|
* for its correspondence in the input set */
|
|
for(i = 0, numNames = 0; i < sequenNumber; i++) {
|
|
for(j = 0; j < sequenNumber; j++) {
|
|
if(seqsName[i] == names[j]) {
|
|
orderVector[i] = j;
|
|
numNames++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Depending on if we get the complete correspondence
|
|
* between both sets of names, we return true or not */
|
|
if(numNames == sequenNumber) return true;
|
|
else return false;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This method lets to build a sequence matrix. A sequence matrix contains
|
|
* the residue position for each sequence without taking into account the
|
|
* gaps in the sequence. This means that at position 10 we have the residue
|
|
* 1 for that sequence */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::sequenMatrix(void) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Use this method to destroy a given sequence matrix */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::destroySequenMatrix(void) {
|
|
if(seqMatrix != NULL)
|
|
delete seqMatrix;
|
|
seqMatrix = NULL;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Print the alignment's sequence matrix */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::printSequenMatrix(void) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
seqMatrix -> printMatrix();
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Given an index, this method returns the sequence matrix column for that
|
|
* index */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::getColumnSeqMatrix(int column, int *columnSeqMatrix) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
seqMatrix -> getColumn(column, columnSeqMatrix);
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function looks if a given value belongs a given row. In the affirmative
|
|
* case, returns the columns value for that row and value combination */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::getColumnSeqMatrix(int value, int row, int *columnSeqMatrix) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
seqMatrix -> getColumn(value, row, columnSeqMatrix);
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function change the rows order in the sequence matrix */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
void alignment::setSeqMatrixOrder(int *order) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
seqMatrix -> setOrder(order);
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* This function change the rows order in the sequence matrix */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
sequencesMatrix *alignment::getSeqMatrix(void) {
|
|
if(seqMatrix == NULL)
|
|
seqMatrix = new sequencesMatrix(sequences, seqsName, sequenNumber, residNumber);
|
|
return seqMatrix;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Computes, if it's necessary, and return the alignment's type */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int alignment::getTypeAlignment(void) {
|
|
if(dataType == 0)
|
|
dataType = utils::checkTypeAlignment(sequenNumber, residNumber, sequences);
|
|
return dataType;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Returns the correspondence between the columns in the original and in the
|
|
* trimmed alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int *alignment::getCorrespResidues(void) {
|
|
return saveResidues;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Returns the correspondence between the sequences in the original and in
|
|
* the trimmed alignment */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
int *alignment::getCorrespSequences(void) {
|
|
return saveSequences;
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Returns if the alignment is aligned or not */
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
bool alignment::isFileAligned(void) {
|
|
return isAligned;
|
|
}
|
|
|
|
/* *****************************************************************************
|
|
* *****************************************************************************
|
|
* *****************************************************************************
|
|
* ************************************************************************** */
|
|
|
|
/* This method removes those columns that exceed a given threshold. If the
|
|
* number of columns in the trimmed alignment is lower than a given percentage
|
|
* of the original alignment. The program relaxes the threshold until to add
|
|
* enough columns to achieve this percentage, to add those columns the program
|
|
* start in the middle of the original alignment and make, alternatively,
|
|
* movements to the left and right side from that point. This method also can
|
|
* return the complementary alignment. */
|
|
alignment *alignment::cleanByCutValue(double cut, float baseLine,
|
|
const int *gInCol, bool complementary) {
|
|
|
|
int i, j, k, jn, oth, pos, block, *vectAux;
|
|
string *matrixAux, *newSeqsName;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Select the columns with a gaps value less or equal
|
|
* than the cut point. */
|
|
for(i = 0, counter.residues = 0; i < residNumber; i++)
|
|
if(gInCol[i] <= cut) counter.residues++;
|
|
else saveResidues[i] = -1;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute, if it's necessary, the number of columns
|
|
* necessary to achieve the minimum number of columns
|
|
* fixed by coverage parameter. */
|
|
oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if it's necessary to recover some columns, we
|
|
* applied this instructions to recover it */
|
|
if(oth > 0) {
|
|
counter.residues += oth;
|
|
|
|
/* Allocate memory */
|
|
vectAux = new int[residNumber];
|
|
|
|
/* Sort a copy of the gInCol vector, and take the value of the column that marks the % baseline */
|
|
utils::copyVect((int *) gInCol, vectAux, residNumber);
|
|
utils::quicksort(vectAux, 0, residNumber-1);
|
|
cut = vectAux[(int) ((float)(residNumber - 1) * (baseLine)/100.0)];
|
|
|
|
/* Deallocate memory */
|
|
delete [] vectAux;
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Fixed the initial size of blocks as 0.5% of
|
|
* alignment's length */
|
|
for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We start in the alignment middle then we move on
|
|
* right and left side at the same time. */
|
|
for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Left side. Here, we compute the block's size. */
|
|
for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we save all columns that have not been
|
|
* saved previously. */
|
|
if((i - jn) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
|
|
if(gInCol[jn] <= cut) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
i = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Right side. Here, we compute the block's size. */
|
|
for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we save all columns that have not been
|
|
* saved previously. */
|
|
if((jn - j) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
|
|
if(gInCol[jn] <= cut) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
j = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* Keep only columns blocks bigger than an input columns block size */
|
|
if(blockSize != 0) {
|
|
|
|
/* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
|
|
* everytime than a column is not selected by the trimming method, check
|
|
* whether the current block size until that point is big enough to be kept
|
|
* or it should be removed from the final alignment */
|
|
for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
|
|
if(saveResidues[i] != -1)
|
|
block++;
|
|
else {
|
|
/* Remove columns from blocks smaller than input blocks size */
|
|
if(block < blockSize)
|
|
for(j = pos; j <= i; j++)
|
|
saveResidues[j] = -1;
|
|
pos = i + 1;
|
|
block = 0;
|
|
}
|
|
}
|
|
/* Check final block separately since it could happen than last block is not
|
|
* big enough but because the loop end could provoke to ignore it */
|
|
if(block < blockSize)
|
|
for(j = pos; j < i; j++)
|
|
saveResidues[j] = -1;
|
|
}
|
|
|
|
/* If the flag -terminalony is set, apply a method to look for internal
|
|
* boundaries and get back columns inbetween them, if they exist */
|
|
if(terminalGapOnly == true)
|
|
if(!removeOnlyTerminal())
|
|
return NULL;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(true, false);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
delete[] matrixAux;
|
|
delete[] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* This method removes those columns that not achieve a given threshold. If the
|
|
* number of columns in the trimmed alignment is lower than a given percentage
|
|
* of the original alignment. The program relaxes the threshold until to add
|
|
* enough columns to achieve this percentage, to add those columns the program
|
|
* start in the middle of the original alignment and make, alternatively,
|
|
* movements to the left and right side from that point. This method also can
|
|
* return the complementary alignment. */
|
|
alignment *alignment::cleanByCutValue(float cut, float baseLine,
|
|
const float *ValueVect, bool complementary) {
|
|
|
|
int i, j, k, jn, oth, pos, block;
|
|
string *matrixAux, *newSeqsName;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Select the columns with a conservation's value
|
|
* greater than the cut point. */
|
|
for(i = 0, counter.residues = 0; i < residNumber; i++)
|
|
if(ValueVect[i] > cut) counter.residues++;
|
|
else saveResidues[i] = -1;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute, if it's necessary, the number of columns
|
|
* necessary to achieve the minimum number of columns
|
|
* fixed by coverage value. */
|
|
oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
|
|
if(oth > 0) counter.residues += oth;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Fixed the initial size of blocks as 0.5% of
|
|
* alignment's length */
|
|
for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We start in the alignment middle then we move on
|
|
* right and left side at the same time. */
|
|
for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Left side. Here, we compute the block's size. */
|
|
for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we save all columns that have not been
|
|
* saved previously. */
|
|
/* Here, we only accept column with a conservation's
|
|
* value equal to conservation cut point. */
|
|
if((i - jn) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
|
|
if(ValueVect[jn] == cut) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
i = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Right side. Here, we compute the block's size. */
|
|
for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we select the column and save the block's
|
|
* size for the next iteraction. it's obvius that we
|
|
* decrease the column's number needed to finish. */
|
|
/* Here, we only accept column with a conservation's
|
|
* value equal to conservation cut point. */
|
|
if((jn - j) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
|
|
if(ValueVect[jn] == cut) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
j = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* Keep only columns blocks bigger than an input columns block size */
|
|
if(blockSize != 0) {
|
|
|
|
/* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
|
|
* everytime than a column is not selected by the trimming method, check
|
|
* whether the current block size until that point is big enough to be kept
|
|
* or it should be removed from the final alignment */
|
|
for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
|
|
if(saveResidues[i] != -1)
|
|
block++;
|
|
else {
|
|
/* Remove columns from blocks smaller than input blocks size */
|
|
if(block < blockSize)
|
|
for(j = pos; j <= i; j++)
|
|
saveResidues[j] = -1;
|
|
pos = i + 1;
|
|
block = 0;
|
|
}
|
|
}
|
|
/* Check final block separately since it could happen than last block is not
|
|
* big enough but because the loop end could provoke to ignore it */
|
|
if(block < blockSize)
|
|
for(j = pos; j < i; j++)
|
|
saveResidues[j] = -1;
|
|
}
|
|
|
|
/* If the flag -terminalony is set, apply a method to look for internal
|
|
* boundaries and get back columns inbetween them, if they exist */
|
|
if(terminalGapOnly == true)
|
|
if(!removeOnlyTerminal())
|
|
return NULL;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(true, false);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
delete[] matrixAux;
|
|
delete[] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* This method removes those columns that not achieve the similarity threshond,
|
|
* by one hand, and exceed the gaps threshold. If the number of columns that
|
|
* have been selected is lower that a given coverage, the program relaxes both
|
|
* thresholds in order to get back some columns and achieve this minimum
|
|
* number of columns in the new alignment. For that purpose, the program
|
|
* starts at the middle of the original alignment and makes, alternatively,
|
|
* movememts to the right and left sides looking for those columns necessary
|
|
* to achieve the minimum number of columns set by the coverage parameter.
|
|
* This method also can return the complementary alignmnet consists of those
|
|
* columns that will be deleted from the original alignment applying the
|
|
* standard method. */
|
|
alignment *alignment::cleanByCutValue(double cutGaps, const int *gInCol,
|
|
float baseLine, float cutCons, const float *MDK_Win, bool complementary) {
|
|
|
|
int i, j, k, oth, pos, block, jn, blGaps, *vectAuxGaps;
|
|
string *matrixAux, *newSeqsName;
|
|
float blCons, *vectAuxCons;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Select the columns with a conservation's value
|
|
* greater than the conservation cut point AND
|
|
* less or equal than the gap cut point. */
|
|
for(i = 0, counter.residues = 0; i < residNumber; i++)
|
|
if((MDK_Win[i] > cutCons) && (gInCol[i] <= cutGaps)) counter.residues++;
|
|
else saveResidues[i] = -1;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Compute, if it's necessary, the number of columns
|
|
* necessary to achieve the minimum number of it fixed
|
|
* by the coverage parameter. */
|
|
oth = utils::roundInt((((baseLine/100.0) - (float) counter.residues/residNumber)) * residNumber);
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* If it's needed to add new columns, we compute the
|
|
* news thresholds */
|
|
if(oth > 0) {
|
|
counter.residues += oth;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Allocate memory */
|
|
vectAuxCons = new float[residNumber];
|
|
vectAuxGaps = new int[residNumber];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Sort a copy of the MDK_Win vector and of the gInCol
|
|
* vector, and take the value of the column that marks
|
|
* the % baseline */
|
|
utils::copyVect((float *) MDK_Win, vectAuxCons, residNumber);
|
|
utils::copyVect((int *) gInCol, vectAuxGaps, residNumber);
|
|
|
|
utils::quicksort(vectAuxCons, 0, residNumber-1);
|
|
utils::quicksort(vectAuxGaps, 0, residNumber-1);
|
|
|
|
blCons = vectAuxCons[(int) ((float)(residNumber - 1) * (100.0 - baseLine)/100.0)];
|
|
blGaps = vectAuxGaps[(int) ((float)(residNumber - 1) * (baseLine)/100.0)];
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Deallocate memory */
|
|
delete [] vectAuxCons;
|
|
delete [] vectAuxGaps;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Fixed the initial size of blocks as 0.5% of
|
|
* alignment's length */
|
|
for(k = utils::roundInt(0.005 * residNumber); (k >= 0) && (oth > 0); k--) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* We start in the alignment middle then we move on
|
|
* right and left side at the same time. */
|
|
for(i = (residNumber/2), j = (i + 1); (((i > 0) || (j < (residNumber - 1))) && (oth > 0)); i--, j++) {
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Left side. Here, we compute the block's size. */
|
|
for(jn = i; ((saveResidues[jn] != -1) && (jn >= 0) && (oth > 0)); jn--) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we select the column and save the block's
|
|
* size for the next iteraction. it's obvius that we
|
|
* decrease the column's number needed to finish. */
|
|
/* Here, we accept column with a conservation's value
|
|
* greater or equal than the conservation cut point OR
|
|
* less or equal than the gap cut point. */
|
|
if((i - jn) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn >= 0) && (oth > 0)); jn--) {
|
|
if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
i = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* Right side. Here, we compute the block's size. */
|
|
for(jn = j; ((saveResidues[jn] != -1) && (jn < residNumber) && (oth > 0)); jn++) ;
|
|
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
/* if block's size is greater or equal than the fixed
|
|
* size then we select the column and save the block's
|
|
* size for the next iteraction. it's obvius that we
|
|
* decrease the column's number needed to finish. */
|
|
/* Here, we accept column with a conservation's value
|
|
* greater or equal than the conservation cut point OR
|
|
* less or equal than the gap cut point. */
|
|
if((jn - j) >= k) {
|
|
for( ; ((saveResidues[jn] == -1) && (jn < residNumber) && (oth > 0)); jn++) {
|
|
if((MDK_Win[jn] >= blCons) || (gInCol[jn] <= blGaps)) {
|
|
saveResidues[jn] = jn;
|
|
oth--;
|
|
} else
|
|
break;
|
|
}
|
|
}
|
|
j = jn;
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
/* ***** ***** ***** ***** ***** ***** ***** ***** */
|
|
}
|
|
|
|
/* Keep only columns blocks bigger than an input columns block size */
|
|
if(blockSize != 0) {
|
|
|
|
/* Traverse all alignment looking for columns blocks greater than LONGBLOCK,
|
|
* everytime than a column is not selected by the trimming method, check
|
|
* whether the current block size until that point is big enough to be kept
|
|
* or it should be removed from the final alignment */
|
|
for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
|
|
if(saveResidues[i] != -1)
|
|
block++;
|
|
else {
|
|
/* Remove columns from blocks smaller than input blocks size */
|
|
if(block < blockSize)
|
|
for(j = pos; j <= i; j++)
|
|
saveResidues[j] = -1;
|
|
pos = i + 1;
|
|
block = 0;
|
|
}
|
|
}
|
|
/* Check final block separately since it could happen than last block is not
|
|
* big enough but because the loop end could provoke to ignore it */
|
|
if(block < blockSize)
|
|
for(j = pos; j < i; j++)
|
|
saveResidues[j] = -1;
|
|
}
|
|
|
|
/* If the flag -terminalony is set, apply a method to look for internal
|
|
* boundaries and get back columns inbetween them, if they exist */
|
|
if(terminalGapOnly == true)
|
|
if(!removeOnlyTerminal())
|
|
return NULL;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(true, false);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
delete[] matrixAux;
|
|
delete[] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* This method carries out the strict and strict plus method. To trim the
|
|
* alignment in an automated way, the method uses as an input a given gaps
|
|
* thresholds over a gaps vector values, a given similarity threshold over a
|
|
* similarity vector values. With a flag, we can decide which method the
|
|
* program shall apply. With another one, the user can ask for the
|
|
* complementary alignment. In this method, those columns that has been marked
|
|
* to be deleted but has, at least, 3 of 4 surronding neighbour selected are
|
|
* get back to the new alignmnet. At other part of the method, those column
|
|
* blocks that do not have a minimum size fix by the method will be deleted
|
|
* from the trimmed alignment. */
|
|
alignment *alignment::cleanStrict(int gapCut, const int *gInCol, float simCut,
|
|
const float *MDK_W, bool complementary, bool variable) {
|
|
|
|
int i, num, lenBlock;
|
|
alignment *newAlig;
|
|
|
|
deque<int> neighboursBlock;
|
|
|
|
/* Reject columns with gaps number greater than the gap threshold. */
|
|
for(i = 0; i < residNumber; i++)
|
|
if(gInCol[i] > gapCut)
|
|
saveResidues[i] = -1;
|
|
|
|
/* Reject columns with similarity score under the threshold. */
|
|
for(i = 0; i < residNumber; i++)
|
|
if(MDK_W[i] < simCut)
|
|
saveResidues[i] = -1;
|
|
|
|
/* Search for those columns that have been removed but have,
|
|
* at least, 3 out of 4 adjacent columns selected. */
|
|
|
|
/* Load the first 5 columns into the list */
|
|
for(i = 0; i < (residNumber) && i < 5; i++)
|
|
neighboursBlock.push_back(saveResidues[i]);
|
|
|
|
/* Special case #1 - we analyze it before processing the rest positions
|
|
* Column: 1 - Save this column considering its neighbours */
|
|
if((saveResidues[0] == -1) && (neighboursBlock.size() > 2) &&
|
|
(neighboursBlock[1] != -1) &&
|
|
(neighboursBlock[2] != -1))
|
|
saveResidues[0] = 0;
|
|
|
|
/* Special case #2A - we analyze it before processing the rest positions
|
|
* Column: 2 - Save this column considering its neighbours */
|
|
if((saveResidues[1] == -1) && (neighboursBlock.size() > 3) &&
|
|
((neighboursBlock[0] != -1) &&
|
|
(neighboursBlock[2] != -1) &&
|
|
(neighboursBlock[3] != -1)))
|
|
saveResidues[1] = 1;
|
|
|
|
/* Special case #2B - when alignment is 3 columns long - we analyze it before
|
|
* processing the rest positions.
|
|
* Column: 2 - Save this column considering its neighbours */
|
|
if((saveResidues[1] == -1) && (neighboursBlock.size() > 2) &&
|
|
((neighboursBlock[0] != -1) &&
|
|
(neighboursBlock[2] != -1)))
|
|
saveResidues[1] = 1;
|
|
|
|
/* Normal cases */
|
|
for(i = 2, num = 0; i < (residNumber - 2); i++, num = 0) {
|
|
if(saveResidues[i] == -1) {
|
|
if(neighboursBlock[0] != -1)
|
|
num++;
|
|
if(neighboursBlock[1] != -1)
|
|
num++;
|
|
if(neighboursBlock[3] != -1)
|
|
num++;
|
|
if(neighboursBlock[4] != -1)
|
|
num++;
|
|
saveResidues[i] = (num >= 3) ? i : -1;
|
|
|
|
}
|
|
/* We slide the window one position to the right */
|
|
if((i+3) < residNumber) {
|
|
neighboursBlock.pop_front();
|
|
neighboursBlock.push_back(saveResidues[i+3]);
|
|
}
|
|
}
|
|
|
|
/* Special case #3 -
|
|
* Column: 2nd last one - We consider the list rather than the saveResidues
|
|
* vector in order to consider only unmodified values before the previous
|
|
* loop */
|
|
if((saveResidues[(residNumber - 2)] == -1) && (neighboursBlock.size() > 3) &&
|
|
((neighboursBlock[neighboursBlock.size() - 4] != -1) &&
|
|
(neighboursBlock[neighboursBlock.size() - 3] != -1) &&
|
|
(neighboursBlock[neighboursBlock.size() - 1] != -1)))
|
|
saveResidues[(residNumber - 2)] = (residNumber - 2);
|
|
|
|
/* Special case #4 -
|
|
* Column: last one - We consider the list rather than the saveResidues
|
|
* vector in order to consider only unmodified values before the previous
|
|
* loop */
|
|
if((saveResidues[(residNumber - 1)] == -1) && (neighboursBlock.size() > 2) &&
|
|
((neighboursBlock[neighboursBlock.size() - 3] != -1) &&
|
|
(neighboursBlock[neighboursBlock.size() - 2] != -1)))
|
|
saveResidues[(residNumber - 1)] = (residNumber - 1);
|
|
|
|
/* Select blocks size based on user input. It can be set either to 5 or to a
|
|
* variable number between 3 and 12 depending on alignment length (1% alig) */
|
|
if(!variable)
|
|
lenBlock = 5;
|
|
else {
|
|
lenBlock = utils::roundInt(residNumber * 0.01);
|
|
lenBlock = lenBlock > 3 ? (lenBlock < 12 ? lenBlock : 12) : 3;
|
|
}
|
|
|
|
/* Allow to change minimal block size */
|
|
blockSize = blockSize > 0 ? blockSize : lenBlock;
|
|
|
|
/* Keep only columns blocks bigger than either a computed dinamically or
|
|
* set by the user block size */
|
|
removeSmallerBlocks(blockSize);
|
|
|
|
/* If the flag -terminalony is set, apply a method to look for internal
|
|
* boundaries and get back columns inbetween them, if they exist */
|
|
if(terminalGapOnly == true)
|
|
if(!removeOnlyTerminal())
|
|
return NULL;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(true, false);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
|
|
newValues counter;
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
//~ matrixAux = new string[counter.sequences];
|
|
//~ newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
//~ fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
fillNewDataStructure(&counter);
|
|
|
|
//~ cerr << counter -> seqsName[counter -> sequences - 1] << endl;
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
//~ newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo, counter.sequences, counter.residues,
|
|
newAlig = new alignment(filename, aligInfo, counter.matrix, counter.seqsName,
|
|
seqsInfo, counter.sequences, counter.residues, iformat, oformat, shortNames,
|
|
dataType, isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber,
|
|
residNumber, residuesNumber, saveResidues, saveSequences, ghWindow, shWindow,
|
|
blockSize, identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
//~ delete[] matrixAux;
|
|
//~ delete[] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* Remove those sequences with an overlap less than a given threshold. It can
|
|
* return the complementary alignment if appropiate flag is set. */
|
|
alignment *alignment::cleanOverlapSeq(float minimumOverlap, float *overlapSeq,
|
|
bool complementary) {
|
|
|
|
string *matrixAux, *newSeqsName;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
int i;
|
|
|
|
/* Keep only those sequences with an overlap value equal or greater than
|
|
* the minimum overlap value set by the user. */
|
|
for(i = 0; i < sequenNumber; i++)
|
|
if(overlapSeq[i] < minimumOverlap)
|
|
saveSequences[i] = -1;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(false, true);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
delete [] matrixAux;
|
|
delete [] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* Remove those columns, expressed as range, set by the user. It can return
|
|
* the complementary alignmnet if appropiate flags is set. */
|
|
alignment *alignment::removeColumns(int *columns, int init, int size,
|
|
bool complementary) {
|
|
|
|
string *matrixAux, *newSeqsName;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
int i, j;
|
|
|
|
/* Delete those range columns defines in the columns vector */
|
|
for(i = init; i < size + init; i += 2)
|
|
for(j = columns[i]; j <= columns[i+1]; j++)
|
|
saveResidues[j] = -1;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(true, false);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Deallocate local memory */
|
|
delete[] matrixAux;
|
|
delete[] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* This method removes those sequences, expressed as range of sequences, set by
|
|
* the user. The method also can return the complementary alignment, it means,
|
|
* those sequences that originally should be removed from the input alignment */
|
|
alignment *alignment::removeSequences(int *seqs, int init, int size,
|
|
bool complementary) {
|
|
|
|
string *matrixAux, *newSeqsName;
|
|
alignment *newAlig;
|
|
newValues counter;
|
|
int i, j;
|
|
|
|
/* Delete those range of sequences defines by the seqs vector */
|
|
for(i = init; i < size + init; i += 2)
|
|
for(j = seqs[i]; j <= seqs[i+1]; j++)
|
|
saveSequences[j] = -1;
|
|
|
|
/* Once the columns/sequences selection is done, turn it around
|
|
* if complementary flag is active */
|
|
if(complementary == true)
|
|
computeComplementaryAlig(false, true);
|
|
|
|
/* Check for any additional column/sequence to be removed */
|
|
/* Compute new sequences and columns numbers */
|
|
counter = removeCols_SeqsAllGaps();
|
|
|
|
/* Allocate memory for selected sequences/columns */
|
|
matrixAux = new string[counter.sequences];
|
|
newSeqsName = new string[counter.sequences];
|
|
|
|
/* Fill local allocated memory with previously selected data */
|
|
fillNewDataStructure(matrixAux, newSeqsName);
|
|
|
|
/* When we have all parameters, we create the new alignment */
|
|
newAlig = new alignment(filename, aligInfo, matrixAux, newSeqsName, seqsInfo,
|
|
counter.sequences, counter.residues, iformat, oformat, shortNames, dataType,
|
|
isAligned, reverse, terminalGapOnly, left_boundary, right_boundary,
|
|
keepSequences, keepHeader, sequenNumber, residNumber,
|
|
residuesNumber, saveResidues, saveSequences, ghWindow, shWindow, blockSize,
|
|
identities, overlaps);
|
|
|
|
/* Free local memory */
|
|
delete [] matrixAux;
|
|
delete [] newSeqsName;
|
|
|
|
/* Return the new alignment reference */
|
|
return newAlig;
|
|
}
|
|
|
|
/* Function for computing the complementary alignment. It just turn around the
|
|
* current columns/sequences selection */
|
|
void alignment::computeComplementaryAlig(bool residues, bool sequences) {
|
|
int i;
|
|
|
|
for(i = 0; i < residNumber && residues; i++)
|
|
saveResidues[i] = (saveResidues[i] == -1) ? i : -1;
|
|
|
|
for(i = 0; i < sequenNumber && sequences; i++)
|
|
saveSequences[i] = (saveSequences[i] == -1) ? i : -1;
|
|
}
|
|
|
|
/* Function designed for identifying right and left borders between central
|
|
* and terminal regions in the alignment. The borders are those columns, first
|
|
* and last, composed by only residues. Everything inbetween left and right
|
|
* borders are keept independently of the applied methods */
|
|
bool alignment::removeOnlyTerminal(void) {
|
|
|
|
int i;
|
|
const int *gInCol;
|
|
|
|
if((left_boundary == -1) and (right_boundary == -1)) {
|
|
/* Get alignments gaps stats and copy it */
|
|
if(calculateGapStats() != true) {
|
|
cerr << endl << "WARNING: Impossible to apply 'terminal-only' method"
|
|
<< endl << endl;
|
|
return false;
|
|
}
|
|
gInCol = sgaps -> getGapsWindow();
|
|
|
|
/* Identify left and right borders. First and last columns with no gaps */
|
|
for(i = 0; i < residNumber && gInCol[i] != 0; i++) ;
|
|
left_boundary = i;
|
|
|
|
for(i = residNumber - 1; i > -1 && gInCol[i] != 0; i--) ;
|
|
right_boundary = i;
|
|
}
|
|
|
|
else if(left_boundary >= right_boundary) {
|
|
cerr << endl << "ERROR: Check your manually set left '"<< left_boundary
|
|
<< "' and right '" << right_boundary << "' boundaries'" << endl << endl;
|
|
return false;
|
|
}
|
|
|
|
/* We increase the right boundary in one position because we use lower strict
|
|
* condition to get back columns inbetween boundaries removed by any method */
|
|
right_boundary += 1;
|
|
|
|
/* Once the interal boundaries have been established, if these limits exist
|
|
* then retrieved all columns inbetween both boundaries. Columns out of these
|
|
* limits will remain selected or not depending on the algorithm applied */
|
|
for(i = left_boundary; i < right_boundary; i++)
|
|
saveResidues[i] = i;
|
|
|
|
return true;
|
|
}
|
|
|
|
/* Function designed to identify and remove those columns blocks smaller than
|
|
* a given size */
|
|
void alignment::removeSmallerBlocks(int blockSize) {
|
|
|
|
int i, j, pos, block;
|
|
|
|
if(blockSize == 0)
|
|
return ;
|
|
|
|
/* Traverse the alignment looking for blocks greater than BLOCKSIZE, everytime
|
|
* than a column hasn't been selected, check whether the current block is big
|
|
* enough to be kept or it should be removed from the final alignment */
|
|
for(i = 0, pos = 0, block = 0; i < residNumber; i++) {
|
|
if(saveResidues[i] != -1)
|
|
block++;
|
|
else {
|
|
/* Remove columns from blocks smaller than input blocks size */
|
|
if(block < blockSize)
|
|
for(j = pos; j <= i; j++)
|
|
saveResidues[j] = -1;
|
|
pos = i + 1;
|
|
block = 0;
|
|
}
|
|
}
|
|
|
|
/* Check final block separately since it could happen than last block is not
|
|
* big enough but because the loop end could provoke to ignore it */
|
|
if(block < blockSize)
|
|
for(j = pos; j < i; j++)
|
|
saveResidues[j] = -1;
|
|
return ;
|
|
}
|
|
|
|
/* Function designed to identify columns and sequences composed only by gaps.
|
|
* Once these columns/sequences have been identified, they are removed from
|
|
* final alignment. */
|
|
newValues alignment::removeCols_SeqsAllGaps(void) {
|
|
int i, j, valid, gaps;
|
|
bool warnings = false;
|
|
newValues counter;
|
|
|
|
/* Check all valid columns looking for those composed by only gaps */
|
|
for(i = 0, counter.residues = 0; i < residNumber; i++) {
|
|
if(saveResidues[i] == -1)
|
|
continue;
|
|
|
|
for(j = 0, valid = 0, gaps = 0; j < sequenNumber; j++) {
|
|
if (saveSequences[j] == -1)
|
|
continue;
|
|
if (sequences[j][i] == '-')
|
|
gaps ++;
|
|
valid ++;
|
|
}
|
|
/* Once a column has been identified, warm about it and remove it */
|
|
if(gaps == valid) {
|
|
if(!warnings)
|
|
cerr << endl;
|
|
warnings = true;
|
|
cerr << "WARNING: Removing column '" << i << "' composed only by gaps"
|
|
<< endl;
|
|
saveResidues[i] = -1;
|
|
} else {
|
|
counter.residues ++;
|
|
}
|
|
}
|
|
|
|
/* Check for those selected sequences to see whether there is anyone with
|
|
* only gaps */
|
|
for(i = 0, counter.sequences = 0; i < sequenNumber; i++) {
|
|
if(saveSequences[i] == -1)
|
|
continue;
|
|
|
|
for(j = 0, valid = 0, gaps = 0; j < residNumber; j++) {
|
|
if(saveResidues[j] == -1)
|
|
continue;
|
|
if (sequences[i][j] == '-')
|
|
gaps ++;
|
|
valid ++;
|
|
}
|
|
/* Warm about it and remove each sequence composed only by gaps */
|
|
if(gaps == valid) {
|
|
if(!warnings)
|
|
cerr << endl;
|
|
warnings = true;
|
|
|
|
if(keepSequences) {
|
|
cerr << "WARNING: Keeping sequence '" << seqsName[i]
|
|
<< "' composed only by gaps" << endl;
|
|
counter.sequences ++;
|
|
} else {
|
|
cerr << "WARNING: Removing sequence '" << seqsName[i]
|
|
<< "' composed only by gaps" << endl;
|
|
saveSequences[i] = -1;
|
|
}
|
|
} else {
|
|
counter.sequences ++;
|
|
}
|
|
}
|
|
if(warnings)
|
|
cerr << endl;
|
|
|
|
counter.matrix = new string[counter.sequences];
|
|
counter.seqsName = new string[counter.sequences];
|
|
|
|
return counter;
|
|
}
|
|
|
|
/* Function for copying to previously allocated memory those data selected
|
|
* for being in the final alignment */
|
|
void alignment::fillNewDataStructure(string *newMatrix, string *newNames) {
|
|
int i, j, k;
|
|
|
|
/* Copy only those sequences/columns selected */
|
|
for(i = 0, j = 0; i < sequenNumber; i++) {
|
|
if(saveSequences[i] == -1)
|
|
continue;
|
|
|
|
newNames[j] = seqsName[i];
|
|
for(k = 0; k < residNumber; k++) {
|
|
if(saveResidues[k] == -1)
|
|
continue;
|
|
newMatrix[j].resize(newMatrix[j].size() + 1, sequences[i][k]);
|
|
}
|
|
j++;
|
|
}
|
|
}
|
|
|
|
/* Function for copying to previously allocated memory those data selected
|
|
* for being in the final alignment */
|
|
void alignment::fillNewDataStructure(newValues *data) {
|
|
int i, j, k;
|
|
|
|
/* Copy only those sequences/columns selected */
|
|
for(i = 0, j = 0; i < sequenNumber; i++) {
|
|
if(saveSequences[i] == -1)
|
|
continue;
|
|
|
|
data -> seqsName[j] = seqsName[i];
|
|
for(k = 0; k < residNumber; k++) {
|
|
if(saveResidues[k] == -1)
|
|
continue;
|
|
data -> matrix[j].resize(data -> matrix[j].size() + 1, sequences[i][k]);
|
|
}
|
|
j++;
|
|
}
|
|
// cerr << data -> seqsName[j-1] << endl;
|
|
}
|
|
|
|
/* Check if CDS file is correct based on: Residues are DNA/RNA (at most). There
|
|
* is not gaps in the whole dataset. Each sequence is multiple of 3. At the same
|
|
* time, the function will remove stop codons if appropiate flags are used */
|
|
bool alignment::prepareCodingSequence(bool splitByStopCodon, bool ignStopCodon,\
|
|
alignment *proteinAlig) {
|
|
|
|
bool warning = false;
|
|
size_t found;
|
|
int i;
|
|
|
|
/* New code: We now care about the presence of wildcards/indeterminations
|
|
* characters such as 'X' or 'B' into protein sequences as well as about the
|
|
* presence of Selenocysteines ('U') or Pyrrolysines ('O'). It only works, by
|
|
* now, with the universal genetic code */
|
|
int *protSeqsLengths, numbProtSeqs, current_prot;
|
|
string *protSeqsNames, *protSequences;
|
|
char aminoAcid;
|
|
|
|
numbProtSeqs = proteinAlig -> getNumSpecies();
|
|
protSeqsNames = new string[numbProtSeqs];
|
|
protSequences = new string[numbProtSeqs];
|
|
protSeqsLengths = new int[numbProtSeqs];
|
|
|
|
proteinAlig -> getSequences(protSeqsNames, protSequences, protSeqsLengths);
|
|
|
|
/* Check read sequences are real DNA/RNA */
|
|
if (getTypeAlignment() == AAType) {
|
|
cerr << endl << "ERROR: Check input CDS file. It seems to content protein "
|
|
<< "residues." << endl << endl;
|
|
return false;
|
|
}
|
|
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
|
|
/* Get protein sequence to compare against any potential stop codon in the
|
|
* coding sequence. If there is not protein sequence for current coding
|
|
* sequence, skip its analysis */
|
|
for(current_prot = 0; current_prot < numbProtSeqs; current_prot++)
|
|
if(protSeqsNames[current_prot] == seqsName[i])
|
|
break;
|
|
if(current_prot == numbProtSeqs)
|
|
continue;
|
|
|
|
if(sequences[i].find("-") != string::npos) {
|
|
if (!warning)
|
|
cerr << endl;
|
|
cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has, at least, one gap"
|
|
<< endl << endl;
|
|
return false;
|
|
}
|
|
|
|
if((sequences[i].length() % 3) != 0) {
|
|
if (!warning)
|
|
cerr << endl;
|
|
warning = true;
|
|
cerr << "WARNING: Sequence length \"" << seqsName[i] << "\" is not "
|
|
<< "multiple of 3 (length: " << sequences[i].length() << ")" << endl;
|
|
}
|
|
|
|
/* Ignore stop codons from the CDS if set by the user */
|
|
if (ignStopCodon)
|
|
continue;
|
|
|
|
/* Detect universal stop codons in the CDS. Then, compare those stop codons
|
|
* against the protein sequence to see whether they are real stop codons or
|
|
* are representing rare amino-acids such as Selenocysteines or Pyrrolysines
|
|
* It also allows stop-codons when there are wildcards/indet characters in
|
|
* the protein sequence. CDS sequences could be splitted using stop codons
|
|
* from the sequence itself */
|
|
|
|
/* Initialize first appearence of a given stop codon to -1.
|
|
* That means that it has not been found yet */
|
|
found = -1;
|
|
do {
|
|
found = sequences[i].find("TGA", found + 1);
|
|
|
|
/* If a stop codon has been found and its position is multiple of 3.
|
|
* Analize it */
|
|
if((found != string::npos) && (((int) found % 3) == 0)) {
|
|
|
|
aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
|
|
/* It may be a Selenocysteine ('TGA') which should be represented as 'U'
|
|
* or wildcard/indet characters such as 'X' or 'B' */
|
|
//~ if ((aminoAcid == 'U') || (aminoAcid == 'X') || (aminoAcid == 'B'))
|
|
/* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
|
|
* such as 'B'/'X' is present, skip current stop codon */
|
|
if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
|
|
(aminoAcid == 'B'))
|
|
continue;
|
|
|
|
/* If split_by_stop_codon flag is activated then cut input CDS sequence
|
|
* up to first appearance of a stop codon */
|
|
else if(splitByStopCodon) {
|
|
if (!warning)
|
|
cerr << endl;
|
|
warning = true;
|
|
cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
|
|
<< " appearance of stop codon \"TGA\" (residue \"" << aminoAcid
|
|
<< "\") at position " << (int) found + 1 << " (length: "
|
|
<< sequences[i].length() << ")" << endl;
|
|
sequences[i].resize((int) found);
|
|
}
|
|
/* Otherwise, warn about it and return an error */
|
|
else {
|
|
if (!warning)
|
|
cerr << endl;
|
|
cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
|
|
<< "TGA\" (residue \"" << aminoAcid << "\") at position "
|
|
<< (int) found + 1 << " (length: " << sequences[i].length() << ")"
|
|
<< endl << endl;
|
|
return false;
|
|
}
|
|
}
|
|
/* Iterate over the CDS until not stop codon is found */
|
|
} while(found != string::npos);
|
|
|
|
/* Initialize first appearence of a given stop codon to -1.
|
|
* That means that it has not been found yet */
|
|
found = -1;
|
|
do {
|
|
found = sequences[i].find("TAA", found + 1);
|
|
|
|
/* If a stop codon has been found and its position is multiple of 3.
|
|
* Analize it */
|
|
if((found != string::npos) && (((int) found % 3) == 0)) {
|
|
|
|
aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
|
|
/* Check if there is any wildcard/indet characters such as 'X' or 'B' */
|
|
//~ if ((aminoAcid == 'X') || (aminoAcid == 'B'))
|
|
/* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
|
|
* such as 'B'/'X' is present, skip current stop codon */
|
|
if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
|
|
(aminoAcid == 'B'))
|
|
continue;
|
|
|
|
/* If split_by_stop_codon flag is activated then cut input CDS sequence
|
|
* up to first appearance of a stop codon */
|
|
else if(splitByStopCodon) {
|
|
if (!warning)
|
|
cerr << endl;
|
|
warning = true;
|
|
cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
|
|
<< " appearance of stop codon \"TAA\" (residue \"" << aminoAcid
|
|
<< "\") at position " << (int) found + 1 << " (length: "
|
|
<< sequences[i].length() << ")" << endl;
|
|
sequences[i].resize((int) found);
|
|
}
|
|
/* Otherwise, warn about it and return an error */
|
|
else {
|
|
if (!warning)
|
|
cerr << endl;
|
|
cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
|
|
<< "TAA\" (residue \"" << aminoAcid << "\") at position "
|
|
<< (int) found + 1 << " (length: " << sequences[i].length() << ")"
|
|
<< endl << endl;
|
|
return false;
|
|
}
|
|
}
|
|
/* Iterate over the CDS until not stop codon is found */
|
|
} while(found != string::npos);
|
|
|
|
/* Initialize first appearence of a given stop codon to -1.
|
|
* That means that it has not been found yet */
|
|
found = -1;
|
|
do {
|
|
found = sequences[i].find("TAG", found + 1);
|
|
/* If a stop codon has been found and its position is multiple of 3.
|
|
* Analize it */
|
|
if((found != string::npos) && (((int) found % 3) == 0)) {
|
|
|
|
aminoAcid = (char) toupper(protSequences[current_prot][(int) found/3]);
|
|
/* It may be a Pyrrolysine ('TAG') which should be represented as 'O'
|
|
* or wildcard/indet characters such as 'X' or 'B' */
|
|
//~ if ((aminoAcid == 'O') || (aminoAcid == 'X') || (aminoAcid == 'B'))
|
|
/* If a rare amino-acids such as 'U'/'O' or a wildcard/indet character
|
|
* such as 'B'/'X' is present, skip current stop codon */
|
|
if((aminoAcid == 'U') || (aminoAcid == 'O') || (aminoAcid == 'X') || \
|
|
(aminoAcid == 'B'))
|
|
continue;
|
|
|
|
/* If split_by_stop_codon flag is activated then cut input CDS sequence
|
|
* up to first appearance of a stop codon */
|
|
else if(splitByStopCodon) {
|
|
if (!warning)
|
|
cerr << endl;
|
|
warning = true;
|
|
cerr << "WARNING: Cutting sequence \"" << seqsName[i] << "\" at first"
|
|
<< " appearance of stop codon \"TAG\" (residue \"" << aminoAcid
|
|
<< "\") at position " << (int) found + 1 << " (length: "
|
|
<< sequences[i].length() << ")" << endl;
|
|
sequences[i].resize((int) found);
|
|
}
|
|
/* Otherwise, warn about it and return an error */
|
|
else {
|
|
if (!warning)
|
|
cerr << endl;
|
|
cerr << "ERROR: Sequence \"" << seqsName[i] << "\" has stop codon \""
|
|
<< "TAG\" (residue \"" << aminoAcid << "\") at position "
|
|
<< (int) found + 1 << " (length: " << sequences[i].length() << ")"
|
|
<< endl << endl;
|
|
return false;
|
|
}
|
|
}
|
|
/* Iterate over the CDS until not stop codon is found */
|
|
} while(found != string::npos);
|
|
}
|
|
|
|
/* If everything was return an OK to informat about it. */
|
|
return true;
|
|
}
|
|
|
|
/* Function designed to check whether input CDS file is correct or not based on
|
|
* some features: Sequences are in both files (it could be more on CDS file),
|
|
* they have (more or less) same ength. Otherwise, some nucleotides could be
|
|
* excluded or some 'N's added to fit protein length. */
|
|
bool alignment::checkCorrespondence(string *names, int *lengths, int \
|
|
totalInputSeqs, int multiple = 1) {
|
|
|
|
int i, j, seqLength, indet;
|
|
bool warnings = false;
|
|
string tmp;
|
|
|
|
/* For each sequence in the current protein alignment, look for its coding
|
|
* DNA sequence checking that they have the same size. */
|
|
for(i = 0; i < sequenNumber; i++) {
|
|
|
|
/* Get protein sequence length removing any possible gap. Get as well last
|
|
* residue from current sequence */
|
|
|
|
tmp = utils::removeCharacter('-', sequences[i]);
|
|
seqLength = tmp.length() * multiple;
|
|
indet = ((int) tmp.length() - utils::min((int) tmp.find_last_not_of("X"), \
|
|
(int) tmp.find_last_not_of("x"))) - 1;
|
|
|
|
/* Go through all available CDS looking for the one with the same ID */
|
|
for(j = 0; j < totalInputSeqs; j++) {
|
|
|
|
/* Once both ID matchs, compare its lengths */
|
|
if(seqsName[i] == names[j]) {
|
|
|
|
/* If both sequences have the same length, stop the search */
|
|
if(seqLength == lengths[j])
|
|
break;
|
|
|
|
/* If nucleotide sequence is larger than protein sequence, warn about
|
|
* it and continue the verification process. It will used the 'Nth'
|
|
* first nucleotides for the conversion */
|
|
else if(seqLength < lengths[j]) {
|
|
if (!warnings)
|
|
cerr << endl;
|
|
warnings = true;
|
|
cerr << "WARNING: Sequence \"" << seqsName[i] << "\" will be cut at "
|
|
<< "position " << seqLength << " (length: "<< lengths[j] << ")"
|
|
<< endl;
|
|
break;
|
|
}
|
|
|
|
/* It has been detected some indeterminations at the end of the protein
|
|
* sequence. That issue could be cause by some incomplete codons in the
|
|
* nucleotide sequences. This issue is solved adding as much 'N' symbols
|
|
* as it is needed to preserve the backtranslated alignment */
|
|
else if((indet > 0) && (indet > (seqLength - lengths[j])/3)) {
|
|
if (!warnings)
|
|
cerr << endl;
|
|
warnings = true;
|
|
cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has some inde"
|
|
<< "termination symbols 'X' at the end of sequence. They will be"
|
|
<< " included in the final alignment." << endl;
|
|
break;
|
|
}
|
|
|
|
/* If nucleotide sequence is shorter than protein sequence, return an
|
|
* error since it is not feasible to cut the input protein aligment to
|
|
* fit it into CDNA sequences size */
|
|
else {
|
|
if (!warnings)
|
|
cerr << endl;
|
|
warnings = true;
|
|
cerr << "WARNING: Sequence \"" << seqsName[i] << "\" has less nucleo"
|
|
<< "tides (" << lengths[j] << ") than expected (" << seqLength
|
|
<< "). It will be added N's to complete the sequence" << endl;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Warn about a mismatch a sequences name level */
|
|
if(j == totalInputSeqs) {
|
|
cerr << endl << "ERROR: Sequence \"" << seqsName[i] << "\" is not in "
|
|
<< "CDS file." << endl << endl;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* If everything is OK, return an appropiate flag */
|
|
return true;
|
|
}
|