using namespace std;
bool alignment::fillMatrices(bool aligned) {
/* Function to determine if a set of sequences, that can be aligned or not,
* have been correctly load and are free of errors. */
int i, j;
/* Initialize some variables */
residuesNumber = new int[sequenNumber];
for(i = 0; i < sequenNumber; i++) {
residuesNumber[i] = sequences[i].size();
}
/* Check whether there are any unknow/no allowed character in the sequences */
for(i = 0; i < sequenNumber; i++)
for(j = 0; j < residuesNumber[i]; j++)
if((!isalpha(sequences[i][j])) && (!ispunct(sequences[i][j]))) {
cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" has an "
<< "unknown (" << sequences[i][j] << ") character." << endl;
return false;
}
/* Check whether all sequences have same size or not */
for(i = 1; i < sequenNumber; i++)
if(residuesNumber[i] != residuesNumber[i-1])
break;
/* Set an appropriate flag for indicating if sequences are aligned or not */
isAligned = (i != sequenNumber) ? false : true;
/* Warm about those cases where sequences should be aligned
* and there are not */
if (aligned and !isAligned) {
cerr << endl << "ERROR: Sequences should be aligned (all with same length) "
<< "and there are not. Check your input alignment" << endl;
return false;
}
/* Full-fill some information about input alignment */
if(residNumber == 0)
residNumber = residuesNumber[0];
/* Check whether aligned sequences have the length fixed for the input alig */
for(i = 0; (i < sequenNumber) and (aligned); i++) {
if(residuesNumber[i] != residNumber) {
cerr << endl << "ERROR: The sequence \"" << seqsName[i] << "\" ("
<< residuesNumber[i] << ") does not have the same number of residues "
<< "fixed by the alignment (" << residNumber << ")." << endl;
return false;
}
}
/* If the sequences are aligned, initialize some additional variables.
* These variables will be useful for posterior analysis */
if((aligned) || (isAligned)) {
/* Asign its position to each column. That will be used to determine which
* columns should be kept in output alignment after applying any method
* and which columns should not */
saveResidues = new int[residNumber];
for(i = 0; i < residNumber; i++)
saveResidues[i] = i;
/* Asign its position to each sequence. Similar to the columns numbering
* process, assign to each sequence its position is useful to know which
* sequences will be in the output alignment */
saveSequences = new int[sequenNumber];
for(i = 0; i < sequenNumber; i++)
saveSequences[i] = i;
}
/* Return an flag indicating that everything is fine */
return true;
}
void alignment::getSequences(ostream &file) {
/* Get only residues sequences from input alignment */
int i, j;
string *tmpMatrix;
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then copy
* it into local memory. Once orientation has been determined, remove gaps
* from resuting sequences */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? utils::removeCharacter('-', sequences[i]) : \
utils::removeCharacter('-', utils::getReverse(sequences[i]));
/* Print output set of sequences in FASTA format*/
for(i = 0; i < sequenNumber; i++) {
file << ">" << seqsName[i] << endl;
for(j = 0; j < (int) tmpMatrix[i].size(); j += 60)
file << tmpMatrix[i].substr(j, 60) << endl;
file << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
int alignment::formatInputAlignment(char *alignmentFile) {
/* Guess input alignment format */
char c, *firstWord = NULL, *line = NULL;
int format = 0, blocks = 0;
ifstream file;
string nline;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Read first valid line in a safer way */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Otherwise, split line */
firstWord = strtok(line, OTHDELIMITERS);
/* Clustal Format */
if((!strcmp(firstWord, "CLUSTAL")) || (!strcmp(firstWord, "clustal")))
format = 1;
/* NBRF/PIR Format */
else if(firstWord[0] == '>' && firstWord[3] == ';')
format = 3;
/* Fasta Format */
else if(firstWord[0] == '>')
format = 8;
/* Nexus Format */
else if((!strcmp(firstWord, "#NEXUS")) || (!strcmp(firstWord, "#nexus")))
format = 17;
/* Mega Format */
else if((!strcmp(firstWord, "#MEGA")) || (!strcmp(firstWord, "#mega"))) {
/* Determine specific mega format: sequential or interleaved.
* Counting the number of blocks (set of lines starting by "#") in
* the input file. */
blocks = 0;
do {
file.read(&c, 1);
} while((c != '#') && (!file.eof()));
do {
while((c != '\n') && (!file.eof()))
file.read(&c, 1);
file.read(&c, 1);
if(c == '#')
blocks++;
} while((c != '\n') && (!file.eof()));
/* MEGA Sequential (22) or Interleaved (21) */
format = (!blocks) ? 22 : 21;
}
/* Phylip Format */
else {
/* Determine specific phylip format: sequential or interleaved. */
/* Get number of sequences and residues */
sequenNumber = atoi(firstWord);
firstWord = strtok(NULL, DELIMITERS);
if(firstWord != NULL)
residNumber = atoi(firstWord);
/* If there is only one sequence, use by default sequential format since
* it is impossible to determine exactly which phylip format is */
if((sequenNumber == 1) && (residNumber != 0))
format = 12;
/* If there are more than one sequence, analyze sequences distribution to
* determine its format. */
else if((sequenNumber != 0) && (residNumber != 0)) {
blocks = 0;
/* Read line in a safer way */
do {
if (line != NULL)
delete [] line;
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
firstWord = strtok(line, DELIMITERS);
while(firstWord != NULL) {
blocks++;
firstWord = strtok(NULL, DELIMITERS);
}
/* Read line in a safer way */
do {
if (line != NULL)
delete [] line;
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
firstWord = strtok(line, DELIMITERS);
while(firstWord != NULL) {
blocks--;
firstWord = strtok(NULL, DELIMITERS);
}
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Phylip Interleaved (12) or Sequential (11) */
format = (!blocks) ? 12 : 11;
}
}
/* Close the input file and delete dinamic memory */
file.close();
if (line != NULL)
delete [] line;
/* Return the input alignment format */
return format;
}
bool alignment::loadPhylipAlignment(char *alignmentFile) {
/* PHYLIP/PHYLIP 4 (Sequential) file format parser */
char *str, *line = NULL;
ifstream file;
int i;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store some data about filename for possible uses in other formats */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Read first valid line in a safer way */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Read the input sequences and residues for each sequence numbers */
str = strtok(line, DELIMITERS);
sequenNumber = 0;
if(str != NULL)
sequenNumber = atoi(str);
str = strtok(NULL, DELIMITERS);
residNumber = 0;
if(str != NULL)
residNumber = atoi(str);
/* If something is wrong about the sequences or/and residues number,
* return an error to warn about that */
if((sequenNumber == 0) || (residNumber == 0))
return false;
/* Allocate memory for the input data */
sequences = new string[sequenNumber];
seqsName = new string[sequenNumber];
/* Read the lines block containing the sequences name + first fragment */
i = 0;
while((i < sequenNumber) && (!file.eof())){
/* Read lines in a safer way. Destroy previous stored information */
if (line != NULL)
delete [] line;
line = utils::readLine(file);
/* It the input line/s are blank lines, skip the loop iteration */
if(line == NULL)
continue;
/* First token: Sequence name */
str = strtok(line, DELIMITERS);
seqsName[i].append(str, strlen(str));
/* Trim the rest of the line from blank spaces, tabs, etc and store it */
str = strtok(NULL, DELIMITERS);
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, DELIMITERS);
}
i++;
}
/* Read the rest of the input file */
while(!file.eof()) {
/* Try to get for each sequences its corresponding residues */
i = 0;
while((i < sequenNumber) && (!file.eof())) {
/* Read lines in a safer way. Destroy previous stored information */
if (line != NULL)
delete [] line;
line = utils::readLine(file);
/* It the input line/s are blank lines, skip the loop iteration */
if(line == NULL)
continue;
/* Remove from the current line non-printable characters and add fragments
* to previous stored sequence */
str = strtok(line, DELIMITERS);
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, DELIMITERS);
}
i++;
}
}
/* Close the input file and delete dinamic memory */
file.close();
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadPhylip3_2Alignment(char *alignmentFile) {
/* PHYLIP 3.2 (Interleaved) file format parser */
int i, blocksFirstLine, firstLine = true;
char *str, *line = NULL;
ifstream file;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store the file name for futher format conversion*/
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Read first valid line in a safer way */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Get the sequences and residues numbers. If there is any mistake,
* return a FALSE value to warn about the possible error */
str = strtok(line, DELIMITERS);
sequenNumber = 0;
if(str != NULL)
sequenNumber = atoi(str);
str = strtok(NULL, DELIMITERS);
residNumber = 0;
if(str != NULL)
residNumber = atoi(str);
if((sequenNumber == 0) || (residNumber == 0))
return false;
/* Reserve memory according to the input parameters */
sequences = new string[sequenNumber];
seqsName = new string[sequenNumber];
/* Point to the first sequence in the alignment. Since the alignment could not
* have blank lines to separate the different sequences. Store the blocks size
* for the first line including a sequence identifier */
i = 0;
blocksFirstLine = 0;
do {
/* Read lines in a safer way. Destroy previous stored information */
if (line != NULL)
delete [] line;
line = utils::readLine(file);
/* If there is nothing in the input line, skip the loop instructions */
if(line == NULL)
continue;
str = strtok(line, DELIMITERS);
/* First block: Sequence Name + Sequence fragment. Count how many blocks
* the first sequence line is divided. It could help to identify the
* different sequences from the input file */
if(firstLine) {
seqsName[i].append(str, strlen(str));
str = strtok(NULL, OTHDELIMITERS);
firstLine = 1;
}
/* Sequence fragment */
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, OTHDELIMITERS);
/* Count the blocks number for the sequences first line */
if (firstLine)
firstLine += 1;
}
/* Store the blocks number for the first sequence including the name */
if ((blocksFirstLine == 0) and firstLine)
blocksFirstLine = firstLine;
/* If a false positive new sequence was detected, add stored information for
* the current sequence to the previous one and clear the data structure
* for the current sequence. Finally, move the sequence pointer to the
* previous one. */
if ((firstLine != false) and (firstLine != blocksFirstLine)) {
sequences[i-1].append(seqsName[i]);
seqsName[i].clear();
sequences[i-1].append(sequences[i]);
sequences[i].clear();
i --;
}
firstLine = false;
/* There are many ways to detect a new sequence. */
/* One of them -experimental- is just to detect if the residues number for
* the current entry is equal to the residues number for the whole align */
if ((int) sequences[i].size() == residNumber) {
firstLine = true;
i++;
}
} while(!file.eof());
/* Close the input file and delete dinamic memory */
file.close();
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadClustalAlignment(char *alignmentFile) {
/* CLUSTAL file format parser */
int i, seqLength, pos, firstBlock;
char *str, *line = NULL;
ifstream file;
/* Check input file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store some details about input file to be used in posterior format
* conversions */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* The first valid line corresponding to CLUSTAL label is ignored */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Ignore blank lines before first sequence block starts */
while(!file.eof()) {
/* Deallocate previously used dynamic memory */
if (line != NULL)
delete [] line;
/* Read lines in safe way */
line = utils::readLine(file);
if (line != NULL)
break;
}
/* The program in only interested in the first blocks of sequences since
* it wants to know how many sequences are in the input file */
sequenNumber = 0;
while(!file.eof()) {
/* If a new line without any valid character is detected
* means the first block is over */
if (line == NULL)
break;
/* Count how many times standard characters as well as
* gap symbol "-" is detected in current line. */
seqLength = (int) strlen(line);
for(pos = 0; pos < seqLength; pos++)
if((isalpha(line[pos])) || (line[pos] == '-'))
break;
/* If not standard characters are detected in current line means that
* the program has found typical line in clustal alignment files to mark
* some scores for some columns. In that case, the first block is over */
if(pos == seqLength)
break;
sequenNumber++;
/* Deallocate previously used dynamic memory */
if (line != NULL)
delete [] line;
/* Read lines in safe way */
line = utils::readLine(file);
}
/* Finish to preprocess the input file. */
file.clear();
file.seekg(0);
/* Allocate memory for the input alignmet */
seqsName = new string[sequenNumber];
sequences = new string[sequenNumber];
/* Read the title line and store it */
line = utils::readLine(file);
if (line == NULL)
return false;
aligInfo.append(line, strlen(line));
/* Ignore blank lines before first sequence block starts */
while(!file.eof()) {
/* Deallocate previously used dynamic memory */
if (line != NULL)
delete [] line;
/* Read lines in safe way */
line = utils::readLine(file);
if (line != NULL)
break;
}
/* Set-up sequences pointer to the first one and the flag to indicate
* the first blocks. That flag implies that sequences names have to be
* stored */
i = 0;
firstBlock = true;
while(!file.eof()) {
if (line == NULL) {
/* Sometimes, clustalw files does not have any marker after first block
* to indicate conservation between its columns residues. In that cases,
* mark the end of first block */
if (i == 0)
firstBlock = false;
/* Read current line and analyze it*/
line = utils::readLine(file);
continue;
}
/* Check whteher current line is a standard line or it is a line to mark
* quality scores for that alignment column */
seqLength = (int) strlen(line);
for(pos = 0; pos < seqLength; pos++)
if((isalpha(line[pos])) || (line[pos] == '-'))
break;
/* Start a new block in the input alignment */
if (pos == seqLength) {
firstBlock = false;
/* Deallocate dinamic memory if it has been used before */
if (line != NULL)
delete [] line;
/* Read current line and analyze it*/
line = utils::readLine(file);
continue;
}
/* If it is a standard line, split it into two parts. The first one contains
* sequence name and the second one the residues. If the "firstBlock" flag
* is active then store the sequence name */
str = strtok(line, OTHDELIMITERS);
if(str != NULL) {
if(firstBlock)
seqsName[i].append(str, strlen(str));
str = strtok(NULL, OTHDELIMITERS);
if(str != NULL)
sequences[i].append(str, strlen(str));
/* Move sequences pointer in a circular way */
i = (i + 1) % sequenNumber;
}
/* Deallocate dinamic memory if it has been used before */
if (line != NULL)
delete [] line;
/* Read current line and analyze it*/
line = utils::readLine(file);
}
/* Close the input file */
file.close();
/* Deallocate dinamic memory */
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadFastaAlignment(char *alignmentFile) {
/* FASTA file format parser */
char *str, *line = NULL;
ifstream file;
int i;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store input file name for posterior uses in other formats */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Compute how many sequences are in the input alignment */
sequenNumber = 0;
while(!file.eof()) {
/* Deallocate previously used dinamic memory */
if (line != NULL)
delete [] line;
/* Read lines in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* It the line starts by ">" means that a new sequence has been found */
str = strtok(line, DELIMITERS);
if (str == NULL)
continue;
/* If a sequence name flag is detected, increase sequences counter */
if(str[0] == '>')
sequenNumber++;
}
/* Finish to preprocess the input file. */
file.clear();
file.seekg(0);
/* Allocate memory for the input alignmet */
seqsName = new string[sequenNumber];
sequences = new string[sequenNumber];
seqsInfo = new string[sequenNumber];
for(i = -1; (i < sequenNumber) && (!file.eof()); ) {
/* Deallocate previously used dinamic memory */
if (line != NULL)
delete [] line;
/* Read lines in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* Store original header fom input sequences including non-standard
* characters */
if (line[0] == '>')
seqsInfo[i+1].append(&line[1], strlen(line) - 1);
/* Cut the current line and check whether there are valid characters */
str = strtok(line, OTHDELIMITERS);
if (str == NULL)
continue;
/* Check whether current line belongs to the current sequence
* or it is a new one. In that case, store the sequence name */
if(str[0] == '>') {
/* Move sequence name pointer until a valid string name is obtained */
do {
str = str + 1;
} while(strlen(str) == 0);
seqsName[++i].append(str, strlen(str));
continue;
}
/* Sequence */
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, DELIMITERS);
}
}
/* Close the input file */
file.close();
/* Deallocate previously used dinamic memory */
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(false);
}
bool alignment::loadNexusAlignment(char *alignmentFile) {
/* NEXUS file format parser */
char *frag = NULL, *str = NULL, *line = NULL;
int i, pos, state, firstBlock;
ifstream file;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store input file name for posterior uses in other formats */
/* We store the file name */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
state = false;
do {
/* Destroy previous assigned memory */
if (line != NULL)
delete [] line;
/* Read line in a safer way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* Discard line where there is not information */
str = strtok(line, DELIMITERS);
if(str == NULL)
continue;
/* If the line has any kind of information, try to catch it */
/* Firstly, convert to capital letters the input line */
for(i = 0; i < (int) strlen(str); i++)
str[i] = toupper(str[i]);
/* ... and then compare it again specific tags */
if(!strcmp(str, "BEGIN"))
state = true;
else if(!strcmp(str, "MATRIX"))
break;
/* Store information about input format file */
else if(!strcmp(str, "FORMAT")) {
str = strtok(NULL, DELIMITERS);
while(str != NULL) {
aligInfo.append(str, strlen(str));
aligInfo.append(" ", strlen(" "));
str = strtok(NULL, DELIMITERS);
}
}
/* In this case, try to get matrix dimensions */
else if((!strcmp(str, "DIMENSIONS")) && state) {
str = strtok(NULL, DELIMITERS);
frag = strtok(NULL, DELIMITERS);
str = strtok(str, "=;");
sequenNumber = atoi(strtok(NULL, "=;"));
frag = strtok(frag, "=;");
residNumber = atoi(strtok(NULL, "=;"));
}
} while(!file.eof());
/* Check all parameters */
if(strcmp(str, "MATRIX") || (sequenNumber == 0) || (residNumber == 0))
return false;
/* Allocate memory for the input alignmet */
seqsName = new string[sequenNumber];
sequences = new string[sequenNumber];
pos = 0;
state = false;
firstBlock = true;
while(!file.eof()) {
/* Destroy previous assigned memory */
if (line != NULL)
delete [] line;
/* Read line in a safer way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* Discard any comments from input file */
for(i = 0; i < (int) strlen(line); i++) {
if (line[i] == '[')
state = true;
else if (line[i] == ']' && state) {
state = false;
break;
}
}
/* If there is a multi-line comments, skip it as well */
if ((state) || (not state && i != (int) strlen(line)))
continue;
/* Check for a specific tag indicating matrix end */
if((!strncmp(line, "end;", 4)) || (!strncmp(line, "END;", 4)))
break;
/* Split input line and check it if it is valid */
str = strtok(line, OTH2DELIMITERS);
if (str == NULL)
continue;
/* Store the sequence name, only from the first block */
if(firstBlock)
seqsName[pos].append(str, strlen(str));
/* Store rest of line as part of sequence */
str = strtok(NULL, OTH2DELIMITERS);
while(str != NULL) {
sequences[pos].append(str, strlen(str));
str = strtok(NULL, OTH2DELIMITERS);
}
/* Move sequences pointer to next one. It if it is last one, move it to
* the beginning and set the first block to false for avoiding to rewrite
* sequences name */
pos = (pos + 1) % sequenNumber;
if (not pos)
firstBlock = false;
}
/* Deallocate memory */
if (line != NULL)
delete [] line;
/* Close the input file */
file.close();
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadMegaNonInterleavedAlignment(char *alignmentFile) {
/* MEGA sequential file format parser */
char *frag = NULL, *str = NULL, *line = NULL;
ifstream file;
int i;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Filename is stored as a title for MEGA input alignment.
* If it is detected later a label "TITLE" in input file, this information
* will be replaced for that one */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Skip first valid line */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Try to get input alignment information */
while(!file.eof()) {
/* Destroy previously allocated memory */
if (line != NULL)
delete [] line;
/* Read a new line in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* If a sequence name flag is found, go out from getting information loop */
if(!strncmp(line, "#", 1))
break;
/* Destroy previously allocated memory */
if (frag != NULL)
delete [] frag;
/* Create a local copy from input line */
frag = new char[strlen(line) + 1];
strcpy(frag, line);
/* Split input line copy into pieces and analize it
* looking for specific labels */
str = strtok(frag, "!: ");
for(i = 0; i < (int) strlen(str); i++)
str[i] = toupper(str[i]);
/* If TITLE label is found, replace previously stored information with
* this info */
if(!strcmp(str, "TITLE")) {
filename.clear();
if(strncmp(line, "!", 1))
filename += "!";
filename += line;
}
/* If FORMAT label is found, try to get some details from input file */
else if(!strcmp(str, "FORMAT"))
aligInfo.append(line, strlen(line));
}
/* Deallocate local memory */
if (frag != NULL)
delete [] frag;
/* Count how many sequences are in input alignment file */
do {
/* Check whether input line is valid or not */
if (line == NULL) {
line = utils::readLine(file);
continue;
}
/* If current line starts by a # means that it is a sequence name */
if (!strncmp(line, "#", 1))
sequenNumber++;
/* Destroy previously allocated memory */
if (line != NULL)
delete [] line;
/* Read a new line in a safe way */
line = utils::readLine(file);
} while(!file.eof());
/* Move file pointer to the beginner */
file.clear();
file.seekg(0);
/* Allocate memory */
seqsName = new string[sequenNumber];
sequences = new string[sequenNumber];
/* Skip first line */
line = utils::readLine(file);
/* Skip lines until first sequence name is found */
while(!file.eof()) {
/* Destroy previously allocated memory */
if (line != NULL)
delete [] line;
/* Read a new line in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* If sequence name label is found, go out from loop */
if (!strncmp(line, "#", 1))
break;
}
/* First sequence is already detected so its name should be stored */
i = -1;
/* This loop is a bit tricky because first sequence name has been already
* detected, so it is necessary to process it before moving to next line.
* That implies that lines are read at loop ends */
while(!file.eof()) {
/* Skip blank lines */
if (line == NULL) {
line = utils::readLine(file);
continue;
}
/* Skip lines with comments */
if (!strncmp(line, "!", 1)) {
/* Deallocate memory and read a new line */
delete [] line;
line = utils::readLine(file);
continue;
}
/* Remove comments inside of sequences and split input line */
frag = utils::trimLine(line);
/* Skip lines with only comments */
if (frag == NULL) {
/* Deallocate memory and read a new line */
if (line != NULL)
delete [] line;
line = utils::readLine(file);
continue;
}
/* Otherwise, split it into fragments */
str = strtok(frag, " #\n");
/* Sequence Name */
if (!strncmp(line, "#", 1)) {
i += 1;
seqsName[i].append(str, strlen(str));
str = strtok(NULL, " #\n");
}
/* Sequence itself */
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, " \n");
}
/* Deallocate dynamic memory */
if (frag != NULL)
delete [] frag;
if (line != NULL)
delete [] line;
/* Read a new line in a safe way */
line = utils::readLine(file);
}
/* Close input file */
file.close();
/* Deallocate dynamic memory */
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadMegaInterleavedAlignment(char *alignmentFile) {
/* MEGA interleaved file format parser */
char *frag = NULL, *str = NULL, *line = NULL;
int i, firstBlock = true;
ifstream file;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Filename is stored as a title for MEGA input alignment.
* If it is detected later a label "TITLE" in input file, this information
* will be replaced for that one */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Skip first valid line */
do {
line = utils::readLine(file);
} while ((line == NULL) && (!file.eof()));
/* If the file end is reached without a valid line, warn about it */
if (file.eof())
return false;
/* Try to get input alignment information */
while(!file.eof()) {
/* Destroy previously allocated memory */
if (line != NULL)
delete [] line;
/* Read a new line in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* If a sequence name flag is found, go out from getting information loop */
if(!strncmp(line, "#", 1))
break;
/* Create a local copy from input line */
frag = new char[strlen(line) + 1];
strcpy(frag, line);
/* Split input line copy into pieces and analize it
* looking for specific labels */
str = strtok(frag, "!: ");
for(i = 0; i < (int) strlen(str); i++)
str[i] = toupper(str[i]);
/* If TITLE label is found, replace previously stored information with
* this info */
if(!strcmp(str, "TITLE")) {
filename.clear();
if(strncmp(line, "!", 1))
filename += "!";
filename += line;
}
/* If FORMAT label is found, try to get some details from input file */
else if(!strcmp(str, "FORMAT"))
aligInfo.append(line, strlen(line));
/* Destroy previously allocated memory */
if (frag != NULL)
delete [] frag;
}
/* Count how many sequences are in input file */
while(!file.eof()) {
/* If a sequence name flag has been detected, increase counter */
if(!strncmp(line, "#", 1))
sequenNumber++;
/* Deallocate dynamic memory */
if(line != NULL)
delete [] line;
/* Read lines in a safe way */
line = utils::readLine(file);
/* If a blank line is detected means first block of sequences is over */
/* Then, break counting sequences loop */
if (line == NULL)
break;
}
/* Finish to preprocess the input file. */
file.clear();
file.seekg(0);
/* Allocate memory */
seqsName = new string[sequenNumber];
sequences = new string[sequenNumber];
/* Skip first line */
line = utils::readLine(file);
/* Skip lines until first # flag is reached */
while(!file.eof()) {
/* Deallocate previously used dynamic memory */
if (line != NULL)
delete [] line;
/* Read line in a safer way */
line = utils::readLine(file);
/* Determine whether a # flag has been found in current string */
if (line != NULL)
if(!strncmp(line, "#", 1))
break;
}
/* Read sequences and get from first block, the sequences names */
i = 0;
firstBlock = true;
while(!file.eof()) {
if (line == NULL) {
/* Read line in a safer way */
line = utils::readLine(file);
continue;
}
if (!strncmp(line, "!", 1)) {
/* Deallocate memory and read a new line */
delete [] line;
line = utils::readLine(file);
continue;
}
/* Trim lines from any kind of comments and split it */
frag = utils::trimLine(line);
str = strtok(frag, " #\n");
/* Check whether a line fragment is valid or not */
if (str == NULL)
continue;
/* Store sequences names if firstBlock flag is TRUE */
if(firstBlock)
seqsName[i].append(str, strlen(str));
/* Store sequence */
str = strtok(NULL, " \n");
while(str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, " \n");
}
/* Deallocate previously used dynamic memory */
if (frag != NULL)
delete [] frag;
if (line != NULL)
delete [] line;
/* Read line in a safer way */
line = utils::readLine(file);
i = (i + 1) % sequenNumber;
if (i == 0)
firstBlock = false;
}
/* Close input file */
file.close();
/* Deallocate local memory */
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
bool alignment::loadNBRF_PirAlignment(char *alignmentFile) {
/* NBRF/PIR file format parser */
bool seqIdLine, seqLines;
char *str, *line = NULL;
ifstream file;
int i;
/* Check the file and its content */
file.open(alignmentFile, ifstream::in);
if(!utils::checkFile(file))
return false;
/* Store input file name for posterior uses in other formats */
filename.append("!Title ");
filename.append(alignmentFile);
filename.append(";");
/* Compute how many sequences are in the input alignment */
sequenNumber = 0;
while(!file.eof()) {
/* Deallocate previously used dinamic memory */
if (line != NULL)
delete [] line;
/* Read lines in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* It the line starts by ">" means that a new sequence has been found */
str = strtok(line, DELIMITERS);
if (str == NULL)
continue;
/* If a sequence name flag is detected, increase sequences counter */
if(str[0] == '>')
sequenNumber++;
}
/* Finish to preprocess the input file. */
file.clear();
file.seekg(0);
/* Allocate memory for the input alignmet */
sequences = new string[sequenNumber];
seqsName = new string[sequenNumber];
seqsInfo = new string[sequenNumber];
/* Initialize some local variables */
seqIdLine = true;
seqLines = false;
i = -1;
/* Read the entire input file */
while(!file.eof()) {
/* Deallocate local memory */
if (line != NULL)
delete [] line;
/* Read lines in a safe way */
line = utils::readLine(file);
if (line == NULL)
continue;
/* Sequence ID line.
* Identification of these kind of lines is based on presence of ">" and ";"
* symbols at positions 0 and 3 respectively */
if((line[0] == '>') && (line[3] == ';') && (seqIdLine)) {
seqIdLine = false;
i += 1;
/* Store information about sequence datatype */
str = strtok(line, ">;");
seqsInfo[i].append(str, strlen(str));
/* and the sequence identifier itself */
str = strtok(NULL, ">;");
seqsName[i].append(str, strlen(str));
}
/* Line just after sequence Id line contains a textual description of
* the sequence. */
else if((!seqIdLine) && (!seqLines)) {
seqLines = true;
seqsInfo[i].append(line, strlen(line));
}
/* Sequence lines itself */
else if (seqLines) {
/* Check whether a sequence end symbol '*' exists in current line.
* In that case, set appropriate flags to read a new sequence */
if (line[strlen(line) - 1] == '*') {
seqLines = false;
seqIdLine = true;
}
/* Process line */
str = strtok(line, OTHDELIMITERS);
while (str != NULL) {
sequences[i].append(str, strlen(str));
str = strtok(NULL, OTHDELIMITERS);
}
/* In case the end symbol '*' has been detected, remove it */
if(sequences[i][sequences[i].size() - 1] == '*')
sequences[i].erase(sequences[i].size()-1);
}
}
/* Close the input file */
file.close();
/* Deallocate dinamic memory */
if (line != NULL)
delete [] line;
/* Check the matrix's content */
return fillMatrices(true);
}
void alignment::alignmentPhylipToFile(ostream &file) {
/* Generate output alignment in PHYLIP/PHYLIP 4 format (sequential) */
int i, j, maxLongName;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Depending on if short name flag is activated (limits sequence name up to
* 10 characters) or not, get maximum sequence name length */
maxLongName = PHYLIPDISTANCE;
for(i = 0; (i < sequenNumber) && (!shortNames); i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Generating output alignment */
/* First Line: Sequences Number & Residued Number */
file << " " << sequenNumber << " " << residNumber << endl;
/* First Block: Sequences Names & First 60 residues */
for(i = 0; i < sequenNumber; i++)
file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName)
<< tmpMatrix[i].substr(0, 60) << endl;
file << endl;
/* Rest of blocks: Print 60 residues per each blocks of sequences */
for(i = 60; i < residNumber; i += 60) {
for(j = 0; j < sequenNumber; j++)
file << tmpMatrix[j].substr(i, 60) << endl;
file << endl;
}
file << endl;
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentPhylip3_2ToFile(ostream &file) {
/* Generate output alignment in PHYLIP 3.2 format (interleaved) */
int i, j, k, maxLongName;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Depending on if short name flag is activated (limits sequence name up to
* 10 characters) or not, get maximum sequence name length */
maxLongName = PHYLIPDISTANCE;
for(i = 0; (i < sequenNumber) && (!shortNames); i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Generating output alignment */
/* First Line: Sequences Number & Residued Number */
file << " " << sequenNumber << " " << residNumber << endl;
/* Alignment */
/* For each sequence, print its identifier and then the sequence itself in
* blocks of 50 residues */
for(i = 0; i < sequenNumber; i++) {
/* Sequence Name */
file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName);
/* Sequence. Each line contains a block of 5 times 10 residues. */
for(j = 0; j < residNumber; j += 50) {
for(k = j; (k < residNumber) && (k < (j + 50)); k += 10)
file << sequences[i].substr(k, 10) << " ";
file << endl;
/* If the sequences end has not been reached, print black spaces
* to follow format specifications */
if((j + 50) < residNumber)
file << setw(maxLongName + 3) << " ";
}
/* Print a blank line to mark sequences separation */
file << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentPhylip_PamlToFile(ostream &file) {
/* Generate output alignment in PHYLIP format compatible with PAML program */
int i, maxLongName;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (PHYLIP) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Depending on if short name flag is activated (limits sequence name up to
* 10 characters) or not, get maximum sequence name length */
maxLongName = PHYLIPDISTANCE;
for(i = 0; (i < sequenNumber) && (!shortNames); i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Generating output alignment */
/* First Line: Sequences Number & Residued Number */
file << " " << sequenNumber << " " << residNumber << endl;
/* Print alignment */
/* Print sequences name follow by the sequence itself in the same line */
for(i = 0; i < sequenNumber; i++)
file << setw(maxLongName + 3) << left << seqsName[i].substr(0, maxLongName)
<< sequences[i] << endl;
file << endl;
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentClustalToFile(ostream &file) {
/* Generate output alignment in CLUSTAL format */
int i, j, maxLongName = 0;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (CLUSTAL) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Compute maximum sequences name length */
for(i = 0; (i < sequenNumber) && (!shortNames); i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Print alignment header */
if((aligInfo.size() != 0) && (iformat == oformat))
file << aligInfo << endl << endl;
else
file << "CLUSTAL multiple sequence alignment" << endl << endl;
/* Print alignment itself */
/* Print as many blocks as it is needed of lines composed
* by sequences name and 60 residues */
for(j = 0; j < residNumber; j += 60) {
for(i = 0; i < sequenNumber; i++)
file << setw(maxLongName + 5) << left << seqsName[i]
<< tmpMatrix[i].substr(j, 60) << endl;
file << endl << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentFastaToFile(ostream &file) {
/* Generate output alignment in FASTA format. Sequences can be unaligned. */
int i, j, maxLongName;
string *tmpMatrix;
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Depending on if short name flag is activated (limits sequence name up to
* 10 characters) or not, get maximum sequence name length. Consider those
* cases when the user has asked to keep original sequence header */
maxLongName = 0;
for(i = 0; i < sequenNumber; i++)
if (!keepHeader)
maxLongName = utils::max(maxLongName, seqsName[i].size());
else if (seqsInfo != NULL)
maxLongName = utils::max(maxLongName, seqsInfo[i].size());
if (shortNames && maxLongName > PHYLIPDISTANCE) {
maxLongName = PHYLIPDISTANCE;
if (keepHeader)
cerr << endl << "WARNING: Original sequence header will be cut by charac"
<< "ter 10" << endl;
}
/* Print alignment. First, sequences name id and then the sequences itself */
for(i = 0; i < sequenNumber; i++) {
if (!keepHeader)
file << ">" << seqsName[i].substr(0, maxLongName) << endl;
else if (seqsInfo != NULL)
file << ">" << seqsInfo[i].substr(0, maxLongName) << endl;
for(j = 0; j < residuesNumber[i]; j+= 60)
file << tmpMatrix[i].substr(j, 60) << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentNexusToFile(ostream &file) {
/* Generate output alignment in NEXUS format setting only alignment block */
int i, j, k, maxLongName = 0;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (NEXUS) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Compute maximum sequences name length */
for(i = 0; (i < sequenNumber) && (!shortNames); i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Compute output file datatype */
getTypeAlignment();
/* Remove characters like ";" from input alignment information line */
while((int) aligInfo.find(";") != (int) string::npos)
aligInfo.erase(aligInfo.find(";"), 1);
/* Print Alignment header */
file << "#NEXUS" << endl << "BEGIN DATA;" << endl << " DIMENSIONS NTAX="
<< sequenNumber << " NCHAR=" << residNumber <<";" << endl;
/* Print alignment datatype */
if ((dataType == DNAType) || (dataType == DNADeg))
file << "FORMAT DATATYPE=DNA INTERLEAVE=yes GAP=-";
else if ((dataType == RNAType) || (dataType == RNADeg))
file << "FORMAT DATATYPE=RNA INTERLEAVE=yes GAP=-";
else if (dataType == AAType)
file << "FORMAT DATATYPE=PROTEIN INTERLEAVE=yes GAP=-";
i = 0;
/* Using information from input alignment. Use only some tags. */
while((j = aligInfo.find(" ", i)) != (int) string::npos) {
if((aligInfo.substr(i, j - i)).compare(0, 7, "MISSING") == 0 ||
(aligInfo.substr(i, j)).compare(0, 7, "missing") == 0)
file << " " << (aligInfo.substr(i, j - i));
else if((aligInfo.substr(i, j)).compare(0, 9, "MATCHCHAR") == 0 ||
(aligInfo.substr(i, j)).compare(0, 9, "matchchar") == 0)
file << " " << (aligInfo.substr(i, j - i));
i = j + 1;
}
file << ";" << endl;
/* Print sequence name and sequence length */
for(i = 0; i < sequenNumber; i++)
file << "[Name: " << setw(maxLongName + 4) << left << seqsName[i] << "Len: "
<< residNumber << "]" << endl;
file << endl << "MATRIX" << endl;
/* Print alignment itself. Sequence name and 50 residues blocks */
for(j = 0; j < residNumber; j += 50) {
for(i = 0; i < sequenNumber; i++) {
file << setw(maxLongName + 4) << left << seqsName[i];
for(k = j; k < (j + 50) && k < residNumber; k += 10)
file << " " << sequences[i].substr(k, 10);
file << endl;
}
file << endl;
}
file << ";" << endl << "END;" << endl;
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentMegaToFile(ostream &file) {
/* Generate output alignment in MEGA format */
int i, j, k;
string *tmpMatrix;
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned. Format (MEGA) "
<< "not compatible with unaligned sequences." << endl << endl;
return ;
}
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Compute output file datatype */
getTypeAlignment();
/* Print output alignment header */
file << "#MEGA" << endl << filename << endl;
/* Print alignment datatype */
if ((dataType == DNAType) || (dataType == DNADeg))
file << "!Format DataType=DNA ";
else if ((dataType == RNAType) || (dataType == RNADeg))
file << "!Format DataType=RNA ";
else if (dataType == AAType)
file << "!Format DataType=protein ";
/* Print number of sequences and alignment length */
file << "NSeqs=" << sequenNumber << " Nsites=" << residNumber
<< " indel=- CodeTable=Standard;" << endl << endl;
/* Print sequences name and sequences divided into blocks of 50 residues */
for(i = 0; i < sequenNumber; i++) {
file << "#" << seqsName[i] << endl;
for(j = 0; j < residNumber; j += 50) {
for(k = j; ((k < residNumber) && (k < j + 50)); k += 10)
file << tmpMatrix[i].substr(k, 10) << " ";
file << endl;
}
file << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
void alignment::alignmentNBRF_PirToFile(ostream &file) {
/* Generate output alignment in NBRF/PIR format. Sequences can be unaligned */
int i, j, k;
string alg_datatype, *tmpMatrix;
/* Allocate local memory for generating output alignment */
tmpMatrix = new string[sequenNumber];
/* Depending on alignment orientation: forward or reverse. Copy directly
* sequence information or get firstly the reversed sequences and then
* copy it into local memory */
for(i = 0; i < sequenNumber; i++)
tmpMatrix[i] = (!reverse) ? sequences[i] : utils::getReverse(sequences[i]);
/* Compute output file datatype */
getTypeAlignment();
if ((dataType == DNAType) || (dataType == DNADeg))
alg_datatype = "DL";
else if ((dataType == RNAType) || (dataType == RNADeg))
alg_datatype = "RL";
else if (dataType == AAType)
alg_datatype = "P1";
/* Print alignment */
for(i = 0; i < sequenNumber; i++) {
/* Print sequence datatype and its name */
if((seqsInfo != NULL) && (iformat == oformat))
file << ">" << seqsInfo[i].substr(0, 2) << ";" << seqsName[i]
<< endl << seqsInfo[i].substr(2) << endl;
else
file << ">" << alg_datatype << ";" << seqsName[i] << endl
<< seqsName[i] << " " << residuesNumber[i] << " bases" << endl;
/* Write the sequence */
for(j = 0; j < residuesNumber[i]; j += 50) {
for(k = j; (k < residuesNumber[i]) && (k < (j + 50)); k += 10)
file << " " << tmpMatrix[i].substr(k, 10);
if(k >= residuesNumber[i]) {
if((residuesNumber[i] % 50) == 0)
file << endl << " ";
else if((residuesNumber[i] % 10) == 0)
file << " ";
file << "*";
}
file << endl;
}
file << endl;
}
/* Deallocate local memory */
delete [] tmpMatrix;
}
bool alignment::alignmentSummaryHTML(char *destFile, int residues, int seqs, \
int *selectedRes, int *selectedSeq, float *consValues) {
/* Generate an HTML file with a visual summary about which sequences/columns
* have been selected and which have not */
int i, j, k, kj, upper, minHTML, maxLongName, *gapsValues;
string tmpColumn;
float *simValues;
bool *res, *seq;
ofstream file;
char type;
/* Allocate some local memory */
tmpColumn.reserve(sequenNumber);
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned." << endl << endl;
return false;
}
/* Open output file and check that file pointer is valid */
file.open(destFile);
if(!file)
return false;
/* Compute maximum sequences name length. */
maxLongName = 0;
for(i = 0; i < sequenNumber; i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Compute HTML blank spaces */
minHTML = utils::max(25, maxLongName + 10);
/* Initialize local variables to control which columns/sequences
* will be kept in the output alignment */
res = new bool[residNumber];
for(i = 0; i < residNumber; i++)
res[i] = false;
seq = new bool[sequenNumber];
for(i = 0; i < sequenNumber; i++)
seq[i] = false;
/* Record which columns/sequences from original alignment
* have been kept in the final one */
for(i = 0; i < residues; i++)
res[selectedRes[i]] = true;
for(i = 0; i < seqs; i++)
seq[selectedSeq[i]] = true;
/* Recover some stats about different scores from current alignment */
gapsValues = NULL;
if (sgaps != NULL)
gapsValues = sgaps -> getGapsWindow();
simValues = NULL;
if (scons != NULL)
simValues = scons -> getMdkwVector();
/* Print HTML header into output file */
file << "" << endl << "" << endl << " "
<< endl << " trimAl v1.4 Summary" << endl
<< " \n \n\n" << " \n" << " " << endl;
/* Show information about how many sequences/residues have been selected */
file << " Selected Sequences: " << setw(5) << right << seqs
<<" /Selected Residues: " << setw(7) << right << residues << ""
<< endl << " Deleted Sequences: " << setw(5) << right
<< sequenNumber - seqs << " /Deleted Residues: " << setw(7) << right
<< residNumber - residues << "" << endl;
/* Print headers for different scores derived from input alignment/s */
if (gapsValues != NULL)
file << endl << setw(minHTML) << left << " Gaps Scores: "
<< " =0= <.001 "
<< " <.050 <.100 "
<< " <.150 <.200 "
<< " <.250 <.350 "
<< " <.500 <.750 "
<< " <1.00 =1= ";
if (simValues != NULL)
file << endl << setw(minHTML) << left << " Similarity Scores: "
<< " =0= <1e-6 "
<< " <1e-5 <1e-4 "
<< " <.001 <.010 "
<< " <.100 <.250 "
<< " <.500 <.750 "
<< " <1.00 =1= ";
if (consValues != NULL)
file << endl << setw(minHTML) << left << " Consistency Scores: "
<< " =0= <.001 "
<< " <.050 <.100 "
<< " <.150 <.200 "
<< " <.250 <.350 "
<< " <.500 <.750 "
<< " <1.00 =1= ";
if ((gapsValues != NULL) or (simValues == NULL) or (consValues == NULL))
file << endl;
/* Print Sequences in block of BLOCK_SIZE */
for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \
HTMLBLOCKS) {
/* Print main columns number */
file << endl << setw(minHTML + 10) << right << (j + 10);
for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10)
file << setw(10) << right << (i);
/* Print special characters to delimit sequences blocks */
file << endl << setw(minHTML + 1) << right;
for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++)
file << (!(i % 10) ? "+" : "=");
file << endl;
/* Print sequences name */
for(i = 0; i < sequenNumber; i++) {
file << " " : "nsel>") << seqsName[i]
<< "" << setw(minHTML - 4 - seqsName[i].size()) << right << "";
/* Print residues corresponding to current sequences block */
for(k = j; ((k < residNumber) && (k < upper)); k++) {
for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++)
tmpColumn += sequences[kj][k];
/* Determine residue color based on residues across the alig column */
type = utils::determineColor(sequences[i][k], tmpColumn);
if (type == 'w')
file << sequences[i][k];
else
file << "" << sequences[i][k] << "";
}
file << endl;
}
file << endl << setw(minHTML) << left << " Selected Cols: ";
for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++)
file << " ";
file << endl;
/* If there is not any score to print, skip this part of the function */
if ((gapsValues == NULL) and (simValues == NULL) and (consValues == NULL))
continue;
/* Print score colors according to certain predefined thresholds */
if (gapsValues != NULL) {
file << endl << setw(minHTML) << left << " Gaps Scores: ";
for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++)
if(gapsValues[k] == 0)
file << " ";
else if(gapsValues[k] == sequenNumber)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .750)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .500)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .350)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .250)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .200)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .150)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .100)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .050)
file << " ";
else if(1 - (float(gapsValues[k])/sequenNumber) >= .001)
file << " ";
else
file << " ";
}
if (simValues != NULL) {
file << endl << setw(minHTML) << left << " Similarity Scores: ";
for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++)
if(simValues[k] == 1)
file << " ";
else if(simValues[k] == 0)
file << " ";
else if(simValues[k] >= .750)
file << " ";
else if(simValues[k] >= .500)
file << " ";
else if(simValues[k] >= .250)
file << " ";
else if(simValues[k] >= .100)
file << " ";
else if(simValues[k] >= .010)
file << " ";
else if(simValues[k] >= .001)
file << " ";
else if(simValues[k] >= 1e-4)
file << " ";
else if(simValues[k] >= 1e-5)
file << " ";
else if(simValues[k] >= 1e-6)
file << " ";
else
file << " ";
}
if (consValues != NULL) {
file << endl << setw(minHTML) << left << " Consistency Scores: ";
for(k = j; ((k < residNumber) && (k < (j + HTMLBLOCKS))); k++)
if(consValues[k] == 1)
file << " ";
else if(consValues[k] == 0)
file << " ";
else if(consValues[k] >= .750)
file << " ";
else if(consValues[k] >= .500)
file << " ";
else if(consValues[k] >= .350)
file << " ";
else if(consValues[k] >= .250)
file << " ";
else if(consValues[k] >= .200)
file << " ";
else if(consValues[k] >= .150)
file << " ";
else if(consValues[k] >= .100)
file << " ";
else if(consValues[k] >= .050)
file << " ";
else if(consValues[k] >= .001)
file << " ";
else
file << " ";
}
file << endl;
}
/* Print HTML footer into output file */
file << " " << endl << " " << endl << "" << endl;
/* Close output file and deallocate local memory */
file.close();
delete [] seq;
delete [] res;
return true;
}
bool alignment::alignmentColourHTML(ostream &file) {
int i, j, kj, upper, k = 0, maxLongName = 0;
string tmpColumn;
char type;
/* Allocate some local memory */
tmpColumn.reserve(sequenNumber);
/* Check whether sequences in the alignment are aligned or not.
* Warn about it if there are not aligned. */
if (!isAligned) {
cerr << endl << "ERROR: Sequences are not aligned." << endl << endl;
return false;
}
/* Compute maximum sequences name length */
maxLongName = 0;
for(i = 0; i < sequenNumber; i++)
maxLongName = utils::max(maxLongName, seqsName[i].size());
/* Print HTML header into output file */
file << "" << endl << "" << endl << " "
<< endl << " readAl v1.4" << endl
<< " \n \n\n" << " \n " << endl;
/* Print sequences colored according to CLUSTAL scheme based on
* physical-chemical properties */
for(j = 0, upper = HTMLBLOCKS; j < residNumber; j += HTMLBLOCKS, upper += \
HTMLBLOCKS) {
file << endl;
/* Print main columns number */
file << setw(maxLongName + 19) << right << (j + 10);
for(i = j + 20; ((i <= residNumber) && (i <= upper)); i += 10)
file << setw(10) << right << i;
/* Print special characters to delimit sequences blocks */
file << endl << setw(maxLongName + 10);
for(i = j + 1; ((i <= residNumber) && (i <= upper)); i++)
file << (!(i % 10) ? "+" : "=");
/* Print sequences themselves */
for(i = 0; i < sequenNumber; i++) {
/* Print sequences name */
file << endl << setw(maxLongName + 9) << left << seqsName[i];
/* Print residues corresponding to current sequences block */
for(k = j; ((k < residNumber) && (k < upper)); k++) {
for(kj = 0, tmpColumn.clear(); kj < sequenNumber; kj++)
tmpColumn += sequences[kj][k];
/* Determine residue color based on residues across the alig column */
type = utils::determineColor(sequences[i][k], tmpColumn);
if (type == 'w')
file << sequences[i][k];
else
file << "" << sequences[i][k] << "";
}
}
file << endl;
}
/* Print HTML footer into output file */
file << " " << endl << " " << endl << "" << endl;
return true;
}
void alignment::printAlignmentInfo(ostream &file) {
/* Print information about sequences number, average sequence length, maximum
* and minimum sequences length, etc */
int i, j, valid_res, max, min, max_pos, min_pos, total_res;
/* Storage which sequences are the longest and shortest ones */
max = 0;
max_pos = 0;
min_pos = 0;
min = residuesNumber[0];
for(i = 0, total_res = 0; i < sequenNumber; i++) {
/* Discard gaps from current sequence and then compute real length */
for(j = 0, valid_res = 0; j < residuesNumber[i]; j++)
valid_res += (sequences[i][j] != '-' ? 1 : 0);
/* Compute the total residues in the alignment to calculate avg. sequence
* length */
total_res += valid_res;
/* Get values for the longest sequence */
max_pos = (max > valid_res) ? max_pos : i;
max = (max > valid_res) ? max : valid_res;
/* Similarily, get values for the shortest sequence */
min_pos = (min < valid_res) ? min_pos : i;
min = (min < valid_res) ? min : valid_res;
}
file << "## Total sequences\t" << sequenNumber << endl;
if (isFileAligned())
file << "## Alignment length\t" << residNumber << endl;
file << "## Avg. sequence length\t" << (float) total_res / sequenNumber << endl
<< "## Longest seq. name\t'" << seqsName[max_pos] << "'" << endl
<< "## Longest seq. length\t" << max << endl
<< "## Shortest seq. name\t'" << seqsName[min_pos] << "'" << endl
<< "## Shortest seq. length\t" << min << endl;
}