mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 04:20:25 +08:00
108 lines
4.1 KiB
Python
108 lines
4.1 KiB
Python
#!/usr/bin/python
|
|
|
|
#
|
|
# 'check_codon_alignments.py'
|
|
#
|
|
# Script implemented to analyze resulting back-translated alignments by trimAl
|
|
# Main idea here is to remove those codon-columns composed by only 'N'/'n' -
|
|
# which are the symbol to indicate indeterminate nucleotides.
|
|
#
|
|
# [2014] S. Capella-Gutierrez - scapella@crg.es
|
|
#
|
|
# this script is free software: you can redistribute it and/or modify it under
|
|
# the terms of the GNU General Public License as published by the Free
|
|
# Software Foundation, the last available version.
|
|
#
|
|
# this script is distributed in the hope that it will be useful, but WITHOUT
|
|
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
|
|
# more details on <http://www.gnu.org/licenses/>
|
|
#
|
|
from Bio import AlignIO
|
|
import numpy as np
|
|
import argparse
|
|
import sys
|
|
import os
|
|
|
|
def splitSequence(seq, length = 80):
|
|
''' Split a given sequence contained in one line into lines of size "length"
|
|
'''
|
|
return "\n".join([seq[i:i + length] for i in range(0, len(seq), length)])
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("-i", "--in", dest = "inFile", required = True, type = \
|
|
str, help = "Input alignment")
|
|
|
|
parser.add_argument("-o", "--out", dest = "outFile", default = None, type = \
|
|
str, help = "Set output file")
|
|
|
|
parser.add_argument("-f", "--format", dest = "inFormat", default = "fasta", \
|
|
type = str, choices = ["clustal", "fasta-m10", "fasta", "phylip-relaxed", \
|
|
"phylip-sequential", "phylip", "nexus"],help = "Set input alignment format")
|
|
|
|
parser.add_argument("-g", "--gap_symbol", dest = "gapSymbol", default = '-', \
|
|
type = str, help = "Set the gap symbol used in the input alignment")
|
|
|
|
parser.add_argument("--indeter_symbol", dest = "indeterSymbol", default = 'N',
|
|
type = str, help = "Set the indetermination symbol used in the alignment")
|
|
|
|
parser.add_argument("--keep_header", dest = "keepHeader", default = False,
|
|
action = "store_true", help = "Keep original alignment sequence IDs indepen"
|
|
+ "dently of blank spaces on it")
|
|
|
|
parser.add_argument("--complementary", dest = "complement", default = False,
|
|
action = "store_true", help = "Get the complementary output alignment")
|
|
|
|
parser.add_argument("-v", "--verbose", dest = "verbose", default = False,
|
|
action = "store_true", help = "Activate verbosity")
|
|
|
|
args = parser.parse_args()
|
|
|
|
if not os.path.isfile(args.inFile):
|
|
sys.exit(("ERROR: Check input alignment file '%s'") % (args.inFile))
|
|
|
|
alignment, alignment_length = {}, 0
|
|
for record in AlignIO.read(args.inFile, format = args.inFormat):
|
|
sequence_id = record.id if not args.keepHeader else record.description
|
|
alignment.setdefault(sequence_id, str(record.seq))
|
|
|
|
## Check all sequences have the same length
|
|
if alignment_length == 0:
|
|
alignment_length = len(str(record.seq))
|
|
if alignment_length != len(str(record.seq)):
|
|
sys.exit("ERROR: Check input alignment. Sequences with different lengths")
|
|
|
|
## Check input alignment is multiple of 3
|
|
if (alignment_length % 3) != 0:
|
|
sys.exit("ERROR: Check input alignment. Its length is not multiple of 3")
|
|
|
|
indetermination_cols = []
|
|
indet = set([args.indeterSymbol.upper()])
|
|
for pos in range(0, alignment_length, 3):
|
|
|
|
onlyIndeter = True
|
|
for col in range(pos, pos+3):
|
|
column = set([alignment[seq][col].upper() for seq in alignment \
|
|
if alignment[seq][col] != args.gapSymbol])
|
|
if column ^ indet != set():
|
|
onlyIndeter = False
|
|
|
|
if onlyIndeter and not args.complement:
|
|
indetermination_cols.append(pos)
|
|
elif not onlyIndeter and args.complement:
|
|
indetermination_cols.append(pos)
|
|
|
|
if args.verbose and indetermination_cols:
|
|
output = ",".join(map(str, sorted(indetermination_cols)))
|
|
print >> sys.stderr, ("%s\t%s") % (args.inFile, output)
|
|
|
|
ofile = open(args.outFile, "w") if args.outFile else sys.stdout
|
|
for seq_id in alignment:
|
|
output = "".join([alignment[seq_id][pos:pos+3] for pos in \
|
|
range(0, alignment_length, 3) if not pos in indetermination_cols])
|
|
print >> ofile, (">%s\n%s") % (seq_id, splitSequence(output))
|
|
ofile.close()
|