EukPhylo/PTL1/Transcriptomes/Scripts/5_GCodeTranslate.py
2024-01-26 10:39:33 -05:00

761 lines
33 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Last updated Sept 2023
# Authors: Xyrus Maurer-Alcala and Auden Cote-L'Heureux
# This script is intended to translate nucleotide sequences. It does this using
# the gcode_output.tsv file output by script 4 and containing in-frame stop codon
# frequencies. The user can use this stop codon information to fill in the last
# column in this file with the genetic code for each taxon. If the user input a
# genetic code or list of genetic codes to script 1, then the gcode_output.tsv will
# be filled automatically. sequences are translated using the Diamond BLASTp results
# from OG assignment as a starting point for determining coding sequence boundaries.
# The first in-frame start codon (if the 5 boundary of the BLASTp hit is not at a start codon)
# and last in-frame stop codon (using the assigned genetic code) outside of these bounds
# are found, while ensuring that in-frame stop codons are not introduced (given the nature
# of transcriptomic data, poor genetic code assignment or low-quality/partial data can
# interfere with this process).
# This script is intended to be run using the wrapper.py as part of the PhyloToL 6 Part 1
# pipeline. It requires that the setup of the 'Output' folder be that as output by script 4
# of this pipeline.
#Dependencies
import argparse, os, re, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data.CodonTable import CodonTable
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
blepharisma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
condylostoma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
c_uncinata_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA'])
euplotes_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
myrionecta_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
no_stop_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
peritrich_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
tag_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAG'])
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
PURPLE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###------------------------- Checks the Command Line Arguments -------------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
'"universal")\n'+color.END)
optional_arg_group.add_argument('--no_RP','-no_RP', action='store_true',
help=color.BOLD+color.GREEN+' Allows files to "skip" the removal\n of Partial Transcripts\n'\
+color.END)
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
### Adding in names to 'arg' class for more easy use throughout the script
args.ntd_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_NTD.ORF.fasta'
args.aa_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_AA.ORF.fasta'
args.tsv_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_allOGCleanresults.tsv'
args.home_folder = '/'.join(args.input_file.split('/')[:-1])
args.Diamond_Folder = args.home_folder+'/DiamondOG'
args.StopFreq = args.home_folder+'/StopCodonFreq'
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
args.tsv_file = args.input_file.split('.fas')[0]+ '_allOGCleanresults.tsv'
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_GCodeTranslate.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'\
' --genetic_code Universal'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
'taa','tag','tga','mesodinium']
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
'Condylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
print (author)
valid_arg += 1
else:
if args.list_codes == True:
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
valid_arg += 1
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'3_CountOGsDiamond.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"WTA_EPU.Renamed.fasta"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
if os.path.isdir(OG_folder) != True:
os.system('mkdir '+OG_folder)
if os.path.isdir(args.all_output_folder + 'TranslatedTranscriptomes') != True:
os.system('mkdir ' + args.all_output_folder + 'TranslatedTranscriptomes')
##########################################################################################
###---------------- Scans 5-Prime End of Transcript for In-Frame "ATG" ----------------###
##########################################################################################
def check_new_start_new(some_seq, low_lim, upper_lim, old_start, codon_table):
## Looks for in-frame STOP codons in the UTR of the transcript
prime5 = str(Seq(some_seq[low_lim:upper_lim]).translate(table=codon_table)).replace('*','x')
in_frame_stops = [stops.start() for stops in re.finditer('x',prime5)]
## Looks for in-frame START codons in the UTR of the transcript
in_frame_starts = [starts.start() for starts in re.finditer('M',prime5)]
## Checks that there are NO in-frame STOP codons between the possible "new" START codon
## and the aligned portion of the transcript -- THIS is double checked!
if len(in_frame_starts) != 0:
if len(in_frame_stops) != 0:
if in_frame_stops[-1] < in_frame_starts[-1]:
new_start = low_lim+in_frame_starts[-1]*3
else:
new_start = old_start
else:
new_start = low_lim+in_frame_starts[-1]*3
else:
new_start = old_start
## Skips the double-checking if there are no GOOD potential START codons
if new_start == old_start:
updated_start = old_start
else:
## Double checks that there are NO IN-FRAME stop codons between the NEW-SUGGESTED Start
## position and the OLD-SUPPORTED stop position!
between_new_old_start = str(Seq(some_seq[new_start:old_start]).translate(table=1)).replace('*','x')
in_frame_stops_check = [stops.start() for stops in re.finditer('x',between_new_old_start)]
in_frame_starts_check = [starts.start() for starts in re.finditer('M',between_new_old_start)]
if len(in_frame_starts_check) != 0:
if len(in_frame_stops_check) != 0:
if in_frame_stops_check[-1] < in_frame_starts_check[-1]:
updated_start = new_start+in_frame_starts_check[-1]*3
else:
updated_start = old_start
else:
updated_start = new_start
else:
updated_start = new_start
return updated_start
##########################################################################################
###--------------- Extracts the ORF from the Fasta File and SpreadSheet ---------------###
##########################################################################################
def extract_ORF(prot_dict, codon_table, args):
print (color.BOLD+'\n\nExtracting '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
' the transcriptomic data-set\n\n'+color.END)
for k, v in prot_dict.items():
## Attempting to find the most-likely START (ATG) position in the transcript (tricky)
## Skips this if the initial Methionine (ATG) is likely present
## (e.g. the alignment position of the protein = '1')
prot_start = int(v[3].split('..')[0])
old_start = v[1]
if prot_start != 1:
min_dist, max_dist = round_down_three(prot_start)
min_start = old_start-min_dist
max_start = old_start-max_dist
if min_start < 0:
min_start = old_start
if max_start < 0:
max_start = min_start%3
# print k+'\tOld_start\t'+str(old_start)+'\tMin_Dist/Start\t'+str(min_dist)+'/'+str(min_start)+'\tMax_Dist/Start\t'+str(max_dist)+'/'+str(max_start)+'\n'
updated_start = check_new_start_new(v[-1], max_start, min_start, old_start, codon_table)
else:
updated_start = old_start
temp = prot_dict[k][-1][updated_start:]
## Uses the given genetic code to identify the stop position of the ORF
temp_prot = str(Seq(temp).translate(table=codon_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[k].append(temp[:stop_pos])
else:
stop_pos = prot_dict[k][2] - prot_dict[k][1]
prot_dict[k].append(temp[:stop_pos])
## Awkward_list is populated with unexpectedly SHORT ORFs!
## Reasons for being short include:
# An error Xyrus introduced
# Not as great genetic code decision (in-frame stop)
# Crummy sequence/assembly quality (false in-frame stop codons)
awkward_list = []
look_good = []
for k, v in prot_dict.items():
expected_min = len(v[-2][v[1]:v[2]])-1
if len(v[-1]) < expected_min:
awkward_list.append(k)
else:
look_good.append(k)
if len(awkward_list) != 0:
with open('ShortTranscripts_FromTranslation.txt','w+') as x:
for entry in awkward_list:
x.write(entry+'\n')
else:
pass
print (color.BOLD+'\n\nTranslating '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
' using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
prot_dict[k].append(str(Seq(v[-1]).translate(table=codon_table)).rstrip('*'))
return prot_dict
##########################################################################################
###------------ Grabs the Coding Coordinates from the OG-BLAST SpreadSheet ------------###
##########################################################################################
def prep_translations(args):
print (color.BOLD+'\n\nGrabbing useful info from the '+color.ORANGE+args.input_file\
.split('/')[-1]+color.END+color.BOLD+' Fasta File\nand from the '+color.ORANGE+args.tsv_file\
.split('/')[-1]+color.END+color.BOLD+' OG-Assignment Spreadsheet'+color.END)
inTSV = ['\t'.join(i.rstrip('\n').split('\t')[:-1]) for i in open(args.tsv_file).readlines() if i != '\n']
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
# ORF identification step here, uses the 'allOGCleanresults.tsv file to identify the ORF
prot_dict = {}
# Special scenario! Only for when the genetic code is not particularly useful ...
if args.genetic_code.lower() == 'none' or args.genetic_code.lower() == 'condylostoma' or args.genetic_code.lower() == 'condy':
for i in inTSV:
prot_dict.setdefault(i.split('\t')[0],[])
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('F')
## Collects initial Start and Stop positions from the BLAST alignment
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('RC')
## Collects initial Start and Stop positions from the BLAST alignment
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
## Makes sure that the dictionary has the transcript in the correct orientation
for i in inFasta:
if i.description in prot_dict.keys():
if 'RC' == prot_dict[i.description][0]:
prot_dict[i.description].append(str(i.seq.reverse_complement()))
else:
prot_dict[i.description].append(str(i.seq))
else:
for i in inTSV:
prot_dict.setdefault(i.split('\t')[0],[])
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('F')
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('RC')
## Collects initial Start and Stop positions from the BLAST alignment (but in the "correct" orientation)
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
## Makes sure that the dictionary has the transcript in the correct orientation
for i in inFasta:
if i.description in prot_dict.keys():
if 'RC' == prot_dict[i.description][0]:
prot_dict[i.description].append(str(i.seq.reverse_complement()))
else:
prot_dict[i.description].append(str(i.seq))
return prot_dict
##########################################################################################
###------------------------ Rounds Down Values to Nearest "3" -------------------------###
##########################################################################################
def round_down_three(num):
min_val = int(num*3*.5)-int(num*3*.5)%3
max_val = int(num*6)-int(num*6)%3
return min_val, max_val
##########################################################################################
###--------------------- Makes Translation Steps (Later) Easier -----------------------###
##########################################################################################
def standardize_gcode(given_code):
if given_code == 'ciliate' or given_code == 'tga':
codon_table = 6
elif given_code == 'chilodonella' or given_code == 'chilo' or given_code == 'taa':
codon_table = c_uncinata_table
elif given_code == 'blepharisma' or given_code == 'bleph':
codon_table = blepharisma_table
elif given_code == 'euplotes' or given_code == 'eup':
codon_table = euplotes_table
elif given_code == 'myrionecta' or given_code == 'mesodinium':
codon_table = myrionecta_table
elif given_code == 'peritrich' or given_code == 'vorticella':
codon_table = peritrich_table
elif given_code == 'none':
codon_table = no_stop_table
elif given_code == 'condylostoma' or given_code == 'condy':
codon_table = condylostoma_table
elif given_code == 'tag':
codon_table = tag_table
elif given_code == 'universal':
codon_table = 1
else:
print (color.BOLD+color.RED+'\n\nNo valid genetic code provided!\n\n'+color.END+\
color.BOLD+'Using the "Universal" genetic code (by default)\n\nPlease check that the'\
' code you wish to use is supported:'+color.CYAN+'\n\npython 5_GCodeTranslate.py'\
' -list_codes\n\n'+color.END)
codon_table = 1
return codon_table
###########################################################################################
###------------------ Updates Spreadsheet with Updated Contig Names --------------------###
###########################################################################################
def update_spreadsheet(args, updated_spreadsheet_dict):
if os.path.isdir(args.home_folder + '/DiamondOG/') != True:
os.system(args.home_folder + '/DiamondOG/')
else:
pass
inTSV = [line.rstrip('\n') for line in open(args.tsv_file).readlines() if line != '\n' and line.split('\t')[0] in updated_spreadsheet_dict.keys()]
updatedTSV = [updated_spreadsheet_dict[line.split('\t')[0]]+'\t'+'\t'.join(line.split('\t')[1:]) for line in inTSV]
with open(args.tsv_out,'w+') as w:
w.write('\n'.join(updatedTSV))
###########################################################################################
###-------------------- Updates Log With OG Assignment Information ---------------------###
###########################################################################################
def update_log(filename, codon_table):
if os.path.isdir('../PostAssembly_Logs/') != True:
os.system('mkdir ../PostAssembly_Logs/')
else:
pass
ntd_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.fasta','fasta')]
aa_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.aa.fasta','fasta')]
min_ntd_ORF = str(min([len(i.seq) for i in ntd_ORF]))
max_ntd_ORF = str(max([len(i.seq) for i in ntd_ORF]))
avg_ntd_ORF = '%.2f' % (sum([len(i.seq) for i in ntd_ORF])/float(len(ntd_ORF)))
min_aa_ORF = str(min([len(i.seq) for i in aa_ORF]))
max_aa_ORF = str(max([len(i.seq) for i in aa_ORF]))
avg_aa_ORF = '%.2f' % (sum([len(i.seq) for i in aa_ORF])/float(len(aa_ORF)))
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
if Logname.startswith(filename.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
LogFile.write('Nucleotide ORFs\t'+str(len(ntd_ORF))+'\tn/a\tn/a\n')
LogFile.write('Nucleotide ORF Lengths\t'+avg_ntd_ORF+'\t'+min_ntd_ORF+'\t'+max_ntd_ORF+'\n')
LogFile.write('Protein ORFs\t'+str(len(aa_ORF))+'\tn/a\tn/a\n')
LogFile.write('Protein ORF Lengths\t'+avg_aa_ORF+'\t'+min_aa_ORF+'\t'+max_aa_ORF+'\n')
##########################################################################################
###----------------------- Write File with Provided Genetic Code ----------------------###
##########################################################################################
def write_data_out(prot_dict, codon_table, args):
update_spreadsheet_dict = {}
#The code below only works if rnaspades was used; constrained by addition of script 6b
for k, v in prot_dict.items():
#if 'Cov' in k:
new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-3:])
#update_spreadsheet_dict[k] = new_name
update_spreadsheet_dict[k] = k
#else:
#new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-2:])
#update_spreadsheet_dict[k] = new_name
#update_spreadsheet_dict[k] = k
with open(args.ntd_out,'w+') as w:
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'ORF'+color.END+color.BOLD\
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-2])+'\n')
with open(args.aa_out, 'w+') as w:
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'Translated ORF'+color.END+color.BOLD\
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-1])+'\n')
return update_spreadsheet_dict
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
if args.input_file.split('.fas')[0].split('/')[-1] + '_StopCodonStats.tsv' in os.listdir(args.home_folder):
os.system('mv ' + args.input_file.split('.fas')[0] + '_StopCodonStats.tsv ' + args.StopFreq)
os.system('mv '+args.tsv_file+' '+args.Diamond_Folder)
os.system('mv '+args.input_file+' '+args.Diamond_Folder)
if args.no_RP == True:
if os.path.isdir(args.all_output_folder + 'ToRename/') != True:
os.system('mkdir ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder + 'ToRename/')
else:
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder)
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder)
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder)
os.system('mv ' + args.home_folder + ' ' + args.all_output_folder + 'TranslatedTranscriptomes')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.ntd_out.split('/')[-1]+color.END+\
color.BOLD+',\n'+color.DARKCYAN+args.aa_out.split('/')[-1]+color.END+color.BOLD+', and\n'\
+color.DARKCYAN+args.tsv_out.split('/')[-1]+color.END+color.BOLD+',\nwhich are in the '+\
color.ORANGE+args.home_folder.split('/')[-1]+' Folder'+color.END)
if args.no_RP == True:
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'7_FinalRename.py'+color.END+color.BOLD+\
' in the '+color.PURPLE+'RemovePartials Folder'+color.END+color.BOLD+'\nwith a copy of'\
' the outputs of this script!'+color.END)
print(color.BOLD+'\n\nRemember that you have chosen '+color.RED+'NOT '+color.END+color.BOLD+\
'to remove partials\nand are skipping to the renaming step!\n\n'+color.END)
else:
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'6_FilterPartials.py'+color.END+color.BOLD+\
' in the '+color.PURPLE+'FinalizeTranscripts Folder'+color.END+color.BOLD+'\nwith a copy of'\
' the outputs of this script!\n\n'+color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
codon_table = standardize_gcode(args.genetic_code.lower())
prot_dict_Prepped = prep_translations(args)
prot_dict_Final = extract_ORF(prot_dict_Prepped, codon_table, args)
new_spreadsheet_names = write_data_out(prot_dict_Final, codon_table, args)
update_spreadsheet(args, new_spreadsheet_names)
# update_log(fasta_file, gcode)
clean_up(args)
next_script(args)
main()