mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 03:00:24 +08:00
250 lines
10 KiB
Python
250 lines
10 KiB
Python
# Last updated Sept 2017
|
|
# Author: Xyrus Maurer-Alcala
|
|
|
|
# This script is intended to aid in determining which genetic code to use in translating sequences
|
|
# (done in script 3) by checking stop codons used at the end of the input CDS, as well as determining
|
|
# if there is a significant number of in-frame stop codons (>5%). It outputs the results of this
|
|
# stop-codon analysis into the gcode_output.tsv file, which the user can fill in with desired genetic
|
|
# codes to use in translation. If genetic codes were input originally (using the --genetic_code argument in
|
|
# wrapper.py), then no action is needed here. This script is intended to be run as part of the PhyloToL 6
|
|
# Part 1 pipeline using the script wrapper.py.
|
|
|
|
#Dependencies
|
|
import argparse, os, sys
|
|
from argparse import RawTextHelpFormatter,SUPPRESS
|
|
from Bio import SeqIO
|
|
from Bio.Seq import Seq
|
|
|
|
#----------------------------- Colors For Print Statements ------------------------------#
|
|
class color:
|
|
PURPLE = '\033[95m'
|
|
CYAN = '\033[96m'
|
|
DARKCYAN = '\033[36m'
|
|
ORANGE = '\033[38;5;214m'
|
|
BLUE = '\033[94m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
RED = '\033[91m'
|
|
BOLD = '\033[1m'
|
|
UNDERLINE = '\033[4m'
|
|
END = '\033[0m'
|
|
|
|
|
|
#------------------------------- Main Functions of Script --------------------------------#
|
|
|
|
###########################################################################################
|
|
###------------------------- Checks the Command Line Arguments -------------------------###
|
|
###########################################################################################
|
|
|
|
def check_args():
|
|
|
|
parser = argparse.ArgumentParser(description=
|
|
color.BOLD + '\n\nThis script is intended to aid you with '+color.RED+'evaluating\n(or checking) '+\
|
|
color.END+color.BOLD+'the putative '+color.PURPLE+'Genetic Code'+color.END+color.BOLD+\
|
|
' for a given\nFasta file of annotated (and untranslated) CDSs.\n\nTo do so, this script'\
|
|
' checks for stop codon usages,\n'+color.RED+'suggesting '+color.END+color.BOLD+'the use of'\
|
|
+color.PURPLE+' published and well-known\nalternate genetic codes'+color.END+color.BOLD+\
|
|
' that are supported by the\nnext script: '+color.END+color.BOLD+color.PURPLE+'3g_GCodeTranslate.py'\
|
|
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
|
|
|
|
|
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
|
|
|
required_arg_group.add_argument('--input_file','-in', action='store',
|
|
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
|
|
|
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
|
|
|
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
|
|
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
|
|
|
|
optional_arg_group.add_argument('-author', action='store_true',
|
|
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
|
|
|
|
|
if len(sys.argv[1:]) == 0:
|
|
print (parser.description)
|
|
print ('\n')
|
|
sys.exit()
|
|
|
|
args = parser.parse_args()
|
|
|
|
quit_eval = return_more_info(args)
|
|
if quit_eval > 0:
|
|
sys.exit()
|
|
|
|
args.folder = '/'.join(args.input_file.split('/')[:-1])
|
|
|
|
return args
|
|
|
|
|
|
###########################################################################################
|
|
###------------------------------- Script Usage Message --------------------------------###
|
|
###########################################################################################
|
|
|
|
def usage_msg():
|
|
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2g_GCodeEval.py'\
|
|
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'+color.END)
|
|
|
|
|
|
##########################################################################################
|
|
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
|
##########################################################################################
|
|
|
|
def return_more_info(args):
|
|
|
|
valid_arg = 0
|
|
|
|
supported_gcodes = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
|
|
'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
|
|
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
|
|
|
|
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
|
' maurerax@gmail.com\n\n'+color.END)
|
|
|
|
if args.list_codes == True:
|
|
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
|
|
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes)+'\n\n'+color.END)
|
|
valid_arg += 1
|
|
|
|
if args.author == True:
|
|
print (author)
|
|
valid_arg += 1
|
|
|
|
print(args.input_file.split('/')[-1], '/'.join(args.input_file.split('/')[:-1]))
|
|
|
|
|
|
if args.input_file != None:
|
|
if os.path.isfile(args.input_file) != False:
|
|
|
|
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
|
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
|
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
|
valid_arg += 1
|
|
else:
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
|
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
|
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
|
valid_arg += 1
|
|
|
|
return valid_arg
|
|
|
|
|
|
###########################################################################################
|
|
###-------------------- Counts Several Metrics of Stop Codon Usage ---------------------###
|
|
###########################################################################################
|
|
|
|
def count_stops(args):
|
|
|
|
print (color.BOLD+'\n\nScanning CDSs for In-Frame Stop Codons and Tracking\nFINAL '\
|
|
'(Terminal) stop codon usage\n\n'+color.END)
|
|
|
|
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
|
seq_ends = [str(i.seq)[-3:].lower() for i in inFasta]
|
|
inFrame_stops_raw = [str(i.seq[:-3].translate()).count('*') for i in inFasta]
|
|
inFrame_stops_summary = [i for i in inFrame_stops_raw if i != 0]
|
|
|
|
tga_end = seq_ends.count('tga')
|
|
tag_end = seq_ends.count('tag')
|
|
taa_end = seq_ends.count('taa')
|
|
|
|
end_stop_freq = [tga_end, tag_end, taa_end]
|
|
|
|
if max(end_stop_freq) > 0.95*sum(end_stop_freq):
|
|
pos_to_keep = [i for i, j in enumerate(end_stop_freq) if j == max(end_stop_freq)][0]
|
|
try:
|
|
if pos_to_keep == 0:
|
|
end_stop_freq = [end_stop_freq[0],0,0]
|
|
elif pos_to_keep == 1:
|
|
end_stop_freq = [0,end_stop_freq[1],0]
|
|
elif pos_to_keep == 2:
|
|
end_stop_freq = [0,0,end_stop_freq[2]]
|
|
except:
|
|
pass
|
|
|
|
inFrame_stop_info = [len(inFrame_stops_summary), int(round(len(inFrame_stops_raw)*0.05)), sum(inFrame_stops_summary)]
|
|
return end_stop_freq, inFrame_stop_info
|
|
|
|
|
|
###########################################################################################
|
|
###-------------------- Suggests Genetic Code Given Stop Codon Usage -------------------###
|
|
###########################################################################################
|
|
|
|
def suggest_code(args):
|
|
|
|
stop_freq, inFrames = count_stops(args)
|
|
|
|
genetic_code = ''
|
|
|
|
if stop_freq.count(0) == 3:
|
|
print (color.BOLD + color.RED + '\n\nNO Stop Codons Present in Data-set\n\n')
|
|
genetic_code = 'None (UNDETERMINED -- NO STOP CODONS)'
|
|
else:
|
|
## DUMB way of checking if there are a significant (> 5%) number of CDSs with IN-FRAME stop codons
|
|
if inFrames[0] < inFrames[1]:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Universal (table = 1)'+color.END)
|
|
genetic_code = 'Universal (table = 1)'
|
|
else:
|
|
|
|
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Condylostoma-Code'\
|
|
' (No Dedicated Stops) OR None (all stops = "X")'+color.END)
|
|
genetic_code = 'Condylostoma or None'
|
|
if stop_freq[0] == 0 and stop_freq[1] == 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Chilodonella-Code'\
|
|
+' (Only Stop = TAA)'+color.END)
|
|
genetic_code = 'Chilodonella or TAA'
|
|
if stop_freq[0] == 0 and stop_freq[2] == 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TAG-Code'\
|
|
+' (Only Stop = TAG)'+color.END)
|
|
genetic_code = 'TAG'
|
|
if stop_freq[1] == 0 and stop_freq[2] == 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Ciliate-Code'\
|
|
+' (table = 6)'+color.END)
|
|
genetic_code = 'Ciliate (table = 6)'
|
|
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] == 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAG are STOP'+color.END)
|
|
genetic_code = 'TGA/TAG'
|
|
if stop_freq[0] != 0 and stop_freq[1] == 0 and stop_freq[2] != 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAA are STOP'+color.END)
|
|
genetic_code = 'TGA/TAA'
|
|
if stop_freq[0] == 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
|
|
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Blepharisma/Euplotes-Codes'\
|
|
+color.END + color.BOLD+'\n--- NOTE: '+color.RED+' Stop-Codon Reassignments'\
|
|
+' differ! (TGA = W or TGA = C)' + color.END)
|
|
genetic_code = 'Blepharisma (TGA = W) or Euplotes (TGA = C)'
|
|
|
|
return genetic_code, stop_freq
|
|
|
|
|
|
###########################################################################################
|
|
###---------------- Writes Out Currently Crummy Summary of Genetic Codes ---------------###
|
|
###########################################################################################
|
|
|
|
def summarize(args):
|
|
|
|
suggestion, stop_freq = suggest_code(args)
|
|
|
|
with open(args.input_file.split('.fa')[0]+'.GeneticCode.txt','w+') as w:
|
|
w.write('Stop Codon\tFrequency\n')
|
|
w.write('TGA\t'+str(stop_freq[0])+'\n')
|
|
w.write('TAG\t'+str(stop_freq[1])+'\n')
|
|
w.write('TAA\t'+str(stop_freq[2])+'\n\n')
|
|
w.write('Suggestion For Genetic Code:\t'+suggestion+'\n\n')
|
|
|
|
|
|
##########################################################################################
|
|
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
|
##########################################################################################
|
|
|
|
def main():
|
|
|
|
args = check_args()
|
|
|
|
summarize(args)
|
|
|
|
print (color.BOLD+'\nNext Script is: '+color.PURPLE+' 3g_GCodeTranslate.py\n\n'+color.END)
|
|
|
|
main()
|