mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 07:10:25 +08:00
Add files via upload
This commit is contained in:
parent
3347fc979a
commit
cc83374ce4
217
PTL1/Genomes/Scripts/1_RenameCDS.py
Normal file
217
PTL1/Genomes/Scripts/1_RenameCDS.py
Normal file
@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 19_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 1g_RenameCDS.py --help
|
||||
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqUtils import GC
|
||||
import argparse, os, sys, time
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to extract '+color.RED+'Annotated '+\
|
||||
color.PURPLE+'ORFS\n'+color.END+color.BOLD+'from a provided Genbank formatted file.'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--output_dir','-o', action='store',
|
||||
help=color.BOLD+color.GREEN+' Output directory\n'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--source','-s', action='store', default='GenBank',
|
||||
help=color.BOLD+color.GREEN+' Data Source of CDSs (default = "GenBank")\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--list_source','-lsrc', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Lists supported data sources\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
more_info = return_more_info(args)
|
||||
if more_info != None:
|
||||
print (parser.description)
|
||||
print (more_info)
|
||||
sys.exit()
|
||||
|
||||
args.folder = args.output_dir + '/' + args.input_file.split('/')[-1][:10]
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1g_RenameCDS.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.fasta --source'\
|
||||
' GenBank'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
acceptable_sources = ['in-house', 'in-lab', 'GenBank', 'gb', 'NCBI']
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
return author
|
||||
|
||||
if args.list_source == True:
|
||||
print (color.BOLD+color.RED+'\nThese are the currently supported data sources.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
if args.source.lower() not in [i.lower() for i in acceptable_sources]:
|
||||
print (color.BOLD+color.RED+'\nUnsupported source was provided.\n\nEnsure that '\
|
||||
'you are providing a valid data source (see below).\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
if args.input_file != None:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
if os.path.isdir(args.folder) != True:
|
||||
os.system('mkdir '+args.folder)
|
||||
os.system('cp '+args.input_file+' '+args.folder)
|
||||
args.input_file = args.folder+'/'+args.input_file.split('/')[-1]
|
||||
|
||||
if os.path.isdir(args.folder+'/Original') != True:
|
||||
os.system('mkdir '+args.folder+'/Original')
|
||||
|
||||
os.system('cp '+args.input_file+' '+args.folder+'/Original/')
|
||||
|
||||
###########################################################################################
|
||||
###------------- Renames Protein-Coding CDS Sequences to Standard Format ---------------###
|
||||
###########################################################################################
|
||||
|
||||
def renamed_GenomeCDS(args):
|
||||
|
||||
print (color.BOLD+'\n\nPrepping to rename '+color.GREEN+args.input_file.split('/')[-1]+\
|
||||
color.END+color.BOLD+"'s CDS sequences"+color.END)
|
||||
inFasta = sorted((i for i in SeqIO.parse(args.input_file,'fasta')),key=lambda seq_rec: -len(seq_rec.seq))
|
||||
|
||||
renamed_seqs = []
|
||||
seq_code_dict = {}
|
||||
|
||||
count = 1
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append('Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
||||
seq_code_dict[seq_rec.description].append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>Contig_' + str(count) + '_Len' + str(len(seq_rec.seq)) + '\n' + str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
|
||||
## keeps only CDSs that are greater than 30 bp (10 AA --> This is a cut-off in the
|
||||
## phylogenomic pipeline too!)
|
||||
renamed_seqs = [i for i in renamed_seqs if len(i.split('\n')[-1]) > 30]
|
||||
|
||||
print (color.BOLD+'\n\nFor '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+\
|
||||
color.BOLD+', '+color.RED+str(len(renamed_seqs))+' CDS sequences\n'+color.END+color.BOLD+
|
||||
'were renamed while preserving the '+color.ORANGE+args.source+color.END+color.BOLD+' formatting'\
|
||||
+color.END+'\n')
|
||||
|
||||
with open(args.input_file.replace('.fasta','.Prepped.fasta'),'w+') as w:
|
||||
w.write('\n'.join(renamed_seqs))
|
||||
|
||||
with open(args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv'),'w+') as w:
|
||||
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
|
||||
for k, v in seq_code_dict.items():
|
||||
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files --------------------###
|
||||
###########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
# os.system('rm '+args.input_file)
|
||||
os.system('mv ' + args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv') + ' ' + args.folder + '/Original/')
|
||||
os.system('mv ' + args.input_file + ' ' + args.folder + '/Original/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1].replace('.fasta','.Renamed.fasta')\
|
||||
+'.fasta'+color.END+color.BOLD+'\nin the '+color.ORANGE+args.folder.split('/')[-1]+\
|
||||
' Folder\n\n'+color.END+color.BOLD)
|
||||
|
||||
print ('Next Script(s) are:\n\n'+color.PURPLE+'2g_GCodeEval.py'+color.END+color.BOLD\
|
||||
+' (if Genetic Code is '+color.RED+'Unknown'+color.END+color.BOLD+')\n\nOtherwise:\n\n'+\
|
||||
color.PURPLE+'3g_GCodeTranslate.py\n\n'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###----------------------------- Calls on Above Functions -----------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
renamed_GenomeCDS(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
252
PTL1/Genomes/Scripts/2_GCodeEval.py
Normal file
252
PTL1/Genomes/Scripts/2_GCodeEval.py
Normal file
@ -0,0 +1,252 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 19_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 2g_GCodeEval.py --help
|
||||
|
||||
|
||||
#############################################################################################
|
||||
# #
|
||||
# Suggests which Genetic Code to use based upon Presence/Absence of Specific Stop Codons #
|
||||
# at the end of the CDS sequences. This is to provide a ROUGH gauge for the user. #
|
||||
# #
|
||||
#############################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###------------------------- Checks the Command Line Arguments -------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to aid you with '+color.RED+'evaluating\n(or checking) '+\
|
||||
color.END+color.BOLD+'the putative '+color.PURPLE+'Genetic Code'+color.END+color.BOLD+\
|
||||
' for a given\nFasta file of annotated (and untranslated) CDSs.\n\nTo do so, this script'\
|
||||
' checks for stop codon usages,\n'+color.RED+'suggesting '+color.END+color.BOLD+'the use of'\
|
||||
+color.PURPLE+' published and well-known\nalternate genetic codes'+color.END+color.BOLD+\
|
||||
' that are supported by the\nnext script: '+color.END+color.BOLD+color.PURPLE+'3g_GCodeTranslate.py'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
args.folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2g_GCodeEval.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
supported_gcodes = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
|
||||
'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
|
||||
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.list_codes == True:
|
||||
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes)+'\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
print(args.input_file.split('/')[-1], '/'.join(args.input_file.split('/')[:-1]))
|
||||
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Counts Several Metrics of Stop Codon Usage ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def count_stops(args):
|
||||
|
||||
print (color.BOLD+'\n\nScanning CDSs for In-Frame Stop Codons and Tracking\nFINAL '\
|
||||
'(Terminal) stop codon usage\n\n'+color.END)
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
seq_ends = [str(i.seq)[-3:].lower() for i in inFasta]
|
||||
inFrame_stops_raw = [str(i.seq[:-3].translate()).count('*') for i in inFasta]
|
||||
inFrame_stops_summary = [i for i in inFrame_stops_raw if i != 0]
|
||||
|
||||
tga_end = seq_ends.count('tga')
|
||||
tag_end = seq_ends.count('tag')
|
||||
taa_end = seq_ends.count('taa')
|
||||
|
||||
end_stop_freq = [tga_end, tag_end, taa_end]
|
||||
|
||||
if max(end_stop_freq) > 0.95*sum(end_stop_freq):
|
||||
pos_to_keep = [i for i, j in enumerate(end_stop_freq) if j == max(end_stop_freq)][0]
|
||||
try:
|
||||
if pos_to_keep == 0:
|
||||
end_stop_freq = [end_stop_freq[0],0,0]
|
||||
elif pos_to_keep == 1:
|
||||
end_stop_freq = [0,end_stop_freq[1],0]
|
||||
elif pos_to_keep == 2:
|
||||
end_stop_freq = [0,0,end_stop_freq[2]]
|
||||
except:
|
||||
pass
|
||||
|
||||
inFrame_stop_info = [len(inFrame_stops_summary), int(round(len(inFrame_stops_raw)*0.05)), sum(inFrame_stops_summary)]
|
||||
return end_stop_freq, inFrame_stop_info
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Suggests Genetic Code Given Stop Codon Usage -------------------###
|
||||
###########################################################################################
|
||||
|
||||
def suggest_code(args):
|
||||
|
||||
stop_freq, inFrames = count_stops(args)
|
||||
|
||||
genetic_code = ''
|
||||
|
||||
if stop_freq.count(0) == 3:
|
||||
print (color.BOLD + color.RED + '\n\nNO Stop Codons Present in Data-set\n\n')
|
||||
genetic_code = 'None (UNDETERMINED -- NO STOP CODONS)'
|
||||
else:
|
||||
## DUMB way of checking if there are a significant (> 5%) number of CDSs with IN-FRAME stop codons
|
||||
if inFrames[0] < inFrames[1]:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Universal (table = 1)'+color.END)
|
||||
genetic_code = 'Universal (table = 1)'
|
||||
else:
|
||||
|
||||
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Condylostoma-Code'\
|
||||
' (No Dedicated Stops) OR None (all stops = "X")'+color.END)
|
||||
genetic_code = 'Condylostoma or None'
|
||||
if stop_freq[0] == 0 and stop_freq[1] == 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Chilodonella-Code'\
|
||||
+' (Only Stop = TAA)'+color.END)
|
||||
genetic_code = 'Chilodonella or TAA'
|
||||
if stop_freq[0] == 0 and stop_freq[2] == 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TAG-Code'\
|
||||
+' (Only Stop = TAG)'+color.END)
|
||||
genetic_code = 'TAG'
|
||||
if stop_freq[1] == 0 and stop_freq[2] == 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Ciliate-Code'\
|
||||
+' (table = 6)'+color.END)
|
||||
genetic_code = 'Ciliate (table = 6)'
|
||||
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] == 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAG are STOP'+color.END)
|
||||
genetic_code = 'TGA/TAG'
|
||||
if stop_freq[0] != 0 and stop_freq[1] == 0 and stop_freq[2] != 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAA are STOP'+color.END)
|
||||
genetic_code = 'TGA/TAA'
|
||||
if stop_freq[0] == 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
|
||||
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Blepharisma/Euplotes-Codes'\
|
||||
+color.END + color.BOLD+'\n--- NOTE: '+color.RED+' Stop-Codon Reassignments'\
|
||||
+' differ! (TGA = W or TGA = C)' + color.END)
|
||||
genetic_code = 'Blepharisma (TGA = W) or Euplotes (TGA = C)'
|
||||
|
||||
return genetic_code, stop_freq
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------- Writes Out Currently Crummy Summary of Genetic Codes ---------------###
|
||||
###########################################################################################
|
||||
|
||||
def summarize(args):
|
||||
|
||||
suggestion, stop_freq = suggest_code(args)
|
||||
|
||||
with open(args.input_file.split('.fa')[0]+'.GeneticCode.txt','w+') as w:
|
||||
w.write('Stop Codon\tFrequency\n')
|
||||
w.write('TGA\t'+str(stop_freq[0])+'\n')
|
||||
w.write('TAG\t'+str(stop_freq[1])+'\n')
|
||||
w.write('TAA\t'+str(stop_freq[2])+'\n\n')
|
||||
w.write('Suggestion For Genetic Code:\t'+suggestion+'\n\n')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
summarize(args)
|
||||
|
||||
print (color.BOLD+'\nNext Script is: '+color.PURPLE+' 3g_GCodeTranslate.py\n\n'+color.END)
|
||||
|
||||
main()
|
||||
397
PTL1/Genomes/Scripts/3_GCodeTranslate.py
Normal file
397
PTL1/Genomes/Scripts/3_GCodeTranslate.py
Normal file
@ -0,0 +1,397 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 19_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 3g_GCodeTranslate.py --help
|
||||
|
||||
|
||||
##############################################################################
|
||||
## ##
|
||||
## Translates CDSs sequences using the Provided Genetic Code. ##
|
||||
## ##
|
||||
## NOTE: ##
|
||||
## No provided input for genetic code results in Translation with the ##
|
||||
## UNIVERSAL genetic code (as default) ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
##############################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.Data.CodonTable import CodonTable
|
||||
|
||||
|
||||
|
||||
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
||||
|
||||
blepharisma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
condylostoma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
c_uncinata_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA'])
|
||||
|
||||
euplotes_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
myrionecta_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
no_stop_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
peritrich_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
tag_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAG'])
|
||||
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###------------------------- Checks the Command Line Arguments -------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
|
||||
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
|
||||
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
|
||||
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
|
||||
'"universal")\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
args.folder = '../'+args.input_file.split('/')[1]
|
||||
args.out_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.AA.fasta'
|
||||
args.new_ntd_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.NTD.fasta'
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3g_GCodeTranslate.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Prepped.fasta'\
|
||||
' --genetic_code Universal'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
|
||||
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
|
||||
'taa','tag','tga']
|
||||
|
||||
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
|
||||
'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
|
||||
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
|
||||
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
|
||||
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
|
||||
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
|
||||
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
else:
|
||||
if args.list_codes == True:
|
||||
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------ Translates CDSs from the Provided Genetic Code ------------------###
|
||||
##########################################################################################
|
||||
|
||||
def translate_seqs(args):
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
print (color.BOLD+'\n\n\nTranslating: '+color.CYAN+args.input_file.split('/')[-1]+color.END+\
|
||||
color.BOLD+'\nwith the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n'+color.END)
|
||||
|
||||
|
||||
if args.genetic_code.lower() == 'ciliate' or args.genetic_code.lower() == 'tga':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=6)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'peritrich' or args.genetic_code.lower() == 'vorticella':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=peritrich_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'tag':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=tag_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'chilo' or args.genetic_code.lower() == 'chilodonella' or args.genetic_code.lower() == 'taa':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=c_uncinata_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'bleph' or args.genetic_code.lower() == 'blepharisma':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=blepharisma_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'eup' or args.genetic_code.lower() == 'euplotes':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=euplotes_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
if args.genetic_code.lower() == 'universal':
|
||||
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=1)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
|
||||
|
||||
return translated_seqs
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###---------------------------- Writes Out Translated CDSs ----------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def write_out(args):
|
||||
|
||||
translated_seqs = translate_seqs(args)
|
||||
|
||||
## Keep only ORFs greater than 10 amino acids long
|
||||
translated_seqs = [i for i in translated_seqs if len(i.split('\n')[1]) > 10]
|
||||
|
||||
print (color.BOLD+'\nTranslated '+color.ORANGE+str(len(translated_seqs))+color.END\
|
||||
+color.BOLD+' seqeunces using the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n\n'+color.END)
|
||||
|
||||
with open(args.out_name,'w+') as w:
|
||||
w.write(''.join(translated_seqs))
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
os.system('mv '+args.input_file+' '+args.new_ntd_name)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###----------------------------- Calls on Above Functions -----------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
write_out(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.PURPLE+' 4g_CountOgsUsearch.py\n\n'+color.END)
|
||||
|
||||
main()
|
||||
319
PTL1/Genomes/Scripts/4_CountOGsDiamond.py
Normal file
319
PTL1/Genomes/Scripts/4_CountOGsDiamond.py
Normal file
@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 19_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 3g_GCodeTranslate.py --help
|
||||
|
||||
##############################################################################
|
||||
## ##
|
||||
## This scrip will categorize TRANSLATED CDSs into Homologous Gene Families ##
|
||||
## ##
|
||||
## Questions about Gene Family Binning/Source? SEE NOTES at Bottom! ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
##############################################################################
|
||||
|
||||
import argparse, os, re, sys
|
||||
from argparse import RawTextHelpFormatter, SUPPRESS
|
||||
from distutils import spawn
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#
|
||||
def check_diamond_path():
|
||||
### IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = /path/to/diamond
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+ ' the PATH to the' + color.BLUE + ' "diamond" '+ color.END + color.BOLD\
|
||||
+ 'executable.\n\n' + color.END)
|
||||
print (color.BOLD + color.BLUE + 'LOOK FOR:\n\n' + color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+ color.BLUE + '\n\nThis is somewhere around lines 55 - 80...\n\n' + color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
|
||||
+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
|
||||
+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
|
||||
'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
|
||||
+'THIS script (4_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
||||
' Eukaryotic protein coding transcripts'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+'Path to folder containing db_OG'+color.END)
|
||||
required_arg_group.add_argument('--evalue','-e', action='store',
|
||||
help=color.BOLD+color.GREEN+'Maximum OG assignment e-value'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--threads','-t', default='2',
|
||||
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
args.diamond = check_diamond_path()
|
||||
|
||||
args.home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
|
||||
|
||||
args.tsv_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('.AA.fasta','_allOGCleanresults.tsv')
|
||||
|
||||
args.aa_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed')
|
||||
args.ntd_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('AA','NTD')
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_CountOGsDiamond.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Universal.AA.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('AA.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'3g_GCodeTranslate.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"AA.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_OG') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again\n\n.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
ogdb_count = 0
|
||||
for file in os.listdir(args.databases + '/db_OG'):
|
||||
if file.endswith('.dmnd'):
|
||||
ogdb_count += 1
|
||||
|
||||
if ogdb_count == 0:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif ogdb_count > 1:
|
||||
print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1])+'/DiamondOG/'
|
||||
|
||||
if os.path.isdir(OG_folder) != True:
|
||||
os.system('mkdir '+OG_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def OG_ublast(args):
|
||||
|
||||
db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
|
||||
|
||||
OG_diamond_cmd = args.diamond + ' blastp -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + args.evalue + ' --subject-cover 0.5 --threads ' + args.threads + ' --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_allOGresults'
|
||||
os.system(OG_diamond_cmd)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
|
||||
###########################################################################################
|
||||
|
||||
def keep_best(args):
|
||||
print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST'\
|
||||
'\nmatch for each transcript\n\n'+color.END)
|
||||
|
||||
inTSV = [i for i in open(args.input_file.split('.fas')[0]+'_allOGresults').read().split('\n') if i != '']
|
||||
|
||||
inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
|
||||
|
||||
keep = []
|
||||
for i in inTSV:
|
||||
if any(i.split('\t')[0] in j for j in keep) != True:
|
||||
keep.append(i)
|
||||
|
||||
updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+\
|
||||
'\t'+'\t'.join(line.split('\t')[1:])+'\n' for line in keep]))
|
||||
|
||||
with open(args.tsv_out, 'w+') as w:
|
||||
for i in updated_lines:
|
||||
w.write(i+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
|
||||
###########################################################################################
|
||||
|
||||
def update_fasta(args):
|
||||
|
||||
print (color.BOLD+color.PURPLE+'Updating Sequence Names with their BEST OG hits\n\n'+color.END)
|
||||
|
||||
keep = [i for i in open(args.tsv_out).read().split('\n') if i != '']
|
||||
|
||||
keep_dict = { }
|
||||
for line in keep:
|
||||
try:
|
||||
og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
|
||||
og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
|
||||
og = og_prefix + og_number
|
||||
|
||||
keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0] : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
protFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
ntdFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file.replace('.AA.','.NTD.'),'fasta')]
|
||||
|
||||
updated_prot_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in protFasta if i.description in keep_dict.keys()]
|
||||
updated_ntd_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in ntdFasta if i.description in keep_dict.keys()]
|
||||
|
||||
with open(args.aa_out,'w+') as w:
|
||||
for i in updated_prot_name:
|
||||
w.write(i)
|
||||
|
||||
with open(args.ntd_out,'w+') as x:
|
||||
for i in updated_ntd_name:
|
||||
x.write(i)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
os.system('mv '+args.input_file.replace('.fasta','_allOGresults')+' '+args.home_folder+\
|
||||
'/DiamondOG')
|
||||
|
||||
os.system('cp '+args.aa_out+' '+args.home_folder+'/DiamondOG/')
|
||||
os.system('cp '+args.ntd_out+' '+args.home_folder+'/DiamondOG/')
|
||||
os.system('cp '+args.tsv_out+' '+args.home_folder+'/DiamondOG/')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###----------------------------- Calls on Above Functions -----------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
OG_ublast(args)
|
||||
|
||||
keep_best(args)
|
||||
|
||||
update_fasta(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.GREEN+'5g_FinalizeName.py\n\n'+color.END)
|
||||
|
||||
main()
|
||||
|
||||
#----------------------------------------- NOTES -----------------------------------------#
|
||||
#
|
||||
# This script uses a "BLAST"-based approach to identify ANCIENT homologous gene families.
|
||||
#
|
||||
# Gene family designations were taken from OrthoMCL.org and serve as the database for
|
||||
# this script's gene family assignments. These gene family assignments are NON-EXHAUSTIVE
|
||||
# and most Lineage-Specific families will be missed!
|
||||
#
|
||||
# If you have any questions contact Xyrus (author): maurerax@gmail.com
|
||||
374
PTL1/Genomes/Scripts/5_FinalizeName.py
Normal file
374
PTL1/Genomes/Scripts/5_FinalizeName.py
Normal file
@ -0,0 +1,374 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 20_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 5g_FinalizeName.py --help
|
||||
|
||||
##################################################################################################
|
||||
## This script is intended to rename the outputs of the FilterPartials script ##
|
||||
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
|
||||
## 8. Partial sequences have been removed from the transcriptomic data sets ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## NONE! You're FINISHED! :D ##
|
||||
## ##
|
||||
##################################################################################################
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
#----------------------- Solely to Make Print Statements Colorful -----------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
|
||||
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
|
||||
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
|
||||
required_arg_group.add_argument('--name','-n', action='store',
|
||||
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
|
||||
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-3])
|
||||
|
||||
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
|
||||
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
|
||||
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
|
||||
args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/'
|
||||
|
||||
args.xml_out = args.input_AA.split('/')[-1]+'_1e-10keepall_BlastOutall.oneHit'
|
||||
|
||||
check_code(args)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_FinalizeName.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.Universal.AA.fasta'\
|
||||
' --name Sr_ci_Scer'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_args = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_args += 1
|
||||
|
||||
if args.input_file.endswith('AA.fasta'):
|
||||
args.input_NTD = args.input_file.replace('AA.fasta','NTD.fasta')
|
||||
args.input_AA = args.input_file
|
||||
args.input_TSV = args.input_file.replace('.AA.fasta','_allOGCleanresults.tsv')
|
||||
|
||||
elif args.input_file.endswith('NTD.fasta'):
|
||||
args.input_NTD = args.input_file
|
||||
args.input_AA = args.input_file.replace('NTD.fasta','AA.fasta')
|
||||
args.input_TSV = args.input_file.replace('.NTD.fasta','_allOGCleanresults.tsv')
|
||||
|
||||
if os.path.isfile(args.input_NTD) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_AA) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_TSV) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
return valid_args
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Double Checks Format for 10-Character Code ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_code(args):
|
||||
|
||||
check_name = args.name.split('_')
|
||||
|
||||
if len(args.name) != 10:
|
||||
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
elif args.name.count('_') != 2:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
|
||||
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
|
||||
.split('_Filtered')[0]+color.END+color.BOLD+"'s files\nusing the following 10-character "\
|
||||
"code: "+color.CYAN+args.name+color.END+'\n')
|
||||
|
||||
else:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------- Creates Folders For Storing Data -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/') != True:
|
||||
os.system('mkdir '+args.r2g_ntd)
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/') != True:
|
||||
os.system('mkdir '+args.r2g_aa)
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/') != True:
|
||||
os.system('mkdir '+args.r2g_tsv)
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/') != True:
|
||||
os.system('mkdir '+args.r2g_xml)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_paralogs(args):
|
||||
|
||||
home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/'
|
||||
|
||||
print('HOME ' + home_folder)
|
||||
|
||||
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_')
|
||||
|
||||
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_')
|
||||
|
||||
|
||||
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
|
||||
if '\n\n' in open(args.input_TSV).read():
|
||||
renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n\n','\n'+args.name+'_')
|
||||
else:
|
||||
renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n','\n'+args.name+'_')
|
||||
|
||||
with open(home_folder + args.input_AA.split('/')[-1],'w+') as w:
|
||||
w.write(renamed_Final_Prots)
|
||||
|
||||
with open(home_folder + args.input_NTD.split('/')[-1],'w+') as x:
|
||||
x.write(renamed_Final_Nucs)
|
||||
|
||||
with open(home_folder + args.input_TSV.split('/')[-1],'w+') as y:
|
||||
y.write(renamed_Final_tsv)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------------- Header/Tail Lines ---------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def header_tail():
|
||||
header = '<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n'\
|
||||
'<BlastOutput>\n <BlastOutput_program>blastp</BlastOutput_program>\n <BlastOutput_version>BLASTP 2.2.29+</BlastOutput_version>\n'\
|
||||
' <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n'\
|
||||
' <BlastOutput_db>../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta</BlastOutput_db>\n <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n'
|
||||
|
||||
tail = '</BlastOutput_iterations>\n</BlastOutput>'
|
||||
return header, tail
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- TSV to XML Conversion -------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def convert_TSV_data(args):
|
||||
|
||||
home_folder = '/'.join(args.input_AA.split('/')[:-2])
|
||||
|
||||
TSVforConvert = home_folder+ '/' + args.input_TSV.split('/')[-1]
|
||||
|
||||
inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n']
|
||||
|
||||
iterations = []
|
||||
|
||||
for n in range(len(inTSV)):
|
||||
if n == 0:
|
||||
iterations.append(' <BlastOutput_query-def>'+inTSV[n].split('\t')[0]+'</BlastOutput_query-def>\n <BlastOutput_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</BlastOutput_query-len>\n'\
|
||||
' <BlastOutput_param>\n <Parameters>\n <Parameters_matrix>BLOSUM62</Parameters_matrix>\n <Parameters_expect>1e-10</Parameters_expect>\n'\
|
||||
' <Parameters_gap-open>11</Parameters_gap-open>\n <Parameters_gap-extend>1</Parameters_gap-extend>\n <Parameters_filter>F</Parameters_filter>\n'\
|
||||
' </Parameters>\n </BlastOutput_param>\n<BlastOutput_iterations>\n<Iteration>\n <Iteration_iter-num>1</Iteration_iter-num>\n <Iteration_query-ID>Query_1</Iteration_query-ID>\n'\
|
||||
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
|
||||
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
|
||||
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
|
||||
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
|
||||
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
|
||||
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
|
||||
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
|
||||
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
|
||||
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
|
||||
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
|
||||
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
|
||||
else:
|
||||
iterations.append('<Iteration>\n <Iteration_iter-num>'+str(n+1)+'</Iteration_iter-num>\n <Iteration_query-ID>Query_'+str(n+1)+'</Iteration_query-ID>\n'\
|
||||
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
|
||||
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
|
||||
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
|
||||
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
|
||||
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
|
||||
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
|
||||
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
|
||||
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
|
||||
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
|
||||
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
|
||||
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
|
||||
|
||||
return iterations
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Writes Out the Fake XML File ----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def write_Fake_XML(args):
|
||||
|
||||
home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/'
|
||||
|
||||
print (color.BOLD+'\n\nConverting '+color.ORANGE+args.name+'_XX_'+args.input_TSV.split('/')[-1]\
|
||||
+color.END+color.BOLD+' to XML format\n'+color.END)
|
||||
|
||||
header, tail = header_tail()
|
||||
|
||||
iterations = convert_TSV_data(args)
|
||||
|
||||
with open(home_folder+args.xml_out,'w+') as w:
|
||||
w.write(header)
|
||||
w.write(''.join(iterations))
|
||||
w.write(tail)
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
|
||||
##########################################################################################
|
||||
def clean_up(args):
|
||||
|
||||
final_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
|
||||
|
||||
os.system('rm '+args.input_AA)
|
||||
os.system('rm '+args.input_NTD)
|
||||
os.system('rm '+args.input_TSV)
|
||||
|
||||
os.system('cp '+final_folder+'*Renamed.*.AA.fasta '+args.r2g_aa)
|
||||
os.system('cp '+final_folder+'*Renamed.*.NTD.fasta '+args.r2g_ntd)
|
||||
os.system('cp '+final_folder+'*.Renamed.*_allOGCleanresults.tsv '+args.r2g_tsv)
|
||||
os.system('cp '+final_folder+'*oneHit '+args.r2g_xml)
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.xml_out\
|
||||
.split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\
|
||||
args.xml_out.split('_XX_')[-1].split('.Renamed')[0]+'.Prep'+color.END+color.BOLD+' and '\
|
||||
+color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\
|
||||
'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
rename_paralogs(args)
|
||||
|
||||
write_Fake_XML(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
274
PTL1/Genomes/Scripts/6_SummaryStats.py
Normal file
274
PTL1/Genomes/Scripts/6_SummaryStats.py
Normal file
@ -0,0 +1,274 @@
|
||||
import os, sys
|
||||
import argparse
|
||||
from Bio import SeqIO
|
||||
import CUB
|
||||
from statistics import mean
|
||||
from math import ceil, floor
|
||||
from tqdm import tqdm
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'PTL6p1 Script 8: Stat Summary',
|
||||
description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
|
||||
)
|
||||
|
||||
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
|
||||
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
|
||||
parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')
|
||||
|
||||
#Curate genetic code
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def hook_lens(args):
|
||||
|
||||
print('\nGetting average OG lengths in the Hook DB...')
|
||||
|
||||
len_by_og = { }
|
||||
for file in os.listdir(args.databases + '/db_OG'):
|
||||
if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
|
||||
for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
|
||||
if rec.id[-10:] not in len_by_og:
|
||||
len_by_og.update({ rec.id[-10:] : [] })
|
||||
|
||||
len_by_og[rec.id[-10:]].append(len(str(rec.seq)))
|
||||
|
||||
for og in len_by_og:
|
||||
len_by_og[og] = mean(len_by_og[og])
|
||||
|
||||
return len_by_og
|
||||
|
||||
|
||||
def aa_comp_lengths(args, gcodes):
|
||||
|
||||
print('\nGetting amino acid composition data from ReadyToGo files...')
|
||||
|
||||
r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
|
||||
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
|
||||
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
|
||||
|
||||
fymink = 0; garp = 0; other = 0; total = 0
|
||||
for char in str(rec.seq):
|
||||
if char in 'FYMINK':
|
||||
fymink += 1
|
||||
elif char in 'GARP':
|
||||
garp += 1
|
||||
else:
|
||||
other += 1
|
||||
|
||||
total += 1
|
||||
|
||||
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total } })
|
||||
|
||||
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
|
||||
|
||||
print('\nGetting transcript sequence data from original assembled transcript files...')
|
||||
|
||||
transcripts = { }; transcript_id_corr = { }
|
||||
for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/')]):
|
||||
if os.path.isdir(args.input + '/Intermediate/' + tax + '/Original'):
|
||||
for file in os.listdir(args.input + '/Intermediate/' + tax + '/Original'):
|
||||
if file.endswith('_GenBankCDS.fasta'):
|
||||
for rec in SeqIO.parse(args.input + '/Intermediate/' + tax + '/Original/' + file, 'fasta'):
|
||||
transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
|
||||
if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
|
||||
transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})
|
||||
|
||||
return aa_comp, transcripts, r2g_lengths, transcript_id_corr
|
||||
|
||||
|
||||
def get_nuc_comp(args, gcodes):
|
||||
|
||||
print('\nGetting nucleotide composition data from ReadyToGo files...')
|
||||
|
||||
nuc_comp = { }
|
||||
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]])[0]
|
||||
for k in cub_out:
|
||||
nuc_comp.update({ k : cub_out[k] })
|
||||
|
||||
return nuc_comp
|
||||
|
||||
|
||||
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
|
||||
|
||||
og_mean_lens = hook_lens(args)
|
||||
|
||||
if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
|
||||
os.mkdir(args.input + '/PerSequenceStatSummaries')
|
||||
|
||||
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
||||
|
||||
for taxon in taxa:
|
||||
with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
|
||||
o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA\n')
|
||||
for rec in nuc_comp:
|
||||
if rec[:10] == taxon:
|
||||
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
|
||||
|
||||
try:
|
||||
o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
|
||||
except KeyError:
|
||||
o.write(',NA,NA')
|
||||
|
||||
o.write(',' + str(r2g_lengths[rec]) + ',' + str(og_mean_lens[rec[-10:]]))
|
||||
|
||||
v = nuc_comp[rec]
|
||||
gcs = [str(v.gcOverall), str(v.gc1), str(v.gc2), str(v.gc3), str(v.gc4F)]
|
||||
ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
|
||||
o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
|
||||
|
||||
o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + '\n')
|
||||
|
||||
|
||||
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
||||
|
||||
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
||||
|
||||
with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
|
||||
o.write('Taxon,TranscriptsInput,Median_GCTranscripts,IQR_GCTranscripts,Median_LenTranscripts,IRQ_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,IQR_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
|
||||
|
||||
for taxon in taxa:
|
||||
try:
|
||||
o.write(taxon)
|
||||
|
||||
transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
|
||||
o.write(',' + str(len(transcripts)))
|
||||
|
||||
transcript_gcs = []
|
||||
for transcript in transcripts:
|
||||
transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))
|
||||
|
||||
transcript_gcs = sorted(transcript_gcs)
|
||||
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
|
||||
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.75)] - transcript_gcs[floor(len(transcripts)*0.25)]))
|
||||
|
||||
transcript_lens = sorted([len(transcript) for transcript in transcripts])
|
||||
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
|
||||
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))
|
||||
|
||||
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
|
||||
o.write(',' + str(len(r2g_ntds)))
|
||||
r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
|
||||
o.write(',' + str(len(r2g_ogs)))
|
||||
|
||||
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
||||
o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
|
||||
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.75)] - r2g_gc3s[floor(len(r2g_gc3s)*0.25)]))
|
||||
|
||||
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
|
||||
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
|
||||
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)]))
|
||||
|
||||
tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
|
||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
|
||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
|
||||
|
||||
o.write(',' + gcodes[taxon] + '\n')
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
def r2g_jf(args, nuc_comp, gcodes):
|
||||
|
||||
#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?
|
||||
|
||||
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF'):
|
||||
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF')
|
||||
|
||||
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF'):
|
||||
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF')
|
||||
|
||||
for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
taxon = file[:10]
|
||||
|
||||
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
|
||||
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
||||
|
||||
with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
|
||||
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
|
||||
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
||||
|
||||
with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
|
||||
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
|
||||
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
||||
|
||||
|
||||
def plot_jf(args, nuc_comp):
|
||||
|
||||
if not os.path.isdir(args.input + '/GC3xENc_Plots'):
|
||||
os.mkdir(args.input + '/GC3xENc_Plots')
|
||||
|
||||
taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))
|
||||
|
||||
gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
|
||||
enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]
|
||||
|
||||
for taxon in taxa:
|
||||
comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]
|
||||
|
||||
plt.figure()
|
||||
plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
|
||||
plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
|
||||
plt.xlabel("GC content (3rd pos, 4-fold sites)")
|
||||
plt.ylabel("Observed Wright ENc (6 Fold)")
|
||||
plt.savefig(args.input + '/GC3xENc_Plots/' + taxon + '.png')
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
|
||||
|
||||
gcodes = { }
|
||||
if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
|
||||
for line in open(args.input + '/Intermediate/gcode_output.tsv'):
|
||||
if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
|
||||
gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
|
||||
elif line.split('\t')[4].strip().lower() != '':
|
||||
print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 6 (summary statistics)\n')
|
||||
else:
|
||||
print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 6 (summary statistics).\n')
|
||||
exit()
|
||||
|
||||
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
|
||||
nuc_comp = get_nuc_comp(args, gcodes)
|
||||
|
||||
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
|
||||
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
|
||||
|
||||
if args.r2g_jf:
|
||||
r2g_jf(args, nuc_comp, gcodes)
|
||||
|
||||
plot_jf(args, nuc_comp)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
523
PTL1/Genomes/Scripts/CUB.py
Normal file
523
PTL1/Genomes/Scripts/CUB.py
Normal file
@ -0,0 +1,523 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding=utf-8
|
||||
|
||||
'''Aim of this script is to generate lots of codon usage statistics to aid in
|
||||
identifying useful characteristics for de novo ORF calling'''
|
||||
|
||||
# Author: Xyrus Maurer-Alcalá
|
||||
# Contact: maurerax@gmail.com or xyrus.maurer-alcala@izb.unibe.ch
|
||||
# Last Modified: 2020-09-17
|
||||
# usage: python CUB.py
|
||||
|
||||
# Dependencies:
|
||||
# Python3, numpy, BioPython
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
#import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
#import seaborn as sns
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqUtils import GC
|
||||
|
||||
|
||||
class CalcCUB:
|
||||
"""
|
||||
Returns the Effective Number of Codons used (observed and expected)
|
||||
following the equations originally from Wright 1990.
|
||||
"""
|
||||
def expWrightENc(gc3):
|
||||
# Calculates the expected ENc from a sequence's GC3 under Wright 1990
|
||||
if gc3 > 1:
|
||||
# If GC3 looks as though it is > 1 (e.g. 100%), converts to a float ≤ 1.
|
||||
# Calculations expect a value between 0 and 1
|
||||
gc3 = gc3/100
|
||||
exp_enc = 2+gc3+(29/((gc3**2)+(1-gc3)**2))
|
||||
return round(exp_enc, 4)
|
||||
|
||||
def nullENcGC3():
|
||||
# Calculates the expected ENc from the null distribution of GC3
|
||||
# values (0, 100% GC)
|
||||
null = [CalcCUB.expWrightENc(n) for n in np.arange(0,.51,0.01)]
|
||||
null += null[:-1][::-1]
|
||||
return [str(i)+'\t'+str(j) for i, j in zip([n for n in range(0, 101)],null)]
|
||||
|
||||
|
||||
def calcWrightENc(cdnTable):
|
||||
# Follows Wright's (1990) calculations for determining ENc scores.
|
||||
|
||||
def faCalcWright(aa_counts):
|
||||
# Returns the codon homozygosity (fa) for a given "type" of AA (e.g.
|
||||
# 2-fold degeneracy).
|
||||
counts = [i[2] for i in aa_counts]
|
||||
# n_aa --> number of this particular AA
|
||||
n_aa = sum(counts)
|
||||
# fa --> codon homozygosity
|
||||
try:
|
||||
fa = (((n_aa*sum([(i/float(n_aa))**2 for i in counts]))-1)/(n_aa-1))
|
||||
except:
|
||||
fa = 0
|
||||
return fa
|
||||
|
||||
def ENcWright_by_Degen(fa_data):
|
||||
# Same as used in Wright 1990, averages the homozygosity across all codons
|
||||
# of a given class (e.g. 2-fold degeneracy)
|
||||
|
||||
# Codons without any degeneracy (e.g. ATG == M) have 100% homozygosity
|
||||
# and provide a "base" for the ENc score
|
||||
enc = 2
|
||||
for k, v in fa_data.items():
|
||||
non_zero_vals, non_zero_sum = len([i for i in v if i != 0]), sum([i for i in v if i != 0])
|
||||
try:
|
||||
f_aa = non_zero_sum/non_zero_vals
|
||||
except:
|
||||
f_aa = 1
|
||||
enc += k/f_aa
|
||||
return enc
|
||||
|
||||
# Determines the number of degenerate groups to use (i.e. whether 6-Fold
|
||||
# degeneracy is present).
|
||||
degen_cdns = {}
|
||||
for k, v in cdnTable.items():
|
||||
if v[1] not in degen_cdns.keys():
|
||||
degen_cdns[v[1]] = [v[0]]
|
||||
else:
|
||||
if v[0] not in degen_cdns[v[1]]:
|
||||
degen_cdns[v[1]] += [v[0]]
|
||||
|
||||
# Calculates codon homozygosity (fa) for each amino acid. Groups the
|
||||
# resulting values based on the amino acids degeneracy (e.g. 'two-fold').
|
||||
fa_cdns = {len(v):[] for k, v in degen_cdns.items() if 'one' not in k}
|
||||
|
||||
for k, v in degen_cdns.items():
|
||||
# Skip codons lacking degeneracy
|
||||
if 'one' in k:
|
||||
continue
|
||||
|
||||
for aa in v:
|
||||
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
|
||||
fa_cdns[len(v)] += [faCalcWright(aa_counts)]
|
||||
enc_val = min(61, round(ENcWright_by_Degen(fa_cdns),4))
|
||||
return enc_val
|
||||
|
||||
def SunEq5(cdnTable):
|
||||
def calcFcf(aa_counts):
|
||||
counts = [i[2] for i in aa_counts]
|
||||
pseudocounts = [i+1 for i in counts]
|
||||
na = sum(pseudocounts)
|
||||
fcf = sum([(i/float(na))**2 for i in pseudocounts]), sum(pseudocounts)
|
||||
return fcf
|
||||
|
||||
ENcWeightedPsuedo = 0
|
||||
degen_cdns = {}
|
||||
|
||||
for k, v in cdnTable.items():
|
||||
if v[1] == 'none':
|
||||
continue
|
||||
if v[1] not in degen_cdns.keys():
|
||||
degen_cdns[v[1]] = [v[0]]
|
||||
else:
|
||||
if v[0] not in degen_cdns[v[1]]:
|
||||
degen_cdns[v[1]] += [v[0]]
|
||||
for k, v in degen_cdns.items():
|
||||
fcf_nc = []
|
||||
for aa in v:
|
||||
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
|
||||
fcf_nc.append(calcFcf(aa_counts))
|
||||
weightedENc = (len(fcf_nc) /
|
||||
(sum([i[0]*i[1] for i in fcf_nc]) /
|
||||
sum([i[1] for i in fcf_nc])))
|
||||
ENcWeightedPsuedo += weightedENc
|
||||
return round(ENcWeightedPsuedo,4)
|
||||
|
||||
def calcRCSU(cdnTbl):
|
||||
rscu = {k:[v[0]] for k, v in cdnTbl.items() if v[0].isalpha()}
|
||||
for k, v in rscu.items():
|
||||
try:
|
||||
aa_info = [(key, val[-1]) for key, val in cdnTbl.items() if val[0] == v[0]]
|
||||
aa_cnts = [x[1] for x in aa_info]
|
||||
cdn_rscu = (cdnTbl[k][-1]*len(aa_cnts))/sum(aa_cnts)
|
||||
rscu[k] += [str(round(cdn_rscu,4))]
|
||||
except:
|
||||
rscu[k] += ['0.0']
|
||||
return rscu
|
||||
|
||||
|
||||
class GenUtil(object):
|
||||
"""
|
||||
"Overflow" of functions for now. Just a precaution to make the code a
|
||||
little cleaner/easier to manage.
|
||||
|
||||
This class inclues means to normalize/check the user-provided genetic code,
|
||||
which if not valid will default to the "universal" genetic code.
|
||||
|
||||
Similarly, This class will return the appropriate
|
||||
codon count table and provides a function to update its values.
|
||||
"""
|
||||
def convertGenCode(gCode):
|
||||
# Will interpret the user provided genetic code (gcode) and checks that
|
||||
# it is currently available for use with the NCBI/biopython
|
||||
# supported translation tables. Default is universal.
|
||||
# Dictionary of the possible/functional genetic codes that are supported.
|
||||
# --- Chilodonella and condylostoma are to come!
|
||||
transTable = {'universal':1, 'blepharisma':4,
|
||||
'ciliate':6, 'euplotes':10, 'mesodinium':29, 'myrionecta':29, 'peritrich':30,
|
||||
'1':1, '4':4, '6':6, '10':10, '29':29, '30':30, 'chilo':'chilo'}
|
||||
|
||||
if str(gCode).lower() not in transTable:
|
||||
print("\nWarning: Provided genetic code is not supported (yet).\n")
|
||||
print("Currently running using the UNIVERSAL genetic code.\n\n")
|
||||
print("Alternative genetic codes are as follows (Note: numbers "\
|
||||
"correspond to NCBI genetic code tables):\n")
|
||||
print('\n'.join(list(transTable.keys()))+'\n')
|
||||
return 'Universal',1
|
||||
else:
|
||||
return gCode,transTable[str(gCode).lower()]
|
||||
|
||||
def getCDNtable(gCode):
|
||||
# Returns the appropriate codon table to be used for the ENc calculations.
|
||||
# Universal codon table, with 6-fold degenerate codons split
|
||||
# into four-fold and two-fold groups.
|
||||
universal_no6fold = {
|
||||
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
|
||||
'GCG': ['A', 'four', 0], 'CGT': ['R', 'four', 0], 'CGC': ['R', 'four', 0],
|
||||
'CGG': ['R', 'four', 0], 'CGA': ['R', 'four', 0], 'AGA': ['R_', 'two', 0],
|
||||
'AGG': ['R_', 'two', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
|
||||
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
|
||||
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
|
||||
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
|
||||
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
|
||||
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
|
||||
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
|
||||
'TTA': ['L_', 'two', 0], 'TTG': ['L_', 'two', 0], 'CTT': ['L', 'four', 0],
|
||||
'CTC': ['L', 'four', 0], 'CTA': ['L', 'four', 0], 'CTG': ['L', 'four', 0],
|
||||
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
|
||||
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
|
||||
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'four', 0],
|
||||
'TCC': ['S', 'four', 0], 'TCA': ['S', 'four', 0], 'TCG': ['S', 'four', 0],
|
||||
'AGT': ['S_', 'two', 0], 'AGC': ['S_', 'two', 0], 'ACT': ['T', 'four', 0],
|
||||
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
|
||||
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
|
||||
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
|
||||
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
|
||||
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
|
||||
|
||||
# Universal codon table, with 6-fold degenerate codons kept
|
||||
# whole, no splitting! Traditional Universal codon table.
|
||||
universal_6fold = {
|
||||
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
|
||||
'GCG': ['A', 'four', 0], 'CGT': ['R', 'six', 0], 'CGC': ['R', 'six', 0],
|
||||
'CGG': ['R', 'six', 0], 'CGA': ['R', 'six', 0], 'AGA': ['R', 'six', 0],
|
||||
'AGG': ['R', 'six', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
|
||||
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
|
||||
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
|
||||
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
|
||||
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
|
||||
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
|
||||
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
|
||||
'TTA': ['L', 'six', 0], 'TTG': ['L', 'six', 0], 'CTT': ['L', 'six', 0],
|
||||
'CTC': ['L', 'six', 0], 'CTA': ['L', 'six', 0], 'CTG': ['L', 'six', 0],
|
||||
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
|
||||
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
|
||||
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'six', 0],
|
||||
'TCC': ['S', 'six', 0], 'TCA': ['S', 'six', 0], 'TCG': ['S', 'six', 0],
|
||||
'AGT': ['S', 'six', 0], 'AGC': ['S', 'six', 0], 'ACT': ['T', 'four', 0],
|
||||
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
|
||||
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
|
||||
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
|
||||
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
|
||||
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
|
||||
|
||||
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
|
||||
# codons kept whole, no splitting!
|
||||
blepharisma_6fold = {**universal_6fold,
|
||||
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
|
||||
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
|
||||
|
||||
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
|
||||
# codons split into four-fold and two-fold groups.
|
||||
blepharisma_no6fold = {**universal_no6fold,
|
||||
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
|
||||
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
|
||||
|
||||
# Chilodonella genetic code codon table, with 6-fold degenerate
|
||||
# codons kept whole, no splitting!
|
||||
chilo_6fold = {**universal_6fold,
|
||||
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
|
||||
'TAA': ['*', 'one', 0], 'TAG': ['Q', 'four', 0],
|
||||
'TGA': ['Q', 'four', 0]}
|
||||
|
||||
# Chilodonella genetic code codon table, with 6-fold degenerate
|
||||
# codons split into four-fold and two-fold groups.
|
||||
# Note that this also splits four-fold degenerate codons that OUGHT to
|
||||
# be in "different" functional categories (e.g. CAG =/= TAG)
|
||||
chilo_no6fold = {**universal_no6fold,
|
||||
'TAA': ['*', 'one', 0], 'TAG': ['Q_', 'one', 0],
|
||||
'TGA': ['Q_', 'one', 0]}
|
||||
|
||||
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
|
||||
# codons kept whole, no splitting! Traditional ciliate codon table.
|
||||
ciliate_6fold = {**universal_6fold,
|
||||
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
|
||||
'TAA': ['Q', 'four', 0], 'TAG': ['Q', 'four', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
|
||||
# codons split into four-fold and two-fold groups.
|
||||
# Note that this also splits four-fold degenerate codons that OUGHT to
|
||||
# be in "different" functional categories (e.g. CAA =/= TAA)
|
||||
ciliate_no6fold = {**universal_no6fold,
|
||||
'TAA': ['Q_', 'two', 0], 'TAG': ['Q_', 'two', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
# Euplotes codon table, with 6-fold degenerate codons kept
|
||||
# whole, no splitting! Traditional Universal codon table.
|
||||
euplotes_6fold = {**universal_6fold,
|
||||
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
|
||||
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
|
||||
'TAG': ['*', 'two',0]}
|
||||
|
||||
# Euplotes genetic code codon table, with 6-fold degenerate codons
|
||||
# split into four-fold and two-fold groups.
|
||||
euplotes_no6fold = {**universal_no6fold,
|
||||
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
|
||||
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
|
||||
'TAG': ['*', 'two',0]}
|
||||
|
||||
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
|
||||
# degenerate codons kept whole, no splitting! Traditional ciliate codon table.
|
||||
mesodinium_6fold = {**universal_6fold,
|
||||
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
|
||||
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
|
||||
# degenerate codons split into four-fold and two-fold groups.
|
||||
mesodinium_no6fold = {**universal_no6fold,
|
||||
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
|
||||
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
|
||||
# codons kept whole, no splitting! Traditional ciliate codon table.
|
||||
peritrich_6fold = {**universal_6fold,
|
||||
'GAA': ['E', 'four', 0], 'GAG': ['E', 'four', 0],
|
||||
'TAA': ['E', 'four', 0], 'TAG': ['E', 'four', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
|
||||
# codons split into four-fold and two-fold groups.
|
||||
# Note that this also splits four-fold degenerate codons that OUGHT to
|
||||
# be in "different" functional categories (e.g. CAA =/= TAA)
|
||||
peritrich_no6fold = {**universal_no6fold,
|
||||
'TAA': ['E_', 'two', 0], 'TAG': ['E_', 'two', 0],
|
||||
'TGA': ['*', 'one', 0]}
|
||||
|
||||
cdnTableDict = {1:[universal_no6fold,universal_6fold],
|
||||
4:[blepharisma_no6fold, blepharisma_6fold],
|
||||
6:[ciliate_no6fold,ciliate_6fold],
|
||||
10:[euplotes_no6fold,euplotes_6fold],
|
||||
29:[mesodinium_no6fold,mesodinium_6fold],
|
||||
30:[peritrich_no6fold,peritrich_6fold],
|
||||
'chilodonella':[chilo_no6fold,chilo_6fold],
|
||||
'chilo':[chilo_no6fold,chilo_6fold]}
|
||||
return cdnTableDict[gCode]
|
||||
|
||||
def mapCdns(seq, cdnTable):
|
||||
# Updates the codon counts for a given sequence to the respective codon
|
||||
# count table (e.g. with or without 6-fold degeneracy).
|
||||
codons = [seq[n:n+3] for n in range(0, len(seq)-len(seq)%3, 3)]
|
||||
amb_cdn = 0
|
||||
for c in codons:
|
||||
try:
|
||||
cdnTable[c][-1] += 1
|
||||
except:
|
||||
amb_cdn += 1
|
||||
if cdnTable['TCC'][1] == 'six':
|
||||
return cdnTable, amb_cdn
|
||||
else:
|
||||
return cdnTable
|
||||
|
||||
class GCeval():
|
||||
"""
|
||||
Returns %GC values from DNA sequences of various types.
|
||||
"""
|
||||
def gcTotal(seq):
|
||||
# This function returns global GC content
|
||||
return round(GC(seq), 4)
|
||||
|
||||
def gc1(seq):
|
||||
# This function return the GC content of the first position of a codon
|
||||
return round(GC(''.join([seq[n] for n in range(0, len(seq), 3)])), 4)
|
||||
|
||||
def gc2(seq):
|
||||
# This function return the GC content of the second position of a codon
|
||||
return round(GC(''.join([seq[n] for n in
|
||||
range(1, len(seq)-len(seq[1:]) % 3, 3)])), 4)
|
||||
|
||||
def gc3(seq):
|
||||
# This function return the GC content of the third position of a codon
|
||||
return round(GC(''.join([seq[n] for n in
|
||||
range(2, len(seq)-len(seq[2:]) % 3, 3)])), 4)
|
||||
|
||||
def gc3_4F(cdnTbl):
|
||||
# # This function return the GC content of the third position of four-fold
|
||||
# # degenerate codons
|
||||
FrFold = round(GC(''.join([k[-1]*v[-1] for k, v in cdnTbl.items() if
|
||||
'one' not in v[1]])), 4)
|
||||
return FrFold
|
||||
|
||||
class SeqInfo(object):
|
||||
"""
|
||||
Provides a means to harbor the data for each individual contig/gene in a
|
||||
given fasta file.
|
||||
This includes GC content (various types), Effective Number of codons
|
||||
(ENc; again various calculations), Relative Synonymous Codon Usage (RSCU).
|
||||
"""
|
||||
def __init__(self,seq,gcode='universal'):
|
||||
self.ntd = str(seq)
|
||||
self.gcode, self.transTable = GenUtil.convertGenCode(gcode)
|
||||
# Dictionary of the GC-related functions/calculations
|
||||
self.gcFuncs = {'gcOverall':GCeval.gcTotal,'gc1':GCeval.gc1,'gc2':GCeval.gc2,'gc3':GCeval.gc3}
|
||||
|
||||
def countCodons(self):
|
||||
# Stores the different codon tables and updates their codon counts
|
||||
cdnTbls = GenUtil.getCDNtable(self.transTable)
|
||||
self.cdnCounts_6F,self.amb_cdn = GenUtil.mapCdns(self.ntd, cdnTbls[1])
|
||||
self.cdnCounts_No6F = GenUtil.mapCdns(self.ntd, cdnTbls[0])
|
||||
|
||||
def ENcStats(self):
|
||||
# Stores the various Effective Number of Codons calculations in the class
|
||||
self.expENc = CalcCUB.expWrightENc(self.gc3)
|
||||
self.obsENc_6F = CalcCUB.calcWrightENc(self.cdnCounts_6F)
|
||||
self.obsENc_No6F = CalcCUB.calcWrightENc(self.cdnCounts_No6F)
|
||||
self.SunENc_6F = CalcCUB.SunEq5(self.cdnCounts_6F)
|
||||
self.SunENc_No6F = CalcCUB.SunEq5(self.cdnCounts_No6F)
|
||||
|
||||
def GCstats(self):
|
||||
# Stores the various GC-stats in the class
|
||||
for k, v in self.gcFuncs.items():
|
||||
setattr(self,k,v(self.ntd))
|
||||
self.gc4F = GCeval.gc3_4F(self.cdnCounts_No6F)
|
||||
|
||||
|
||||
def RSCUstats(self):
|
||||
self.rscu_No6Fold = CalcCUB.RSCU(self.cdnCounts_No6F)
|
||||
self.rscu_6Fold = CalcCUB.RSCU(self.cdnCounts_6F)
|
||||
|
||||
|
||||
def prepFolders(outName):
|
||||
if os.path.isdir(outName) == False:
|
||||
os.mkdir(outName)
|
||||
if os.path.isdir(outName+'/Plots') == False:
|
||||
os.mkdir(outName+'/Plots')
|
||||
if os.path.isdir(outName+'/SpreadSheets') == False:
|
||||
os.mkdir(outName+'/SpreadSheets')
|
||||
|
||||
|
||||
def CalcRefFasta(fasta, gCode):
|
||||
seqDB = {i.description:SeqInfo(i.seq, gCode) for i in SeqIO.parse(fasta,'fasta')}
|
||||
GenCDNtable = {}
|
||||
for k, v in seqDB.items():
|
||||
v.countCodons()
|
||||
v.GCstats()
|
||||
v.ENcStats()
|
||||
for k, v in v.cdnCounts_6F.items():
|
||||
if k.isalpha() and k not in GenCDNtable .keys():
|
||||
GenCDNtable[k] = [v[0],v[-1]]
|
||||
else:
|
||||
GenCDNtable[k][-1] += v[-1]
|
||||
RSCU = CalcCUB.calcRCSU(GenCDNtable)
|
||||
return seqDB, RSCU
|
||||
|
||||
|
||||
def WriteWrightOut(seqData, outName, comp):
|
||||
if comp == False:
|
||||
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Raw.tsv','w+') as w:
|
||||
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
|
||||
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
|
||||
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
|
||||
for k, v in seqData.items():
|
||||
name = [k]
|
||||
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
|
||||
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
|
||||
str(v.SunENc_6F),str(v.SunENc_No6F)]
|
||||
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
|
||||
else:
|
||||
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.CompTrans.ENc.Raw.tsv','w+') as w:
|
||||
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
|
||||
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
|
||||
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
|
||||
for k, v in seqData.items():
|
||||
name = [k]
|
||||
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
|
||||
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
|
||||
str(v.SunENc_6F),str(v.SunENc_No6F)]
|
||||
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
|
||||
|
||||
|
||||
def getCompFasta(fasta, gCode):
|
||||
print(fasta)
|
||||
stopCDNs = {'1':['TAA','TAG','TGA'], '4':['TAA','TAG'], '6':['TGA'], '10':['TAA','TAG'],
|
||||
'29':['TGA'], '30':['TGA'], 'universal':['TAA','TAG','TGA'], 'blepharisma':['TAA','TAG'],
|
||||
'ciliate':['TGA'],'euplotes':['TAA','TAG'], 'mesodinium':['TGA'], 'peritrich':['TGA'],
|
||||
'chilo':['TAA']}
|
||||
if gCode.lower() not in stopCDNs.keys():
|
||||
stops = stopCDNs['1']
|
||||
else:
|
||||
stops = stopCDNs[gCode]
|
||||
|
||||
with open(fasta.replace('.fasta','.Comp.fasta'),'w+') as w:
|
||||
for i in SeqIO.parse(fasta,'fasta'):
|
||||
#if str(i.seq).upper().startswith('ATG') and str(i.seq).upper()[-3:] in stops:
|
||||
#if str(i.seq).upper()[-3:] in stops:
|
||||
if len(i.seq) % 3 == 0:
|
||||
w.write('>'+i.description+'\n'+str(i.seq)+'\n')
|
||||
|
||||
return fasta.replace('.fasta','.Comp.fasta')
|
||||
|
||||
def WriteNullENcOut(outName):
|
||||
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Null.tsv','w+') as w:
|
||||
w.write('GC3\tENc\n')
|
||||
w.write('\n'.join(CalcCUB.nullENcGC3()))
|
||||
|
||||
|
||||
def WriteRSCUtbl(RSCUtbl, outName):
|
||||
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.RSCU.tsv','w+') as w:
|
||||
w.write('Codon\tAmino Acid\tRSCU\n')
|
||||
for k,v in RSCUtbl.items():
|
||||
w.write(k+'\t'+'\t'.join(v)+'\n')
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print('\nUsage:\n')
|
||||
print('python CUB.py MyNtds.fasta MyTaxon genetic_code\n')
|
||||
print('\nGenetic Codes:\n')
|
||||
gcd = ['1', '4', '6', '10', '29', '30', 'universal', 'blepharisma',
|
||||
'ciliate','euplotes', 'mesodinium', 'peritrich','chilo']
|
||||
print('\n'.join(gcd)+'\n')
|
||||
sys.exit()
|
||||
fasta = sys.argv[1]
|
||||
try:
|
||||
outName = sys.argv[2]
|
||||
except:
|
||||
print('Missing an output name. Include one, then run again!')
|
||||
sys.exit()
|
||||
try:
|
||||
gCode = sys.argv[3]
|
||||
except:
|
||||
gCode = 'universal'
|
||||
compFasta = getCompFasta(fasta, gCode)
|
||||
prepFolders(outName)
|
||||
fastaDataRaw, RSCUtbl = CalcRefFasta(fasta, gCode)
|
||||
fastaDataComp, RSCUtbl = CalcRefFasta(compFasta, gCode)
|
||||
WriteWrightOut(fastaDataRaw, outName, comp=False)
|
||||
WriteWrightOut(fastaDataComp, outName, comp=True)
|
||||
WriteNullENcOut(outName)
|
||||
WriteRSCUtbl(RSCUtbl, outName)
|
||||
os.system('cp '+fasta+' '+outName+'/')
|
||||
os.system('mv '+compFasta+' '+outName+'/')
|
||||
211
PTL1/Genomes/Scripts/wrapper.py
Normal file
211
PTL1/Genomes/Scripts/wrapper.py
Normal file
@ -0,0 +1,211 @@
|
||||
import os, sys, re
|
||||
import argparse
|
||||
|
||||
|
||||
def get_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'PhyloToL v6.0 Part 1 for GenBank Genomes',
|
||||
description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
|
||||
)
|
||||
|
||||
parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'Script to run if you are only running one script')
|
||||
parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4 }, help = 'First script to run')
|
||||
parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5 }, help = 'First script to run')
|
||||
parser.add_argument('-c', '--cds', type = str, help = 'Path to a folder of nucleotide CDS. Each file name should start with a unique 10 digit code, and end in "_GenBankCDS.fasta", E.g. Op_me_hsap_GenBankCDS.fasta')
|
||||
parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder')
|
||||
parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 4). Otherwise, stop after script 3 and fill in "gcode_output.tsv" before running script 4')
|
||||
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder (which should contain db_OG)')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def script_one(args, ten_digit_codes):
|
||||
|
||||
for file in os.listdir(args.cds):
|
||||
if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes:
|
||||
os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output')
|
||||
|
||||
|
||||
def script_two(args):
|
||||
|
||||
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
|
||||
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'):
|
||||
os.system('python 2_GCodeEval.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta')
|
||||
|
||||
gcode_info = []
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt'):
|
||||
with open(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt') as f:
|
||||
gcode_temp = [folder]
|
||||
for line in f:
|
||||
line_sep = line.strip().split('\t')
|
||||
if line_sep[0] == 'TGA':
|
||||
gcode_temp.append(line_sep[1])
|
||||
elif line_sep[0] == 'TAG':
|
||||
gcode_temp.append(line_sep[1])
|
||||
elif line_sep[0] == 'TAA':
|
||||
gcode_temp.append(line_sep[1])
|
||||
|
||||
gcode_info.append(gcode_temp)
|
||||
|
||||
stop = False
|
||||
gcode_file = { }
|
||||
if args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
|
||||
if os.path.isfile(args.genetic_code):
|
||||
for line in open(args.genetic_code):
|
||||
try:
|
||||
if line.split('\t')[1].strip().lower() in valid_codes:
|
||||
gcode_file.update({ line.split('\t')[0] : line.split('\t')[1].strip() })
|
||||
else:
|
||||
print('Genetic code ERROR -- ' + line.split('\t')[1].strip() + ' is not a valid genetic code. Please fill out the "gcode_output.tsv" file and continue with script 3.')
|
||||
except IndexError:
|
||||
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it is improperly formatted. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
|
||||
stop = True
|
||||
else:
|
||||
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it could not be found. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
|
||||
stop = True
|
||||
|
||||
with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
|
||||
g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
|
||||
for row in gcode_info:
|
||||
if args.genetic_code == None:
|
||||
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
|
||||
elif args.genetic_code.lower() in valid_codes:
|
||||
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + args.genetic_code + '\n')
|
||||
elif args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
|
||||
try:
|
||||
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + gcode_file[row[0]] + '\n')
|
||||
except KeyError:
|
||||
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
|
||||
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but a taxon is missing. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
|
||||
stop = True
|
||||
else:
|
||||
stop = True
|
||||
|
||||
if stop or args.genetic_code == None:
|
||||
print('\nStopping after script 2 because genetic code information is incomplete. Please fill out the file "gcode_output.tsv" and continue with script 3.\n')
|
||||
exit()
|
||||
|
||||
|
||||
def script_three(args):
|
||||
|
||||
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
|
||||
|
||||
lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')]
|
||||
with open(args.output + '/Output/gcode_output.tsv', 'r') as g:
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'):
|
||||
for line in lines:
|
||||
if line[0] == folder and line[-1].lower() in valid_codes:
|
||||
os.system('python 3_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta --genetic_code ' + line[-1])
|
||||
elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
|
||||
print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
|
||||
|
||||
|
||||
def script_four(args):
|
||||
|
||||
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
|
||||
|
||||
gcode_by_folder = { line.strip().split('\t')[0] : line.strip().split('\t')[-1] for line in open(args.output + '/Output/gcode_output.tsv', 'r') }
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isdir(args.output + '/Output/' + folder):
|
||||
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.' + gcode_formatted + '.AA.fasta'):
|
||||
os.system('python 4_CountOGsDiamond.py -in ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.' + gcode_formatted + '.AA.fasta -t 30 --databases ' + args.databases + ' --evalue 1e-15')
|
||||
|
||||
|
||||
|
||||
def script_five(args):
|
||||
|
||||
gcode_by_folder = { line.strip().split('\t')[0] : line.strip().split('\t')[-1] for line in open(args.output + '/Output/gcode_output.tsv', 'r') }
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isdir(args.output + '/Output/' + folder):
|
||||
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta'):
|
||||
step5_cmd = 'python 5_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder
|
||||
os.system(step5_cmd)
|
||||
|
||||
os.mkdir(args.output + '/Output/Intermediate')
|
||||
|
||||
for file in os.listdir(args.output + '/Output'):
|
||||
if file != 'ReadyToGo' and file != 'Intermediate':
|
||||
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
|
||||
|
||||
os.system('python 6_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = get_args()
|
||||
|
||||
if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds):
|
||||
print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument')
|
||||
quit()
|
||||
|
||||
ten_digit_codes = []
|
||||
if args.first_script == 1 or args.script == 1:
|
||||
for file in os.listdir(args.cds):
|
||||
if file[10:] == '_GenBankCDS.fasta' and '.DS_Store' not in file:
|
||||
ten_digit_codes.append(file[:10])
|
||||
else:
|
||||
if not os.path.isdir(args.output + '/Output'):
|
||||
print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
|
||||
quit()
|
||||
|
||||
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
|
||||
print('\nDuplicate 10-digit codes are not allowed. Aborting.\n')
|
||||
quit()
|
||||
|
||||
for code in ten_digit_codes:
|
||||
for c, char in enumerate(code):
|
||||
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
|
||||
print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
|
||||
quit()
|
||||
|
||||
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
|
||||
print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
|
||||
quit()
|
||||
elif os.path.isdir(args.output + '/Output/Intermediate'):
|
||||
print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
|
||||
quit()
|
||||
elif not os.path.isdir(args.output + '/Output'):
|
||||
os.mkdir(args.output + '/Output')
|
||||
|
||||
scripts = [0, script_one, script_two, script_three, script_four, script_five]
|
||||
|
||||
if args.script == -1:
|
||||
if args.first_script < args.last_script:
|
||||
for i in range(1 + args.last_script - args.first_script):
|
||||
print('\nRunning script ' + str(i + args.first_script) + '...\n')
|
||||
if i + args.first_script == 1:
|
||||
scripts[i + args.first_script](args, ten_digit_codes)
|
||||
else:
|
||||
scripts[i + args.first_script](args)
|
||||
else:
|
||||
print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
|
||||
quit()
|
||||
else:
|
||||
if args.script == 1:
|
||||
scripts[args.script](args, ten_digit_codes)
|
||||
else:
|
||||
scripts[args.script](args)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
24
PTL1/Genomes/Scripts/wrapper_submit.sh
Normal file
24
PTL1/Genomes/Scripts/wrapper_submit.sh
Normal file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#SBATCH --job-name=PTL1_genome
|
||||
#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances
|
||||
#SBATCH --mem=160G
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=YOUREMAIL@smith.edu
|
||||
|
||||
module purge #Cleans up any loaded modules
|
||||
module use /gridapps/modules/all #make sure module locations is loaded
|
||||
|
||||
module load slurm
|
||||
module load tqdm
|
||||
module load Biopython/1.75-foss-2019b-Python-3.7.4
|
||||
module load BLAST+/2.9.0-gompi-2019b
|
||||
module load DIAMOND/0.9.30-GCC-8.3.0
|
||||
|
||||
path='/beegfs/fast/katzlab/PTL1/Genomes/'
|
||||
|
||||
srun -D ${path}Scripts python3 ${path}Scripts/wrapper.py -1 1 -2 5 --cds ${path}PTL1GenomesBatches/PTL1GenomesBatch2 -o ${path}Output/PTL1Genomes_OutputBatch2 --genetic_code Universal --databases ${path}Databases &
|
||||
wait
|
||||
Loading…
x
Reference in New Issue
Block a user