Add files via upload

This commit is contained in:
Auden Cote-L'Heureux 2023-06-12 13:30:20 -04:00 committed by GitHub
parent 3347fc979a
commit cc83374ce4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 2591 additions and 0 deletions

View File

@ -0,0 +1,217 @@
#!/usr/bin/env python3.5
##__Updated__: 19_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 1g_RenameCDS.py --help
from Bio import SeqIO
from Bio.SeqUtils import GC
import argparse, os, sys, time
from argparse import RawTextHelpFormatter,SUPPRESS
#----------------------------- Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script is intended to extract '+color.RED+'Annotated '+\
color.PURPLE+'ORFS\n'+color.END+color.BOLD+'from a provided Genbank formatted file.'\
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
required_arg_group.add_argument('--output_dir','-o', action='store',
help=color.BOLD+color.GREEN+' Output directory\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--source','-s', action='store', default='GenBank',
help=color.BOLD+color.GREEN+' Data Source of CDSs (default = "GenBank")\n'+color.END)
optional_arg_group.add_argument('--list_source','-lsrc', action='store_true',
help=color.BOLD+color.GREEN+' Lists supported data sources\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
more_info = return_more_info(args)
if more_info != None:
print (parser.description)
print (more_info)
sys.exit()
args.folder = args.output_dir + '/' + args.input_file.split('/')[-1][:10]
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1g_RenameCDS.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.fasta --source'\
' GenBank'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
acceptable_sources = ['in-house', 'in-lab', 'GenBank', 'gb', 'NCBI']
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
return author
if args.list_source == True:
print (color.BOLD+color.RED+'\nThese are the currently supported data sources.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n\n'+color.END)
sys.exit()
if args.source.lower() not in [i.lower() for i in acceptable_sources]:
print (color.BOLD+color.RED+'\nUnsupported source was provided.\n\nEnsure that '\
'you are providing a valid data source (see below).\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n'+color.END)
sys.exit()
if args.input_file != None:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
sys.exit()
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
if os.path.isdir(args.folder) != True:
os.system('mkdir '+args.folder)
os.system('cp '+args.input_file+' '+args.folder)
args.input_file = args.folder+'/'+args.input_file.split('/')[-1]
if os.path.isdir(args.folder+'/Original') != True:
os.system('mkdir '+args.folder+'/Original')
os.system('cp '+args.input_file+' '+args.folder+'/Original/')
###########################################################################################
###------------- Renames Protein-Coding CDS Sequences to Standard Format ---------------###
###########################################################################################
def renamed_GenomeCDS(args):
print (color.BOLD+'\n\nPrepping to rename '+color.GREEN+args.input_file.split('/')[-1]+\
color.END+color.BOLD+"'s CDS sequences"+color.END)
inFasta = sorted((i for i in SeqIO.parse(args.input_file,'fasta')),key=lambda seq_rec: -len(seq_rec.seq))
renamed_seqs = []
seq_code_dict = {}
count = 1
for seq_rec in inFasta:
seq_code_dict.setdefault(seq_rec.description,[]).append('Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
seq_code_dict[seq_rec.description].append(str(seq_rec.seq).upper())
renamed_seqs.append('>Contig_' + str(count) + '_Len' + str(len(seq_rec.seq)) + '\n' + str(seq_rec.seq).upper())
count += 1
## keeps only CDSs that are greater than 30 bp (10 AA --> This is a cut-off in the
## phylogenomic pipeline too!)
renamed_seqs = [i for i in renamed_seqs if len(i.split('\n')[-1]) > 30]
print (color.BOLD+'\n\nFor '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+\
color.BOLD+', '+color.RED+str(len(renamed_seqs))+' CDS sequences\n'+color.END+color.BOLD+
'were renamed while preserving the '+color.ORANGE+args.source+color.END+color.BOLD+' formatting'\
+color.END+'\n')
with open(args.input_file.replace('.fasta','.Prepped.fasta'),'w+') as w:
w.write('\n'.join(renamed_seqs))
with open(args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv'),'w+') as w:
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
for k, v in seq_code_dict.items():
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
###########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files --------------------###
###########################################################################################
def clean_up(args):
# os.system('rm '+args.input_file)
os.system('mv ' + args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv') + ' ' + args.folder + '/Original/')
os.system('mv ' + args.input_file + ' ' + args.folder + '/Original/')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1].replace('.fasta','.Renamed.fasta')\
+'.fasta'+color.END+color.BOLD+'\nin the '+color.ORANGE+args.folder.split('/')[-1]+\
' Folder\n\n'+color.END+color.BOLD)
print ('Next Script(s) are:\n\n'+color.PURPLE+'2g_GCodeEval.py'+color.END+color.BOLD\
+' (if Genetic Code is '+color.RED+'Unknown'+color.END+color.BOLD+')\n\nOtherwise:\n\n'+\
color.PURPLE+'3g_GCodeTranslate.py\n\n'+color.END)
##########################################################################################
###----------------------------- Calls on Above Functions -----------------------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
renamed_GenomeCDS(args)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,252 @@
#!/usr/bin/env python3.5
##__Updated__: 19_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 2g_GCodeEval.py --help
#############################################################################################
# #
# Suggests which Genetic Code to use based upon Presence/Absence of Specific Stop Codons #
# at the end of the CDS sequences. This is to provide a ROUGH gauge for the user. #
# #
#############################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
from Bio.Seq import Seq
#----------------------------- Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###------------------------- Checks the Command Line Arguments -------------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script is intended to aid you with '+color.RED+'evaluating\n(or checking) '+\
color.END+color.BOLD+'the putative '+color.PURPLE+'Genetic Code'+color.END+color.BOLD+\
' for a given\nFasta file of annotated (and untranslated) CDSs.\n\nTo do so, this script'\
' checks for stop codon usages,\n'+color.RED+'suggesting '+color.END+color.BOLD+'the use of'\
+color.PURPLE+' published and well-known\nalternate genetic codes'+color.END+color.BOLD+\
' that are supported by the\nnext script: '+color.END+color.BOLD+color.PURPLE+'3g_GCodeTranslate.py'\
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
args.folder = '/'.join(args.input_file.split('/')[:-1])
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2g_GCodeEval.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
supported_gcodes = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.list_codes == True:
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes)+'\n\n'+color.END)
valid_arg += 1
if args.author == True:
print (author)
valid_arg += 1
print(args.input_file.split('/')[-1], '/'.join(args.input_file.split('/')[:-1]))
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
return valid_arg
###########################################################################################
###-------------------- Counts Several Metrics of Stop Codon Usage ---------------------###
###########################################################################################
def count_stops(args):
print (color.BOLD+'\n\nScanning CDSs for In-Frame Stop Codons and Tracking\nFINAL '\
'(Terminal) stop codon usage\n\n'+color.END)
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
seq_ends = [str(i.seq)[-3:].lower() for i in inFasta]
inFrame_stops_raw = [str(i.seq[:-3].translate()).count('*') for i in inFasta]
inFrame_stops_summary = [i for i in inFrame_stops_raw if i != 0]
tga_end = seq_ends.count('tga')
tag_end = seq_ends.count('tag')
taa_end = seq_ends.count('taa')
end_stop_freq = [tga_end, tag_end, taa_end]
if max(end_stop_freq) > 0.95*sum(end_stop_freq):
pos_to_keep = [i for i, j in enumerate(end_stop_freq) if j == max(end_stop_freq)][0]
try:
if pos_to_keep == 0:
end_stop_freq = [end_stop_freq[0],0,0]
elif pos_to_keep == 1:
end_stop_freq = [0,end_stop_freq[1],0]
elif pos_to_keep == 2:
end_stop_freq = [0,0,end_stop_freq[2]]
except:
pass
inFrame_stop_info = [len(inFrame_stops_summary), int(round(len(inFrame_stops_raw)*0.05)), sum(inFrame_stops_summary)]
return end_stop_freq, inFrame_stop_info
###########################################################################################
###-------------------- Suggests Genetic Code Given Stop Codon Usage -------------------###
###########################################################################################
def suggest_code(args):
stop_freq, inFrames = count_stops(args)
genetic_code = ''
if stop_freq.count(0) == 3:
print (color.BOLD + color.RED + '\n\nNO Stop Codons Present in Data-set\n\n')
genetic_code = 'None (UNDETERMINED -- NO STOP CODONS)'
else:
## DUMB way of checking if there are a significant (> 5%) number of CDSs with IN-FRAME stop codons
if inFrames[0] < inFrames[1]:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Universal (table = 1)'+color.END)
genetic_code = 'Universal (table = 1)'
else:
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Condylostoma-Code'\
' (No Dedicated Stops) OR None (all stops = "X")'+color.END)
genetic_code = 'Condylostoma or None'
if stop_freq[0] == 0 and stop_freq[1] == 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Chilodonella-Code'\
+' (Only Stop = TAA)'+color.END)
genetic_code = 'Chilodonella or TAA'
if stop_freq[0] == 0 and stop_freq[2] == 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TAG-Code'\
+' (Only Stop = TAG)'+color.END)
genetic_code = 'TAG'
if stop_freq[1] == 0 and stop_freq[2] == 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Ciliate-Code'\
+' (table = 6)'+color.END)
genetic_code = 'Ciliate (table = 6)'
if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] == 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAG are STOP'+color.END)
genetic_code = 'TGA/TAG'
if stop_freq[0] != 0 and stop_freq[1] == 0 and stop_freq[2] != 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAA are STOP'+color.END)
genetic_code = 'TGA/TAA'
if stop_freq[0] == 0 and stop_freq[1] != 0 and stop_freq[2] != 0:
print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Blepharisma/Euplotes-Codes'\
+color.END + color.BOLD+'\n--- NOTE: '+color.RED+' Stop-Codon Reassignments'\
+' differ! (TGA = W or TGA = C)' + color.END)
genetic_code = 'Blepharisma (TGA = W) or Euplotes (TGA = C)'
return genetic_code, stop_freq
###########################################################################################
###---------------- Writes Out Currently Crummy Summary of Genetic Codes ---------------###
###########################################################################################
def summarize(args):
suggestion, stop_freq = suggest_code(args)
with open(args.input_file.split('.fa')[0]+'.GeneticCode.txt','w+') as w:
w.write('Stop Codon\tFrequency\n')
w.write('TGA\t'+str(stop_freq[0])+'\n')
w.write('TAG\t'+str(stop_freq[1])+'\n')
w.write('TAA\t'+str(stop_freq[2])+'\n\n')
w.write('Suggestion For Genetic Code:\t'+suggestion+'\n\n')
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
summarize(args)
print (color.BOLD+'\nNext Script is: '+color.PURPLE+' 3g_GCodeTranslate.py\n\n'+color.END)
main()

View File

@ -0,0 +1,397 @@
#!/usr/bin/env python3.5
##__Updated__: 19_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 3g_GCodeTranslate.py --help
##############################################################################
## ##
## Translates CDSs sequences using the Provided Genetic Code. ##
## ##
## NOTE: ##
## No provided input for genetic code results in Translation with the ##
## UNIVERSAL genetic code (as default) ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
##############################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data.CodonTable import CodonTable
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
blepharisma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
condylostoma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
c_uncinata_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA'])
euplotes_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
myrionecta_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
no_stop_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
peritrich_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
tag_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAG'])
#----------------------------- Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###------------------------- Checks the Command Line Arguments -------------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
'"universal")\n'+color.END)
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
args.folder = '../'+args.input_file.split('/')[1]
args.out_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.AA.fasta'
args.new_ntd_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.NTD.fasta'
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3g_GCodeTranslate.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Prepped.fasta'\
' --genetic_code Universal'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
'taa','tag','tga']
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
print (author)
valid_arg += 1
else:
if args.list_codes == True:
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
valid_arg += 1
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
return valid_arg
##########################################################################################
###------------------ Translates CDSs from the Provided Genetic Code ------------------###
##########################################################################################
def translate_seqs(args):
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
print (color.BOLD+'\n\n\nTranslating: '+color.CYAN+args.input_file.split('/')[-1]+color.END+\
color.BOLD+'\nwith the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n'+color.END)
if args.genetic_code.lower() == 'ciliate' or args.genetic_code.lower() == 'tga':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=6)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'peritrich' or args.genetic_code.lower() == 'vorticella':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=peritrich_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'tag':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=tag_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'chilo' or args.genetic_code.lower() == 'chilodonella' or args.genetic_code.lower() == 'taa':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=c_uncinata_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'bleph' or args.genetic_code.lower() == 'blepharisma':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=blepharisma_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'eup' or args.genetic_code.lower() == 'euplotes':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=euplotes_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
if args.genetic_code.lower() == 'universal':
translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=1)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta]
return translated_seqs
##########################################################################################
###---------------------------- Writes Out Translated CDSs ----------------------------###
##########################################################################################
def write_out(args):
translated_seqs = translate_seqs(args)
## Keep only ORFs greater than 10 amino acids long
translated_seqs = [i for i in translated_seqs if len(i.split('\n')[1]) > 10]
print (color.BOLD+'\nTranslated '+color.ORANGE+str(len(translated_seqs))+color.END\
+color.BOLD+' seqeunces using the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n\n'+color.END)
with open(args.out_name,'w+') as w:
w.write(''.join(translated_seqs))
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
os.system('mv '+args.input_file+' '+args.new_ntd_name)
##########################################################################################
###----------------------------- Calls on Above Functions -----------------------------###
##########################################################################################
def main():
args = check_args()
write_out(args)
clean_up(args)
print (color.BOLD+'Next Script is: '+color.PURPLE+' 4g_CountOgsUsearch.py\n\n'+color.END)
main()

View File

@ -0,0 +1,319 @@
#!/usr/bin/env python3.5
##__Updated__: 19_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 3g_GCodeTranslate.py --help
##############################################################################
## ##
## This scrip will categorize TRANSLATED CDSs into Homologous Gene Families ##
## ##
## Questions about Gene Family Binning/Source? SEE NOTES at Bottom! ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
##############################################################################
import argparse, os, re, sys
from argparse import RawTextHelpFormatter, SUPPRESS
from distutils import spawn
from Bio import SeqIO
#----------------------------- Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#
def check_diamond_path():
### IF Diamond is IN YOUR PATH then no updating is needed...
diamond_path = ''
if diamond_path == '':
diamond_path = spawn.find_executable("diamond")
#diamond_path = /path/to/diamond
else:
pass
if diamond_path == None:
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+ ' the PATH to the' + color.BLUE + ' "diamond" '+ color.END + color.BOLD\
+ 'executable.\n\n' + color.END)
print (color.BOLD + color.BLUE + 'LOOK FOR:\n\n' + color.RED\
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+ color.BLUE + '\n\nThis is somewhere around lines 55 - 80...\n\n' + color.END)
sys.exit()
else:
pass
return diamond_path
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
+'THIS script (4_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
' Eukaryotic protein coding transcripts'+color.END)
required_arg_group.add_argument('--databases','-d', action='store',
help=color.BOLD+color.GREEN+'Path to folder containing db_OG'+color.END)
required_arg_group.add_argument('--evalue','-e', action='store',
help=color.BOLD+color.GREEN+'Maximum OG assignment e-value'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--threads','-t', default='2',
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
args.diamond = check_diamond_path()
args.home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
args.tsv_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('.AA.fasta','_allOGCleanresults.tsv')
args.aa_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed')
args.ntd_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('AA','NTD')
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_CountOGsDiamond.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Universal.AA.fasta'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('AA.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'3g_GCodeTranslate.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"AA.fasta"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if os.path.isdir(args.databases + '/db_OG') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+'\n\nThen try once again\n\n.'+color.END)
valid_arg += 1
ogdb_count = 0
for file in os.listdir(args.databases + '/db_OG'):
if file.endswith('.dmnd'):
ogdb_count += 1
if ogdb_count == 0:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
valid_arg += 1
elif ogdb_count > 1:
print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
OG_folder = '/'.join(args.input_file.split('/')[:-1])+'/DiamondOG/'
if os.path.isdir(OG_folder) != True:
os.system('mkdir '+OG_folder)
###########################################################################################
###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
###########################################################################################
def OG_ublast(args):
db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
OG_diamond_cmd = args.diamond + ' blastp -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + args.evalue + ' --subject-cover 0.5 --threads ' + args.threads + ' --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_allOGresults'
os.system(OG_diamond_cmd)
###########################################################################################
###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
###########################################################################################
def keep_best(args):
print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST'\
'\nmatch for each transcript\n\n'+color.END)
inTSV = [i for i in open(args.input_file.split('.fas')[0]+'_allOGresults').read().split('\n') if i != '']
inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
keep = []
for i in inTSV:
if any(i.split('\t')[0] in j for j in keep) != True:
keep.append(i)
updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+\
'\t'+'\t'.join(line.split('\t')[1:])+'\n' for line in keep]))
with open(args.tsv_out, 'w+') as w:
for i in updated_lines:
w.write(i+'\n')
###########################################################################################
###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
###########################################################################################
def update_fasta(args):
print (color.BOLD+color.PURPLE+'Updating Sequence Names with their BEST OG hits\n\n'+color.END)
keep = [i for i in open(args.tsv_out).read().split('\n') if i != '']
keep_dict = { }
for line in keep:
try:
og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
og = og_prefix + og_number
keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0] : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
except IndexError:
pass
protFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
ntdFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file.replace('.AA.','.NTD.'),'fasta')]
updated_prot_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in protFasta if i.description in keep_dict.keys()]
updated_ntd_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in ntdFasta if i.description in keep_dict.keys()]
with open(args.aa_out,'w+') as w:
for i in updated_prot_name:
w.write(i)
with open(args.ntd_out,'w+') as x:
for i in updated_ntd_name:
x.write(i)
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
os.system('mv '+args.input_file.replace('.fasta','_allOGresults')+' '+args.home_folder+\
'/DiamondOG')
os.system('cp '+args.aa_out+' '+args.home_folder+'/DiamondOG/')
os.system('cp '+args.ntd_out+' '+args.home_folder+'/DiamondOG/')
os.system('cp '+args.tsv_out+' '+args.home_folder+'/DiamondOG/')
##########################################################################################
###----------------------------- Calls on Above Functions -----------------------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
OG_ublast(args)
keep_best(args)
update_fasta(args)
clean_up(args)
print (color.BOLD+'Next Script is: '+color.GREEN+'5g_FinalizeName.py\n\n'+color.END)
main()
#----------------------------------------- NOTES -----------------------------------------#
#
# This script uses a "BLAST"-based approach to identify ANCIENT homologous gene families.
#
# Gene family designations were taken from OrthoMCL.org and serve as the database for
# this script's gene family assignments. These gene family assignments are NON-EXHAUSTIVE
# and most Lineage-Specific families will be missed!
#
# If you have any questions contact Xyrus (author): maurerax@gmail.com

View File

@ -0,0 +1,374 @@
#!/usr/bin/env python3.5
##__Updated__: 20_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 5g_FinalizeName.py --help
##################################################################################################
## This script is intended to rename the outputs of the FilterPartials script ##
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
## ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
## 6. You either know (or have inferred) the genetic code of the organism ##
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
## 8. Partial sequences have been removed from the transcriptomic data sets ##
## ##
## COMMAND Example Below ##
## Extra Notes at Bottom of Script ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## NONE! You're FINISHED! :D ##
## ##
##################################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
#----------------------- Solely to Make Print Statements Colorful -----------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
required_arg_group.add_argument('--name','-n', action='store',
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
print ('\n')
sys.exit()
args.all_output_folder = '/'.join(args.input_file.split('/')[:-3])
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/'
args.xml_out = args.input_AA.split('/')[-1]+'_1e-10keepall_BlastOutall.oneHit'
check_code(args)
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_FinalizeName.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.Universal.AA.fasta'\
' --name Sr_ci_Scer'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_args = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_args += 1
if args.input_file.endswith('AA.fasta'):
args.input_NTD = args.input_file.replace('AA.fasta','NTD.fasta')
args.input_AA = args.input_file
args.input_TSV = args.input_file.replace('.AA.fasta','_allOGCleanresults.tsv')
elif args.input_file.endswith('NTD.fasta'):
args.input_NTD = args.input_file
args.input_AA = args.input_file.replace('NTD.fasta','AA.fasta')
args.input_TSV = args.input_file.replace('.NTD.fasta','_allOGCleanresults.tsv')
if os.path.isfile(args.input_NTD) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
if os.path.isfile(args.input_AA) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
if os.path.isfile(args.input_TSV) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
'Fasta file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
return valid_args
###########################################################################################
###-------------------- Double Checks Format for 10-Character Code ---------------------###
###########################################################################################
def check_code(args):
check_name = args.name.split('_')
if len(args.name) != 10:
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
elif args.name.count('_') != 2:
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
.split('_Filtered')[0]+color.END+color.BOLD+"'s files\nusing the following 10-character "\
"code: "+color.CYAN+args.name+color.END+'\n')
else:
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
##########################################################################################
###------------------------- Creates Folders For Storing Data -------------------------###
##########################################################################################
def prep_folders(args):
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/') != True:
os.system('mkdir '+args.r2g_ntd)
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/') != True:
os.system('mkdir '+args.r2g_aa)
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/') != True:
os.system('mkdir '+args.r2g_tsv)
if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/') != True:
os.system('mkdir '+args.r2g_xml)
###########################################################################################
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
###########################################################################################
def rename_paralogs(args):
home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/'
print('HOME ' + home_folder)
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_')
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_')
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
if '\n\n' in open(args.input_TSV).read():
renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\
.replace('\n\n','\n'+args.name+'_')
else:
renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\
.replace('\n','\n'+args.name+'_')
with open(home_folder + args.input_AA.split('/')[-1],'w+') as w:
w.write(renamed_Final_Prots)
with open(home_folder + args.input_NTD.split('/')[-1],'w+') as x:
x.write(renamed_Final_Nucs)
with open(home_folder + args.input_TSV.split('/')[-1],'w+') as y:
y.write(renamed_Final_tsv)
###########################################################################################
###--------------------------------- Header/Tail Lines ---------------------------------###
###########################################################################################
def header_tail():
header = '<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n'\
'<BlastOutput>\n <BlastOutput_program>blastp</BlastOutput_program>\n <BlastOutput_version>BLASTP 2.2.29+</BlastOutput_version>\n'\
' <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n'\
' <BlastOutput_db>../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta</BlastOutput_db>\n <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n'
tail = '</BlastOutput_iterations>\n</BlastOutput>'
return header, tail
###########################################################################################
###------------------------------- TSV to XML Conversion -------------------------------###
###########################################################################################
def convert_TSV_data(args):
home_folder = '/'.join(args.input_AA.split('/')[:-2])
TSVforConvert = home_folder+ '/' + args.input_TSV.split('/')[-1]
inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n']
iterations = []
for n in range(len(inTSV)):
if n == 0:
iterations.append(' <BlastOutput_query-def>'+inTSV[n].split('\t')[0]+'</BlastOutput_query-def>\n <BlastOutput_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</BlastOutput_query-len>\n'\
' <BlastOutput_param>\n <Parameters>\n <Parameters_matrix>BLOSUM62</Parameters_matrix>\n <Parameters_expect>1e-10</Parameters_expect>\n'\
' <Parameters_gap-open>11</Parameters_gap-open>\n <Parameters_gap-extend>1</Parameters_gap-extend>\n <Parameters_filter>F</Parameters_filter>\n'\
' </Parameters>\n </BlastOutput_param>\n<BlastOutput_iterations>\n<Iteration>\n <Iteration_iter-num>1</Iteration_iter-num>\n <Iteration_query-ID>Query_1</Iteration_query-ID>\n'\
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
else:
iterations.append('<Iteration>\n <Iteration_iter-num>'+str(n+1)+'</Iteration_iter-num>\n <Iteration_query-ID>Query_'+str(n+1)+'</Iteration_query-ID>\n'\
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
return iterations
###########################################################################################
###--------------------------- Writes Out the Fake XML File ----------------------------###
###########################################################################################
def write_Fake_XML(args):
home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/'
print (color.BOLD+'\n\nConverting '+color.ORANGE+args.name+'_XX_'+args.input_TSV.split('/')[-1]\
+color.END+color.BOLD+' to XML format\n'+color.END)
header, tail = header_tail()
iterations = convert_TSV_data(args)
with open(home_folder+args.xml_out,'w+') as w:
w.write(header)
w.write(''.join(iterations))
w.write(tail)
##########################################################################################
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
##########################################################################################
def clean_up(args):
final_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
os.system('rm '+args.input_AA)
os.system('rm '+args.input_NTD)
os.system('rm '+args.input_TSV)
os.system('cp '+final_folder+'*Renamed.*.AA.fasta '+args.r2g_aa)
os.system('cp '+final_folder+'*Renamed.*.NTD.fasta '+args.r2g_ntd)
os.system('cp '+final_folder+'*.Renamed.*_allOGCleanresults.tsv '+args.r2g_tsv)
os.system('cp '+final_folder+'*oneHit '+args.r2g_xml)
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.xml_out\
.split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\
args.xml_out.split('_XX_')[-1].split('.Renamed')[0]+'.Prep'+color.END+color.BOLD+' and '\
+color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\
'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
rename_paralogs(args)
write_Fake_XML(args)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,274 @@
import os, sys
import argparse
from Bio import SeqIO
import CUB
from statistics import mean
from math import ceil, floor
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
def get_args():
parser = argparse.ArgumentParser(
prog = 'PTL6p1 Script 8: Stat Summary',
description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
)
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')
#Curate genetic code
return parser.parse_args()
def hook_lens(args):
print('\nGetting average OG lengths in the Hook DB...')
len_by_og = { }
for file in os.listdir(args.databases + '/db_OG'):
if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
if rec.id[-10:] not in len_by_og:
len_by_og.update({ rec.id[-10:] : [] })
len_by_og[rec.id[-10:]].append(len(str(rec.seq)))
for og in len_by_og:
len_by_og[og] = mean(len_by_og[og])
return len_by_og
def aa_comp_lengths(args, gcodes):
print('\nGetting amino acid composition data from ReadyToGo files...')
r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
if file.endswith('.fasta') and file[:10] in gcodes:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
fymink = 0; garp = 0; other = 0; total = 0
for char in str(rec.seq):
if char in 'FYMINK':
fymink += 1
elif char in 'GARP':
garp += 1
else:
other += 1
total += 1
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total } })
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
print('\nGetting transcript sequence data from original assembled transcript files...')
transcripts = { }; transcript_id_corr = { }
for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/')]):
if os.path.isdir(args.input + '/Intermediate/' + tax + '/Original'):
for file in os.listdir(args.input + '/Intermediate/' + tax + '/Original'):
if file.endswith('_GenBankCDS.fasta'):
for rec in SeqIO.parse(args.input + '/Intermediate/' + tax + '/Original/' + file, 'fasta'):
transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})
return aa_comp, transcripts, r2g_lengths, transcript_id_corr
def get_nuc_comp(args, gcodes):
print('\nGetting nucleotide composition data from ReadyToGo files...')
nuc_comp = { }
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
if file.endswith('.fasta') and file[:10] in gcodes:
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]])[0]
for k in cub_out:
nuc_comp.update({ k : cub_out[k] })
return nuc_comp
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
og_mean_lens = hook_lens(args)
if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
os.mkdir(args.input + '/PerSequenceStatSummaries')
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
for taxon in taxa:
with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA\n')
for rec in nuc_comp:
if rec[:10] == taxon:
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
try:
o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
except KeyError:
o.write(',NA,NA')
o.write(',' + str(r2g_lengths[rec]) + ',' + str(og_mean_lens[rec[-10:]]))
v = nuc_comp[rec]
gcs = [str(v.gcOverall), str(v.gc1), str(v.gc2), str(v.gc3), str(v.gc4F)]
ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + '\n')
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
o.write('Taxon,TranscriptsInput,Median_GCTranscripts,IQR_GCTranscripts,Median_LenTranscripts,IRQ_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,IQR_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
for taxon in taxa:
try:
o.write(taxon)
transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
o.write(',' + str(len(transcripts)))
transcript_gcs = []
for transcript in transcripts:
transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))
transcript_gcs = sorted(transcript_gcs)
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.75)] - transcript_gcs[floor(len(transcripts)*0.25)]))
transcript_lens = sorted([len(transcript) for transcript in transcripts])
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
o.write(',' + str(len(r2g_ntds)))
r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
o.write(',' + str(len(r2g_ogs)))
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.75)] - r2g_gc3s[floor(len(r2g_gc3s)*0.25)]))
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)]))
tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
o.write(',' + gcodes[taxon] + '\n')
except:
pass
def r2g_jf(args, nuc_comp, gcodes):
#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF'):
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF')
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF'):
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF')
for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
if file.endswith('.fasta') and file[:10] in gcodes:
taxon = file[:10]
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
def plot_jf(args, nuc_comp):
if not os.path.isdir(args.input + '/GC3xENc_Plots'):
os.mkdir(args.input + '/GC3xENc_Plots')
taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))
gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]
for taxon in taxa:
comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]
plt.figure()
plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
plt.xlabel("GC content (3rd pos, 4-fold sites)")
plt.ylabel("Observed Wright ENc (6 Fold)")
plt.savefig(args.input + '/GC3xENc_Plots/' + taxon + '.png')
if __name__ == "__main__":
args = get_args()
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
gcodes = { }
if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
for line in open(args.input + '/Intermediate/gcode_output.tsv'):
if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
elif line.split('\t')[4].strip().lower() != '':
print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 6 (summary statistics)\n')
else:
print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 6 (summary statistics).\n')
exit()
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
nuc_comp = get_nuc_comp(args, gcodes)
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
if args.r2g_jf:
r2g_jf(args, nuc_comp, gcodes)
plot_jf(args, nuc_comp)

523
PTL1/Genomes/Scripts/CUB.py Normal file
View File

@ -0,0 +1,523 @@
#!/usr/bin/env python3
# coding=utf-8
'''Aim of this script is to generate lots of codon usage statistics to aid in
identifying useful characteristics for de novo ORF calling'''
# Author: Xyrus Maurer-Alcalá
# Contact: maurerax@gmail.com or xyrus.maurer-alcala@izb.unibe.ch
# Last Modified: 2020-09-17
# usage: python CUB.py
# Dependencies:
# Python3, numpy, BioPython
import os
import re
import sys
#import matplotlib.pyplot as plt
import numpy as np
#import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import GC
class CalcCUB:
"""
Returns the Effective Number of Codons used (observed and expected)
following the equations originally from Wright 1990.
"""
def expWrightENc(gc3):
# Calculates the expected ENc from a sequence's GC3 under Wright 1990
if gc3 > 1:
# If GC3 looks as though it is > 1 (e.g. 100%), converts to a float ≤ 1.
# Calculations expect a value between 0 and 1
gc3 = gc3/100
exp_enc = 2+gc3+(29/((gc3**2)+(1-gc3)**2))
return round(exp_enc, 4)
def nullENcGC3():
# Calculates the expected ENc from the null distribution of GC3
# values (0, 100% GC)
null = [CalcCUB.expWrightENc(n) for n in np.arange(0,.51,0.01)]
null += null[:-1][::-1]
return [str(i)+'\t'+str(j) for i, j in zip([n for n in range(0, 101)],null)]
def calcWrightENc(cdnTable):
# Follows Wright's (1990) calculations for determining ENc scores.
def faCalcWright(aa_counts):
# Returns the codon homozygosity (fa) for a given "type" of AA (e.g.
# 2-fold degeneracy).
counts = [i[2] for i in aa_counts]
# n_aa --> number of this particular AA
n_aa = sum(counts)
# fa --> codon homozygosity
try:
fa = (((n_aa*sum([(i/float(n_aa))**2 for i in counts]))-1)/(n_aa-1))
except:
fa = 0
return fa
def ENcWright_by_Degen(fa_data):
# Same as used in Wright 1990, averages the homozygosity across all codons
# of a given class (e.g. 2-fold degeneracy)
# Codons without any degeneracy (e.g. ATG == M) have 100% homozygosity
# and provide a "base" for the ENc score
enc = 2
for k, v in fa_data.items():
non_zero_vals, non_zero_sum = len([i for i in v if i != 0]), sum([i for i in v if i != 0])
try:
f_aa = non_zero_sum/non_zero_vals
except:
f_aa = 1
enc += k/f_aa
return enc
# Determines the number of degenerate groups to use (i.e. whether 6-Fold
# degeneracy is present).
degen_cdns = {}
for k, v in cdnTable.items():
if v[1] not in degen_cdns.keys():
degen_cdns[v[1]] = [v[0]]
else:
if v[0] not in degen_cdns[v[1]]:
degen_cdns[v[1]] += [v[0]]
# Calculates codon homozygosity (fa) for each amino acid. Groups the
# resulting values based on the amino acids degeneracy (e.g. 'two-fold').
fa_cdns = {len(v):[] for k, v in degen_cdns.items() if 'one' not in k}
for k, v in degen_cdns.items():
# Skip codons lacking degeneracy
if 'one' in k:
continue
for aa in v:
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
fa_cdns[len(v)] += [faCalcWright(aa_counts)]
enc_val = min(61, round(ENcWright_by_Degen(fa_cdns),4))
return enc_val
def SunEq5(cdnTable):
def calcFcf(aa_counts):
counts = [i[2] for i in aa_counts]
pseudocounts = [i+1 for i in counts]
na = sum(pseudocounts)
fcf = sum([(i/float(na))**2 for i in pseudocounts]), sum(pseudocounts)
return fcf
ENcWeightedPsuedo = 0
degen_cdns = {}
for k, v in cdnTable.items():
if v[1] == 'none':
continue
if v[1] not in degen_cdns.keys():
degen_cdns[v[1]] = [v[0]]
else:
if v[0] not in degen_cdns[v[1]]:
degen_cdns[v[1]] += [v[0]]
for k, v in degen_cdns.items():
fcf_nc = []
for aa in v:
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
fcf_nc.append(calcFcf(aa_counts))
weightedENc = (len(fcf_nc) /
(sum([i[0]*i[1] for i in fcf_nc]) /
sum([i[1] for i in fcf_nc])))
ENcWeightedPsuedo += weightedENc
return round(ENcWeightedPsuedo,4)
def calcRCSU(cdnTbl):
rscu = {k:[v[0]] for k, v in cdnTbl.items() if v[0].isalpha()}
for k, v in rscu.items():
try:
aa_info = [(key, val[-1]) for key, val in cdnTbl.items() if val[0] == v[0]]
aa_cnts = [x[1] for x in aa_info]
cdn_rscu = (cdnTbl[k][-1]*len(aa_cnts))/sum(aa_cnts)
rscu[k] += [str(round(cdn_rscu,4))]
except:
rscu[k] += ['0.0']
return rscu
class GenUtil(object):
"""
"Overflow" of functions for now. Just a precaution to make the code a
little cleaner/easier to manage.
This class inclues means to normalize/check the user-provided genetic code,
which if not valid will default to the "universal" genetic code.
Similarly, This class will return the appropriate
codon count table and provides a function to update its values.
"""
def convertGenCode(gCode):
# Will interpret the user provided genetic code (gcode) and checks that
# it is currently available for use with the NCBI/biopython
# supported translation tables. Default is universal.
# Dictionary of the possible/functional genetic codes that are supported.
# --- Chilodonella and condylostoma are to come!
transTable = {'universal':1, 'blepharisma':4,
'ciliate':6, 'euplotes':10, 'mesodinium':29, 'myrionecta':29, 'peritrich':30,
'1':1, '4':4, '6':6, '10':10, '29':29, '30':30, 'chilo':'chilo'}
if str(gCode).lower() not in transTable:
print("\nWarning: Provided genetic code is not supported (yet).\n")
print("Currently running using the UNIVERSAL genetic code.\n\n")
print("Alternative genetic codes are as follows (Note: numbers "\
"correspond to NCBI genetic code tables):\n")
print('\n'.join(list(transTable.keys()))+'\n')
return 'Universal',1
else:
return gCode,transTable[str(gCode).lower()]
def getCDNtable(gCode):
# Returns the appropriate codon table to be used for the ENc calculations.
# Universal codon table, with 6-fold degenerate codons split
# into four-fold and two-fold groups.
universal_no6fold = {
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
'GCG': ['A', 'four', 0], 'CGT': ['R', 'four', 0], 'CGC': ['R', 'four', 0],
'CGG': ['R', 'four', 0], 'CGA': ['R', 'four', 0], 'AGA': ['R_', 'two', 0],
'AGG': ['R_', 'two', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
'TTA': ['L_', 'two', 0], 'TTG': ['L_', 'two', 0], 'CTT': ['L', 'four', 0],
'CTC': ['L', 'four', 0], 'CTA': ['L', 'four', 0], 'CTG': ['L', 'four', 0],
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'four', 0],
'TCC': ['S', 'four', 0], 'TCA': ['S', 'four', 0], 'TCG': ['S', 'four', 0],
'AGT': ['S_', 'two', 0], 'AGC': ['S_', 'two', 0], 'ACT': ['T', 'four', 0],
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
# Universal codon table, with 6-fold degenerate codons kept
# whole, no splitting! Traditional Universal codon table.
universal_6fold = {
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
'GCG': ['A', 'four', 0], 'CGT': ['R', 'six', 0], 'CGC': ['R', 'six', 0],
'CGG': ['R', 'six', 0], 'CGA': ['R', 'six', 0], 'AGA': ['R', 'six', 0],
'AGG': ['R', 'six', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
'TTA': ['L', 'six', 0], 'TTG': ['L', 'six', 0], 'CTT': ['L', 'six', 0],
'CTC': ['L', 'six', 0], 'CTA': ['L', 'six', 0], 'CTG': ['L', 'six', 0],
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'six', 0],
'TCC': ['S', 'six', 0], 'TCA': ['S', 'six', 0], 'TCG': ['S', 'six', 0],
'AGT': ['S', 'six', 0], 'AGC': ['S', 'six', 0], 'ACT': ['T', 'four', 0],
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting!
blepharisma_6fold = {**universal_6fold,
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
blepharisma_no6fold = {**universal_no6fold,
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
# Chilodonella genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting!
chilo_6fold = {**universal_6fold,
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
'TAA': ['*', 'one', 0], 'TAG': ['Q', 'four', 0],
'TGA': ['Q', 'four', 0]}
# Chilodonella genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAG =/= TAG)
chilo_no6fold = {**universal_no6fold,
'TAA': ['*', 'one', 0], 'TAG': ['Q_', 'one', 0],
'TGA': ['Q_', 'one', 0]}
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting! Traditional ciliate codon table.
ciliate_6fold = {**universal_6fold,
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
'TAA': ['Q', 'four', 0], 'TAG': ['Q', 'four', 0],
'TGA': ['*', 'one', 0]}
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAA =/= TAA)
ciliate_no6fold = {**universal_no6fold,
'TAA': ['Q_', 'two', 0], 'TAG': ['Q_', 'two', 0],
'TGA': ['*', 'one', 0]}
# Euplotes codon table, with 6-fold degenerate codons kept
# whole, no splitting! Traditional Universal codon table.
euplotes_6fold = {**universal_6fold,
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
'TAG': ['*', 'two',0]}
# Euplotes genetic code codon table, with 6-fold degenerate codons
# split into four-fold and two-fold groups.
euplotes_no6fold = {**universal_no6fold,
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
'TAG': ['*', 'two',0]}
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
# degenerate codons kept whole, no splitting! Traditional ciliate codon table.
mesodinium_6fold = {**universal_6fold,
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
'TGA': ['*', 'one', 0]}
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
# degenerate codons split into four-fold and two-fold groups.
mesodinium_no6fold = {**universal_no6fold,
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
'TGA': ['*', 'one', 0]}
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting! Traditional ciliate codon table.
peritrich_6fold = {**universal_6fold,
'GAA': ['E', 'four', 0], 'GAG': ['E', 'four', 0],
'TAA': ['E', 'four', 0], 'TAG': ['E', 'four', 0],
'TGA': ['*', 'one', 0]}
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAA =/= TAA)
peritrich_no6fold = {**universal_no6fold,
'TAA': ['E_', 'two', 0], 'TAG': ['E_', 'two', 0],
'TGA': ['*', 'one', 0]}
cdnTableDict = {1:[universal_no6fold,universal_6fold],
4:[blepharisma_no6fold, blepharisma_6fold],
6:[ciliate_no6fold,ciliate_6fold],
10:[euplotes_no6fold,euplotes_6fold],
29:[mesodinium_no6fold,mesodinium_6fold],
30:[peritrich_no6fold,peritrich_6fold],
'chilodonella':[chilo_no6fold,chilo_6fold],
'chilo':[chilo_no6fold,chilo_6fold]}
return cdnTableDict[gCode]
def mapCdns(seq, cdnTable):
# Updates the codon counts for a given sequence to the respective codon
# count table (e.g. with or without 6-fold degeneracy).
codons = [seq[n:n+3] for n in range(0, len(seq)-len(seq)%3, 3)]
amb_cdn = 0
for c in codons:
try:
cdnTable[c][-1] += 1
except:
amb_cdn += 1
if cdnTable['TCC'][1] == 'six':
return cdnTable, amb_cdn
else:
return cdnTable
class GCeval():
"""
Returns %GC values from DNA sequences of various types.
"""
def gcTotal(seq):
# This function returns global GC content
return round(GC(seq), 4)
def gc1(seq):
# This function return the GC content of the first position of a codon
return round(GC(''.join([seq[n] for n in range(0, len(seq), 3)])), 4)
def gc2(seq):
# This function return the GC content of the second position of a codon
return round(GC(''.join([seq[n] for n in
range(1, len(seq)-len(seq[1:]) % 3, 3)])), 4)
def gc3(seq):
# This function return the GC content of the third position of a codon
return round(GC(''.join([seq[n] for n in
range(2, len(seq)-len(seq[2:]) % 3, 3)])), 4)
def gc3_4F(cdnTbl):
# # This function return the GC content of the third position of four-fold
# # degenerate codons
FrFold = round(GC(''.join([k[-1]*v[-1] for k, v in cdnTbl.items() if
'one' not in v[1]])), 4)
return FrFold
class SeqInfo(object):
"""
Provides a means to harbor the data for each individual contig/gene in a
given fasta file.
This includes GC content (various types), Effective Number of codons
(ENc; again various calculations), Relative Synonymous Codon Usage (RSCU).
"""
def __init__(self,seq,gcode='universal'):
self.ntd = str(seq)
self.gcode, self.transTable = GenUtil.convertGenCode(gcode)
# Dictionary of the GC-related functions/calculations
self.gcFuncs = {'gcOverall':GCeval.gcTotal,'gc1':GCeval.gc1,'gc2':GCeval.gc2,'gc3':GCeval.gc3}
def countCodons(self):
# Stores the different codon tables and updates their codon counts
cdnTbls = GenUtil.getCDNtable(self.transTable)
self.cdnCounts_6F,self.amb_cdn = GenUtil.mapCdns(self.ntd, cdnTbls[1])
self.cdnCounts_No6F = GenUtil.mapCdns(self.ntd, cdnTbls[0])
def ENcStats(self):
# Stores the various Effective Number of Codons calculations in the class
self.expENc = CalcCUB.expWrightENc(self.gc3)
self.obsENc_6F = CalcCUB.calcWrightENc(self.cdnCounts_6F)
self.obsENc_No6F = CalcCUB.calcWrightENc(self.cdnCounts_No6F)
self.SunENc_6F = CalcCUB.SunEq5(self.cdnCounts_6F)
self.SunENc_No6F = CalcCUB.SunEq5(self.cdnCounts_No6F)
def GCstats(self):
# Stores the various GC-stats in the class
for k, v in self.gcFuncs.items():
setattr(self,k,v(self.ntd))
self.gc4F = GCeval.gc3_4F(self.cdnCounts_No6F)
def RSCUstats(self):
self.rscu_No6Fold = CalcCUB.RSCU(self.cdnCounts_No6F)
self.rscu_6Fold = CalcCUB.RSCU(self.cdnCounts_6F)
def prepFolders(outName):
if os.path.isdir(outName) == False:
os.mkdir(outName)
if os.path.isdir(outName+'/Plots') == False:
os.mkdir(outName+'/Plots')
if os.path.isdir(outName+'/SpreadSheets') == False:
os.mkdir(outName+'/SpreadSheets')
def CalcRefFasta(fasta, gCode):
seqDB = {i.description:SeqInfo(i.seq, gCode) for i in SeqIO.parse(fasta,'fasta')}
GenCDNtable = {}
for k, v in seqDB.items():
v.countCodons()
v.GCstats()
v.ENcStats()
for k, v in v.cdnCounts_6F.items():
if k.isalpha() and k not in GenCDNtable .keys():
GenCDNtable[k] = [v[0],v[-1]]
else:
GenCDNtable[k][-1] += v[-1]
RSCU = CalcCUB.calcRCSU(GenCDNtable)
return seqDB, RSCU
def WriteWrightOut(seqData, outName, comp):
if comp == False:
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Raw.tsv','w+') as w:
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
for k, v in seqData.items():
name = [k]
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
str(v.SunENc_6F),str(v.SunENc_No6F)]
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
else:
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.CompTrans.ENc.Raw.tsv','w+') as w:
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
for k, v in seqData.items():
name = [k]
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
str(v.SunENc_6F),str(v.SunENc_No6F)]
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
def getCompFasta(fasta, gCode):
print(fasta)
stopCDNs = {'1':['TAA','TAG','TGA'], '4':['TAA','TAG'], '6':['TGA'], '10':['TAA','TAG'],
'29':['TGA'], '30':['TGA'], 'universal':['TAA','TAG','TGA'], 'blepharisma':['TAA','TAG'],
'ciliate':['TGA'],'euplotes':['TAA','TAG'], 'mesodinium':['TGA'], 'peritrich':['TGA'],
'chilo':['TAA']}
if gCode.lower() not in stopCDNs.keys():
stops = stopCDNs['1']
else:
stops = stopCDNs[gCode]
with open(fasta.replace('.fasta','.Comp.fasta'),'w+') as w:
for i in SeqIO.parse(fasta,'fasta'):
#if str(i.seq).upper().startswith('ATG') and str(i.seq).upper()[-3:] in stops:
#if str(i.seq).upper()[-3:] in stops:
if len(i.seq) % 3 == 0:
w.write('>'+i.description+'\n'+str(i.seq)+'\n')
return fasta.replace('.fasta','.Comp.fasta')
def WriteNullENcOut(outName):
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Null.tsv','w+') as w:
w.write('GC3\tENc\n')
w.write('\n'.join(CalcCUB.nullENcGC3()))
def WriteRSCUtbl(RSCUtbl, outName):
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.RSCU.tsv','w+') as w:
w.write('Codon\tAmino Acid\tRSCU\n')
for k,v in RSCUtbl.items():
w.write(k+'\t'+'\t'.join(v)+'\n')
if __name__ == "__main__":
if len(sys.argv) < 2:
print('\nUsage:\n')
print('python CUB.py MyNtds.fasta MyTaxon genetic_code\n')
print('\nGenetic Codes:\n')
gcd = ['1', '4', '6', '10', '29', '30', 'universal', 'blepharisma',
'ciliate','euplotes', 'mesodinium', 'peritrich','chilo']
print('\n'.join(gcd)+'\n')
sys.exit()
fasta = sys.argv[1]
try:
outName = sys.argv[2]
except:
print('Missing an output name. Include one, then run again!')
sys.exit()
try:
gCode = sys.argv[3]
except:
gCode = 'universal'
compFasta = getCompFasta(fasta, gCode)
prepFolders(outName)
fastaDataRaw, RSCUtbl = CalcRefFasta(fasta, gCode)
fastaDataComp, RSCUtbl = CalcRefFasta(compFasta, gCode)
WriteWrightOut(fastaDataRaw, outName, comp=False)
WriteWrightOut(fastaDataComp, outName, comp=True)
WriteNullENcOut(outName)
WriteRSCUtbl(RSCUtbl, outName)
os.system('cp '+fasta+' '+outName+'/')
os.system('mv '+compFasta+' '+outName+'/')

View File

@ -0,0 +1,211 @@
import os, sys, re
import argparse
def get_args():
parser = argparse.ArgumentParser(
prog = 'PhyloToL v6.0 Part 1 for GenBank Genomes',
description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
)
parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'Script to run if you are only running one script')
parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4 }, help = 'First script to run')
parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5 }, help = 'First script to run')
parser.add_argument('-c', '--cds', type = str, help = 'Path to a folder of nucleotide CDS. Each file name should start with a unique 10 digit code, and end in "_GenBankCDS.fasta", E.g. Op_me_hsap_GenBankCDS.fasta')
parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder')
parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 4). Otherwise, stop after script 3 and fill in "gcode_output.tsv" before running script 4')
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder (which should contain db_OG)')
return parser.parse_args()
def script_one(args, ten_digit_codes):
for file in os.listdir(args.cds):
if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes:
os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output')
def script_two(args):
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'):
os.system('python 2_GCodeEval.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta')
gcode_info = []
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt'):
with open(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt') as f:
gcode_temp = [folder]
for line in f:
line_sep = line.strip().split('\t')
if line_sep[0] == 'TGA':
gcode_temp.append(line_sep[1])
elif line_sep[0] == 'TAG':
gcode_temp.append(line_sep[1])
elif line_sep[0] == 'TAA':
gcode_temp.append(line_sep[1])
gcode_info.append(gcode_temp)
stop = False
gcode_file = { }
if args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
if os.path.isfile(args.genetic_code):
for line in open(args.genetic_code):
try:
if line.split('\t')[1].strip().lower() in valid_codes:
gcode_file.update({ line.split('\t')[0] : line.split('\t')[1].strip() })
else:
print('Genetic code ERROR -- ' + line.split('\t')[1].strip() + ' is not a valid genetic code. Please fill out the "gcode_output.tsv" file and continue with script 3.')
except IndexError:
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it is improperly formatted. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
stop = True
else:
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it could not be found. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
stop = True
with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
for row in gcode_info:
if args.genetic_code == None:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
elif args.genetic_code.lower() in valid_codes:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + args.genetic_code + '\n')
elif args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
try:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + gcode_file[row[0]] + '\n')
except KeyError:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but a taxon is missing. Stopping after script 2; you may fill out the file gcode_output.tsv and continue with script 3.\n')
stop = True
else:
stop = True
if stop or args.genetic_code == None:
print('\nStopping after script 2 because genetic code information is incomplete. Please fill out the file "gcode_output.tsv" and continue with script 3.\n')
exit()
def script_three(args):
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')]
with open(args.output + '/Output/gcode_output.tsv', 'r') as g:
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'):
for line in lines:
if line[0] == folder and line[-1].lower() in valid_codes:
os.system('python 3_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta --genetic_code ' + line[-1])
elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
def script_four(args):
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
gcode_by_folder = { line.strip().split('\t')[0] : line.strip().split('\t')[-1] for line in open(args.output + '/Output/gcode_output.tsv', 'r') }
for folder in os.listdir(args.output + '/Output'):
if os.path.isdir(args.output + '/Output/' + folder):
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.' + gcode_formatted + '.AA.fasta'):
os.system('python 4_CountOGsDiamond.py -in ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.' + gcode_formatted + '.AA.fasta -t 30 --databases ' + args.databases + ' --evalue 1e-15')
def script_five(args):
gcode_by_folder = { line.strip().split('\t')[0] : line.strip().split('\t')[-1] for line in open(args.output + '/Output/gcode_output.tsv', 'r') }
for folder in os.listdir(args.output + '/Output'):
if os.path.isdir(args.output + '/Output/' + folder):
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta'):
step5_cmd = 'python 5_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder
os.system(step5_cmd)
os.mkdir(args.output + '/Output/Intermediate')
for file in os.listdir(args.output + '/Output'):
if file != 'ReadyToGo' and file != 'Intermediate':
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
os.system('python 6_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
if __name__ == "__main__":
args = get_args()
if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds):
print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument')
quit()
ten_digit_codes = []
if args.first_script == 1 or args.script == 1:
for file in os.listdir(args.cds):
if file[10:] == '_GenBankCDS.fasta' and '.DS_Store' not in file:
ten_digit_codes.append(file[:10])
else:
if not os.path.isdir(args.output + '/Output'):
print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
quit()
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
print('\nDuplicate 10-digit codes are not allowed. Aborting.\n')
quit()
for code in ten_digit_codes:
for c, char in enumerate(code):
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
quit()
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
quit()
elif os.path.isdir(args.output + '/Output/Intermediate'):
print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
quit()
elif not os.path.isdir(args.output + '/Output'):
os.mkdir(args.output + '/Output')
scripts = [0, script_one, script_two, script_three, script_four, script_five]
if args.script == -1:
if args.first_script < args.last_script:
for i in range(1 + args.last_script - args.first_script):
print('\nRunning script ' + str(i + args.first_script) + '...\n')
if i + args.first_script == 1:
scripts[i + args.first_script](args, ten_digit_codes)
else:
scripts[i + args.first_script](args)
else:
print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
quit()
else:
if args.script == 1:
scripts[args.script](args, ten_digit_codes)
else:
scripts[args.script](args)

View File

@ -0,0 +1,24 @@
#!/bin/bash
#
#SBATCH --job-name=PTL1_genome
#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId)
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances
#SBATCH --mem=160G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=YOUREMAIL@smith.edu
module purge #Cleans up any loaded modules
module use /gridapps/modules/all #make sure module locations is loaded
module load slurm
module load tqdm
module load Biopython/1.75-foss-2019b-Python-3.7.4
module load BLAST+/2.9.0-gompi-2019b
module load DIAMOND/0.9.30-GCC-8.3.0
path='/beegfs/fast/katzlab/PTL1/Genomes/'
srun -D ${path}Scripts python3 ${path}Scripts/wrapper.py -1 1 -2 5 --cds ${path}PTL1GenomesBatches/PTL1GenomesBatch2 -o ${path}Output/PTL1Genomes_OutputBatch2 --genetic_code Universal --databases ${path}Databases &
wait