diff --git a/PTL1/Genomes/Scripts/1_RenameCDS.py b/PTL1/Genomes/Scripts/1_RenameCDS.py deleted file mode 100644 index 68c21f9..0000000 --- a/PTL1/Genomes/Scripts/1_RenameCDS.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 19_09_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 1g_RenameCDS.py --help - - -from Bio import SeqIO -from Bio.SeqUtils import GC -import argparse, os, sys, time -from argparse import RawTextHelpFormatter,SUPPRESS - - -#----------------------------- Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - -#------------------------------- Main Functions of Script --------------------------------# -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script is intended to extract '+color.RED+'Annotated '+\ - color.PURPLE+'ORFS\n'+color.END+color.BOLD+'from a provided Genbank formatted file.'\ - +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END) - - required_arg_group.add_argument('--output_dir','-o', action='store', - help=color.BOLD+color.GREEN+' Output directory\n'+color.END) - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('--source','-s', action='store', default='GenBank', - help=color.BOLD+color.GREEN+' Data Source of CDSs (default = "GenBank")\n'+color.END) - - optional_arg_group.add_argument('--list_source','-lsrc', action='store_true', - help=color.BOLD+color.GREEN+' Lists supported data sources\n'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - more_info = return_more_info(args) - if more_info != None: - print (parser.description) - print (more_info) - sys.exit() - - args.folder = args.output_dir + '/' + args.input_file.split('/')[-1][:10] - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1g_RenameCDS.py'\ - ' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.fasta --source'\ - ' GenBank'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - acceptable_sources = ['in-house', 'in-lab', 'GenBank', 'gb', 'NCBI'] - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - return author - - if args.list_source == True: - print (color.BOLD+color.RED+'\nThese are the currently supported data sources.\n'+color.END) - print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n\n'+color.END) - sys.exit() - - if args.source.lower() not in [i.lower() for i in acceptable_sources]: - print (color.BOLD+color.RED+'\nUnsupported source was provided.\n\nEnsure that '\ - 'you are providing a valid data source (see below).\n'+color.END) - print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n'+color.END) - sys.exit() - - if args.input_file != None: - if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])): - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - sys.exit() - - -########################################################################################### -###--------------------------- Does the Inital Folder Prep -----------------------------### -########################################################################################### - -def prep_folders(args): - - if os.path.isdir(args.folder) != True: - os.system('mkdir '+args.folder) - os.system('cp '+args.input_file+' '+args.folder) - args.input_file = args.folder+'/'+args.input_file.split('/')[-1] - - if os.path.isdir(args.folder+'/Original') != True: - os.system('mkdir '+args.folder+'/Original') - - os.system('cp '+args.input_file+' '+args.folder+'/Original/') - -########################################################################################### -###------------- Renames Protein-Coding CDS Sequences to Standard Format ---------------### -########################################################################################### - -def renamed_GenomeCDS(args): - - print (color.BOLD+'\n\nPrepping to rename '+color.GREEN+args.input_file.split('/')[-1]+\ - color.END+color.BOLD+"'s CDS sequences"+color.END) - inFasta = sorted((i for i in SeqIO.parse(args.input_file,'fasta')),key=lambda seq_rec: -len(seq_rec.seq)) - - renamed_seqs = [] - seq_code_dict = {} - - count = 1 - for seq_rec in inFasta: - seq_code_dict.setdefault(seq_rec.description,[]).append('Contig_'+str(count)+'_Len'+str(len(seq_rec.seq))) - seq_code_dict[seq_rec.description].append(str(seq_rec.seq).upper()) - renamed_seqs.append('>Contig_' + str(count) + '_Len' + str(len(seq_rec.seq)) + '\n' + str(seq_rec.seq).upper()) - count += 1 - - ## keeps only CDSs that are greater than 30 bp (10 AA --> This is a cut-off in the - ## phylogenomic pipeline too!) - renamed_seqs = [i for i in renamed_seqs if len(i.split('\n')[-1]) > 30] - - print (color.BOLD+'\n\nFor '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+\ - color.BOLD+', '+color.RED+str(len(renamed_seqs))+' CDS sequences\n'+color.END+color.BOLD+ - 'were renamed while preserving the '+color.ORANGE+args.source+color.END+color.BOLD+' formatting'\ - +color.END+'\n') - - with open(args.input_file.replace('.fasta','.Prepped.fasta'),'w+') as w: - w.write('\n'.join(renamed_seqs)) - - with open(args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv'),'w+') as w: - w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n') - for k, v in seq_code_dict.items(): - w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n') - - -########################################################################################### -###--------------------- Cleans up the Folder and Moves Final Files --------------------### -########################################################################################### - -def clean_up(args): - -# os.system('rm '+args.input_file) - os.system('mv ' + args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv') + ' ' + args.folder + '/Original/') - os.system('mv ' + args.input_file + ' ' + args.folder + '/Original/') - - -########################################################################################### -###-------------------------------- Next Script Message --------------------------------### -########################################################################################### - -def next_script(args): - - print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1].replace('.fasta','.Renamed.fasta')\ - +'.fasta'+color.END+color.BOLD+'\nin the '+color.ORANGE+args.folder.split('/')[-1]+\ - ' Folder\n\n'+color.END+color.BOLD) - - print ('Next Script(s) are:\n\n'+color.PURPLE+'2g_GCodeEval.py'+color.END+color.BOLD\ - +' (if Genetic Code is '+color.RED+'Unknown'+color.END+color.BOLD+')\n\nOtherwise:\n\n'+\ - color.PURPLE+'3g_GCodeTranslate.py\n\n'+color.END) - - -########################################################################################## -###----------------------------- Calls on Above Functions -----------------------------### -########################################################################################## - -def main(): - - args = check_args() - - prep_folders(args) - - renamed_GenomeCDS(args) - - clean_up(args) - - next_script(args) - -main() \ No newline at end of file diff --git a/PTL1/Genomes/Scripts/2_GCodeEval.py b/PTL1/Genomes/Scripts/2_GCodeEval.py deleted file mode 100644 index d580c49..0000000 --- a/PTL1/Genomes/Scripts/2_GCodeEval.py +++ /dev/null @@ -1,252 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 19_09_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 2g_GCodeEval.py --help - - -############################################################################################# -# # -# Suggests which Genetic Code to use based upon Presence/Absence of Specific Stop Codons # -# at the end of the CDS sequences. This is to provide a ROUGH gauge for the user. # -# # -############################################################################################# - - -import argparse, os, sys -from argparse import RawTextHelpFormatter,SUPPRESS -from Bio import SeqIO -from Bio.Seq import Seq - -#----------------------------- Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###------------------------- Checks the Command Line Arguments -------------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script is intended to aid you with '+color.RED+'evaluating\n(or checking) '+\ - color.END+color.BOLD+'the putative '+color.PURPLE+'Genetic Code'+color.END+color.BOLD+\ - ' for a given\nFasta file of annotated (and untranslated) CDSs.\n\nTo do so, this script'\ - ' checks for stop codon usages,\n'+color.RED+'suggesting '+color.END+color.BOLD+'the use of'\ - +color.PURPLE+' published and well-known\nalternate genetic codes'+color.END+color.BOLD+\ - ' that are supported by the\nnext script: '+color.END+color.BOLD+color.PURPLE+'3g_GCodeTranslate.py'\ - +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END) - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('--list_codes','-codes', action='store_true', - help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - sys.exit() - - args.folder = '/'.join(args.input_file.split('/')[:-1]) - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2g_GCodeEval.py'\ - ' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_arg = 0 - - supported_gcodes = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\ - 'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\ - 'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)'] - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.list_codes == True: - print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END) - print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes)+'\n\n'+color.END) - valid_arg += 1 - - if args.author == True: - print (author) - valid_arg += 1 - - print(args.input_file.split('/')[-1], '/'.join(args.input_file.split('/')[:-1])) - - - if args.input_file != None: - if os.path.isfile(args.input_file) != False: - - if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])): - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - else: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - - return valid_arg - - -########################################################################################### -###-------------------- Counts Several Metrics of Stop Codon Usage ---------------------### -########################################################################################### - -def count_stops(args): - - print (color.BOLD+'\n\nScanning CDSs for In-Frame Stop Codons and Tracking\nFINAL '\ - '(Terminal) stop codon usage\n\n'+color.END) - - inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')] - seq_ends = [str(i.seq)[-3:].lower() for i in inFasta] - inFrame_stops_raw = [str(i.seq[:-3].translate()).count('*') for i in inFasta] - inFrame_stops_summary = [i for i in inFrame_stops_raw if i != 0] - - tga_end = seq_ends.count('tga') - tag_end = seq_ends.count('tag') - taa_end = seq_ends.count('taa') - - end_stop_freq = [tga_end, tag_end, taa_end] - - if max(end_stop_freq) > 0.95*sum(end_stop_freq): - pos_to_keep = [i for i, j in enumerate(end_stop_freq) if j == max(end_stop_freq)][0] - try: - if pos_to_keep == 0: - end_stop_freq = [end_stop_freq[0],0,0] - elif pos_to_keep == 1: - end_stop_freq = [0,end_stop_freq[1],0] - elif pos_to_keep == 2: - end_stop_freq = [0,0,end_stop_freq[2]] - except: - pass - - inFrame_stop_info = [len(inFrame_stops_summary), int(round(len(inFrame_stops_raw)*0.05)), sum(inFrame_stops_summary)] - return end_stop_freq, inFrame_stop_info - - -########################################################################################### -###-------------------- Suggests Genetic Code Given Stop Codon Usage -------------------### -########################################################################################### - -def suggest_code(args): - - stop_freq, inFrames = count_stops(args) - - genetic_code = '' - - if stop_freq.count(0) == 3: - print (color.BOLD + color.RED + '\n\nNO Stop Codons Present in Data-set\n\n') - genetic_code = 'None (UNDETERMINED -- NO STOP CODONS)' - else: - ## DUMB way of checking if there are a significant (> 5%) number of CDSs with IN-FRAME stop codons - if inFrames[0] < inFrames[1]: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Universal (table = 1)'+color.END) - genetic_code = 'Universal (table = 1)' - else: - - if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] != 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Condylostoma-Code'\ - ' (No Dedicated Stops) OR None (all stops = "X")'+color.END) - genetic_code = 'Condylostoma or None' - if stop_freq[0] == 0 and stop_freq[1] == 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Chilodonella-Code'\ - +' (Only Stop = TAA)'+color.END) - genetic_code = 'Chilodonella or TAA' - if stop_freq[0] == 0 and stop_freq[2] == 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TAG-Code'\ - +' (Only Stop = TAG)'+color.END) - genetic_code = 'TAG' - if stop_freq[1] == 0 and stop_freq[2] == 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Ciliate-Code'\ - +' (table = 6)'+color.END) - genetic_code = 'Ciliate (table = 6)' - if stop_freq[0] != 0 and stop_freq[1] != 0 and stop_freq[2] == 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAG are STOP'+color.END) - genetic_code = 'TGA/TAG' - if stop_freq[0] != 0 and stop_freq[1] == 0 and stop_freq[2] != 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' TGA/TAA are STOP'+color.END) - genetic_code = 'TGA/TAA' - if stop_freq[0] == 0 and stop_freq[1] != 0 and stop_freq[2] != 0: - print (color.BOLD + '\n\nSuggested Genetic Code is: '+color.CYAN+' Blepharisma/Euplotes-Codes'\ - +color.END + color.BOLD+'\n--- NOTE: '+color.RED+' Stop-Codon Reassignments'\ - +' differ! (TGA = W or TGA = C)' + color.END) - genetic_code = 'Blepharisma (TGA = W) or Euplotes (TGA = C)' - - return genetic_code, stop_freq - - -########################################################################################### -###---------------- Writes Out Currently Crummy Summary of Genetic Codes ---------------### -########################################################################################### - -def summarize(args): - - suggestion, stop_freq = suggest_code(args) - - with open(args.input_file.split('.fa')[0]+'.GeneticCode.txt','w+') as w: - w.write('Stop Codon\tFrequency\n') - w.write('TGA\t'+str(stop_freq[0])+'\n') - w.write('TAG\t'+str(stop_freq[1])+'\n') - w.write('TAA\t'+str(stop_freq[2])+'\n\n') - w.write('Suggestion For Genetic Code:\t'+suggestion+'\n\n') - - -########################################################################################## -###--------------- Checks Command Line Arguments and Calls on Functions ---------------### -########################################################################################## - -def main(): - - args = check_args() - - summarize(args) - - print (color.BOLD+'\nNext Script is: '+color.PURPLE+' 3g_GCodeTranslate.py\n\n'+color.END) - -main() \ No newline at end of file diff --git a/PTL1/Genomes/Scripts/3_GCodeTranslate.py b/PTL1/Genomes/Scripts/3_GCodeTranslate.py deleted file mode 100644 index 6b97409..0000000 --- a/PTL1/Genomes/Scripts/3_GCodeTranslate.py +++ /dev/null @@ -1,397 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 19_09_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 3g_GCodeTranslate.py --help - - -############################################################################## -## ## -## Translates CDSs sequences using the Provided Genetic Code. ## -## ## -## NOTE: ## -## No provided input for genetic code results in Translation with the ## -## UNIVERSAL genetic code (as default) ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -############################################################################## - - -import argparse, os, sys -from argparse import RawTextHelpFormatter,SUPPRESS -from Bio import SeqIO -from Bio.Seq import Seq -from Bio.Data.CodonTable import CodonTable - - - -#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------# - -blepharisma_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TAA','TAG']) - -condylostoma_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['']) - -c_uncinata_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TAA']) - -euplotes_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TAA','TAG']) - -myrionecta_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y', - 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TGA']) - -no_stop_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['']) - -peritrich_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E', - 'TGT': 'C', 'TGC': 'C', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TGA']) - -tag_table = CodonTable(forward_table={ - 'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', - 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', - 'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', - 'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W', - 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', - 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', - 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', - 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', - 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', - 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', - 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', - 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', - 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', - 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', - 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', - 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}, - start_codons = [ 'ATG'], - stop_codons = ['TAG']) - - -#----------------------------- Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###------------------------- Checks the Command Line Arguments -------------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\ - 'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\ - color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END) - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal', - help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\ - '"universal")\n'+color.END) - - optional_arg_group.add_argument('--list_codes','-codes', action='store_true', - help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - sys.exit() - - args.folder = '../'+args.input_file.split('/')[1] - args.out_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.AA.fasta' - args.new_ntd_name = args.input_file.split('.Prepped')[0]+'.'+args.genetic_code.title()+'.NTD.fasta' - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3g_GCodeTranslate.py'\ - ' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Prepped.fasta'\ - ' --genetic_code Universal'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_arg = 0 - - supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\ - 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\ - 'taa','tag','tga'] - - supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\ - 'Conylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\ - 'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)'] - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - - if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names: - print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\ - 'If you have a new genetic code, please contact the author (with some evidence).\n\n'\ - 'Otherwise, use one of the currently supported genetic codes.\n'+color.END) - print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END) - print (author) - valid_arg += 1 - else: - if args.list_codes == True: - print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END) - print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END) - valid_arg += 1 - - if args.author == True: - print (author) - valid_arg += 1 - - if args.input_file != None: - if os.path.isfile(args.input_file) != False: - if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])): - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - else: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - - return valid_arg - - -########################################################################################## -###------------------ Translates CDSs from the Provided Genetic Code ------------------### -########################################################################################## - -def translate_seqs(args): - - inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')] - - print (color.BOLD+'\n\n\nTranslating: '+color.CYAN+args.input_file.split('/')[-1]+color.END+\ - color.BOLD+'\nwith the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n'+color.END) - - - if args.genetic_code.lower() == 'ciliate' or args.genetic_code.lower() == 'tga': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=6)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'peritrich' or args.genetic_code.lower() == 'vorticella': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=peritrich_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'tag': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=tag_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'chilo' or args.genetic_code.lower() == 'chilodonella' or args.genetic_code.lower() == 'taa': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=c_uncinata_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'bleph' or args.genetic_code.lower() == 'blepharisma': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=blepharisma_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'eup' or args.genetic_code.lower() == 'euplotes': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=euplotes_table)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - if args.genetic_code.lower() == 'universal': - translated_seqs = ['>'+seq_rec.description+'\n'+str(seq_rec.seq.translate(table=1)).rstrip('*').replace('*','X')+'\n' for seq_rec in inFasta] - - return translated_seqs - - -########################################################################################## -###---------------------------- Writes Out Translated CDSs ----------------------------### -########################################################################################## - -def write_out(args): - - translated_seqs = translate_seqs(args) - - ## Keep only ORFs greater than 10 amino acids long - translated_seqs = [i for i in translated_seqs if len(i.split('\n')[1]) > 10] - - print (color.BOLD+'\nTranslated '+color.ORANGE+str(len(translated_seqs))+color.END\ - +color.BOLD+' seqeunces using the '+color.GREEN+args.genetic_code.upper()+' Genetic Code\n\n'+color.END) - - with open(args.out_name,'w+') as w: - w.write(''.join(translated_seqs)) - - -########################################################################################## -###--------------------- Cleans up the Folder and Moves Final Files -------------------### -########################################################################################## - -def clean_up(args): - - os.system('mv '+args.input_file+' '+args.new_ntd_name) - - -########################################################################################## -###----------------------------- Calls on Above Functions -----------------------------### -########################################################################################## - -def main(): - - args = check_args() - - write_out(args) - - clean_up(args) - - print (color.BOLD+'Next Script is: '+color.PURPLE+' 4g_CountOgsUsearch.py\n\n'+color.END) - -main() \ No newline at end of file diff --git a/PTL1/Genomes/Scripts/4_CountOGsDiamond.py b/PTL1/Genomes/Scripts/4_CountOGsDiamond.py deleted file mode 100644 index 88d914a..0000000 --- a/PTL1/Genomes/Scripts/4_CountOGsDiamond.py +++ /dev/null @@ -1,301 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 19_09_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 3g_GCodeTranslate.py --help - -############################################################################## -## ## -## This scrip will categorize TRANSLATED CDSs into Homologous Gene Families ## -## ## -## Questions about Gene Family Binning/Source? SEE NOTES at Bottom! ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -############################################################################## - -import argparse, os, re, sys -from argparse import RawTextHelpFormatter, SUPPRESS -from distutils import spawn -from Bio import SeqIO - - -#----------------------------- Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------# -def check_diamond_path(): - ### IF Diamond is IN YOUR PATH then no updating is needed... - diamond_path = '' - - if diamond_path == '': - diamond_path = spawn.find_executable("diamond") - #diamond_path = /path/to/diamond - else: - pass - - if diamond_path == None: - print (color.BOLD + '\n\nPlease open this script and check that you have included'\ - + ' the PATH to the' + color.BLUE + ' "diamond" '+ color.END + color.BOLD\ - + 'executable.\n\n' + color.END) - print (color.BOLD + color.BLUE + 'LOOK FOR:\n\n' + color.RED\ - +'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\ - + color.BLUE + '\n\nThis is somewhere around lines 55 - 80...\n\n' + color.END) - - sys.exit() - else: - pass - - return diamond_path - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\ - +color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\ - +color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\ - 'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\ - +'THIS script (4_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS, - formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\ - ' Eukaryotic protein coding transcripts'+color.END) - required_arg_group.add_argument('--databases','-d', action='store', - help=color.BOLD+color.GREEN+'Path to folder containing db_OG'+color.END) - required_arg_group.add_argument('--evalue','-e', action='store', - help=color.BOLD+color.GREEN+'Maximum OG assignment e-value'+color.END) - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('--threads','-t', default='2', - help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - sys.exit() - - args.diamond = check_diamond_path() - - args.home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/' - - args.tsv_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('.AA.fasta','_allOGCleanresults.tsv') - - args.aa_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed') - args.ntd_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('AA','NTD') - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_CountOGsDiamond.py'\ - ' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Universal.AA.fasta'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_arg = 0 - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - print (author) - valid_arg += 1 - - if args.input_file != None: - if os.path.isfile(args.input_file) != False: - if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])): - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - elif args.input_file.endswith('AA.fasta') != True: - print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\ - ' with '+color.GREEN+'3g_GCodeTranslate.py '+color.END+color.BOLD+'are valid\n\n'\ - 'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\ - '"AA.fasta"\n\n'+color.END) - valid_arg += 1 - else: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - - if os.path.isdir(args.databases + '/db_OG') != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\ - +color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\ - 'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\ - +'\n\nThen try once again\n\n.'+color.END) - valid_arg += 1 - - elif os.path.isfile(args.databases + '/db_OG/OGSout.dmnd') != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\ - 'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\ - 'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\ - color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\ - color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END) - valid_arg += 1 - - return valid_arg - - -########################################################################################### -###--------------------------- Does the Inital Folder Prep -----------------------------### -########################################################################################### - -def prep_folders(args): - - OG_folder = '/'.join(args.input_file.split('/')[:-1])+'/DiamondOG/' - - if os.path.isdir(OG_folder) != True: - os.system('mkdir '+OG_folder) - - -########################################################################################### -###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------### -########################################################################################### - -def OG_ublast(args): - - OG_diamond_cmd = args.diamond + ' blastp -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/OGSout.dmnd --evalue ' + args.evalue + ' --subject-cover 0.5 --threads ' + args.threads + ' --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_allOGresults' - os.system(OG_diamond_cmd) - - -########################################################################################### -###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------### -########################################################################################### - -def keep_best(args): - print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST'\ - '\nmatch for each transcript\n\n'+color.END) - - inTSV = [i for i in open(args.input_file.split('.fas')[0]+'_allOGresults').read().split('\n') if i != ''] - - inTSV.sort(key = lambda x: -float(x.split('\t')[-1])) - - keep = [] - for i in inTSV: - if any(i.split('\t')[0] in j for j in keep) != True: - keep.append(i) - - updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+\ - '\t'+'\t'.join(line.split('\t')[1:])+'\n' for line in keep])) - - with open(args.tsv_out, 'w+') as w: - for i in updated_lines: - w.write(i+'\n') - - -########################################################################################### -###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------### -########################################################################################### - -def update_fasta(args): - - print (color.BOLD+color.PURPLE+'Updating Sequence Names with their BEST OG hits\n\n'+color.END) - - keep = [i for i in open(args.tsv_out).read().split('\n') if i != ''] - - keep_dict = {line.split('\t')[0].split('_OG5')[0]:line.split('\t')[0].split('_OG5')[0]+\ - '_OG5_'+line.split('\t')[1].split('_')[-1] for line in keep if 'OG5' in line.split('\t')[1]} - - protFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')] - - ntdFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file.replace('.AA.','.NTD.'),'fasta')] - - updated_prot_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in protFasta if i.description in keep_dict.keys()] - updated_ntd_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq).rstrip('*')+'\n' for i in ntdFasta if i.description in keep_dict.keys()] - - with open(args.aa_out,'w+') as w: - for i in updated_prot_name: - w.write(i) - - with open(args.ntd_out,'w+') as x: - for i in updated_ntd_name: - x.write(i) - - -########################################################################################## -###--------------------- Cleans up the Folder and Moves Final Files -------------------### -########################################################################################## - -def clean_up(args): - - os.system('mv '+args.input_file.replace('.fasta','_allOGresults')+' '+args.home_folder+\ - '/DiamondOG') - - os.system('cp '+args.aa_out+' '+args.home_folder+'/DiamondOG/') - os.system('cp '+args.ntd_out+' '+args.home_folder+'/DiamondOG/') - os.system('cp '+args.tsv_out+' '+args.home_folder+'/DiamondOG/') - - -########################################################################################## -###----------------------------- Calls on Above Functions -----------------------------### -########################################################################################## - -def main(): - - args = check_args() - - prep_folders(args) - - OG_ublast(args) - - keep_best(args) - - update_fasta(args) - - clean_up(args) - - print (color.BOLD+'Next Script is: '+color.GREEN+'5g_FinalizeName.py\n\n'+color.END) - -main() - -#----------------------------------------- NOTES -----------------------------------------# -# -# This script uses a "BLAST"-based approach to identify ANCIENT homologous gene families. -# -# Gene family designations were taken from OrthoMCL.org and serve as the database for -# this script's gene family assignments. These gene family assignments are NON-EXHAUSTIVE -# and most Lineage-Specific families will be missed! -# -# If you have any questions contact Xyrus (author): maurerax@gmail.com \ No newline at end of file diff --git a/PTL1/Genomes/Scripts/5_FinalizeName.py b/PTL1/Genomes/Scripts/5_FinalizeName.py deleted file mode 100644 index bcfe2a1..0000000 --- a/PTL1/Genomes/Scripts/5_FinalizeName.py +++ /dev/null @@ -1,374 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 20_09_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 5g_FinalizeName.py --help - -################################################################################################## -## This script is intended to rename the outputs of the FilterPartials script ## -## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ## -## ## -## Prior to running this script, ensure the following: ## -## ## -## 1. You have assembled your transcriptome and COPIED the 'assembly' file ## -## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ## -## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ## -## 3. Removed SSU/LSU sequences from your Fasta File ## -## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ## -## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ## -## 6. You either know (or have inferred) the genetic code of the organism ## -## 7. You have translated the sequences and checked for the data in the RemovePartials folder ## -## 8. Partial sequences have been removed from the transcriptomic data sets ## -## ## -## COMMAND Example Below ## -## Extra Notes at Bottom of Script ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -## Next Script(s) to Run: ## -## NONE! You're FINISHED! :D ## -## ## -################################################################################################## - -import argparse, os, sys -from argparse import RawTextHelpFormatter,SUPPRESS - -#----------------------- Solely to Make Print Statements Colorful -----------------------# - -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\ - +color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\ - +color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\ - +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END) - required_arg_group.add_argument('--name','-n', action='store', - help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END) - - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - print ('\n') - sys.exit() - - args.all_output_folder = '/'.join(args.input_file.split('/')[:-3]) - - args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/' - args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/' - args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/' - args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/' - - args.xml_out = args.input_AA.split('/')[-1]+'_1e-10keepall_BlastOutall.oneHit' - - check_code(args) - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_FinalizeName.py'\ - ' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.Universal.AA.fasta'\ - ' --name Sr_ci_Scer'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_args = 0 - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - print (author) - valid_args += 1 - - if args.input_file.endswith('AA.fasta'): - args.input_NTD = args.input_file.replace('AA.fasta','NTD.fasta') - args.input_AA = args.input_file - args.input_TSV = args.input_file.replace('.AA.fasta','_allOGCleanresults.tsv') - - elif args.input_file.endswith('NTD.fasta'): - args.input_NTD = args.input_file - args.input_AA = args.input_file.replace('NTD.fasta','AA.fasta') - args.input_TSV = args.input_file.replace('.NTD.fasta','_allOGCleanresults.tsv') - - if os.path.isfile(args.input_NTD) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\ - 'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - if os.path.isfile(args.input_AA) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\ - 'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - if os.path.isfile(args.input_TSV) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\ - 'Fasta file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - return valid_args - -########################################################################################### -###-------------------- Double Checks Format for 10-Character Code ---------------------### -########################################################################################### - -def check_code(args): - - check_name = args.name.split('_') - - if len(args.name) != 10: - print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - sys.exit() - - elif args.name.count('_') != 2: - print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - sys.exit() - - if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4: - print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\ - .split('_Filtered')[0]+color.END+color.BOLD+"'s files\nusing the following 10-character "\ - "code: "+color.CYAN+args.name+color.END+'\n') - - else: - print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - sys.exit() - - -########################################################################################## -###------------------------- Creates Folders For Storing Data -------------------------### -########################################################################################## - -def prep_folders(args): - - if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True: - os.system('mkdir ' + args.all_output_folder + '/ReadyToGo') - - if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/') != True: - os.system('mkdir '+args.r2g_ntd) - if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/') != True: - os.system('mkdir '+args.r2g_aa) - if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/') != True: - os.system('mkdir '+args.r2g_tsv) - if os.path.isdir(args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/') != True: - os.system('mkdir '+args.r2g_xml) - - -########################################################################################### -###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------### -########################################################################################### - -def rename_paralogs(args): - - home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/' - - print('HOME ' + home_folder) - - print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END) - renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_') - - print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END) - renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_') - - - print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END) - if '\n\n' in open(args.input_TSV).read(): - renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\ - .replace('\n\n','\n'+args.name+'_') - else: - renamed_Final_tsv = open(args.input_TSV).read().rstrip('\n')\ - .replace('\n','\n'+args.name+'_') - - with open(home_folder + args.input_AA.split('/')[-1],'w+') as w: - w.write(renamed_Final_Prots) - - with open(home_folder + args.input_NTD.split('/')[-1],'w+') as x: - x.write(renamed_Final_Nucs) - - with open(home_folder + args.input_TSV.split('/')[-1],'w+') as y: - y.write(renamed_Final_tsv) - - -########################################################################################### -###--------------------------------- Header/Tail Lines ---------------------------------### -########################################################################################### - -def header_tail(): - header = '\n\n'\ - '\n blastp\n BLASTP 2.2.29+\n'\ - ' Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.\n'\ - ' ../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta\n Query_1\n' - - tail = '\n' - return header, tail - - -########################################################################################### -###------------------------------- TSV to XML Conversion -------------------------------### -########################################################################################### - -def convert_TSV_data(args): - - home_folder = '/'.join(args.input_AA.split('/')[:-2]) - - TSVforConvert = home_folder+ '/' + args.input_TSV.split('/')[-1] - - inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n'] - - iterations = [] - - for n in range(len(inTSV)): - if n == 0: - iterations.append(' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - ' \n \n BLOSUM62\n 1e-10\n'\ - ' 11\n 1\n F\n'\ - ' \n \n\n\n 1\n Query_1\n'\ - ' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - '\n\n 1\n Fake_Entry\n '+inTSV[n].split('\t')[1]+'\n Fake_Accession\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n \n \n 1\n 1234\n'\ - ' '+inTSV[n].split('\t')[-1]+'\n '+inTSV[n].split('\t')[-2]+'\n '+inTSV[n].split('\t')[-4]+'\n'\ - ' '+inTSV[n].split('\t')[-3]+'\n '+inTSV[n].split('\t')[-4]+'\n '+inTSV[n].split('\t')[-3]+'\n'\ - ' 0\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' \n \n \n \n \n\n'\ - '\n\n \n \n 379660\n 197499634\n'\ - ' 123\n 184705217500\n 0.041\n'\ - ' 0.267\n 0.14\n \n \n\n') - else: - iterations.append('\n '+str(n+1)+'\n Query_'+str(n+1)+'\n'\ - ' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - '\n\n 1\n Fake_Entry\n '+inTSV[n].split('\t')[1]+'\n Fake_Accession\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n \n \n 1\n 1234\n'\ - ' '+inTSV[n].split('\t')[-1]+'\n '+inTSV[n].split('\t')[-2]+'\n '+inTSV[n].split('\t')[-4]+'\n'\ - ' '+inTSV[n].split('\t')[-3]+'\n '+inTSV[n].split('\t')[-4]+'\n '+inTSV[n].split('\t')[-3]+'\n'\ - ' 0\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' \n \n \n \n \n\n'\ - '\n\n \n \n 379660\n 197499634\n'\ - ' 123\n 184705217500\n 0.041\n'\ - ' 0.267\n 0.14\n \n \n\n') - - return iterations - - -########################################################################################### -###--------------------------- Writes Out the Fake XML File ----------------------------### -########################################################################################### - -def write_Fake_XML(args): - - home_folder = '/'.join(args.input_AA.split('/')[:-2]) + '/' - - print (color.BOLD+'\n\nConverting '+color.ORANGE+args.name+'_XX_'+args.input_TSV.split('/')[-1]\ - +color.END+color.BOLD+' to XML format\n'+color.END) - - header, tail = header_tail() - - iterations = convert_TSV_data(args) - - with open(home_folder+args.xml_out,'w+') as w: - w.write(header) - w.write(''.join(iterations)) - w.write(tail) - -########################################################################################## -###-------------------- Cleans up the Folder and Moves Final Files --------------------### -########################################################################################## -def clean_up(args): - - final_folder = '/'.join(args.input_file.split('/')[:-2]) + '/' - - os.system('rm '+args.input_AA) - os.system('rm '+args.input_NTD) - os.system('rm '+args.input_TSV) - - os.system('cp '+final_folder+'*Renamed.*.AA.fasta '+args.r2g_aa) - os.system('cp '+final_folder+'*Renamed.*.NTD.fasta '+args.r2g_ntd) - os.system('cp '+final_folder+'*.Renamed.*_allOGCleanresults.tsv '+args.r2g_tsv) - os.system('cp '+final_folder+'*oneHit '+args.r2g_xml) - -########################################################################################### -###-------------------------------- Next Script Message --------------------------------### -########################################################################################### - -def next_script(args): - - print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.xml_out\ - .split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\ - args.xml_out.split('_XX_')[-1].split('.Renamed')[0]+'.Prep'+color.END+color.BOLD+' and '\ - +color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\ - 'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END) - -########################################################################################## -###--------------- Checks Command Line Arguments and Calls on Functions ---------------### -########################################################################################## - -def main(): - - args = check_args() - - prep_folders(args) - - rename_paralogs(args) - - write_Fake_XML(args) - - clean_up(args) - - next_script(args) - -main() \ No newline at end of file diff --git a/PTL1/Genomes/Scripts/wrapper.py b/PTL1/Genomes/Scripts/wrapper.py deleted file mode 100644 index f984cb7..0000000 --- a/PTL1/Genomes/Scripts/wrapper.py +++ /dev/null @@ -1,170 +0,0 @@ -import os, sys, re -import argparse - - -def get_args(): - - parser = argparse.ArgumentParser( - prog = 'PhyloToL v6.0 Part 1 for GenBank Genomes', - description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0" - ) - - parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'Script to run if you are only running one script') - parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4, 5 }, help = 'First script to run') - parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5, 6 }, help = 'First script to run') - parser.add_argument('-c', '--cds', type = str, help = 'Path to a folder of nucleotide CDS. Each file name should start with a unique 10 digit code, and end in "_GenBankCDS.fasta", E.g. Op_me_hsap_GenBankCDS.fasta') - parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder') - parser.add_argument('-x', '--xplate_contam', action = 'store_true', help = 'Run cross-plate contamination removal (includes all files)') - parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 4). Otherwise, stop after script 3 and fill in "gcode_output.tsv" before running script 4') - parser.add_argument('-l', '--minlen', type = int, default = 200, help = 'Minimum CDS length') - parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder (which should contain db_OG)') - - return parser.parse_args() - - -def script_one(args, ten_digit_codes): - - for file in os.listdir(args.cds): - if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes: - os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output') - - -def script_two(args): - - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'): - os.system('python 2_GCodeEval.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta') - - gcode_info = [] - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt'): - with open(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.GeneticCode.txt') as f: - gcode_temp = [folder] - for line in f: - line_sep = line.strip().split('\t') - if line_sep[0] == 'TGA': - gcode_temp.append(line_sep[1]) - elif line_sep[0] == 'TAG': - gcode_temp.append(line_sep[1]) - elif line_sep[0] == 'TAA': - gcode_temp.append(line_sep[1]) - - gcode_info.append(gcode_temp) - - with open(args.output + '/Output/gcode_output.tsv', 'w') as g: - g.writelines('10 Digit Code\tIn-frame TGA Density\tIn-frame TAG Density\tIn-frame TAA Density\tGenetic Code\n') - for row in gcode_info: - g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\n') - - -def script_three(args): - - valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none'] - - if args.genetic_code != None and args.genetic_code.lower() in valid_codes: - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'): - os.system('python 3_GCodeTranslate.py -in ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta -g ' + args.genetic_code.lower()) - else: - lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')] - with open(args.output + '/Output/gcode_output.tsv', 'r') as g: - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta'): - for line in lines: - if line[0] == folder and line[-1].lower() in valid_codes: - os.system('python 3_GCodeTranslate.py -in ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Prepped.fasta -g ' + line[-1]) - elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code': - print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n') - - -def script_four(args): - - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Universal.AA.fasta'): - os.system('python 4_CountOGsDiamond.py -in ' + args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Universal.AA.fasta -t 30 --databases ' + args.databases + ' --evalue 1e-15') - - - -def script_five(args): - - for folder in os.listdir(args.output + '/Output'): - if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.Universal.AA.fasta'): - step5_cmd = 'python 5_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.Universal.AA.fasta -n ' + folder - os.system(step5_cmd) - - os.mkdir(args.output + '/Output/Intermediate') - - for file in os.listdir(args.output + '/Output'): - if file != 'ReadyToGo' and file != 'Intermediate': - os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate') - - - -if __name__ == "__main__": - - args = get_args() - - if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds): - print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument') - quit() - - ten_digit_codes = [] - if args.first_script == 1 or args.script == 1: - for file in os.listdir(args.cds): - if file[10:] == '_GenBankCDS.fasta': - ten_digit_codes.append(file[:10]) - else: - if not os.path.isdir(args.output + '/Output'): - print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n') - - if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))): - print('\nDuplicate 10-digit codes are not allowed. Aborting.\n') - quit() - - for code in ten_digit_codes: - for c, char in enumerate(code): - if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'): - print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n') - quit() - - if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1): - print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n') - quit() - elif not os.path.isdir(args.output + '/Output'): - os.mkdir(args.output + '/Output') - - scripts = [0, script_one, script_two, script_three, script_four, script_five] - - if args.script == -1: - if args.first_script < args.last_script: - for i in range(1 + args.last_script - args.first_script): - print('\nRunning script ' + str(i + args.first_script) + '...\n') - if i + args.first_script == 1: - scripts[i + args.first_script](args, ten_digit_codes) - else: - scripts[i + args.first_script](args) - else: - print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n') - quit() - else: - if args.script == 1: - scripts[args.script](args, ten_digit_codes) - else: - scripts[args.script](args) - - - - - - - - - - - - - - - - - diff --git a/PTL1/Genomes/Scripts/wrapper_submit.sh b/PTL1/Genomes/Scripts/wrapper_submit.sh deleted file mode 100644 index 13a673c..0000000 --- a/PTL1/Genomes/Scripts/wrapper_submit.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -# -#SBATCH --job-name=PTL1_genome -#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId) -#SBATCH --nodes=1 -#SBATCH --ntasks=1 -#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances -#SBATCH --mem=160G -#SBATCH --mail-type=ALL -#SBATCH --mail-user=YOUREMAIL@smith.edu - -module purge #Cleans up any loaded modules -module use /gridapps/modules/all #make sure module locations is loaded - -module load slurm -module load Biopython/1.75-foss-2019b-Python-3.7.4 -module load BLAST+ -module load DIAMOND/0.9.30-GCC-8.3.0 - -export PATH=$PATH:/Users/katzlab/scratch/katzlab/grid_phylotol_setup/programs/standard-RAxML-master -export PATH=$PATH:/Users/katzlab/scratch/katzlab/grid_vsearch_setup/vsearch-2.15.1-linux-x86_64/bin - -python wrapper.py -1 1 -2 5 --cds ../TestData --genetic_code Universal --databases ../Databases \ No newline at end of file