Add files via upload

2025-12-27 03:30:25 +08:00 · 2023-06-12 13:31:51 -04:00 · 2023-06-12 13:31:51 -04:00 · 6dc90e0d4c
commit 6dc90e0d4c
parent f4b7839ea9
20 changed files with 37278 additions and 0 deletions
--- a/PTL1/Transcriptomes/Scripts/1a_ContigFiltStats.py
+++ b/PTL1/Transcriptomes/Scripts/1a_ContigFiltStats.py
@ -0,0 +1,269 @@
+#!/usr/bin/env python3.6
+
+##__Updated__: 01_04_2023
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 1_ContigFiltStats.py
+##__Options__: python 1_ContigFiltStats.py --help
+
+##########################################################################################
+## This script is intended to remove small transcripts or small contigs below a given   ##
+## minimum size from a transcriptome assembly.                                          ##
+##                                                                                      ##
+## Prior to running this script, ensure the following:                                  ##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file              ##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder                    ##
+##                                                                                      ##
+##                              COMMAND Example Below                                   ##
+##                                                                                      ##
+##          E-mail Xyrus (author) for help if needed: maurerax@gmail.com                ##
+##                                                                                      ##
+##                          Next Script(s) to Run:                                      ##
+##  AutoBactVsEuk.py (removes SSU then Bact) or 2a_removeSSU.py then 2b_removeBact.py   ##
+##                                                                                      ##
+##########################################################################################
+
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+from Bio import SeqIO
+from Bio.SeqUtils import GC
+
+
+#----------------------------- Colors For Print Statements ------------------------------#
+
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD+'\nThis script will remove Contigs (and provide a summary of statistics)'\
+	+'\nfrom your Assembly that are shorter than a given length.'+color.ORANGE+\
+	'\n\nA good minimum length to start with is 200bp.'+color.END+color.BOLD+\
+	'\n\nThe minimum length value should be adjusted for your data sets.\n'+color.END+usage_msg(), 
+	usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+" Fasta file of Protein/Nucleotide sequences\n"+color.END)
+
+	required_arg_group.add_argument('--output_file','-out',
+	help=color.BOLD+color.GREEN+" Desired Output Name\n\n"+color.END)
+
+	required_arg_group.add_argument('--minLen','-min', default=200, type=int,
+	help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 200)"+color.END)
+	required_arg_group.add_argument('--maxLen','-max', default=15000, type=int,
+	help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 15000)"+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('--spades','-spades', action='store_true',
+	help=color.BOLD+color.GREEN+'rnaSPAdes transcriptome assembly\n'+color.END)
+
+	optional_arg_group.add_argument('--genbank','-gb', action='store_true',
+	help=color.BOLD+color.GREEN+'Assembly from Genbank\n (Will include Accession Number in'\
+	' contig name)\n'+color.END)
+
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
+	
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1_ContigFiltStats.py'\
+	' --input_file ../Op_me_Xxma_rnaSPAdes_scaffolds_15_05.fasta --output_file '\
+	'Op_me_Xxma --minLen 200 --spades'+color.END
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+	
+	if args.output_file == None:
+		valid_arg += 1
+
+	return valid_arg
+
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+	Home_folder_name = args.output_file
+	
+	if os.path.isdir(args.output_file) != True:
+		os.system('mkdir ' + args.output_file)
+
+	if os.path.isdir(args.output_file + '/OriginalFasta/') != True:
+		os.system('mkdir ' + args.output_file +'/OriginalFasta/')
+
+	if os.path.isdir(args.output_file + '/SizeFiltered/') != True:
+		os.system('mkdir ' + args.output_file +'/SizeFiltered/')
+
+	if os.path.isdir('/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/') != True:
+		os.system('mkdir ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
+
+
+###########################################################################################
+###---------- Renames the Contigs, Writes them out, and Calculates Basic Info ----------###
+###########################################################################################
+
+def rename_Transcriptome(args):
+	
+	home_folder = args.output_file + '/SizeFiltered/'
+
+	print (color.BOLD+'\n\nPrepping '+color.GREEN+args.input_file.split('/')[-1]+color.END)
+
+	inFasta = [i for i in SeqIO.parse(args.input_file,'fasta') if len(i.seq) >= args.minLen and len(i.seq) <= args.maxLen]
+	inFasta.sort(key=lambda seq_rec: -len(seq_rec.seq))
+
+	renamed_seqs = []
+	seq_code_dict = {}
+
+	count = 1
+
+	seq_name_start = 'Contig'
+
+	if args.genbank == True:
+		for seq_rec in inFasta:
+			seq_code_dict.setdefault(seq_rec.id,[]).append(seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
+			seq_code_dict.setdefault(seq_rec.id,[]).append(str(seq_rec.seq).upper())
+			renamed_seqs.append('>'+seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
+			count += 1
+	elif args.spades == True:
+		for seq_rec in inFasta:
+			seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3])))))
+			seq_code_dict.setdefault(seq_rec.description,[]).append(seq_rec.description.split('_')[5])
+			seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
+			renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3]))))+'\n'+str(seq_rec.seq).upper())
+			count += 1
+	else:
+		for seq_rec in inFasta:
+			seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq)))
+			seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
+			renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
+			count += 1
+	
+	print (color.BOLD+'\n\nThere are '+color.RED+str(len(renamed_seqs))+' contigs > '+str(args.minLen)\
+	+color.END+color.BOLD+' in '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END)
+	
+	with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta','w+') as w:
+		for seq in renamed_seqs:
+			w.write(seq+'\n')
+	
+	if args.spades != True:
+		with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
+			w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
+			for k, v in seq_code_dict.items():
+				w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
+	else:
+		with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
+			w.write('Original Name\tNew Name\tSeq Length\tSeq GC\tSeq Coverage\n')
+			for k, v in seq_code_dict.items():
+				w.write(k+'\t'+v[0]+'\t'+str(len(v[2]))+'\t'+str(GC(v[2]))+'\t'+str(v[1])+'\n')
+
+	
+###########################################################################################
+###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
+###########################################################################################
+
+def clean_up(args):
+		
+	os.system('cp ' + args.input_file + ' ' + args.output_file + '/OriginalFasta/' + args.input_file.split('/')[-1].replace('.fasta', '.Original.fasta'))
+	
+	os.system('cp ' + args.output_file + '/SizeFiltered/' + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.output_file+'.'+str(args.minLen)+\
+	'bp.fasta'+color.END+color.BOLD+'\n\n')
+
+	print ('Next Script is: '+color.GREEN+'2_Auto_rRNA_BvE.py'+color.END\
+	+color.BOLD+'\n(Alternatively'+color.GREEN+' 2a_remove_rRNA.py followed by 2b_remove_Bact.py'\
+	+color.END+color.BOLD+')\n\n'+ color.END)
+
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+
+def main():
+
+	args = check_args()
+
+	prep_folders(args)
+	
+	temp = rename_Transcriptome(args)
+	
+	clean_up(args)
+	
+	next_script(args)
+	
+main()
+	
--- a/PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py
+++ b/PTL1/Transcriptomes/Scripts/1b_XSpeciesContaminationAgnes.py
@ -0,0 +1,153 @@
+#!/usr/bin/python3
+
+__author__ = 'Jean-David Grattepanche'
+__version__ = 'ACL fixed sequence naming issue Feb 23, 2022'
+__email__ = 'jeandavid.grattepanche@gmail.com'
+
+
+
+import sys
+import os
+import re
+import time
+import string
+import os.path
+from Bio import SeqIO
+from sys import argv
+listtaxa=[]
+toosim = 0.99
+seqcoverage = 0.7
+
+def merge_files(folder, minlen, conspecific_names):
+	mergefile = open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','w+')
+	print("MERGE following files")
+	for taxafile in os.listdir(folder):
+		if taxafile[0] != ".":
+			listtaxa.append(taxafile.split('.' + str(minlen) + 'bp')[0])
+
+			for line2 in SeqIO.parse(folder+'/'+taxafile, 'fasta'):
+				if int(len(str(line2.seq))) >= int(minlen):
+					mergefile.write('>'+taxafile.split('.' + str(minlen) + 'bp')[0] + '_' + line2.description + '\n' + str(line2.seq) + '\n')
+				else:
+					print(line2, " is too short")
+	mergefile.close()
+
+	sort_cluster(folder, listtaxa, minlen, conspecific_names)
+
+
+def sort_cluster(folder, listtaxa, minlen, conspecific_names):
+	if not os.path.exists('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/'):
+		os.makedirs('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
+
+	fastalist = []; fastadict= {}
+	conspecific_names_dict = { line.split('\t')[0] : line.split('\t')[1].strip() for line in open(conspecific_names) }
+
+	print('CREATE a dictionnary of sequences')
+	for record in SeqIO.parse(open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','r'),'fasta'):
+		if record.id[:10] not in conspecific_names_dict:
+			print('\nError in cross-plate contamination assessment: the ten-digit code ' + record.id[:10] + ' is not found in the conspecific names file. Please check that this file is correct and try again.\n')
+			quit()
+
+		IDL  = record.description, int(record.description.split('_Cov')[1].replace('\n',''))
+		fastalist.append(IDL)
+		fastadict[record.description] = record.seq
+
+	print("CLUSTER sequences that overlap at least 70%")
+	os.system('vsearch --cluster_fast ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta --strand both --query_cov '+str(seqcoverage)+' --id '+str(toosim) +' --uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc --threads 60' )
+	
+	#input2 = open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r')
+	#input2 = open('/Output_PostClusterBackup/clusteringresults_vsearch/results_forclustering.uc','r')
+	cluster_output = '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc'
+	out2 = open('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','w+')
+	out3 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas','w+')
+	out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','w+')
+	print("CREATE a dictionary with clustering results")
+	clustdict= {}; clustlist = []; allseq = []; clustline = {}; list= []; i=0; j=0
+	for row2 in open(cluster_output, 'r'):
+		if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) < 2: # keep all unique sequences
+			out2.write('>'+row2.split('\t')[8] + '\n' + str(fastadict[row2.split('\t')[8]])+ '\n')
+		if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) > 1: # create another dictionary
+#			print("create dico: ", row2.split('\t')[8])
+			clustdict.setdefault(row2.split('\t')[8], [row2.split('\t')[8]])
+			clustlist.append(row2.split('\t')[8])
+
+	for row3 in open(cluster_output, 'r'):
+		if row3.split('\t')[0] == 'H':
+#			print("add dico: ", row3.split('\t')[9], row3.split('\t')[8])
+			clustdict[row3.split('\t')[9].replace('\n','')].append(row3.split('\t')[8].replace('\n',''))
+			clustline[row3.split('\t')[8].replace('\n','')] = row3.replace('\n','')
+			clustline[row3.split('\t')[9].replace('\n','')] = row3.replace('\n','')
+
+	print("PARSE the clusters: keep seed sequences (highest coverage) for each cluster")
+	for clust in clustlist:
+		list = sorted(clustdict[clust], reverse = True, key=lambda x: int(x.split('_Cov')[1]))
+		master = list[0]
+		Covmaster = int(list[0].split('_Cov')[1])
+		master8dig = ('_').join(list[0].split('_')[0:3])[:-2]
+		for seq in list:
+			clustered =  seq.replace('\n','')
+			Covclustered = int(clustered.split('_Cov')[1])
+			clustered8dig = ('_').join(clustered.split('_')[0:3])[:-2]
+# 			print(master8dig, Covmaster, '//', clustered8dig, Covclustered)
+			if float(Covmaster/Covclustered) < 10:
+				out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
+				i +=1
+			elif conspecific_names_dict[master[:10]] == conspecific_names_dict[clustered[:10]]:
+				out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
+				i +=1
+			elif Covclustered >= 50:				
+				out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
+				i +=1
+			else:
+				j +=1
+				out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','a')
+				out3.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
+				print(clustline[clustered],'\t' , master )
+				out4.write(clustline[clustered]+ '\t' + master + '\n')
+				out4.close()
+
+
+	print('there are ', str(i),' sequences kept and ',str(j),' sequences removed')
+	
+	out2.close()
+	out3.close()
+
+	splittaxa(folder, listtaxa, minlen)
+	
+def splittaxa(folder, listtaxa, minlen):
+	for taxa in listtaxa:
+		tax_sf_path = '/'.join(folder.split('/')[:-1]) + '/' + taxa + '/SizeFiltered/'
+		os.system('mv ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta' + ' ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.preXPlate.fasta')
+
+		with open(tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta','w') as o:
+			for kept in SeqIO.parse('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','fasta'):
+				if taxa in kept.description:
+					o.write('>' + kept.description.replace(taxa + '_', '') + '\n' + str(kept.seq) + '\n')
+
+	os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
+	os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
+	os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
+	os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
+	
+def main():
+
+	script, folder, minlen, conspecific_names = argv
+	merge_files(folder, minlen, conspecific_names)
+
+main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/PTL1/Transcriptomes/Scripts/2a_remove_rRNA.py
+++ b/PTL1/Transcriptomes/Scripts/2a_remove_rRNA.py
@ -0,0 +1,285 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 18_08_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 2a_remove_rDNA.py --help
+
+##########################################################################################
+## This script is intended to identify and isolate SSU/LSU sequences 					##
+## Prior to running this script, ensure the following:									##
+##																						##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 				##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder					##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py	##
+## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their 	##
+##	  respective folders! See the manual if you need help								##
+##																						##
+## 								COMMAND Example Below									##
+##																						##
+## 				E-mail Xyrus (author) for help if needed: maurerax@gmail.com			##
+##																						##
+##							Next Script(s) to Run: 										##
+##								2b_removeBact.py										##
+##																						##
+##########################################################################################
+
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+from Bio import SeqIO
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+	
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD+'\nThis script will remove '+color.RED+'rDNA contigs (both SSU and LSU)'+color.END\
+	+color.BOLD+'\nfrom your Assembly using a set of '+color.RED+'SSU/LSU rDNAs '+color.END\
+	+color.BOLD+'from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
+	+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+"Fasta file of Nucleotide sequences"+color.END)
+	required_arg_group.add_argument('--databases','-d', action='store',
+	help=color.BOLD+color.GREEN+"Path to databases"+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+	optional_arg_group.add_argument('--threads','-t', default='2',
+	help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+			
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2a_remove_rRNA.py --input_file ../Op_me_Xxma_rna.200bp.fasta'+color.END
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('bp.fasta') != True:
+				print (color.BOLD + '\n\nCheck that you are giving an appropriately Named/Processed'\
+				'Fasta file(s) to this script\n\nNOTE that this script CURRENTLY expects your'\
+				' Fasta files to contain '+color.RED+ '"rna"'+color.END+color.BOLD+' in \nthe Fasta File'\
+				' Name and must end with ' + color.RED + '"bp.fasta"\n\n' + color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	if os.path.isdir(args.databases + '/db_BvsE') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
+		'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+		+'\n\nThen try once again.')
+		valid_arg += 1
+	elif os.path.isfile(args.databases + '/db_BvsE/SSULSUdb.nhr') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'BLAST+ formatted '+color.ORANGE+'SSU-LSU databases!\n\n'+color.END+color.BOLD+\
+		'Ensure that they can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.')
+		valid_arg += 1
+
+	return valid_arg
+	
+	
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+	code = args.input_file.split('/')[-1][:10]
+	rRNA_folder = args.input_file.split('SizeFiltered')[0] + '/rRNA_Removal/'
+	
+	if os.path.isdir(rRNA_folder) != True:
+		os.system('mkdir '+rRNA_folder)
+
+	return code, rRNA_folder
+
+
+###########################################################################################
+###---------------------- Uses BLAST to identify SSU/LSU Sequences ---------------------###
+###########################################################################################
+
+def remove_rDNA(args, rRNA_folder): 	
+
+	blast_output = rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_allSSULSUresults.tsv'
+	
+	BLASTN_cmd = 'blastn -query ' + args.input_file + ' -evalue 1e-10 -max_target_seqs 1 -outfmt'\
+	' 6 -db ' + args.databases + '/db_BvsE/SSULSUdb -num_threads 2 -out ' + blast_output
+		
+	print (color.BOLD+'\n\nBLASTing '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END\
+		+color.BOLD+ ' against the rDNA database\n\n' + color.END)
+
+	os.system(BLASTN_cmd)		
+
+	rDNA_Hits = list(set([i.split('\t')[0] for i in open(blast_output).readlines()]))
+
+	print (color.BOLD+'Binning Sequences from '+color.DARKCYAN+args.input_file.split('/')[-1]\
+		+color.END+color.BOLD+'\nas rDNA OR Potentially Protein-Coding\n\n'+color.END)
+
+	no_SSULSU = 0
+	with_SSULSU = 0
+
+	inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
+
+	with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_rRNAseqs.fasta','w+') as HasSSU:
+		for seq_rec in inFasta:
+			if seq_rec.description in rDNA_Hits:
+				HasSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
+				with_SSULSU += 1
+
+	with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0] + '_NorRNAseqs.fasta','w+') as NoSSU:
+		for seq_rec in inFasta:
+			if seq_rec.description not in rDNA_Hits:
+				NoSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
+				no_SSULSU += 1
+
+	return str(with_SSULSU), str(no_SSULSU)
+	
+	
+###########################################################################################
+###--------------------------- Updates Log of SSU/LSU Removal --------------------------###
+###########################################################################################
+	
+def update_log(args, with_SSU, no_SSU):
+
+	if os.path.isdir('../PostAssembly_Logs/') != True:
+		os.system('mkdir ../PostAssembly_Logs/')
+	
+	print (color.BOLD+'There are '+color.RED+with_SSU+' rRNA contigs'+color.END+color.BOLD\
+	+' and '+color.PURPLE+no_SSU+' Putative Protein-coding contigs'+color.END+color.BOLD\
+	+'\nin '+color.DARKCYAN+args.input_file.split('/')[1]+'\n' + color.END)
+	
+	with open('../PostAssembly_Logs/'+args.input_file.split('/')[1].split('.fas')[0]+'.Log.txt','a') as LogFile:
+		LogFile.write('rDNA Contigs\t'+with_SSU+'\tn/a\tn/a\n')
+		LogFile.write('Non-rDNA Contigs\t'+no_SSU+'\tn/a\tn/a\n')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	print (color.BOLD+'\nLook for '+color.ORANGE+args.input_file.split('/')[1].split('_rna')[0]\
+		+ '_NorRNAseqs.fasta'+color.END+color.BOLD+'\nin the '+args.input_file.split('/')[1].split('_rna')[0]\
+		+' Folder\n\n' + color.END)
+	print (color.BOLD + 'Next Script is: ' + color.GREEN + '2b_remove_Bact.py\n\n'+ color.END)
+
+
+###########################################################################################
+###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
+###########################################################################################
+
+def clean_up(args):
+	
+	home_folder = args.input_file.split('SizeFiltered')[0]
+	
+	os.system('cp ' + home_folder + 'rRNA_Removal/*NorRNA*.fasta ' + home_folder)
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+
+def main():
+
+	args = check_args()
+
+	code, rRNA_folder = prep_folders(args)
+
+	with_SSULSU, no_SSULSU = remove_rDNA(args, rRNA_folder)
+ 	
+	#update_log(args, with_SSULSU, no_SSULSU)
+ 	
+	clean_up(args)
+ 
+	next_script(args)
+
+main()
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/PTL1/Transcriptomes/Scripts/2b_remove_Bact.py
+++ b/PTL1/Transcriptomes/Scripts/2b_remove_Bact.py
@ -0,0 +1,410 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 18_08_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 2b_remove_Bact.py --help
+
+##########################################################################################
+## This script is intended to identify and isolate SSU/LSU sequences 					##
+## Prior to running this script, ensure the following:									##
+##																						##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 				##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder					##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py	##
+## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their 	##
+##	  respective folders! See the manual if you need help								##
+## 4. Run removeSSU.py on your Fasta file												##
+##																						##
+## 								COMMAND Example Below									##
+##																						##
+## 				E-mail Xyrus (author) for help if needed: maurerax@gmail.com			##
+##																						##
+##							Next Script(s) to Run: 										##
+##							 3_CountOGsDiamond.py										##
+##																						##
+##########################################################################################
+
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+from distutils import spawn
+from Bio import SeqIO
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
+###########################################################################################
+	## IF Diamond is IN YOUR PATH then no updating is needed...
+
+def check_diamond_path():
+
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = '/path/to/diamond'
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+' the PATH to the'+color.BLUE+' "Diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
+		print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+		+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+		+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
+
+		sys.exit()
+	else:
+		pass
+	
+	return diamond_path
+	
+	
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+	
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\nThis script will categorize Contigs as'+color.ORANGE+' STRONGLY '+color.END\
+	+color.BOLD+color.RED+'Eukaryotic \nOR Prokaryotic'+color.END+color.BOLD+' using a set of Proteins'\
+	' from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
+	+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences (with rRNAs removed)'+color.END)
+	required_arg_group.add_argument('--databases','-d', action='store',
+	help=color.BOLD+color.GREEN+"Path to databases"+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+		
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2b_remove_Bact.py --input_file'\
+	' ../Op_me_Xxma/Op_me_Xxma_NorRNAseqs.fasta'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	print(args.input_file)
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('NorRNAseqs.fasta') != True:
+				print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
+				' with '+color.GREEN+'2a_remove_rRNA.py '+color.END+color.BOLD+'are valid\n\n'\
+				'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
+				'"NorRNAseqs.fas"\n\n'+color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	if os.path.isdir(args.databases + '/db_BvsE') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
+		'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+		+'\n\nThen try once again.')
+		valid_arg += 1
+	elif os.path.isfile(args.databases + '/db_BvsE/eukout.dmnd') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'Diamond formatted '+color.ORANGE+'Eukaryotic Protein database!\n\n'+color.END+color.BOLD+\
+		'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.'+color.END)
+	elif os.path.isfile(args.databases + '/db_BvsE/micout.dmnd') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'Diamond formatted '+color.ORANGE+'Bacterial/Archaeal Protein database!\n\n'+color.END+color.BOLD+\
+		'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.'+color.END)
+
+		valid_arg += 1
+
+	return valid_arg
+	
+	
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+
+	BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
+	
+	if os.path.isdir(BvE_folder) != True:
+		os.system('mkdir '+BvE_folder)
+
+		
+###########################################################################################
+###---------------- Runs Diamond on Bact and Euk small RefSeq Databases ----------------###
+###########################################################################################
+
+def ublast_BvE(args, diamond_path):
+
+	BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
+	mic_output = args.input_file.split('/')[-1]+'micresults.'
+	euk_output = args.input_file.split('/')[-1]+'eukresults.'
+
+	print(color.BOLD+'\n\n"BLAST"-ing against PROK database using DIAMOND: ' + color.DARKCYAN + 'micout.dmnd' + color.END + '\n\n')
+
+	Prok_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/micout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'allmicresults.tsv'
+
+	os.system(Prok_diamond_cmd)
+
+	print(color.BOLD+'\n\n"BLAST"-ing against EUK database using DIAMOND: ' + color.DARKCYAN + 'eukout.dmnd' + color.END + '\n\n')
+
+	Euk_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/eukout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'alleukresults.tsv'
+
+	os.system(Euk_diamond_cmd)
+
+
+
+###########################################################################################
+###---------------- Compares Bacterial and Euk Hits for Classification -----------------###
+###########################################################################################
+
+def compare_hits(args):
+
+	BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
+	
+	EukDict = {}
+	ProkDict = {}
+	CompDict = {}	
+	
+	inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
+	
+	for seq_rec in inFasta:
+			EukDict[seq_rec.description] = ''
+			ProkDict[seq_rec.description] = ''
+			CompDict[seq_rec.description] = []
+
+	inEukHits = [i for i in open(BvE_folder + 'alleukresults.tsv').readlines()]
+	inEukHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
+	
+	inProkHits = [i for i in open(BvE_folder + 'allmicresults.tsv').readlines()]
+	inProkHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
+	
+	for i in inEukHits:
+		if EukDict[i.split('\t')[0]] == '':
+			EukDict[i.split('\t')[0]] = float(i.split('\t')[-2])
+	
+	for i in inProkHits:
+		if ProkDict[i.split('\t')[0]] == '':
+			ProkDict[i.split('\t')[0]] = float(i.split('\t')[-2])
+	
+	for k in CompDict.keys():
+		if EukDict[k] != '':
+			CompDict[k].append(EukDict[k])
+		else:
+			CompDict[k].append('no hit')
+		if ProkDict[k] != '':
+			CompDict[k].append(ProkDict[k])
+		else:
+			CompDict[k].append('no hit')
+
+	for k, v in CompDict.items():
+
+	### Contigs lacking STRONG Eukaryotic OR Prokaryotic Hits
+		if v[0] == 'no hit' and v[1] == 'no hit':
+			CompDict[k].append('UNDETERMINED')
+
+	### Contigs lacking STRONG Eukaryotic with a Prokaryotic Hit	
+		elif v[0] != 'no hit' and v[1] == 'no hit':
+			CompDict[k].append('EUKARYOTIC')
+
+	### Contigs with a Eukaryotic but without a Prokaryotic Hit		
+		elif v[0] == 'no hit' and v[1] != 'no hit':
+			CompDict[k].append('PROKARYOTIC')
+
+	### Uses Basic math to determine if contigs with are MORE Eukaryotic than Prokaryotic				
+		else:
+			try:
+				prok_euk_ratio = float(v[1])/float(v[0])
+				euk_prok_ratio = float(v[0])/float(v[1])
+	
+				if prok_euk_ratio >= 100:
+					CompDict[k].append('EUKARYOTIC')
+	
+				elif  euk_prok_ratio >= 1000:
+					CompDict[k].append('PROKARYOTIC')
+	
+				else:
+					CompDict[k].append('UNDETERMINED')
+	
+			except:
+				CompDict[k].append('divide by zero')
+		
+	with open(BvE_folder + 'comparisons.txt','w+') as w:
+		for k, v in CompDict.items():
+			w.write(k+':'+':'.join([str(i) for i in v])+'\n')
+
+	BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
+	BvE_output_base = BvE_folder+args.input_file.split('/')[-1].split('.fas')[0]
+	
+### Gathers the sequences and categorizes them	
+	Euk_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'EUKARYOTIC'), key=lambda x: -int(len(x.seq)))
+	Prok_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'PROKARYOTIC'), key=lambda x: -int(len(x.seq)))
+	Und_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'UNDETERMINED'), key=lambda x: -int(len(x.seq)))
+	Zero_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'divide by zero'), key=lambda x: -int(len(x.seq)))
+
+
+### Writes out all of the categorized sequences
+	with open(args.input_file.split('NorRNA')[0] + 'WTA_EPU.fasta', 'w') as epu:
+		with open(BvE_output_base+'.Not_Bact.fasta','w+') as nb:
+			for euk_seq in Euk_Fasta:
+				nb.write('>' + euk_seq.description + '\n' + str(euk_seq.seq) + '\n')
+				epu.write('>' + euk_seq.description + '_E' + '\n' + str(euk_seq.seq) + '\n')
+
+
+		with open(BvE_output_base+'.Bact_Hit.fasta','w+') as pr:
+			for prok_seq in Prok_Fasta:
+				pr.write('>' + prok_seq.description + '\n' + str(prok_seq.seq) + '\n')		
+				epu.write('>' + prok_seq.description + '_P' + '\n' + str(prok_seq.seq) + '\n')
+
+		with open(BvE_output_base+'.Undetermined.fasta','w+') as und:
+			for und_seq in Und_Fasta:
+				und.write('>' + und_seq.description + '\n' + str(und_seq.seq) + '\n')
+				epu.write('>' + und_seq.description + '_U' + '\n' + str(und_seq.seq) + '\n')
+
+		if len(Zero_Fasta) != 0:
+			with open(BvE_output_base+'.DivideByZero.fasta','w+') as w:
+				for zero_seq in Zero_Fasta:
+					w.write('>' + zero_seq.description + '\n' + str(zero_seq.seq) + '\n')
+					epu.write('>' + zero_seq.description + '_U' + '\n' + str(zero_seq.seq) + '\n')
+		else:
+			pass
+		
+	return str(len(Euk_Fasta)), str(len(Prok_Fasta)), str(len(Und_Fasta))	
+
+
+###########################################################################################
+###----------------------- Updates Log of Prok vs Euk Comparisons ----------------------###
+###########################################################################################
+	
+def update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs):
+
+	if os.path.isdir('../PostAssembly_Logs/') != True:
+		os.system('mkdir ../PostAssembly_Logs/')
+	else:
+		pass
+			
+	print (color.BOLD +'\n\nThere are '+color.RED+Prok_Contigs+' Strongly Prokaryotic contigs'+color.END\
+	+color.BOLD+',\n'+color.ORANGE+Euk_Contigs+' Strongly Eukaryotic contigs'+color.END\
+	+color.BOLD+',\nand '+color.PURPLE+Und_Contigs+' Undetermined Contigs\n'+color.END\
+	+color.BOLD+'in '+args.input_file.split('/')[-1]+color.END)
+	
+	for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
+		if Logname.startswith(args.input_file.split('/')[-1].split('_No')[0]) and Logname.endswith('Log.txt'): # ACL - ???
+			with open('../PostAssembly_Logs/'+Logname,'a') as Logfilename:
+				Logfilename.write('Prokaryotic Contigs\t'+Prok_Contigs+'\tn/a\tn/a\n')
+				Logfilename.write('Eukaryotic Contigs\t'+Euk_Contigs+'\tn/a\tn/a\n')
+				Logfilename.write('Undetermined Contigs\t'+Und_Contigs+'\tn/a\tn/a\n')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
+	.split('NorRNA')[0]+'WTA_EPU.fasta'+color.END+color.BOLD+' in the '\
+	+args.input_file.split('/')[1]+' Folder\n\n' + color.END)
+	print (color.BOLD + 'Next Script is: ' + color.GREEN + '3_CountOGsDiamond.py\n\n'+ color.END)
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+
+	home_folder = '/'.join(args.input_file.split('/')[:-1])
+	
+	os.system('cp '+home_folder+'/*WTA_EPU.fasta '+home_folder+'/BvE/')
+	os.system('mv '+home_folder+'/*NorRNA*fasta '+home_folder+'/rRNA_Removal/')
+
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+
+def main():
+
+	usearch_path = check_diamond_path()
+
+	args = check_args()
+
+	prep_folders(args)
+	
+	ublast_BvE(args, usearch_path)
+
+	Euk_Contigs, Prok_Contigs, Und_Contigs = compare_hits(args)
+	
+	#update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs)
+	
+	clean_up(args)
+	
+	next_script(args)
+
+main()
--- a/PTL1/Transcriptomes/Scripts/3_CountOGsDiamond.py
+++ b/PTL1/Transcriptomes/Scripts/3_CountOGsDiamond.py
@ -0,0 +1,372 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 16_10_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 3_CountOGsDiamond.py --help
+
+
+##########################################################################################
+## This script is intended to classify the STRONGLY Eukaryotic and UNDETERMINED/UNKNOWN ##
+## contigs into different OGs (e.g. orthologous gene-families)							##
+##																						##
+## For more info about the OGs, check out: OrthoMCL.org									##
+##																						##
+## Prior to running this script, ensure the following:									##
+##																						##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 				##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder					##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py	##
+## 3. Removed SSU/LSU sequences from your Fasta File									## 
+##																						##
+## 			E-mail Xyrus (author) for help if needed: maurerax@gmail.com				##
+##																						##
+##								Next Script(s) to Run: 									##
+##							 	 4_StopFrequency.py										##
+##																						##
+##########################################################################################
+
+import argparse, os, sys, re
+from argparse import RawTextHelpFormatter,SUPPRESS
+from distutils import spawn
+from Bio import SeqIO
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
+###########################################################################################
+	## IF Diamond is IN YOUR PATH then no updating is needed...
+
+def check_diamond_path():
+
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = '/path/to/diamond'
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+' the PATH to the'+color.BLUE+' "usearch" '+color.END+color.BOLD+'executable.\n\n'+color.END)
+		print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+		+'#------------------------------ UPDATE USEARCH PATH BELOW! -------------------------------#'\
+		+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
+
+		sys.exit()
+	else:
+		pass
+	
+	return diamond_path
+
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
+	+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
+	+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
+	'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
+	+'THIS script (3_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
+	formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
+	' Eukaryotic protein coding transcripts'+color.END)
+	required_arg_group.add_argument('--databases','-g', action='store',
+	help=color.BOLD+color.GREEN+"Path to fasta file with Hook sequences"+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+	optional_arg_group.add_argument('--threads','-t', default='2',
+	help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
+	optional_arg_group.add_argument('--evalue','-e', default=1e-5, type = float,
+	help=color.BOLD+color.GREEN+' Maximum e-value for OG assignment\n (default = 1e-5)\n'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+
+	return args
+
+		
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3_CountOGsDiamond.py'\
+	' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_NBU.fasta'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('WTA_EPU.fasta') != True:
+				print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
+				' with '+color.GREEN+'2b_remove_Bact.py '+color.END+color.BOLD+'are valid\n\n'\
+				'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
+				'"WTA_NBU.fasta"\n\n'+color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	if os.path.isdir(args.databases + '/db_OG') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
+		'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+		+'\n\nThen try once again\n\n.'+color.END)
+		valid_arg += 1
+
+	ogdb_count = 0
+	for file in os.listdir(args.databases + '/db_OG'):
+		if file.endswith('.dmnd'):
+			ogdb_count += 1
+
+	if ogdb_count == 0:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
+		'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
+		valid_arg += 1
+	elif ogdb_count > 1:
+		print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
+		valid_arg += 1
+
+	return valid_arg
+	
+	
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+
+	OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
+	
+	if os.path.isdir(OG_folder) != True:
+		os.system('mkdir '+OG_folder)	
+
+		
+###########################################################################################
+###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
+###########################################################################################
+
+def OG_diamond(args, diamond_path):
+	
+	print (color.BOLD+'\nStarting to "BLAST" against OG databases'+color.END)
+		
+	OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
+	db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
+
+	print (color.BOLD + '\n\n"BLAST"-ing against OG database using DIAMOND: ' + color.DARKCYAN + db + color.END + '\n\n')
+			
+	OG_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + str(args.evalue) + ' --threads 60 --subject-cover 0.35 --outfmt 6 -o ' + OG_folder + 'allOGresults.tsv'
+	
+	os.system(OG_diamond_cmd)	
+
+
+###########################################################################################
+###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
+###########################################################################################
+
+def keep_best(args):
+
+	print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST match for each transcript\n\n'+color.END)
+	
+	OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
+		
+	inTSV = [i for i in open(OG_folder + 'allOGresults.tsv').readlines()]
+	
+	inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
+	
+	keep = []
+	for i in inTSV:
+		if any(i.split('\t')[0] in j for j in keep) != True:
+			keep.append(i)
+
+	updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+'\t'+'\t'.join(line.split('\t')[1:]) for line in keep]))
+		
+	with open(args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv'), 'w+') as w:
+		for i in updated_lines:
+			w.write(i)
+	
+
+###########################################################################################
+###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
+###########################################################################################
+
+def update_fasta(args):
+
+	print (color.BOLD+color.PURPLE+'Updating Fasta File Sequence Names with their BEST OG hits\n\n'+color.END)
+
+	Renamed_TSV = args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv')
+
+	keep = [i for i in open(Renamed_TSV).readlines() if i != '\n']
+	
+	keep_dict = { }
+	for line in keep:
+		try:
+			og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
+			og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
+			og = og_prefix + og_number
+
+			keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0]  : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
+		except IndexError:
+			pass
+
+	inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
+	
+	updated_seq_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq)+'\n' for i in inFasta if i.description in keep_dict.keys()]
+
+	seqs_without_OG = ['>'+i.description+'\n'+str(i.seq)+'\n' for i in inFasta if i.description not in keep_dict.keys()]
+	
+	with open(args.input_file.replace('.fasta','.Renamed.fasta'),'w+') as w:
+		for i in updated_seq_name:
+			w.write(i)		
+
+	with open(args.input_file.replace('.fasta','.LackOG.fasta'),'w+') as x:
+		for i in seqs_without_OG:
+			x.write(i)		
+
+
+###########################################################################################
+###-------------------- Updates Log With OG Assignment Information ---------------------###
+###########################################################################################
+	
+def update_log(args):
+
+	if os.path.isdir('../PostAssembly_Logs/') != True:
+		os.system('mkdir ../PostAssembly_Logs/')
+	else:
+		pass
+
+	home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
+	
+	Renamed_TSV = home_folder+args.input_file.split('/')[-1].replace('.fasta','.Renamed_allOGCleanresults.tsv')
+
+	keep = [line for line in open(Renamed_TSV).readlines()]
+	all_ogs = [line.split('\t')[1].split('_')[-1] for line in keep if len(re.split('_OG.{1}_', line.split('\t')[1])) > 1]
+
+	total_with_ogs = str(len(all_ogs))
+	unique_ogs = str(len(set(all_ogs)))
+	
+	print (color.BOLD +'There are '+color.BLUE +total_with_ogs+' Contigs'+color.END\
+	+color.BOLD+' that hit '+color.DARKCYAN+unique_ogs+' Unique OGs\n'+color.END)
+
+	
+	for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
+		if Logname.startswith(args.input_file.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
+			with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
+				LogFile.write('Contigs With OG\t'+total_with_ogs+'\tn/a\tn/a\n')
+				LogFile.write('Unique OGs\t'+unique_ogs+'\tn/a\tn/a\n')
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+
+	OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
+	
+	os.system('rm ' + args.input_file)
+	
+	os.system('cp ' + args.input_file.replace('.fasta','.Renamed.fasta') + ' ' + OG_folder)
+
+	os.system('cp ' + args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv') + ' ' + OG_folder)
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	home_folder = '../'+args.input_file.split('/')[1]+'/'
+
+	print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
+	.replace('.fasta','WTA_EPU.fasta')+color.END+color.BOLD+' in the '+home_folder\
+	+' Folder\n\n' + color.END)
+	
+	print (color.BOLD+'Next Script is: '+color.GREEN+'4_InFrameStopFreq.py\n\n'+ color.END)
+
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+				
+def main():
+
+	usearch_path = check_diamond_path()
+
+	args = check_args()
+
+	prep_folders(args)
+
+	OG_diamond(args, usearch_path)
+
+	keep_best(args)
+
+	update_fasta(args)
+
+	#update_log(args)
+
+	clean_up(args)
+
+	next_script(args)
+
+main()
--- a/PTL1/Transcriptomes/Scripts/4_InFrameStopFreq.py
+++ b/PTL1/Transcriptomes/Scripts/4_InFrameStopFreq.py
@ -0,0 +1,790 @@
+#!/usr/bin/env python
+
+##__Updated__: 18_08_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 4_InFrameStopFreq.py --help
+
+
+##########################################################################################
+## This script is intended to aid in identifying the genetic code of the data given		##
+##																						##
+## Prior to running this script, ensure the following:									##
+##																						##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 				##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder					##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py	##
+## 3. Removed SSU/LSU sequences from your Fasta File									##
+## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined		##
+## 5. Classified the Non-Strongly Prokaryotic sequences into OGs 						##
+##																						##
+## 								COMMAND Example Below									##
+##							Extra Notes at Bottom of Script								##
+##																						##
+## 			E-mail Xyrus (author) for help if needed: maurerax@gmail.com				##
+##																						##
+##								Next Script(s) to Run: 									##
+##							 	 5_GCodeTranslate.py									##
+##																						##
+##########################################################################################
+
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+from distutils import spawn
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Data.CodonTable import CodonTable
+
+
+#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
+
+tag_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',            
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAG'])
+
+c_uncinata_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y',             'TAG': 'Q',
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAA'])
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
+###########################################################################################
+	## IF Diamond is IN YOUR PATH then no updating is needed...
+
+def check_diamond_path():
+
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = '/path/to/diamond'
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
+		print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+		+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+		+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
+
+		sys.exit()
+	else:
+		pass
+	
+	return diamond_path
+
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD+'\n\nThis script is intended to '+color.RED+'AID You '+color.END+color.BOLD\
+	+'in determining the '+color.RED+'\nLikely Genetic Code'+color.END+color.BOLD+' of a'\
+	' given Fasta File of transcripts\n\nInterpretation of the output (StopFreq.tsv) is difficult \nand so '+color.ORANGE\
+	+'TWO EXAMPLES'+color.END+color.BOLD+' can be found in the '+color.CYAN+'NOTES Section'\
+	+color.END+color.BOLD+' of\nTHIS Script '+color.GREEN+'(4_InFrameStopFreq.py)\n'+color.END\
+	+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store', required=True,
+	help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
+	' Eukaryotic protein coding transcripts'+color.END)
+	required_arg_group.add_argument('--databases','-d', action='store',
+	help=color.BOLD+color.GREEN+"Path to databases"+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+		
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_InFrameStopFreq.py'\
+	' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_EPU.Renamed.fasta'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
+				print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
+				' with '+color.GREEN+'3_CountOGsUsearcy.py '+color.END+color.BOLD+'are valid\n\n'\
+				'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
+				'"WTA_NBU.Renamed.fasta"\n\n'+color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	if os.path.isdir(args.databases + '/db_StopFreq') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		+color.ORANGE+'db_StopFreq Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
+		'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+		+'\n\nThen try once again\n\n.'+color.END)
+		valid_arg += 1
+
+	elif os.path.isfile(args.databases + '/db_StopFreq/RepEukProts.dmnd') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'Diamond formatted '+color.ORANGE+'Representative Eukaryotic Protein Database!\n\n'+color.END+color.BOLD+\
+		'Ensure that it can be found in the '+color.ORANGE+'db_StopFreq folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
+		valid_arg += 1
+
+	return valid_arg
+	
+	
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+
+	Stop_folder = '../'+args.input_file.split('/')[1]+'/StopCodonFreq/'
+	
+	if os.path.isdir(Stop_folder) != True:
+		os.system('mkdir '+Stop_folder)
+
+	if os.path.isdir(Stop_folder+'StopCodonFastas') != True:
+		os.system('mkdir '+Stop_folder+'StopCodonFastas')			
+
+	if os.path.isdir(Stop_folder+'SpreadSheets') != True:
+		os.system('mkdir '+Stop_folder+'SpreadSheets')			
+
+	return Stop_folder+'StopCodonFastas/'
+
+
+###########################################################################################
+###--------------------- Translates Sequences with Each Stop Codon ---------------------###
+###########################################################################################
+
+def prep_translations(args):
+	print (color.BOLD+'\nIdentifying ORFs in the Fasta file based on the output of 3_CountOGsDiamond.py\n'+color.END)
+	
+	intsv = [i for i in open(args.input_file.replace('.fasta','_allOGCleanresults.tsv')).readlines() if i != '\n']
+
+	inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
+
+	prot_dict = {}
+
+	for i in intsv:
+#		print i
+		prot_dict.setdefault(i.split('\t')[0],[])
+		if int(i.split('\t')[6]) < int(i.split('\t')[7]):
+			prot_dict[i.split('\t')[0]].append('F')
+			if (int(i.split('\t')[6])) < 5:
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
+			else:
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
+			prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
+		if int(i.split('\t')[7]) < int(i.split('\t')[6]):
+			prot_dict[i.split('\t')[0]].append('RC')
+			prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6]))
+			if (int(i.split('\t')[7])-4) < 5:
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7]))
+			else:
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])-4)
+
+
+	#------------- Prep translation with 'TAA' as the only Stop -------------#
+	
+	print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAA'+color.END\
+	+color.BOLD+' as the sole STOP codon\n'+color.END)
+
+	for key, value in prot_dict.items():
+		for seq_rec in inFasta:
+			if key in seq_rec.description:
+				stop_pos = 0
+				if prot_dict[key][0] == 'F':
+					temp = seq_rec.seq[prot_dict[key][1]:]
+					temp_prot = str(temp.translate(table=c_uncinata_table))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
+				if prot_dict[key][0] == 'RC':
+					temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
+					temp_prot = str(temp.translate(table=c_uncinata_table))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
+
+
+	#------------- Prep translation with 'TGA' as the only Stop -------------#
+	print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TGA'+color.END\
+	+color.BOLD+' as the sole STOP codon\n'+color.END)	
+	
+	for key, value in prot_dict.items():
+		for seq_rec in inFasta:
+			if key in seq_rec.description:
+				stop_pos = 0
+				if prot_dict[key][0] == 'F':
+					temp = seq_rec.seq[prot_dict[key][1]:]
+					temp_prot = str(temp.translate(table=6))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
+				if prot_dict[key][0] == 'RC':
+					temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
+					temp_prot = str(temp.translate(table=6))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
+	
+			
+	#------------- Prep translation with 'TAG' as the only Stop -------------#
+	print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAG'+color.END\
+	+color.BOLD+' as the sole STOP codon\n'+color.END)
+	
+	
+	for key, value in prot_dict.items():
+		for seq_rec in inFasta:
+			if key in seq_rec.description:
+				stop_pos = 0
+				if prot_dict[key][0] == 'F':
+					temp = seq_rec.seq[prot_dict[key][1]:]
+					temp_prot = str(temp.translate(table=tag_table))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
+				if prot_dict[key][0] == 'RC':
+					temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
+					temp_prot = str(temp.translate(table=tag_table))
+					if '*' in temp_prot:		
+						stop_pos = (temp_prot.index('*')+1)*3
+						prot_dict[key].append(temp[:stop_pos])
+					else:
+						prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
+					
+	#------------ Parsing through data to maintain OG assignments ------------#
+	inOGs = intsv
+	inOGs = [i.split('\t')[0]+';'+i.split('\t')[1][-10:] for i in inOGs]
+	inOGs2 = []
+	for i in inOGs:
+		if 'no_group' not in i.split(';')[1]:
+			inOGs2.append(i)
+		else:
+			inOGs2.append(i.split(';')[0]+';no_group')
+	inOGs2 = list(set(inOGs2))
+
+	#---------------- Write file with 'TAA' is the only Stop ----------------#
+						
+	with open(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','w+') as w:
+		print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+		+' TAA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
+		
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-3]).upper()+'\n')
+						
+	with open(args.input_file.split('.fas')[0]+'_taa_ORF.aa.fasta','w+') as w:
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-3])).translate(table=c_uncinata_table)).upper()+'\n')					
+
+	#---------------- Write file with 'TGA' is the only Stop ----------------#
+	
+	with open(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','w+') as w:
+		print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+		+' TGA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
+	
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-2]).upper()+'\n')
+					
+	with open(args.input_file.split('.fas')[0]+'_tga_ORF.aa.fasta','w+') as w:
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-2])).translate(table=6)).upper()+'\n')
+						
+	#---------------- Write file with 'TAG' is the only Stop ----------------#
+	
+	with open(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','w+') as w:
+		print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+		+' TAG '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
+		
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-1]).upper()+'\n')
+	
+	with open(args.input_file.split('.fas')[0]+'_tag_ORF.aa.fasta','w+') as w:
+		for key, value in prot_dict.items():
+			for j in inOGs2:
+				if key == j.split(';')[0]:
+					if len(prot_dict[key]) < 4:
+						pass
+					else:
+						w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-1])).translate(table=tag_table)).upper()+'\n')				
+
+
+###########################################################################################		
+###---------- Diamonds the Translations Against a SMALL Euk Protein Database ----------###
+###########################################################################################
+
+def diamond_ProtDB(args, diamond_path):
+	os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tag_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tag_ORF.RepEukProts.tsv')
+
+	os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tga_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tga_ORF.RepEukProts.tsv')
+
+	os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_taa_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_taa_ORF.RepEukProts.tsv')
+
+
+###########################################################################################	
+###-------------------- Manages the search for In-Frame Stop Codons --------------------###
+###########################################################################################
+
+
+def hunt_for_stops(args):
+		
+	#------------------------ Open Fasta Files ------------------------#
+	try: 
+		TAGinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','fasta') if str(i.seq).endswith('TAG')]
+		print (color.BOLD+'\n\nGathering Sequence information from FASTA and TSV files\n'+color.END)
+		
+	except:
+		print (color.BOLD+color.RED+'\n\nMissing Necessary Inputs: Open Script for Usage'\
+		' Information\n\n'+color.END)
+		sys.exit()
+		
+	TGAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','fasta') if str(i.seq).endswith('TGA')]
+
+	TAAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','fasta') if str(i.seq).endswith('TAA')]
+	
+	## This section originally ONLY considered sequences WITH OG assignments:
+	## TAAinFasta = [i for i in TAAinFasta if 'no_group' not in i.description and str(i.seq).endswith('TAA')]
+	## This has been taken out for now
+	
+	#----------------------- Open BLAST Reports -----------------------#
+
+	TAGinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tag_ORF.RepEukProts.tsv').read().split('\n') if i != '']
+
+	TGAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tga_ORF.RepEukProts.tsv').read().split('\n') if i != '']
+
+	TAAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_taa_ORF.RepEukProts.tsv').read().split('\n') if i != '']
+
+## This section originally ONLY considered sequences WITH OG assignments:
+	## TAAinTSV = i for i in TAAinTSV if i != ''and 'no_group' not in i.split('\t')[0]]
+	## This has been taken out for now
+
+	
+	#------------ Set-up Genetic Code Specific Dictionaries ------------#
+	
+	tag_dict = {}
+	for i in TAGinTSV:
+		tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-6]))
+		tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-5]))
+
+	tga_dict = {}
+	for i in TGAinTSV:
+		tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-6]))
+		tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-5]))
+
+	taa_dict = {}
+	for i in TAAinTSV:
+		taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-6]))
+		taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-5]))
+
+	#-------------- Preparing In-Frame Stop Codon Counts --------------#
+
+# All the data when TGA is the sole stop codon
+	tga_codons = 0
+	tga_data_tag = 0
+	tga_data_tga = 0
+	tga_data_taa = 0
+	tga_seq_count = 0 
+
+# All the data when TAG is the sole stop codon	
+	tag_codons = 0
+	tag_data_tag = 0
+	tag_data_tga = 0
+	tag_data_taa = 0
+	tag_seq_count = 0
+	
+# All the data when TAA is the sole stop codon	
+	taa_codons = 0
+	taa_data_tag = 0
+	taa_data_tga = 0
+	taa_data_taa = 0
+	taa_seq_count = 0
+	
+# All the data for each stop codon combined
+	tga_inframe = 0
+	tag_inframe = 0
+	taa_inframe = 0
+	total_codons = 0
+	total_seq_counts = len(open(args.input_file).read().split('>'))-1
+
+
+	#-------- Gathering In-frame Stop Codon Density Information --------#
+	
+### Collect in-frame stop information for "TAA" and "TAG" when TGA is the ONLY stop	
+	print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+	+' TGA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
+	
+	for i in TGAinFasta:
+		try:
+			if tga_dict[i.description][0] == 1:
+				for n in range((tga_dict[i.description][0]-1),((tga_dict[i.description][1])*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						tga_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':
+						tga_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':
+						tga_data_tga += 1
+						tga_inframe += 1
+					tga_codons += 1
+					total_codons += 1
+				tga_seq_count += 1 
+					
+			else:
+				for n in range(((tga_dict[i.description][0]-1)*3),((tga_dict[i.description][1])*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						tga_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':
+						tga_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':
+						tga_data_tga += 1
+						tga_inframe += 1
+					tga_codons += 1 
+					total_codons += 1
+				tga_seq_count += 1
+		except:
+			pass		
+			
+### Collect in-frame stop information for "TAA" and "TGA" when TAG is the ONLY stop	
+	print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+	+' TAG'+color.END+color.BOLD+' is the only STOP\n'+color.END)	
+
+	for i in TAGinFasta:
+		try:
+			if tag_dict[i.description][0] == 1:
+				for n in range((tag_dict[i.description][0]-1),((tag_dict[i.description][1])*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						tag_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':	
+						tag_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':	
+						tag_data_tga += 1
+						tga_inframe += 1
+					tag_codons += 1
+					total_codons += 1 
+				tag_seq_count += 1
+				
+			else:
+				for n in range(((tag_dict[i.description][0]-1)*3),(tag_dict[i.description][1]*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						tag_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':	
+						tag_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':	
+						tag_data_tga += 1
+						tga_inframe += 1
+					tag_codons += 1 
+					total_codons += 1
+				tag_seq_count += 1
+		except:
+			pass
+
+
+### Collect in-frame stop information for "TGA" and "TAG" when TAA is the ONLY stop	
+	print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+	+' TAA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
+		
+	for i in TAAinFasta:
+		try:
+			if taa_dict[i.description][0] == 1:
+				for n in range((taa_dict[i.description][0]-1),((taa_dict[i.description][1])*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						taa_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':	
+						taa_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':	
+						taa_data_tga += 1
+						tga_inframe += 1
+					taa_codons += 1 
+					total_codons += 1
+				taa_seq_count += 1
+				
+			else:
+				for n in range(((taa_dict[i.description][0]-1)*3),(taa_dict[i.description][1]*3)-3,3):
+					if str(i.seq).upper()[n:n+3] == 'TAG':
+						taa_data_tag += 1
+						tag_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TAA':	
+						taa_data_taa += 1
+						taa_inframe += 1
+					if str(i.seq).upper()[n:n+3].upper() == 'TGA':	
+						taa_data_tga += 1
+						tga_inframe += 1
+					tag_codons += 1 
+					total_codons += 1
+				taa_seq_count += 1
+		except:
+			pass
+
+	#-------------- Writing Data Out and Print Statement --------------#	
+
+	with open(args.input_file.split('.fas')[0]+'_StopCodonStats.tsv','w+') as w:
+		w.write('Stop Codon\tNumber of Seqs Analyzed\tIn-frame TAG\tIn-frame TGA\tIn-frame TAA\tTotal Codons\tIn-frame TAG density\tIn-frame TGA density\tIn-frame TAA density\n')
+		if tga_codons != 0:
+			w.write('TGA\t'+str(tga_seq_count)+'\t'+str(tga_data_tag)+'\t'+str(tga_data_tga)+'\t'+str(tga_data_taa)+'\t'+str(tga_codons)\
+			+'\t'+"%.2f" % ((float(tga_data_tag)*1000)/float(tga_codons))+'\t'+"%.2f" % ((float(tga_data_tga)*1000)/float(tga_codons))+'\t'\
+			+"%.2f" % ((float(tga_data_taa)*1000)/float(tga_codons))+'\n')
+		else:
+			w.write('TGA\t0\t0\t0\t0\t0\t0\t0\t0\n')
+	
+		if tag_codons != 0:
+			w.write('TAG\t'+str(tag_seq_count)+'\t'+str(tag_data_tag)+'\t'+str(tag_data_tga)+'\t'+str(tag_data_taa)+'\t'+str(tag_codons)\
+			+'\t'+"%.2f" % ((float(tag_data_tag)*1000)/float(tag_codons))+'\t'+"%.2f" % ((float(tag_data_tga)*1000)/float(tag_codons))+'\t'\
+			+"%.2f" % ((float(tag_data_taa)*1000)/float(tag_codons))+'\n')
+		else:
+			w.write('TAG\t0\t0\t0\t0\t0\t0\t0\t0\n')
+		if taa_codons != 0:
+			w.write('TAA\t'+str(taa_seq_count)+'\t'+str(taa_data_tag)+'\t'+str(taa_data_tga)+'\t'+str(taa_data_taa)+'\t'+str(taa_codons)\
+			+'\t'+"%.2f" % ((float(taa_data_tag)*1000)/float(taa_codons))+'\t'+"%.2f" % ((float(taa_data_tga)*1000)/float(taa_codons))+'\t'\
+			+"%.2f" % ((float(taa_data_taa)*1000)/float(taa_codons))+'\n')
+		else:
+			w.write('TAA\t0\t0\t0\t0\t0\t0\t0\t0\n')
+	
+		w.write('\n \n')
+		w.write('Summary\t'+str(tga_seq_count+tag_seq_count+taa_seq_count)+'\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)\
+		+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))\
+		+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n')
+		w.write('\nTotal Seqs in Fasta\t'+str(total_seq_counts))
+	
+#	print color.BOLD + color.BLUE + '\nSummary\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'\
+#		+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n\n'\
+#		+ str(tag_seq_count) + '\t' + str(tga_seq_count) + '\t' + str(taa_seq_count) + color.END
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+	if os.path.isdir('/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq') != True:
+		os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/')
+	else:
+		pass
+			
+	os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
+	os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
+	os.system('mv ' + args.input_file.split('.fas')[0]+'_t*_ORF.*fasta ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
+	os.system('mv ' + args.input_file.split('.fas')[0]+'_t*Prots.tsv ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	home_folder = '/'.join(args.input_file.split('/')[:-1])
+
+	print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
+	.replace('.fasta','_StopCodonStats.tsv')+color.END+color.BOLD+' in the '+home_folder\
+	+' Folder\n\n' + color.END)
+	
+	print (color.BOLD+'Next Script is: '+color.GREEN+'5_GCodeTranslate.py\n\n'+ color.END)
+
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+
+def main():
+
+	diamond_path = check_diamond_path()
+
+	args = check_args()
+	
+	prep_translations(args)
+	
+	diamond_ProtDB(args, diamond_path)
+
+	hunt_for_stops(args)
+	
+	clean_up(args)
+	
+	next_script(args)
+ 
+main()
+
+
+#----------------------------------------- NOTES -----------------------------------------#
+#
+# This script is designed to HELP you make an informed decision about the genetic code being
+# used by your particular organism. Be aware that it will be limited by the quality of the
+# data given to it! 
+#
+# You will need:
+#
+# Diamond, BioPython, AND the output from '3_CountOGSDiamond.py'
+#
+# If you are not using the Author's database, update your database name(s) in lines: 345-360
+#
+# katzlab$ python StopFrequency.py YourFastaFile.fasta
+#
+#
+#------------------------------- Interpretation of Results -------------------------------#
+#
+# FORMATTED BELOW WITH TEXTWRANGLER... 
+#
+# Example output using CILIATE (TGA) genetic Code (NOTE THE In-Frame Densities):
+#
+# Stop Codon   Number_of_Seqs_Analyzed   In-frame TAG   In-frame TGA   In-frame TAA   Total Codons   In-frame TAG density   In-frame TGA density   In-frame TAA density
+# TGA                   341                   14              0			    	  22            113156              1.2                     0                     0.92
+# TAG                   424                    0              0			    	  34            140085               0                      0                     0.78
+# TAA                   205                   14              0			    	   0             16714              0.84                    0                      0
+# Summary               970                   28              0			    	  56            269955              2.04                    0                     1.7
+#
+# VALUES in summary line (OR SUM of Density) that are > 1.5 likely indicate that the STOP 
+# codon has been reassigned... in the case above, TAG and TAA look like they have been
+# reassigned.
+#
+#
+# Example output using UNIVERSAL genetic Code (NOTE THE In-Frame Densities):
+#
+# Stop Codon   Number_of_Seqs_Analyzed   In-frame TAG   In-frame TGA   In-frame TAA   Total Codons   In-frame TAG density   In-frame TGA density   In-frame TAA density
+# TGA                   341                    1              0			    	   2            113156              0.2                     0                     0.05
+# TAG                   424                    0              2			    	   4            140085               0                      0                     0.08
+# TAA                   205                    1              0			    	   0             16714              0.04                    0                      0
+# Summary               970                    2              2			    	   6            269955              0.15                    0                     0.06
+#
+# VALUES in summary line (OR SUM of Density) that are > 0.5 likely indicate that the STOP 
+# codon still acts as STOP... in the case above, TAG, TGA and TAA look like they still behave
+# as a stop codon.
+#
+# THIS IS A ROUGH GUIDE FOR INTERPRETING THE RESULTS!!!! BE VERY VERY WARY! NUMBER OF TOTAL
+# SEQUENCES AND TOTAL CODONS OBSERVED ARE IMPORTANT (TOO FEW AND ANY INTERPRETATION IS DEVOID
+# OF ANY MEANING).
--- a/PTL1/Transcriptomes/Scripts/5_GCodeTranslate.py
+++ b/PTL1/Transcriptomes/Scripts/5_GCodeTranslate.py
@ -0,0 +1,770 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 20_09_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 5_GCodeTranslate.py --help
+
+
+##########################################################################################
+## This script is intended to aid in identifying the genetic code of the data given		##
+##                                                                                      ##
+## Prior to running this script, ensure the following:									##
+##                                                                                      ##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file              ##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder                    ##
+## 2. Removed small sequences (usually sequences < 300bp) with 1_ContigFiltStats.py     ##
+## 3. Removed SSU/LSU sequences from your Fasta File                                    ##
+## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined      ##
+## 5. Classified the Non-Strongly Prokaryotic sequences into OGs                        ##
+## 6. You either know (or have inferred) the genetic code of the organism               ##
+##                                                                                      ##
+##             E-mail Xyrus (author) for help if needed: maurerax@gmail.com             ##
+##                                                                                      ##
+##                              Next Script(s) to Run:                                  ##
+##                 6_FilterPatials.py (in FinalizeTranscripts Folder)                   ##
+##                                                                                      ##
+##########################################################################################
+
+import argparse, os, re, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.Data.CodonTable import CodonTable
+
+
+#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
+
+blepharisma_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y',                       
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAA','TAG'])
+
+condylostoma_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = [''])
+
+c_uncinata_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y',             'TAG': 'Q',
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAA'])
+
+euplotes_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y',                       
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAA','TAG'])
+
+myrionecta_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
+	'TGT': 'C', 'TGC': 'C',             'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TGA'])
+
+no_stop_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = [''])
+
+peritrich_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
+	'TGT': 'C', 'TGC': 'C',             'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TGA'])
+	
+tag_table = CodonTable(forward_table={
+	'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
+	'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
+	'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',            
+	'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
+	'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
+	'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
+	'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
+	'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
+	'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
+	'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
+	'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
+	'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
+	'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
+	'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
+	'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
+	'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
+	start_codons = [ 'ATG'],
+	stop_codons = ['TAG'])
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   PURPLE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###------------------------- Checks the Command Line Arguments -------------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
+	'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
+	color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
+
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
+	help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
+	'"universal")\n'+color.END)
+
+	optional_arg_group.add_argument('--no_RP','-no_RP', action='store_true',
+	help=color.BOLD+color.GREEN+' Allows files to "skip" the removal\n of Partial Transcripts\n'\
+	+color.END)
+
+	optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
+	help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
+
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+		
+	### Adding in names to 'arg' class for more easy use throughout the script
+	args.ntd_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_NTD.ORF.fasta'
+	args.aa_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_AA.ORF.fasta'
+	args.tsv_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_allOGCleanresults.tsv'
+
+	args.home_folder = '/'.join(args.input_file.split('/')[:-1])
+	args.Diamond_Folder = args.home_folder+'/DiamondOG'
+	args.StopFreq = args.home_folder+'/StopCodonFreq'
+	args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
+	
+	args.tsv_file = args.input_file.split('.fas')[0]+ '_allOGCleanresults.tsv'
+	
+	
+	
+	
+	
+	
+	return args
+	
+	
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_GCodeTranslate.py'\
+	' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'\
+	' --genetic_code Universal'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
+	'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
+	'taa','tag','tga','mesodinium']
+
+	supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
+	'Condylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
+	'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+
+	if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
+		print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
+		'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
+		'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
+		print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
+		print (author)
+		valid_arg += 1
+	else:	
+		if args.list_codes == True:
+			print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
+			print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
+			valid_arg += 1	
+
+		if args.author == True:
+			print (author)
+			valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
+				print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
+				' with '+color.GREEN+'3_CountOGsDiamond.py '+color.END+color.BOLD+'are valid\n\n'\
+				'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
+				'"WTA_EPU.Renamed.fasta"\n\n'+color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	return valid_arg
+	
+
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+
+	OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
+	
+	if os.path.isdir(OG_folder) != True:
+		os.system('mkdir '+OG_folder)		
+		
+	if os.path.isdir(args.all_output_folder + 'TranslatedTranscriptomes') != True:
+		os.system('mkdir ' + args.all_output_folder + 'TranslatedTranscriptomes')
+
+
+
+##########################################################################################
+###---------------- Scans 5-Prime End of Transcript for In-Frame "ATG" ----------------###
+##########################################################################################
+
+def check_new_start_new(some_seq, low_lim, upper_lim, old_start, codon_table):
+
+	## Looks for in-frame STOP codons in the UTR of the transcript
+	prime5 = str(Seq(some_seq[low_lim:upper_lim]).translate(table=codon_table)).replace('*','x')
+	in_frame_stops = [stops.start() for stops in re.finditer('x',prime5)]
+
+	## Looks for in-frame START codons in the UTR of the transcript
+	in_frame_starts = [starts.start() for starts in re.finditer('M',prime5)]
+
+	## Checks that there are NO in-frame STOP codons between the possible "new" START codon 
+	## and the aligned portion of the transcript -- THIS is double checked!
+	if len(in_frame_starts) != 0:
+		if len(in_frame_stops) != 0:
+			if in_frame_stops[-1] < in_frame_starts[-1]:
+				new_start = low_lim+in_frame_starts[-1]*3
+			else:
+				new_start = old_start
+		else:
+			new_start = low_lim+in_frame_starts[-1]*3
+	else:
+		new_start = old_start
+
+	## Skips the double-checking if there are no GOOD potential START codons
+	if new_start == old_start:
+		updated_start = old_start 
+
+	else:
+	## Double checks that there are NO IN-FRAME stop codons between the NEW-SUGGESTED Start
+	## position and the OLD-SUPPORTED stop position! 
+		between_new_old_start = str(Seq(some_seq[new_start:old_start]).translate(table=1)).replace('*','x')
+		in_frame_stops_check = [stops.start() for stops in re.finditer('x',between_new_old_start)]
+		in_frame_starts_check = [starts.start() for starts in re.finditer('M',between_new_old_start)]
+		if len(in_frame_starts_check) != 0:
+			if len(in_frame_stops_check) != 0:
+				if in_frame_stops_check[-1] < in_frame_starts_check[-1]:
+					updated_start = new_start+in_frame_starts_check[-1]*3
+				else:
+					updated_start = old_start
+			else:
+				updated_start = new_start
+		else:
+			updated_start = new_start
+	
+	return updated_start
+
+
+##########################################################################################
+###--------------- Extracts the ORF from the Fasta File and SpreadSheet ---------------###
+##########################################################################################
+
+def extract_ORF(prot_dict, codon_table, args):
+
+	print (color.BOLD+'\n\nExtracting '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
+	' the transcriptomic data-set\n\n'+color.END)
+
+	for k, v in prot_dict.items():
+
+	## Attempting to find the most-likely START (ATG) position in the transcript (tricky)
+	## Skips this if the initial Methionine (ATG) is likely present 
+	## (e.g. the alignment position of the protein = '1')
+		prot_start = int(v[3].split('..')[0])
+		old_start = v[1]
+		if prot_start != 1:
+			min_dist, max_dist = round_down_three(prot_start)
+			min_start = old_start-min_dist
+			max_start = old_start-max_dist
+			if min_start < 0:
+				min_start = old_start
+			if max_start < 0:
+				max_start = min_start%3
+#			print k+'\tOld_start\t'+str(old_start)+'\tMin_Dist/Start\t'+str(min_dist)+'/'+str(min_start)+'\tMax_Dist/Start\t'+str(max_dist)+'/'+str(max_start)+'\n'
+			updated_start = check_new_start_new(v[-1], max_start, min_start, old_start, codon_table)
+		else:
+			updated_start = old_start
+		temp = prot_dict[k][-1][updated_start:]
+	
+	## Uses the given genetic code to identify the stop position of the ORF
+		temp_prot = str(Seq(temp).translate(table=codon_table))						
+		if '*' in temp_prot:		
+			stop_pos = (temp_prot.index('*')+1)*3
+			prot_dict[k].append(temp[:stop_pos])
+		else:
+			stop_pos = prot_dict[k][2] - prot_dict[k][1]
+			prot_dict[k].append(temp[:stop_pos])
+
+	## Awkward_list is populated with unexpectedly SHORT ORFs!
+	## Reasons for being short include:
+		# An error Xyrus introduced
+		# Not as great genetic code decision (in-frame stop)
+		# Crummy sequence/assembly quality (false in-frame stop codons)
+		
+	awkward_list = []
+	look_good = []
+
+	for k, v in prot_dict.items():
+		expected_min = len(v[-2][v[1]:v[2]])-1
+		if len(v[-1]) < expected_min:
+			awkward_list.append(k)
+		else:
+			look_good.append(k)
+
+	if len(awkward_list) != 0:
+		with open('UnexpexctedShortStuffBlameXyrus.txt','w+') as x:
+			for entry in awkward_list:
+				x.write(entry+'\n')
+	else:
+		pass
+		
+	print (color.BOLD+'\n\nTranslating '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
+	' using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
+	
+	for k, v in prot_dict.items():
+		prot_dict[k].append(str(Seq(v[-1]).translate(table=codon_table)).rstrip('*'))	
+
+	return prot_dict
+	
+##########################################################################################
+###------------ Grabs the Coding Coordinates from the OG-BLAST SpreadSheet ------------###
+##########################################################################################
+	
+def prep_translations(args):
+
+	print (color.BOLD+'\n\nGrabbing useful info from the '+color.ORANGE+args.input_file\
+	.split('/')[-1]+color.END+color.BOLD+' Fasta File\nand from the '+color.ORANGE+args.tsv_file\
+	.split('/')[-1]+color.END+color.BOLD+' OG-Assignment Spreadsheet'+color.END)		
+
+	inTSV = ['\t'.join(i.rstrip('\n').split('\t')[:-1]) for i in open(args.tsv_file).readlines() if i != '\n']
+	inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
+
+	# ORF identification step here, uses the 'allOGCleanresults.tsv file to identify the ORF
+	prot_dict = {}
+
+	# Special scenario! Only for when the genetic code is not particularly useful ...
+	if args.genetic_code.lower() == 'none' or args.genetic_code.lower() == 'condylostoma' or args.genetic_code.lower() == 'condy':
+		for i in inTSV:
+			prot_dict.setdefault(i.split('\t')[0],[])
+			if int(i.split('\t')[6]) < int(i.split('\t')[7]):
+	## Saves the Transcript Orientation (Coding vs. Template Strand)			
+				prot_dict[i.split('\t')[0]].append('F')
+	## Collects initial Start and Stop positions from the BLAST alignment
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
+	## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
+				prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
+
+			if int(i.split('\t')[7]) < int(i.split('\t')[6]):
+	## Saves the Transcript Orientation (Coding vs. Template Strand)			
+				prot_dict[i.split('\t')[0]].append('RC')
+	## Collects initial Start and Stop positions from the BLAST alignment
+				prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
+				prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
+	## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
+				prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
+
+	## Makes sure that the dictionary has the transcript in the correct orientation	
+		for i in inFasta:
+			if i.description in prot_dict.keys():
+				if 'RC' == prot_dict[i.description][0]:
+					prot_dict[i.description].append(str(i.seq.reverse_complement()))
+				else:
+					prot_dict[i.description].append(str(i.seq))
+
+	else:
+		for i in inTSV:
+			prot_dict.setdefault(i.split('\t')[0],[])
+			if int(i.split('\t')[6]) < int(i.split('\t')[7]):
+	## Saves the Transcript Orientation (Coding vs. Template Strand)			
+				prot_dict[i.split('\t')[0]].append('F')
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
+				prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
+	## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
+				prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))	
+			if int(i.split('\t')[7]) < int(i.split('\t')[6]):
+	## Saves the Transcript Orientation (Coding vs. Template Strand)
+				prot_dict[i.split('\t')[0]].append('RC')
+	## Collects initial Start and Stop positions from the BLAST alignment (but in the "correct" orientation)
+				prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
+				prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
+	## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
+				prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
+
+	## Makes sure that the dictionary has the transcript in the correct orientation				
+		for i in inFasta:
+			if i.description in prot_dict.keys():
+				if 'RC' == prot_dict[i.description][0]:
+					prot_dict[i.description].append(str(i.seq.reverse_complement()))
+				else:
+					prot_dict[i.description].append(str(i.seq))
+					
+	return prot_dict
+
+
+##########################################################################################
+###------------------------ Rounds Down Values to Nearest "3" -------------------------###
+##########################################################################################
+
+def round_down_three(num):
+	min_val = int(num*3*.5)-int(num*3*.5)%3
+	max_val = int(num*6)-int(num*6)%3
+	return min_val, max_val
+
+
+##########################################################################################
+###--------------------- Makes Translation Steps (Later) Easier -----------------------###
+##########################################################################################
+
+def standardize_gcode(given_code):
+	if given_code == 'ciliate' or given_code == 'tga':
+		codon_table = 6	
+	elif given_code == 'chilodonella' or given_code == 'chilo' or given_code == 'taa':
+		codon_table = c_uncinata_table
+	elif given_code == 'blepharisma' or given_code == 'bleph':
+		codon_table = blepharisma_table
+	elif given_code == 'euplotes' or given_code == 'eup':
+		codon_table = euplotes_table
+	elif given_code == 'myrionecta' or given_code == 'mesodinium':
+		codon_table = myrionecta_table
+	elif given_code == 'peritrich' or given_code == 'vorticella':
+		codon_table = peritrich_table
+	elif given_code == 'none':
+		codon_table = no_stop_table
+	elif given_code == 'condylostoma' or given_code == 'condy':
+		codon_table = condylostoma_table
+	elif given_code == 'tag':
+		codon_table = tag_table
+	elif given_code == 'universal':
+		codon_table = 1
+	else:
+		print (color.BOLD+color.RED+'\n\nNo valid genetic code provided!\n\n'+color.END+\
+		color.BOLD+'Using the "Universal" genetic code (by default)\n\nPlease check that the'\
+		' code you wish to use is supported:'+color.CYAN+'\n\npython 5_GCodeTranslate.py'\
+		' -list_codes\n\n'+color.END)
+		codon_table = 1
+
+	return codon_table
+	
+
+###########################################################################################
+###------------------ Updates Spreadsheet with Updated Contig Names --------------------###
+###########################################################################################
+	
+def update_spreadsheet(args, updated_spreadsheet_dict):
+	if os.path.isdir(args.home_folder + '/DiamondOG/') != True:
+		os.system(args.home_folder + '/DiamondOG/')
+	else:
+		pass
+
+	inTSV = [line.rstrip('\n') for line in open(args.tsv_file).readlines() if line != '\n' and line.split('\t')[0] in updated_spreadsheet_dict.keys()]
+	
+	updatedTSV = [updated_spreadsheet_dict[line.split('\t')[0]]+'\t'+'\t'.join(line.split('\t')[1:]) for line in inTSV]
+	
+	with open(args.tsv_out,'w+') as w:
+		w.write('\n'.join(updatedTSV))
+
+
+###########################################################################################
+###-------------------- Updates Log With OG Assignment Information ---------------------###
+###########################################################################################
+	
+def update_log(filename, codon_table):
+
+	if os.path.isdir('../PostAssembly_Logs/') != True:
+		os.system('mkdir ../PostAssembly_Logs/')
+	else:
+		pass
+
+	ntd_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.fasta','fasta')]
+	aa_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.aa.fasta','fasta')]
+	
+	min_ntd_ORF = str(min([len(i.seq) for i in ntd_ORF]))
+	max_ntd_ORF = str(max([len(i.seq) for i in ntd_ORF]))
+	avg_ntd_ORF = '%.2f' % (sum([len(i.seq) for i in ntd_ORF])/float(len(ntd_ORF)))
+	
+	min_aa_ORF = str(min([len(i.seq) for i in aa_ORF]))
+	max_aa_ORF = str(max([len(i.seq) for i in aa_ORF]))
+	avg_aa_ORF = '%.2f' % (sum([len(i.seq) for i in aa_ORF])/float(len(aa_ORF)))
+	
+	for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
+		if Logname.startswith(filename.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
+			with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
+				LogFile.write('Nucleotide ORFs\t'+str(len(ntd_ORF))+'\tn/a\tn/a\n')
+				LogFile.write('Nucleotide ORF Lengths\t'+avg_ntd_ORF+'\t'+min_ntd_ORF+'\t'+max_ntd_ORF+'\n')
+				LogFile.write('Protein ORFs\t'+str(len(aa_ORF))+'\tn/a\tn/a\n')
+				LogFile.write('Protein ORF Lengths\t'+avg_aa_ORF+'\t'+min_aa_ORF+'\t'+max_aa_ORF+'\n')
+
+	
+##########################################################################################
+###----------------------- Write File with Provided Genetic Code ----------------------###
+##########################################################################################
+
+def write_data_out(prot_dict, codon_table, args):		
+	
+	update_spreadsheet_dict = {}
+
+	#The code below only works if rnaspades was used; constrained by addition of script 6b
+	for k, v in prot_dict.items():
+		#if 'Cov' in k:
+		new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-3:])
+		#update_spreadsheet_dict[k] = new_name
+		update_spreadsheet_dict[k] = k
+		#else:
+			#new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-2:])
+			#update_spreadsheet_dict[k] = new_name
+			#update_spreadsheet_dict[k] = k			
+
+	with open(args.ntd_out,'w+') as w:
+		print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'ORF'+color.END+color.BOLD\
+		+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
+	
+		for k, v in prot_dict.items():
+			w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-2])+'\n')
+
+	with open(args.aa_out, 'w+') as w:
+		print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'Translated ORF'+color.END+color.BOLD\
+		+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
+
+		for k, v in prot_dict.items():
+			w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-1])+'\n')
+				
+	return update_spreadsheet_dict
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+					
+def clean_up(args):
+	
+	if args.input_file.split('.fas')[0].split('/')[-1] + '_StopCodonStats.tsv' in os.listdir(args.home_folder):
+		os.system('mv ' + args.input_file.split('.fas')[0] + '_StopCodonStats.tsv ' + args.StopFreq)
+	
+	os.system('mv '+args.tsv_file+' '+args.Diamond_Folder)
+	os.system('mv '+args.input_file+' '+args.Diamond_Folder)
+
+	if args.no_RP == True:
+		if os.path.isdir(args.all_output_folder + 'ToRename/') != True:
+			os.system('mkdir ' + args.all_output_folder + 'ToRename/')
+		
+		os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder + 'ToRename/')
+		os.system('cp ' + args.aa_out + ' ' + args.all_output_folder + 'ToRename/')
+		os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder + 'ToRename/')
+
+	else:	
+		os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder)
+		os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder)
+		os.system('cp ' + args.aa_out + ' ' + args.all_output_folder)
+		
+	os.system('mv ' + args.home_folder + ' ' + args.all_output_folder + 'TranslatedTranscriptomes')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.ntd_out.split('/')[-1]+color.END+\
+	color.BOLD+',\n'+color.DARKCYAN+args.aa_out.split('/')[-1]+color.END+color.BOLD+', and\n'\
+	+color.DARKCYAN+args.tsv_out.split('/')[-1]+color.END+color.BOLD+',\nwhich are in the '+\
+	color.ORANGE+args.home_folder.split('/')[-1]+' Folder'+color.END)
+	
+	if args.no_RP == True:
+		print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'7_FinalRename.py'+color.END+color.BOLD+\
+		' in the '+color.PURPLE+'RemovePartials Folder'+color.END+color.BOLD+'\nwith a copy of'\
+		' the outputs of this script!'+color.END)
+		print(color.BOLD+'\n\nRemember that you have chosen '+color.RED+'NOT '+color.END+color.BOLD+\
+		'to remove partials\nand are skipping to the renaming step!\n\n'+color.END)
+	
+	else:
+		print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'6_FilterPartials.py'+color.END+color.BOLD+\
+		' in the '+color.PURPLE+'FinalizeTranscripts Folder'+color.END+color.BOLD+'\nwith a copy of'\
+		' the outputs of this script!\n\n'+color.END)
+
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+							
+def main():
+
+	args = check_args()
+	
+	prep_folders(args)
+
+	codon_table = standardize_gcode(args.genetic_code.lower())
+	
+	prot_dict_Prepped = prep_translations(args)
+
+	prot_dict_Final = extract_ORF(prot_dict_Prepped, codon_table, args)
+
+	new_spreadsheet_names = write_data_out(prot_dict_Final, codon_table, args)
+	
+	update_spreadsheet(args, new_spreadsheet_names)
+	
+#	update_log(fasta_file, gcode)
+	
+	clean_up(args) 
+
+	next_script(args)
+	
+main()
--- a/PTL1/Transcriptomes/Scripts/6_FilterPartials.py
+++ b/PTL1/Transcriptomes/Scripts/6_FilterPartials.py
@ -0,0 +1,652 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 2020-11-29
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch
+##__Usage__: python 6_FilterPartials.py --help
+
+
+##################################################################################################
+## This script is intended to remove incomplete transcripts that have a more complete mate		##
+##																								##
+## Prior to running this script, ensure the following:											##
+##																								##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 						##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder							##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py			##
+## 3. Removed SSU/LSU sequences from your Fasta File											##
+## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined				##
+## 5. Classified the Non-Strongly Prokaryotic sequences into OGs 								##
+## 6. You either know (or have inferred) the genetic code of the organism						##
+## 7. You have translated the sequences and checked for the data in the RemovePartials folder	##
+##																								##
+## 					E-mail Xyrus (author) for help if needed: maurerax@gmail.com				##
+##																								##
+##										Next Script(s) to Run: 									##
+##						 	  			  7_FinalRename.py										##
+##																								##
+##################################################################################################
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from statistics import mean
+
+from distutils import spawn
+import argparse, os, sys, time, re
+from argparse import RawTextHelpFormatter,SUPPRESS
+
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
+###########################################################################################
+	## IF Diamond is IN YOUR PATH then no updating is needed...
+
+def check_diamond_path():
+
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = '/path/to/diamond'
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
+		print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+		+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+		+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
+
+		sys.exit()
+	else:
+		pass
+
+	return diamond_path
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\
+	+color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\
+	+color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\
+	+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
+
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+
+	required_arg_group.add_argument('--file_prefix','-fp', action='store',
+	help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\
+	'to be processed\n'+color.END)
+
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98,
+	help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\
+	' contigs\n (default = 0.98)\n'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+	optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder')
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+
+	args.id_print = str(int(float(args.identity)*100))
+
+	args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/'
+	args.file_prefix = args.file_prefix.split('/')[-1]
+
+	args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')]
+
+	args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')]
+
+	args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')]
+
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		print ('\n')
+		sys.exit()
+
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\
+	' --file_prefix Op_me_Xxma'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.file_listNTD == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if args.file_listAA == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if args.file_listTSV == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV):
+		pass
+	else:
+		print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\
+		' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\
+		+color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\
+		' each transcriptome\n\nThen try once again.'+color.END)
+		valid_arg += 1
+
+	return valid_arg
+
+
+##########################################################################################
+###------------------------- Creates Folders For Storing Data -------------------------###
+##########################################################################################
+
+def prep_folders(args):
+
+	if os.path.isdir(args.all_output_folder + 'ToRename') != True:
+		os.system('mkdir ' + args.all_output_folder + 'ToRename')
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix) != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix)
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets')
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets')
+
+
+##########################################################################################
+###-------------------- Merges Fasta Files When Replicates Present --------------------###
+##########################################################################################
+
+def merge_fasta_replicates(args, type):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+
+	count = 0
+	fasta_to_merge = []
+
+	if type == 'NTD':
+		fasta_list = args.file_listNTD
+	else:
+		fasta_list = args.file_listAA
+
+	for file in fasta_list:
+		fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != '']
+		count += 1
+
+	with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w:
+		w.write(''.join(fasta_to_merge))
+
+	time.sleep(.75)
+
+
+##########################################################################################
+###--------------------- Merges TSV Files When Replicates Present ---------------------###
+##########################################################################################
+
+def merge_tsv_replicates(args):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
+
+	count = 0
+	tsv_to_merge = []
+
+	for file in args.file_listTSV:
+		tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != '']
+		count += 1
+
+	with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w:
+		w.write('\n'.join(tsv_to_merge))
+
+	time.sleep(.75)
+
+
+##########################################################################################
+###------------------ Calls on the other Merge Functions by Data Type -----------------###
+##########################################################################################
+
+def merge_relevant_data(args):
+
+	print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END)
+
+	merge_fasta_replicates(args, 'NTD')
+	merge_fasta_replicates(args, 'AA')
+	merge_tsv_replicates(args)
+
+
+##########################################################################################
+###------------------- Uses Diamond to perform Self-vs-Self "BLAST" -------------------###
+##########################################################################################
+
+def self_blast(args, diamond_path):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+
+	diamond_makedb = diamond_path + ' makedb --in ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated'
+
+	diamond_self = diamond_path + ' blastp -q ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated --strand plus --no-self-hits --id '+str(args.identity)+\
+	' --query-cover 0.7 --evalue 1e-15 --threads 60 --outfmt 6 -o ' + cat_folder + 'SpreadSheets/' + args.file_prefix + '.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
+
+	print (color.BOLD+'\n\nBinning ALL '+color.ORANGE+'Nucleotide ORFs'+color.END+color.BOLD\
+	+' for '+color.GREEN+args.file_prefix+color.END+color.BOLD+' at '+args.id_print\
+	+'% identity.\n\n'+color.END)
+
+	os.system(diamond_makedb)
+	os.system(diamond_self)
+
+	return cat_folder+'SpreadSheets/'+args.file_prefix+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
+
+
+##########################################################################################
+###------------------- Uses USearch to perform Self-vs-Self "BLAST" -------------------###
+##########################################################################################
+
+def check_Self_vs_Self(tsv_file):
+
+	evaluation = ''
+
+	tsv_in = [i for i in open(tsv_file).read().split('\n') if i != '']
+
+	if len(tsv_in) == 0:
+		evaluation = 'empty'
+		with open(tsv_file,'w+') as w:
+			w.write('No Self-vs-Self hits were found')
+	else:
+		evaluation = 'continue'
+
+	return evaluation
+
+
+
+##########################################################################################
+###-------------------- Removes Nearly Identical ORFs from Data Set -------------------###
+##########################################################################################
+
+def filter_NTD_data(args):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	##########################################
+	## Set-up Useful Lists and Dictionaries ##
+	##########################################
+
+	nuc_Above98_hit = {}
+	seqs_to_toss = []
+	prepped_NTD = []
+	prepped_AA = []
+
+	nuc_tsv_100 = 0
+
+	replicates = ''
+
+	if len(args.file_listNTD) > 1:
+		replicates = 'yes'
+	else:
+		replicates = 'nope'
+
+	OGLenDB = {}
+	for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
+		if rec.id[-10:] not in OGLenDB:
+			OGLenDB.update({ rec.id[-10:] : [] })
+
+		OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
+
+	for og in OGLenDB:
+		OGLenDB[og] = mean(OGLenDB[og])
+
+	print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\
+	' with >'+args.id_print+'% Nucleotide Identity over >70% of\ntheir length when '\
+	'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\
+	+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+	#####################################################################
+	## Self-v-self BLAST Output Parsing - first checks for Seq-length! ##
+	#####################################################################
+
+	nuc_tsv_raw = [i.rstrip('\n') for i in open(cat_folder+'SpreadSheets/'+args.file_prefix\
+	+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv').readlines() if i != '\n']
+
+	too_long = 0
+	for line in nuc_tsv_raw:
+		og_number = re.split('OG.{1}_', line)[-1][:6]
+		og_prefix = line.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if og in OGLenDB.keys():
+			if int(line.split('Len')[-1].split('_')[0]) > 4.5*OGLenDB[og] or int(line.split('Len')[-1].split('_')[0]) < 1.5*OGLenDB[og]:
+				seqs_to_toss.append(line.split('\t')[1])
+				too_long += 1
+
+	nuc_tsv = [line for line in nuc_tsv_raw if line.split('\t')[1] not in seqs_to_toss]
+
+	if len(nuc_tsv) > 0:
+		if 'Cov' in nuc_tsv[0].split('\t')[0].split('_')[-3]:
+			nuc_tsv.sort(key=lambda x: (-int(x.split('\t')[1].split('Len')[-1].split('_')[0]),-int(x.split('\t')[1].split('Cov')[-1].split('_')[0])))
+		else:
+			nuc_tsv.sort(key=lambda x: -int(x.split('\t')[1].split('Len')[-1].split('_')[0]))
+
+
+	for line in nuc_tsv:
+		if line.split('\t')[1] not in seqs_to_toss:
+			nuc_Above98_hit.setdefault(line.split('\t')[1],[]).append(line.split('\t')[0])
+			seqs_to_toss.append(line.split('\t')[0])
+			if line.split('\t')[2] == '100.0':
+				nuc_tsv_100 += 1
+
+	seqs_to_toss = list(set(seqs_to_toss))
+	inFasta_NTD_rawLen = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') if i.description]
+	inFasta_NTD = [i for i in inFasta_NTD_rawLen if i.description not in seqs_to_toss]
+	inFasta_AA = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta','fasta') if i.description not in seqs_to_toss]
+
+	if replicates != '':
+		for i in inFasta_NTD:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq))
+			else:
+				Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
+				prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq))
+		for i in inFasta_AA:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq).replace('*','X'))
+			else:
+				Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
+				prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq).replace('*','X'))
+	else:
+		for i in inFasta_NTD:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
+			else:
+				prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
+		for i in inFasta_AA:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
+			else:
+				prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
+
+	with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w+') as w:
+		for k, v in nuc_Above98_hit.items():
+			w.write(k+'\t'+'\t'.join(v)+'\n')
+
+	###################################################################################
+	## Check for abnormally short sequences for the taxon for every Gene Family (OG) ##
+	###################################################################################
+
+	print (color.BOLD+'Removing Abnormally Short (70% length) OR Long (200% length)'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
+	'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+	self_OGLenDB={} ##
+	seqs_to_toss = [] ##
+	too_long = too_short = 0 ##
+
+	for i in prepped_NTD:
+		og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
+		og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		self_OGLenDB.setdefault(og,[]).append(len(i.split('\n')[-1]))
+
+	good_NTD_names = []
+	for i in prepped_NTD:
+		og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
+		og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if (0.7*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))) <= len(i.split('\n')[-1]) <= (2*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))):
+			good_NTD_names.append(i.split('\n')[0])
+
+	good_NTD_seqs = [i for i in prepped_NTD if i.split('\n')[0] in good_NTD_names]
+	good_AA_seqs = [i for i in prepped_AA if i.split('\n')[0] in good_NTD_names]
+
+	too_short = len(prepped_NTD) - len(good_NTD_names)
+
+	####################################################################
+	## Finalized Outputs are Summarized and Written Out to New Fastas ##
+	####################################################################
+
+	print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+	+color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\
+	str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\
+	color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\
+	+color.PURPLE+' ORFs.\n\n'+color.END)
+
+	print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+	+' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\
+	color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\
+	color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\
+	+color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\
+	color.RED+'TOO LONG or SHORT.\n\n'+color.END)
+
+	print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\
+	+color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END)
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
+		for i in good_NTD_seqs:
+			w.write(i+'\n')
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
+		for i in good_AA_seqs:
+			x.write(i+'\n')
+
+	return good_NTD_names
+
+
+##########################################################################################
+###------------------- Updates SpreadSheet with Update Sequence Names -----------------###
+##########################################################################################
+
+def update_tsv(args, NTD_list_names):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	inTSV = {'_'.join(i.split('\t')[0].split('_')[1:]):'\t'.join(i.split('\t')[1:]) for i in open(cat_folder+\
+	args.file_prefix+'_Concatenated.allOGCleanresults.tsv').readlines() if i != '\n'}
+
+	Updated_inTSV = [i.strip('>')+'\t'+inTSV[i.split('_Trans')[0].strip('>')] for i in NTD_list_names]
+
+	with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as w:
+		for line in Updated_inTSV:
+			w.write(line+'\n')
+
+
+def no_partials_present(args, OGLenDB):
+
+	print (color.BOLD+color.RED+'\n\nWarning:'+color.END+color.BOLD+' No partial sequences'\
+	' were found with > '+str(args.id_print)+'% nucleotide identity.\n\nThe data will still be '\
+	'checked for ORFs that are unexpectedly '+color.ORANGE+'Short'+color.END+color.BOLD+' or'\
+	+color.ORANGE+' Long.\n\n'+color.END)
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	NTD_file = cat_folder+args.file_prefix+'.NTD.Concatenated.fasta'
+	AA_file = cat_folder+args.file_prefix+'.AA.Concatenated.fasta'
+	TSV_file = cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv'
+
+	OGLenDB = {}
+	for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
+		if rec.id[-10:] not in OGLenDB:
+			OGLenDB.update({ rec.id[-10:] : [] })
+
+		OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
+
+	for og in OGLenDB:
+		OGLenDB[og] = mean(OGLenDB[og])
+
+	self_OGLenDB = {}
+	seqs_to_toss = []
+	too_long, too_short = 0, 0
+
+    ## Small changes in this section for Auden (ought to work now)
+    ## Lists -> Dictionaries and some data curation steps
+
+	inFasta = {i.description:str(i.seq) for i in SeqIO.parse(NTD_file,'fasta')}
+
+	for k,v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if len(v) >= 4.5*OGLenDB[og]:
+			seqs_to_toss.append(k)
+			too_long+= 1
+
+	prepped_NTD = [i for i in inFasta if i not in seqs_to_toss]
+
+	print (color.BOLD+'Removing Abnormally Short (70% length) OR Long (200% length)'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
+	'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+    ## toss those sequences from the sequence dictonary (less headache)
+	for crap_seq in seqs_to_toss:
+		del inFasta[crap_seq]
+
+	for k, v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		self_OGLenDB.setdefault(og,[]).append(len(v))
+
+	self_OGLenDB_Final = {k:sum(v)/len(v) for k, v in self_OGLenDB.items()}
+
+	good_NTD_data = { }
+	for k, v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if 0.7*self_OGLenDB_Final[og] <= len(v) <= 2*self_OGLenDB_Final[og]:
+			good_NTD_data.update({ k : v })
+
+
+	good_AA_data = {i.description:str(i.seq) for i in SeqIO.parse(AA_file,'fasta') if i.description in good_NTD_data.keys()}
+
+	good_TSV_data = [i for i in open(cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv')\
+		.read().split('\n') if i != '' and i.split('\t')[0] in good_NTD_data.keys()]
+
+	renamed_TSV_data = [i.split('\t')[0]+'_Trans1\t'+'\t'.join(i.split('\t')[1:]) for i in good_TSV_data]
+
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
+		for k,v in good_NTD_data.items():
+			w.write('>'+k+'_Trans1\n'+v+'\n')
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
+		for k, v in good_AA_data.items():
+			x.write('>'+k+'_Trans1\n'+v+'\n')
+
+	with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as y:
+		y.write('\n'.join(renamed_TSV_data))
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+
+	for i in args.file_listNTD:
+		os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
+		os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
+		os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script():
+
+	print(color.BOLD+'\nNext Script is: '+color.GREEN+'6b_update_cov_post_removepartials.py\n\n'+color.END)
+
+
+##########################################################################################
+###------------------- Checks Command Line Arguments and Calls Steps ------------------###
+##########################################################################################
+
+def main():
+
+	diamond_path = check_diamond_path()
+
+	args = check_args()
+
+	prep_folders(args)
+
+	merge_relevant_data(args)
+
+	self_BLAST_out = self_blast(args, diamond_path)
+
+	evaluation = check_Self_vs_Self(self_BLAST_out)
+
+	if evaluation != 'empty':
+		NTD_names = filter_NTD_data(args)
+		update_tsv(args, NTD_names)
+	else:
+		no_partials_present(args)
+
+	clean_up(args)
+
+	next_script()
+
+main()
--- a/PTL1/Transcriptomes/Scripts/6b_update_cov_post_removepartials.py
+++ b/PTL1/Transcriptomes/Scripts/6b_update_cov_post_removepartials.py
@ -0,0 +1,88 @@
+#!/usr/bin/python
+from __future__ import print_function
+
+__author__ = "Jean-David Grattepanche"
+__version__ = "2, August 28, 2017"
+__email__ = "jeandavid.grattepanche@gmail.com"
+
+
+import sys
+import os
+import re
+import time
+import string
+import os.path
+from Bio import SeqIO
+from sys import argv
+
+def Addcoverage(code):
+	seqfolder = code
+	all_output_folder = '/'.join(code.split('/')[:-1])
+	code = code.split('/')[-1]
+	
+	covupd = {}
+	for seqcoll in open(seqfolder + '/' + code + '_SeqPairsAbove98.txt','r'):
+		CL = 0
+		for transc in seqcoll.split('\t'):
+			if CL == 0:
+				reftrans = ('_').join(transc.split('_')[1:])
+			coverage = int(transc.split('Cov')[1].split('_')[0])
+			Length = int(transc.split('Len')[1].split('_')[0])
+			CL += coverage * Length
+		covupd[reftrans] = CL 
+
+	if os.path.isdir(seqfolder + '/Updated_Coverage/') != True:
+		os.system('mkdir ' + seqfolder + '/Updated_Coverage/')
+	if os.path.isdir(seqfolder + '/Updated_Coverage/SpreadSheets/') != True:
+		os.system('mkdir ' + seqfolder + '/Updated_Coverage/SpreadSheets/')
+
+	for spreadsh in os.listdir(seqfolder + '/Processed/SpreadSheets/'):
+		if spreadsh.endswith('.tsv'):
+			outtsvtokeep = open(seqfolder + '/Updated_Coverage/SpreadSheets/' + spreadsh.split('Final')[0] + 'UC.Final' + spreadsh.split('Final')[1],'w+')
+			for row in open(seqfolder + '/Processed/SpreadSheets/'+ spreadsh, 'r'):
+				if row.split('_Trans')[0] in covupd:
+					og_number = re.split('OG.{1}_', row)[-1][:6]
+					og_prefix = row.split(og_number)[0][-4:]
+					og = og_prefix + og_number
+
+					newcov2 = round(covupd[row.split('_Trans')[0]] / int(row.split('_Len')[1].split('_')[0]))
+					outtsvtokeep.write(row.split('Cov')[0]+'Cov'+str(newcov2)+'_' + og_prefix +row.split(og_prefix)[1].split('_Trans')[0] +'\t' +('\t').join(row.split('\t')[1:]))
+				else: 
+					if 'Trans' in row:
+						outtsvtokeep.write(row.split('_Trans')[0]+ '\t' +('\t').join(row.split('\t')[1:]))
+					else:
+						outtsvtokeep.write(row)
+			outtsvtokeep.close()
+
+	for seqfile in os.listdir(seqfolder + '/Processed'):
+		if seqfile.endswith('.fasta'):
+			outseqtokeep = open(seqfolder + '/Updated_Coverage/' + seqfile.split('Final')[0] + 'UC.Final' + seqfile.split('Final')[1],'w+')
+			for Seq in SeqIO.parse(seqfolder + '/Processed/' + seqfile ,'fasta'):
+				if Seq.description.split('_Trans')[0] not in covupd:
+					outseqtokeep.write('>'+Seq.description.split('_Trans')[0]+ '\n'+str(Seq.seq) +'\n')
+				else:
+					og_number = re.split('OG.{1}_', Seq.description)[-1][:6]
+					og_prefix = Seq.description.split(og_number)[0][-4:]
+					og = og_prefix + og_number
+
+					newcov = round(covupd[Seq.description.split('_Trans')[0]] / int(Seq.description.split('_Len')[1].split('_')[0]))
+					outseqtokeep.write('>'+Seq.description.split('Cov')[0]+'Cov'+str(newcov)+'_' + Seq.description.split(og)[0][-2:] + og + '\n'+str(Seq.seq) +'\n')
+			outseqtokeep.close()
+
+	if os.path.isdir(all_output_folder + '/ToRename') != True:
+		os.system('mkdir ' + all_output_folder + '/ToRename')
+
+	os.system('cp ' + seqfolder + '/Updated_Coverage/*fasta ' + all_output_folder + '/ToRename/')
+	os.system('cp ' + seqfolder + '/Updated_Coverage/SpreadSheets/*tsv ' + all_output_folder + '/ToRename/')
+			
+			
+def main():
+	script, code = argv
+	Addcoverage(code)
+main()
+
+
+
+
+
+
--- a/PTL1/Transcriptomes/Scripts/7_FinalizeName.py
+++ b/PTL1/Transcriptomes/Scripts/7_FinalizeName.py
@ -0,0 +1,398 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 31_08_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 6_FilterPartials.py --help
+
+##################################################################################################
+## This script is intended to rename the outputs of the FilterPartials script					##
+## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods		##
+##																								##
+## Prior to r`ning this script, ensure the following:											##
+##																								##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 						##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder							##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py			##
+## 3. Removed SSU/LSU sequences from your Fasta File											##
+## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined				##
+## 5. Classified the Non-Strongly Prokaryotic sequences into OGs 								##
+## 6. You either know (or have inferred) the genetic code of the organism						##
+## 7. You have translated the sequences and checked for the data in the RemovePartials folder	##
+## 8. Partial sequences have been removed from the transcriptomic data sets						##
+##																								##
+## 										COMMAND Example Below									##
+##									Extra Notes at Bottom of Script								##
+##																								##
+## 					E-mail Xyrus (author) for help if needed: maurerax@gmail.com				##
+##																								##
+##										Next Script(s) to Run: 									##
+##                                     NONE! You're FINISHED! :D                                ##
+##																								##
+##################################################################################################
+
+import argparse, os, sys
+from argparse import RawTextHelpFormatter,SUPPRESS
+
+#----------------------- Solely to Make Print Statements Colorful -----------------------#
+
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
+	+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
+	+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
+	+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
+	required_arg_group.add_argument('--name','-n', action='store',
+	help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
+
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		print ('\n')
+		sys.exit()
+
+	args.all_output_folder = '/'.join(args.input_file.split('/')[:-2])
+
+	if '.allOGCleanresults' in args.input_TSV:
+		args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('.allOGCleanresults.','.AA.ORF.')\
+		.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
+	else:
+		args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('_allOGCleanresults.','_AA.ORF.')\
+		.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
+	
+	args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0]
+	if 'fasta' in args.file_prefix:
+		args.file_prefix = args.name
+
+	args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
+	args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
+	args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
+	args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/'
+	
+
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\
+	' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_args = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_args += 1
+
+	if args.input_file.endswith('AA.ORF.fasta'):
+		args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta')
+		args.input_AA = args.input_file
+# 		args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv')
+		args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
+
+	elif args.input_file.endswith('NTD.ORF.fasta'):
+		args.input_NTD = args.input_file
+		args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta')
+# 		args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv')
+		args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
+		print(args.input_TSV)
+
+	if os.path.isfile(args.input_NTD) != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
+		'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+		' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+		valid_args += 1
+
+	if os.path.isfile(args.input_AA) != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
+		'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+		' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+		valid_args += 1
+
+	if os.path.isfile(args.input_TSV) != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\
+		' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+		' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+		valid_args += 1
+
+	return valid_args
+
+###########################################################################################
+###-------------------- Double Checks Format for 10-Character Code ---------------------###
+###########################################################################################
+
+def check_code(args):
+	
+	check_name = args.name.split('_')
+	
+	if len(args.name) != 10:
+		print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
+		print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
+		'Am_ar_Ehis\n\n'+color.END) 
+		sys.exit()
+
+	elif args.name.count('_') != 2:
+		print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
+		print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
+		'Am_ar_Ehis\n\n'+color.END) 
+
+		sys.exit()
+
+	if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
+		print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
+		.split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\
+		"code: "+color.CYAN+args.name+color.END+'\n')
+	else:
+		print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
+		print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
+		'Am_ar_Ehis\n\n'+color.END) 
+		sys.exit()
+
+			
+##########################################################################################
+###------------------------- Creates Folders For Storing Data -------------------------###
+##########################################################################################
+
+def prep_folders(args):
+	
+
+	if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
+		os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
+
+
+	if os.path.isdir(args.r2g_ntd) != True:
+		os.system('mkdir ' + args.r2g_ntd)
+	if os.path.isdir(args.r2g_aa) != True:
+		os.system('mkdir ' + args.r2g_aa)
+	if os.path.isdir(args.r2g_tsv) != True:
+		os.system('mkdir ' + args.r2g_tsv)
+	if os.path.isdir(args.r2g_xml) != True:
+		os.system('mkdir ' + args.r2g_xml)
+
+	if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True:
+		os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed')
+
+###########################################################################################
+###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
+###########################################################################################
+
+def rename_paralogs(args):
+
+	home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
+
+	print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
+	renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_')
+	
+	print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
+	renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_')
+
+	
+	print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
+	if '\n\n' in open(args.input_TSV).read():
+		renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
+		.replace('\n\n','\n'+args.name+'_XX_')
+	else:
+		renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
+		.replace('\n','\n'+args.name+'_XX_')
+		
+	with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w:
+		w.write(renamed_Final_Prots)
+
+	with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x:
+		x.write(renamed_Final_Nucs)
+
+	
+	with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y:
+		y.write(renamed_Final_tsv)
+
+
+###########################################################################################
+###--------------------------------- Header/Tail Lines ---------------------------------###
+###########################################################################################
+
+def header_tail():
+	header = '<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n'\
+		'<BlastOutput>\n  <BlastOutput_program>blastp</BlastOutput_program>\n  <BlastOutput_version>BLASTP 2.2.29+</BlastOutput_version>\n'\
+		'  <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n'\
+		'  <BlastOutput_db>../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta</BlastOutput_db>\n  <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n'
+
+	tail = '</BlastOutput_iterations>\n</BlastOutput>'
+	return header, tail
+
+
+###########################################################################################
+###------------------------------- TSV to XML Conversion -------------------------------###
+###########################################################################################
+
+def convert_TSV_data(args):
+
+	home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
+
+	TSVforConvert = home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1]
+
+	inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n']
+
+	iterations = []
+
+	for n in range(len(inTSV)):
+		if n == 0:
+			iterations.append('  <BlastOutput_query-def>'+inTSV[n].split('\t')[0]+'</BlastOutput_query-def>\n  <BlastOutput_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</BlastOutput_query-len>\n'\
+				'  <BlastOutput_param>\n    <Parameters>\n      <Parameters_matrix>BLOSUM62</Parameters_matrix>\n      <Parameters_expect>1e-10</Parameters_expect>\n'\
+				'      <Parameters_gap-open>11</Parameters_gap-open>\n      <Parameters_gap-extend>1</Parameters_gap-extend>\n      <Parameters_filter>F</Parameters_filter>\n'\
+				'    </Parameters>\n  </BlastOutput_param>\n<BlastOutput_iterations>\n<Iteration>\n  <Iteration_iter-num>1</Iteration_iter-num>\n  <Iteration_query-ID>Query_1</Iteration_query-ID>\n'\
+				'  <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n  <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
+				'<Iteration_hits>\n<Hit>\n  <Hit_num>1</Hit_num>\n  <Hit_id>Fake_Entry</Hit_id>\n  <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n  <Hit_accession>Fake_Accession</Hit_accession>\n'\
+				'  <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n  <Hit_hsps>\n    <Hsp>\n      <Hsp_num>1</Hsp_num>\n      <Hsp_bit-score>1234</Hsp_bit-score>\n'\
+				'      <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n      <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n      <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
+				'      <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n      <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n      <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
+				'      <Hsp_query-frame>0</Hsp_query-frame>\n      <Hsp_hit-frame>0</Hsp_hit-frame>\n      <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
+				'      <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n      <Hsp_gaps>0</Hsp_gaps>\n      <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
+				'      <Hsp_qseq></Hsp_qseq>\n      <Hsp_hseq></Hsp_hseq>\n      <Hsp_midline></Hsp_midline>\n    </Hsp>\n  </Hit_hsps>\n</Hit>\n'\
+				'\n</Iteration_hits>\n  <Iteration_stat>\n    <Statistics>\n      <Statistics_db-num>379660</Statistics_db-num>\n      <Statistics_db-len>197499634</Statistics_db-len>\n'\
+				'      <Statistics_hsp-len>123</Statistics_hsp-len>\n      <Statistics_eff-space>184705217500</Statistics_eff-space>\n      <Statistics_kappa>0.041</Statistics_kappa>\n'\
+				'      <Statistics_lambda>0.267</Statistics_lambda>\n      <Statistics_entropy>0.14</Statistics_entropy>\n    </Statistics>\n  </Iteration_stat>\n</Iteration>\n')
+		else:
+			iterations.append('<Iteration>\n  <Iteration_iter-num>'+str(n+1)+'</Iteration_iter-num>\n  <Iteration_query-ID>Query_'+str(n+1)+'</Iteration_query-ID>\n'\
+				'  <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n  <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
+				'<Iteration_hits>\n<Hit>\n  <Hit_num>1</Hit_num>\n  <Hit_id>Fake_Entry</Hit_id>\n  <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n  <Hit_accession>Fake_Accession</Hit_accession>\n'\
+				'  <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n  <Hit_hsps>\n    <Hsp>\n      <Hsp_num>1</Hsp_num>\n      <Hsp_bit-score>1234</Hsp_bit-score>\n'\
+				'      <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n      <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n      <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
+				'      <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n      <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n      <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
+				'      <Hsp_query-frame>0</Hsp_query-frame>\n      <Hsp_hit-frame>0</Hsp_hit-frame>\n      <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
+				'      <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n      <Hsp_gaps>0</Hsp_gaps>\n      <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
+				'      <Hsp_qseq></Hsp_qseq>\n      <Hsp_hseq></Hsp_hseq>\n      <Hsp_midline></Hsp_midline>\n    </Hsp>\n  </Hit_hsps>\n</Hit>\n'\
+				'\n</Iteration_hits>\n  <Iteration_stat>\n    <Statistics>\n      <Statistics_db-num>379660</Statistics_db-num>\n      <Statistics_db-len>197499634</Statistics_db-len>\n'\
+				'      <Statistics_hsp-len>123</Statistics_hsp-len>\n      <Statistics_eff-space>184705217500</Statistics_eff-space>\n      <Statistics_kappa>0.041</Statistics_kappa>\n'\
+				'      <Statistics_lambda>0.267</Statistics_lambda>\n      <Statistics_entropy>0.14</Statistics_entropy>\n    </Statistics>\n  </Iteration_stat>\n</Iteration>\n')
+
+	return iterations
+
+
+###########################################################################################
+###--------------------------- Writes Out the Fake XML File ----------------------------###
+###########################################################################################
+
+def write_Fake_XML(args):
+
+	home_folder = args.all_output_folder + '/' + args.file_prefix + '/'
+
+	print (color.BOLD+'\n\nConverting '+color.ORANGE+args.input_file.split('/')[-1]+color.END\
+	+color.BOLD+' to XML format\n'+color.END)
+
+	header, tail = header_tail()
+	
+	iterations = convert_TSV_data(args)	
+	
+	with open(home_folder+args.out_XML,'w+') as w:
+		w.write(header)
+		w.write(''.join(iterations))
+		w.write(tail)
+		
+##########################################################################################
+###-------------------- Cleans up the Folder and Moves Final Files --------------------###
+##########################################################################################
+def clean_up(args):
+
+	home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
+
+	os.system('cp ' + args.all_output_folder + '/' + args.file_prefix+'/'+args.out_XML+' '+args.r2g_xml)
+
+	os.system('cp '+home_folder+'*tsv '+args.r2g_tsv)
+
+	os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa)
+	os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd)
+
+	os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix)
+	os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
+	os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
+	
+	os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*')
+	
+	if os.path.isdir(args.all_output_folder + '/Finished/') != True:
+		os.system('mkdir ' + args.all_output_folder + '/Finished')
+	
+	os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished')
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script(args):
+
+	print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.out_XML\
+	.split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\
+	args.out_XML.split('_XX_')[-1].split('_Filtered')[0]+color.END+color.BOLD+' and '\
+	+color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\
+	'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END)
+
+##########################################################################################
+###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
+##########################################################################################
+			
+def main():
+
+	args = check_args()
+	
+	check_code(args)
+	
+	prep_folders(args)
+	
+	rename_paralogs(args)
+	
+	write_Fake_XML(args)
+	
+	clean_up(args)
+	
+	next_script(args)
+	
+main()
--- a/PTL1/Transcriptomes/Scripts/8_SummaryStats.py
+++ b/PTL1/Transcriptomes/Scripts/8_SummaryStats.py
@ -0,0 +1,269 @@
+import os, sys
+import argparse
+from Bio import SeqIO
+import CUB
+from statistics import mean
+from math import ceil, floor
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def get_args():
+
+	parser = argparse.ArgumentParser(
+		prog = 'PTL6p1 Script 8: Stat Summary',
+		description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
+	)
+
+	parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
+	parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
+	parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')
+
+	return parser.parse_args()
+
+
+def hook_lens(args):
+
+	print('\nGetting average OG lengths in the Hook DB...')
+
+	len_by_og = { }
+	for file in os.listdir(args.databases + '/db_OG'):
+		if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
+			for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
+				if rec.id[-10:] not in len_by_og:
+					len_by_og.update({ rec.id[-10:] : [] })
+
+				len_by_og[rec.id[-10:]].append(len(str(rec.seq)))
+
+	for og in len_by_og:
+		len_by_og[og] = mean(len_by_og[og])
+
+	return len_by_og
+
+
+def aa_comp_lengths(args, gcodes):
+
+	print('\nGetting amino acid composition data from ReadyToGo files...')
+
+	r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
+	for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
+		if file.endswith('.fasta') and file[:10] in gcodes:
+			for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
+				r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
+
+				fymink = 0; garp = 0; other = 0; total = 0
+				for char in str(rec.seq):
+					if char in 'FYMINK':
+						fymink += 1
+					elif char in 'GARP':
+						garp += 1
+					else:
+						other += 1
+
+					total += 1
+
+				aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total } })
+
+				recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
+
+	print('\nGetting transcript sequence data from original assembled transcript files...')
+
+	transcripts = { }; transcript_id_corr = { }
+	for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes')]):
+		if os.path.isdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
+			for file in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
+				if file.endswith('Original.fasta') and file[:10] in gcodes:
+					for rec in SeqIO.parse(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta/' + file, 'fasta'):
+						transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
+						if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
+							transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})
+
+	return aa_comp, transcripts, r2g_lengths, transcript_id_corr
+
+
+def get_nuc_comp(args, gcodes):
+
+	print('\nGetting nucleotide composition data from ReadyToGo files...')
+
+	nuc_comp = { }
+	for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
+		if file.endswith('.fasta') and file[:10] in gcodes:
+			cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]])[0]
+			for k in cub_out:
+				nuc_comp.update({ k : cub_out[k] })
+
+	return nuc_comp
+
+
+def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
+
+	og_mean_lens = hook_lens(args)
+
+	if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
+		os.mkdir(args.input + '/PerSequenceStatSummaries')
+
+	taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
+
+	for taxon in taxa:
+		with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
+			o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA\n')
+			for rec in nuc_comp:
+				if rec[:10] == taxon:
+					o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
+
+					try:
+						o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
+					except KeyError:
+						o.write(',NA,NA')
+
+					o.write(',' + str(r2g_lengths[rec]) + ',' + str(og_mean_lens[rec[-10:]]))
+
+					v = nuc_comp[rec]
+					gcs = [str(v.gcOverall), str(v.gc1), str(v.gc2), str(v.gc3), str(v.gc4F)]
+					ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
+					o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
+
+					o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + '\n')
+
+
+def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
+
+	taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
+
+	with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
+		o.write('Taxon,TranscriptsInput,Median_GCTranscripts,IQR_GCTranscripts,Median_LenTranscripts,IRQ_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,IQR_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
+
+		for taxon in taxa:
+			o.write(taxon)
+
+			transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
+			o.write(',' + str(len(transcripts)))
+
+			transcript_gcs = []
+			for transcript in transcripts:
+				transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))
+
+			transcript_gcs = sorted(transcript_gcs)
+			o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
+			o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.75)] - transcript_gcs[floor(len(transcripts)*0.25)]))
+
+			transcript_lens = sorted([len(transcript) for transcript in transcripts])
+			o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
+			o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))
+
+			r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
+			o.write(',' + str(len(r2g_ntds)))
+			r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
+			o.write(',' + str(len(r2g_ogs)))
+
+			r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
+			o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
+			o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.75)] - r2g_gc3s[floor(len(r2g_gc3s)*0.25)]))
+
+			r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
+			o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
+			o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)]))
+
+			tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
+			o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
+			o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
+
+			o.write(',' + gcodes[taxon] + '\n')
+
+
+def r2g_jf(args, nuc_comp, gcodes):
+
+	#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?
+
+	if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF'):
+		os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF')
+
+	if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF'):
+		os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF')
+
+	for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
+		if file.endswith('.fasta') and file[:10] in gcodes:
+			taxon = file[:10]
+
+			r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
+			r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
+
+			with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
+				for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
+					if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
+						o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
+
+			with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
+				for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
+					if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
+						o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
+
+
+def plot_jf(args, nuc_comp):
+
+	if not os.path.isdir(args.input + '/GC3xENc_Plots'):
+		os.mkdir(args.input + '/GC3xENc_Plots')
+
+	taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))
+
+	gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
+	enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]
+
+	for taxon in taxa:
+		comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]
+
+		plt.figure()
+		plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
+		plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
+		plt.xlabel("GC content (3rd pos, 4-fold sites)")
+		plt.ylabel("Observed Wright ENc (6 Fold)")
+		plt.savefig(args.input + '/GC3xENc_Plots/' + taxon + '.png')
+	
+if __name__ == "__main__":
+	args = get_args()
+
+	valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
+
+	gcodes = { }
+	if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
+		for line in open(args.input + '/Intermediate/gcode_output.tsv'):
+			if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
+				gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
+			elif line.split('\t')[4].strip().lower() != '':
+				print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 8 (summary statistics)\n')
+	else:
+		print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 8 (summary statistics).\n')
+		exit()
+
+	aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
+	nuc_comp = get_nuc_comp(args, gcodes)
+
+	per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
+	per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
+
+	if args.r2g_jf:
+		r2g_jf(args, nuc_comp, gcodes)
+
+	plot_jf(args, nuc_comp)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/PTL1/Transcriptomes/Scripts/CUB.py
+++ b/PTL1/Transcriptomes/Scripts/CUB.py
@ -0,0 +1,523 @@
+#!/usr/bin/env python3
+# coding=utf-8
+
+'''Aim of this script is to generate lots of codon usage statistics to aid in
+identifying useful characteristics for de novo ORF calling'''
+
+# Author: Xyrus Maurer-Alcalá
+# Contact: maurerax@gmail.com or xyrus.maurer-alcala@izb.unibe.ch
+# Last Modified: 2020-09-17
+# usage: python CUB.py
+
+# Dependencies:
+# Python3, numpy, BioPython
+
+import os
+import re
+import sys
+#import matplotlib.pyplot as plt
+import numpy as np
+#import seaborn as sns
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from Bio.SeqUtils import GC
+
+
+class CalcCUB:
+    """
+    Returns the Effective Number of Codons used (observed and expected)
+    following the equations originally from Wright 1990.
+    """
+    def expWrightENc(gc3):
+        # Calculates the expected ENc from a sequence's GC3 under Wright 1990
+        if gc3 > 1:
+            # If GC3 looks as though it is > 1 (e.g. 100%), converts to a float ≤ 1.
+            # Calculations expect a value between 0 and 1
+            gc3 = gc3/100
+        exp_enc = 2+gc3+(29/((gc3**2)+(1-gc3)**2))
+        return round(exp_enc, 4)
+
+    def nullENcGC3():
+        # Calculates the expected ENc from the null distribution of GC3
+        # values (0, 100% GC)
+        null = [CalcCUB.expWrightENc(n) for n in np.arange(0,.51,0.01)]
+        null += null[:-1][::-1]
+        return [str(i)+'\t'+str(j) for i, j in zip([n for n in range(0, 101)],null)]
+
+
+    def calcWrightENc(cdnTable):
+        # Follows Wright's (1990) calculations for determining ENc scores.
+
+        def faCalcWright(aa_counts):
+            # Returns the codon homozygosity (fa) for a given "type" of AA (e.g.
+            # 2-fold degeneracy).
+            counts = [i[2] for i in aa_counts]
+            # n_aa --> number of this particular AA
+            n_aa = sum(counts)
+            # fa --> codon homozygosity
+            try:
+                fa = (((n_aa*sum([(i/float(n_aa))**2 for i in counts]))-1)/(n_aa-1))
+            except:
+                fa = 0
+            return fa
+
+        def ENcWright_by_Degen(fa_data):
+            # Same as used in Wright 1990, averages the homozygosity across all codons
+            # of a given class (e.g. 2-fold degeneracy)
+
+            # Codons without any degeneracy (e.g. ATG == M) have 100% homozygosity
+            # and provide a "base" for the ENc score
+            enc = 2
+            for k, v in fa_data.items():
+                non_zero_vals, non_zero_sum = len([i for i in v if i != 0]), sum([i for i in v if i != 0])
+                try:
+                    f_aa = non_zero_sum/non_zero_vals
+                except:
+                    f_aa = 1
+                enc += k/f_aa
+            return enc
+
+        # Determines the number of degenerate groups to use (i.e. whether 6-Fold
+        # degeneracy is present).
+        degen_cdns = {}
+        for k, v in cdnTable.items():
+            if v[1] not in degen_cdns.keys():
+                degen_cdns[v[1]] = [v[0]]
+            else:
+                if v[0] not in degen_cdns[v[1]]:
+                    degen_cdns[v[1]] += [v[0]]
+
+        # Calculates codon homozygosity (fa) for each amino acid. Groups the
+        # resulting values based on the amino acids degeneracy (e.g. 'two-fold').
+        fa_cdns = {len(v):[] for k, v in degen_cdns.items() if 'one' not in k}
+
+        for k, v in degen_cdns.items():
+            # Skip codons lacking degeneracy
+            if 'one' in k:
+                continue
+
+            for aa in v:
+                aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
+                fa_cdns[len(v)] += [faCalcWright(aa_counts)]
+        enc_val = min(61, round(ENcWright_by_Degen(fa_cdns),4))
+        return enc_val
+
+    def SunEq5(cdnTable):
+        def calcFcf(aa_counts):
+            counts = [i[2] for i in aa_counts]
+            pseudocounts = [i+1 for i in counts]
+            na = sum(pseudocounts)
+            fcf = sum([(i/float(na))**2 for i in pseudocounts]), sum(pseudocounts)
+            return fcf
+
+        ENcWeightedPsuedo = 0
+        degen_cdns = {}
+
+        for k, v in cdnTable.items():
+            if v[1] == 'none':
+                continue
+            if v[1] not in degen_cdns.keys():
+                degen_cdns[v[1]] = [v[0]]
+            else:
+                if v[0] not in degen_cdns[v[1]]:
+                    degen_cdns[v[1]] += [v[0]]
+        for k, v in degen_cdns.items():
+            fcf_nc = []
+            for aa in v:
+                aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
+                fcf_nc.append(calcFcf(aa_counts))
+            weightedENc = (len(fcf_nc) /
+                (sum([i[0]*i[1] for i in fcf_nc]) /
+                sum([i[1] for i in fcf_nc])))
+            ENcWeightedPsuedo += weightedENc
+        return round(ENcWeightedPsuedo,4)
+
+    def calcRCSU(cdnTbl):
+        rscu = {k:[v[0]] for k, v in cdnTbl.items() if v[0].isalpha()}
+        for k, v in rscu.items():
+            try:
+                aa_info = [(key, val[-1]) for key, val in cdnTbl.items() if val[0] == v[0]]
+                aa_cnts = [x[1] for x in aa_info]
+                cdn_rscu = (cdnTbl[k][-1]*len(aa_cnts))/sum(aa_cnts)
+                rscu[k] += [str(round(cdn_rscu,4))]
+            except:
+                rscu[k] += ['0.0']
+        return rscu
+
+
+class GenUtil(object):
+    """
+    "Overflow" of functions for now. Just a precaution to make the code a
+    little cleaner/easier to manage.
+
+    This class inclues means to normalize/check the user-provided genetic code,
+    which if not valid will default to the "universal" genetic code.
+
+    Similarly, This class will return the appropriate
+    codon count table and provides a function to update its values.
+    """
+    def convertGenCode(gCode):
+        # Will interpret the user provided genetic code (gcode) and checks that
+        # it is currently available for use with the NCBI/biopython
+        # supported translation tables. Default is universal.
+        # Dictionary of the possible/functional genetic codes that are supported.
+        # --- Chilodonella and condylostoma are to come!
+        transTable = {'universal':1, 'blepharisma':4,
+            'ciliate':6, 'euplotes':10, 'mesodinium':29, 'myrionecta':29, 'peritrich':30,
+            '1':1, '4':4, '6':6, '10':10, '29':29, '30':30, 'chilo':'chilo'}
+
+        if str(gCode).lower() not in transTable:
+            print("\nWarning: Provided genetic code is not supported (yet).\n")
+            print("Currently running using the UNIVERSAL genetic code.\n\n")
+            print("Alternative genetic codes are as follows (Note: numbers "\
+                "correspond to NCBI genetic code tables):\n")
+            print('\n'.join(list(transTable.keys()))+'\n')
+            return 'Universal',1
+        else:
+            return gCode,transTable[str(gCode).lower()]
+
+    def getCDNtable(gCode):
+        # Returns the appropriate codon table to be used for the ENc calculations.
+        # Universal codon table, with 6-fold degenerate codons split
+        # into four-fold and two-fold groups.
+        universal_no6fold = {
+        	'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
+        	'GCG': ['A', 'four', 0], 'CGT': ['R', 'four', 0], 'CGC': ['R', 'four', 0],
+        	'CGG': ['R', 'four', 0], 'CGA': ['R', 'four', 0], 'AGA': ['R_', 'two', 0],
+        	'AGG': ['R_', 'two', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
+        	'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
+        	'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
+        	'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
+        	'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
+        	'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
+        	'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
+        	'TTA': ['L_', 'two', 0], 'TTG': ['L_', 'two', 0], 'CTT': ['L', 'four', 0],
+        	'CTC': ['L', 'four', 0], 'CTA': ['L', 'four', 0], 'CTG': ['L', 'four', 0],
+        	'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
+        	'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
+        	'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'four', 0],
+        	'TCC': ['S', 'four', 0], 'TCA': ['S', 'four', 0], 'TCG': ['S', 'four', 0],
+        	'AGT': ['S_', 'two', 0], 'AGC': ['S_', 'two', 0], 'ACT': ['T', 'four', 0],
+        	'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
+        	'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
+        	'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
+        	'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
+        	'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
+
+        # Universal codon table, with 6-fold degenerate codons kept
+        # whole, no splitting! Traditional Universal codon table.
+        universal_6fold = {
+        	'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
+        	'GCG': ['A', 'four', 0], 'CGT': ['R', 'six', 0], 'CGC': ['R', 'six', 0],
+        	'CGG': ['R', 'six', 0], 'CGA': ['R', 'six', 0], 'AGA': ['R', 'six', 0],
+        	'AGG': ['R', 'six', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
+        	'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
+        	'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
+        	'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
+        	'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
+        	'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
+        	'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
+        	'TTA': ['L', 'six', 0], 'TTG': ['L', 'six', 0], 'CTT': ['L', 'six', 0],
+        	'CTC': ['L', 'six', 0], 'CTA': ['L', 'six', 0], 'CTG': ['L', 'six', 0],
+        	'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
+        	'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
+        	'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'six', 0],
+        	'TCC': ['S', 'six', 0], 'TCA': ['S', 'six', 0], 'TCG': ['S', 'six', 0],
+        	'AGT': ['S', 'six', 0], 'AGC': ['S', 'six', 0], 'ACT': ['T', 'four', 0],
+        	'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
+        	'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
+        	'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
+        	'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
+        	'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
+
+        # Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
+        # codons kept whole, no splitting!
+        blepharisma_6fold = {**universal_6fold,
+            'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
+            'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
+
+        # Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
+        # codons split into four-fold and two-fold groups.
+        blepharisma_no6fold = {**universal_no6fold,
+            'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
+            'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
+
+        # Chilodonella genetic code codon table, with 6-fold degenerate
+        # codons kept whole, no splitting!
+        chilo_6fold = {**universal_6fold,
+            'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
+            'TAA': ['*', 'one', 0], 'TAG': ['Q', 'four', 0],
+            'TGA': ['Q', 'four', 0]}
+
+        # Chilodonella genetic code codon table, with 6-fold degenerate
+        # codons split into four-fold and two-fold groups.
+        # Note that this also splits four-fold degenerate codons that OUGHT to
+        # be in "different" functional categories (e.g. CAG =/= TAG)
+        chilo_no6fold = {**universal_no6fold,
+            'TAA': ['*', 'one', 0], 'TAG': ['Q_', 'one', 0],
+            'TGA': ['Q_', 'one', 0]}
+
+        # Ciliate (table 6) genetic code codon table, with 6-fold degenerate
+        # codons kept whole, no splitting! Traditional ciliate codon table.
+        ciliate_6fold = {**universal_6fold,
+            'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
+            'TAA': ['Q', 'four', 0], 'TAG': ['Q', 'four', 0],
+            'TGA': ['*', 'one', 0]}
+
+        # Ciliate (table 6) genetic code codon table, with 6-fold degenerate
+        # codons split into four-fold and two-fold groups.
+        # Note that this also splits four-fold degenerate codons that OUGHT to
+        # be in "different" functional categories (e.g. CAA =/= TAA)
+        ciliate_no6fold = {**universal_no6fold,
+            'TAA': ['Q_', 'two', 0], 'TAG': ['Q_', 'two', 0],
+            'TGA': ['*', 'one', 0]}
+
+        # Euplotes codon table, with 6-fold degenerate codons kept
+        # whole, no splitting! Traditional Universal codon table.
+        euplotes_6fold = {**universal_6fold,
+            'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
+            'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
+            'TAG': ['*', 'two',0]}
+
+        # Euplotes genetic code codon table, with 6-fold degenerate codons
+        # split into four-fold and two-fold groups.
+        euplotes_no6fold = {**universal_no6fold,
+            'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
+            'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
+            'TAG': ['*', 'two',0]}
+
+        # Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
+        # degenerate codons kept whole, no splitting! Traditional ciliate codon table.
+        mesodinium_6fold = {**universal_6fold,
+            'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
+            'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
+            'TGA': ['*', 'one', 0]}
+
+        # Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
+        # degenerate codons split into four-fold and two-fold groups.
+        mesodinium_no6fold = {**universal_no6fold,
+            'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
+            'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
+            'TGA': ['*', 'one', 0]}
+
+        # Peritrich (table 30) genetic code codon table, with 6-fold degenerate
+        # codons kept whole, no splitting! Traditional ciliate codon table.
+        peritrich_6fold = {**universal_6fold,
+            'GAA': ['E', 'four', 0], 'GAG': ['E', 'four', 0],
+            'TAA': ['E', 'four', 0], 'TAG': ['E', 'four', 0],
+            'TGA': ['*', 'one', 0]}
+
+        # Peritrich (table 30) genetic code codon table, with 6-fold degenerate
+        # codons split into four-fold and two-fold groups.
+        # Note that this also splits four-fold degenerate codons that OUGHT to
+        # be in "different" functional categories (e.g. CAA =/= TAA)
+        peritrich_no6fold = {**universal_no6fold,
+            'TAA': ['E_', 'two', 0], 'TAG': ['E_', 'two', 0],
+            'TGA': ['*', 'one', 0]}
+
+        cdnTableDict = {1:[universal_no6fold,universal_6fold],
+            4:[blepharisma_no6fold, blepharisma_6fold],
+            6:[ciliate_no6fold,ciliate_6fold],
+            10:[euplotes_no6fold,euplotes_6fold],
+            29:[mesodinium_no6fold,mesodinium_6fold],
+            30:[peritrich_no6fold,peritrich_6fold],
+            'chilodonella':[chilo_no6fold,chilo_6fold],
+            'chilo':[chilo_no6fold,chilo_6fold]}
+        return cdnTableDict[gCode]
+
+    def mapCdns(seq, cdnTable):
+        # Updates the codon counts for a given sequence to the respective codon
+        # count table (e.g. with or without 6-fold degeneracy).
+        codons = [seq[n:n+3] for n in range(0, len(seq)-len(seq)%3, 3)]
+        amb_cdn = 0
+        for c in codons:
+            try:
+                cdnTable[c][-1] += 1
+            except:
+                amb_cdn += 1
+        if cdnTable['TCC'][1] == 'six':
+            return cdnTable, amb_cdn
+        else:
+            return cdnTable
+
+class GCeval():
+    """
+    Returns %GC values from DNA sequences of various types.
+    """
+    def gcTotal(seq):
+        # This function returns global GC content
+        return round(GC(seq), 4)
+
+    def gc1(seq):
+        # This function return the GC content of the first position of a codon
+        return round(GC(''.join([seq[n] for n in range(0, len(seq), 3)])), 4)
+
+    def gc2(seq):
+        # This function return the GC content of the second position of a codon
+        return round(GC(''.join([seq[n] for n in
+            range(1, len(seq)-len(seq[1:]) % 3, 3)])), 4)
+
+    def gc3(seq):
+        # This function return the GC content of the third position of a codon
+        return round(GC(''.join([seq[n] for n in
+            range(2, len(seq)-len(seq[2:]) % 3, 3)])), 4)
+
+    def gc3_4F(cdnTbl):
+    #     # This function return the GC content of the third position of four-fold
+    #     # degenerate codons
+        FrFold = round(GC(''.join([k[-1]*v[-1] for k, v in cdnTbl.items() if
+            'one' not in v[1]])), 4)
+        return FrFold
+
+class SeqInfo(object):
+    """
+    Provides a means to harbor the data for each individual contig/gene in a
+    given fasta file.
+    This includes GC content (various types), Effective Number of codons
+    (ENc; again various calculations), Relative Synonymous Codon Usage (RSCU).
+    """
+    def __init__(self,seq,gcode='universal'):
+        self.ntd = str(seq)
+        self.gcode, self.transTable = GenUtil.convertGenCode(gcode)
+        # Dictionary of the GC-related functions/calculations
+        self.gcFuncs = {'gcOverall':GCeval.gcTotal,'gc1':GCeval.gc1,'gc2':GCeval.gc2,'gc3':GCeval.gc3}
+
+    def countCodons(self):
+        # Stores the different codon tables and updates their codon counts
+        cdnTbls = GenUtil.getCDNtable(self.transTable)
+        self.cdnCounts_6F,self.amb_cdn = GenUtil.mapCdns(self.ntd, cdnTbls[1])
+        self.cdnCounts_No6F = GenUtil.mapCdns(self.ntd, cdnTbls[0])
+
+    def ENcStats(self):
+        # Stores the various Effective Number of Codons calculations in the class
+        self.expENc = CalcCUB.expWrightENc(self.gc3)
+        self.obsENc_6F = CalcCUB.calcWrightENc(self.cdnCounts_6F)
+        self.obsENc_No6F = CalcCUB.calcWrightENc(self.cdnCounts_No6F)
+        self.SunENc_6F = CalcCUB.SunEq5(self.cdnCounts_6F)
+        self.SunENc_No6F = CalcCUB.SunEq5(self.cdnCounts_No6F)
+
+    def GCstats(self):
+        # Stores the various GC-stats in the class
+        for k, v in self.gcFuncs.items():
+            setattr(self,k,v(self.ntd))
+        self.gc4F = GCeval.gc3_4F(self.cdnCounts_No6F)
+
+
+    def RSCUstats(self):
+        self.rscu_No6Fold = CalcCUB.RSCU(self.cdnCounts_No6F)
+        self.rscu_6Fold = CalcCUB.RSCU(self.cdnCounts_6F)
+
+
+def prepFolders(outName):
+    if os.path.isdir(outName) == False:
+        os.mkdir(outName)
+    if os.path.isdir(outName+'/Plots') == False:
+        os.mkdir(outName+'/Plots')
+    if os.path.isdir(outName+'/SpreadSheets') == False:
+        os.mkdir(outName+'/SpreadSheets')
+
+
+def CalcRefFasta(fasta, gCode):
+    seqDB = {i.description:SeqInfo(i.seq, gCode) for i in SeqIO.parse(fasta,'fasta')}
+    GenCDNtable = {}
+    for k, v in seqDB.items():
+        v.countCodons()
+        v.GCstats()
+        v.ENcStats()
+        for k, v in v.cdnCounts_6F.items():
+            if k.isalpha() and k not in GenCDNtable .keys():
+                GenCDNtable[k] = [v[0],v[-1]]
+            else:
+                GenCDNtable[k][-1] += v[-1]
+    RSCU = CalcCUB.calcRCSU(GenCDNtable)
+    return seqDB, RSCU
+
+
+def WriteWrightOut(seqData, outName, comp):
+    if comp == False:
+        with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Raw.tsv','w+') as w:
+            w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
+                'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
+                'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
+            for k, v in seqData.items():
+                name = [k]
+                gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
+                ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
+                str(v.SunENc_6F),str(v.SunENc_No6F)]
+                w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
+    else:
+        with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.CompTrans.ENc.Raw.tsv','w+') as w:
+            w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
+                'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
+                'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
+            for k, v in seqData.items():
+                name = [k]
+                gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
+                ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
+                str(v.SunENc_6F),str(v.SunENc_No6F)]
+                w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
+
+
+def getCompFasta(fasta, gCode):
+    print(fasta)
+    stopCDNs = {'1':['TAA','TAG','TGA'], '4':['TAA','TAG'], '6':['TGA'], '10':['TAA','TAG'],
+    '29':['TGA'], '30':['TGA'], 'universal':['TAA','TAG','TGA'], 'blepharisma':['TAA','TAG'],
+    'ciliate':['TGA'],'euplotes':['TAA','TAG'], 'mesodinium':['TGA'], 'peritrich':['TGA'],
+    'chilo':['TAA']}
+    if gCode.lower() not in stopCDNs.keys():
+        stops = stopCDNs['1']
+    else:
+        stops = stopCDNs[gCode]
+
+    with open(fasta.replace('.fasta','.Comp.fasta'),'w+') as w:
+        for i in SeqIO.parse(fasta,'fasta'):
+        	#if str(i.seq).upper().startswith('ATG') and str(i.seq).upper()[-3:] in stops:
+        	#if str(i.seq).upper()[-3:] in stops:
+            if len(i.seq) % 3 == 0:
+                w.write('>'+i.description+'\n'+str(i.seq)+'\n')
+
+    return fasta.replace('.fasta','.Comp.fasta')
+
+def WriteNullENcOut(outName):
+    with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Null.tsv','w+') as w:
+        w.write('GC3\tENc\n')
+        w.write('\n'.join(CalcCUB.nullENcGC3()))
+
+
+def WriteRSCUtbl(RSCUtbl, outName):
+    with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.RSCU.tsv','w+') as w:
+        w.write('Codon\tAmino Acid\tRSCU\n')
+        for k,v in RSCUtbl.items():
+            w.write(k+'\t'+'\t'.join(v)+'\n')
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print('\nUsage:\n')
+        print('python CUB.py MyNtds.fasta MyTaxon genetic_code\n')
+        print('\nGenetic Codes:\n')
+        gcd = ['1', '4', '6', '10', '29', '30', 'universal', 'blepharisma',
+        'ciliate','euplotes', 'mesodinium', 'peritrich','chilo']
+        print('\n'.join(gcd)+'\n')
+        sys.exit()
+    fasta = sys.argv[1]
+    try:
+        outName = sys.argv[2]
+    except:
+        print('Missing an output name. Include one, then run again!')
+        sys.exit()
+    try:
+        gCode = sys.argv[3]
+    except:
+        gCode = 'universal'
+    compFasta = getCompFasta(fasta, gCode)
+    prepFolders(outName)
+    fastaDataRaw, RSCUtbl = CalcRefFasta(fasta, gCode)
+    fastaDataComp, RSCUtbl = CalcRefFasta(compFasta, gCode)
+    WriteWrightOut(fastaDataRaw, outName, comp=False)
+    WriteWrightOut(fastaDataComp, outName, comp=True)
+    WriteNullENcOut(outName)
+    WriteRSCUtbl(RSCUtbl, outName)
+    os.system('cp '+fasta+' '+outName+'/')
+    os.system('mv '+compFasta+' '+outName+'/')
--- a/PTL1/Transcriptomes/Scripts/PTL1.2831.out
+++ b/PTL1/Transcriptomes/Scripts/PTL1.2831.out
--- a/PTL1/Transcriptomes/Scripts/UnexpexctedShortStuffBlameXyrus.txt
+++ b/PTL1/Transcriptomes/Scripts/UnexpexctedShortStuffBlameXyrus.txt
@ -0,0 +1,143 @@
+Contig_204_Len5938_Cov7976_P_OG6_102770
+Contig_3323_Len1201_Cov3_U_OG6_148486
+Contig_313_Len4904_Cov3_E_OG6_101730
+Contig_3269_Len1219_Cov2_U_OG6_102957
+Contig_9180_Len427_Cov1_P_OG6_100774
+Contig_4698_Len874_Cov1_U_OG6_111564
+Contig_6408_Len634_Cov2_U_OG6_102446
+Contig_667_Len3323_Cov11_U_OG6_116443
+Contig_2271_Len1620_Cov1_E_OG6_101536
+Contig_4504_Len909_Cov11_P_OG6_101629
+Contig_333_Len4757_Cov12241_P_OG6_102770
+Contig_811_Len2973_Cov4_U_OG6_116443
+Contig_4707_Len871_Cov7779_P_OG6_102770
+Contig_5488_Len751_Cov2_U_OG6_103872
+Contig_317_Len4879_Cov6_E_OG6_107629
+Contig_5624_Len732_Cov1_U_OG6_100648
+Contig_1895_Len1829_Cov9_U_OG6_107629
+Contig_6636_Len610_Cov1_P_OG6_100598
+Contig_4338_Len944_Cov2_U_OG6_118109
+Contig_3061_Len1290_Cov2_P_OG6_105259
+Contig_786_Len3043_Cov3_E_OG6_101725
+Contig_4267_Len958_Cov2_U_OG6_107629
+Contig_157_Len6644_Cov11_E_OG6_102358
+Contig_24_Len10475_Cov4_E_OG6_106943
+Contig_145_Len6822_Cov9_U_OG6_145244
+Contig_536_Len3701_Cov3_E_OG6_103917
+Contig_421_Len4239_Cov31_P_OG6_102198
+Contig_2102_Len1703_Cov4_E_OG6_102675
+Contig_40_Len9764_Cov77_E_OG6_105576
+Contig_4490_Len912_Cov1_P_OG6_106264
+Contig_6491_Len626_Cov2_P_OG6_100328
+Contig_7231_Len557_Cov1_U_OG6_107629
+Contig_756_Len3094_Cov12165_P_OG6_102770
+Contig_3009_Len1307_Cov13_U_OG6_116443
+Contig_4049_Len1006_Cov3_U_OG6_103879
+Contig_426_Len4213_Cov26_U_OG6_108411
+Contig_5227_Len788_Cov1_P_OG6_100289
+Contig_701_Len3225_Cov2_P_OG6_106492
+Contig_4905_Len840_Cov2_P_OG6_115036
+Contig_28_Len10287_Cov699_E_OG6_100777
+Contig_117_Len7490_Cov12_E_OG6_102035
+Contig_748_Len3109_Cov7889_P_OG6_102770
+Contig_3013_Len1306_Cov6_U_OG6_116443
+Contig_5988_Len682_Cov2_U_OG6_107629
+Contig_2349_Len1580_Cov1_P_OG6_124813
+Contig_349_Len4667_Cov4_U_OG6_116443
+Contig_841_Len2909_Cov21_E_OG6_107629
+Contig_8720_Len444_Cov1_P_OG6_115892
+Contig_2350_Len1578_Cov8_U_OG6_107629
+Contig_7003_Len576_Cov3_P_OG6_115725
+Contig_8605_Len452_Cov1_U_OG6_107629
+Contig_3057_Len1292_Cov6_U_OG6_102073
+Contig_168_Len6512_Cov4_E_OG6_102446
+Contig_3018_Len1303_Cov9938_P_OG6_102770
+Contig_3148_Len1260_Cov6057_P_OG6_102770
+Contig_2866_Len1358_Cov8209_P_OG6_102770
+Contig_1129_Len2472_Cov5_E_OG6_100425
+Contig_4087_Len998_Cov2_E_OG6_102109
+Contig_9116_Len429_Cov2_E_OG6_100420
+Contig_2878_Len1356_Cov2_E_OG6_102565
+Contig_1436_Len2164_Cov2_U_OG6_121413
+Contig_125_Len7242_Cov67_U_OG6_105015
+Contig_4250_Len962_Cov4_U_OG6_107629
+Contig_9689_Len411_Cov1_P_OG6_110848
+Contig_488_Len3892_Cov12068_P_OG6_102770
+Contig_5963_Len686_Cov1_P_OG6_101423
+Contig_5913_Len692_Cov3_U_OG6_116443
+Contig_1768_Len1907_Cov2_P_OG6_106950
+Contig_1477_Len2126_Cov2_P_OG6_104145
+Contig_7577_Len529_Cov1_E_OG6_116443
+Contig_9982_Len402_Cov2_P_OG6_104843
+Contig_373_Len4515_Cov26_U_OG6_108411
+Contig_4656_Len883_Cov2_E_OG6_102774
+Contig_3145_Len1262_Cov2_P_OG6_110223
+Contig_5222_Len788_Cov2_E_OG6_116443
+Contig_6139_Len665_Cov1_P_OG6_129320
+Contig_987_Len2641_Cov4_E_OG6_103026
+Contig_213_Len5837_Cov11847_P_OG6_102770
+Contig_5568_Len739_Cov1_E_OG6_102109
+Contig_3238_Len1232_Cov5_U_OG6_116443
+Contig_1549_Len2061_Cov3_E_OG6_102774
+Contig_3274_Len1217_Cov2_E_OG6_107629
+Contig_6478_Len627_Cov4_U_OG6_107219
+Contig_1710_Len1945_Cov2_P_OG6_101690
+Contig_172_Len6427_Cov11653_P_OG6_102770
+Contig_2759_Len1398_Cov2_U_OG6_102446
+Contig_4836_Len848_Cov10167_E_OG6_101051
+Contig_665_Len3326_Cov6_E_OG6_103961
+Contig_7896_Len504_Cov1_U_OG6_116443
+Contig_6369_Len640_Cov2_U_OG6_107629
+Contig_218_Len5773_Cov12260_P_OG6_102770
+Contig_250_Len5432_Cov11_U_OG6_102109
+Contig_1503_Len2102_Cov2_P_OG6_104631
+Contig_1781_Len1898_Cov4_E_OG6_103110
+Contig_7718_Len517_Cov4_U_OG6_107629
+Contig_2524_Len1486_Cov10283_U_OG6_102770
+Contig_207_Len5911_Cov4_P_OG6_104171
+Contig_1989_Len1770_Cov9572_P_OG6_102770
+Contig_6893_Len587_Cov1_P_OG6_103083
+Contig_1403_Len2189_Cov6_E_OG6_100617
+Contig_4699_Len873_Cov2_U_OG6_102109
+Contig_7520_Len533_Cov1_P_OG6_103438
+Contig_115_Len7578_Cov15_U_OG6_107629
+Contig_1732_Len1931_Cov2_E_OG6_101536
+Contig_6482_Len627_Cov1_P_OG6_101673
+Contig_849_Len2894_Cov3_U_OG6_116443
+Contig_3913_Len1037_Cov5_U_OG6_107629
+Contig_9808_Len408_Cov1_P_OG6_113435
+Contig_439_Len4172_Cov3_E_OG6_107629
+Contig_749_Len3109_Cov7_U_OG6_101143
+Contig_376_Len4497_Cov28_E_OG6_116443
+Contig_1579_Len2041_Cov3_E_OG6_103026
+Contig_787_Len3042_Cov8_U_OG6_101143
+Contig_4099_Len996_Cov1_U_OG6_105091
+Contig_8084_Len489_Cov1_P_OG6_101427
+Contig_2431_Len1534_Cov37_E_OG6_101143
+Contig_110_Len7741_Cov24_U_OG6_101172
+Contig_4412_Len930_Cov1_P_OG6_101345
+Contig_7241_Len556_Cov1_E_OG6_107629
+Contig_574_Len3608_Cov16_P_OG6_107278
+Contig_739_Len3127_Cov4_E_OG6_102109
+Contig_858_Len2877_Cov9661_P_OG6_102770
+Contig_3857_Len1055_Cov1_U_OG6_100774
+Contig_7308_Len551_Cov1_P_OG6_100463
+Contig_61_Len9077_Cov26_E_OG6_101172
+Contig_5256_Len784_Cov1_U_OG6_107629
+Contig_6823_Len592_Cov1_E_OG6_100210
+Contig_2634_Len1443_Cov2_U_OG6_102109
+Contig_2511_Len1491_Cov2_P_OG6_118575
+Contig_3656_Len1108_Cov2_P_OG6_106027
+Contig_7423_Len542_Cov1_U_OG6_132867
+Contig_889_Len2819_Cov18_U_OG6_116443
+Contig_5665_Len726_Cov1_E_OG6_100769
+Contig_3821_Len1064_Cov2_P_OG6_100469
+Contig_6635_Len610_Cov1_P_OG6_100578
+Contig_573_Len3614_Cov53_U_OG6_102121
+Contig_7211_Len558_Cov3_U_OG6_101143
+Contig_1293_Len2294_Cov87_E_OG6_106711
+Contig_131_Len7117_Cov11286_P_OG6_102770
+Contig_5544_Len742_Cov1_P_OG6_111848
+Contig_806_Len2987_Cov21_E_OG6_103192
+Contig_219_Len5773_Cov12245_P_OG6_102770
+Contig_136_Len7040_Cov8_U_OG6_100080
--- a/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-37.pyc
+++ b/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-37.pyc
--- a/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-38.pyc
+++ b/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-38.pyc
--- a/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-39.pyc
+++ b/PTL1/Transcriptomes/Scripts/pycache/CUB.cpython-39.pyc
--- a/PTL1/Transcriptomes/Scripts/scratch.py
+++ b/PTL1/Transcriptomes/Scripts/scratch.py
@ -0,0 +1,7 @@
+import re
+
+
+og = 'OG6_123456'
+
+ogv = og.split(re.split('OG.{1}_', og)[1])[0][-4:]
+print(ogv, re.split('OG.{1}_', og)[1])
--- a/PTL1/Transcriptomes/Scripts/wrapper.py
+++ b/PTL1/Transcriptomes/Scripts/wrapper.py
@ -0,0 +1,261 @@
+#Dependencies
+import os, sys, re
+import shutil
+import argparse
+
+
+def get_args():
+
+	parser = argparse.ArgumentParser(
+                prog = 'PhyloToL v6.0 Part 1 for Transcriptomes',
+                description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
+                )
+
+	parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6, 7 }, help = 'Script to run if you are only running one script')
+	parser.add_argument('-n', '--conspecific_names', type = str, help = 'A .txt or .tsv file with two tab-separated columns; the first should have 10 digit codes, the second species or other identifying names. This is used to determine which sequences to remove (only between "species") in cross-plate contamination assessment')
+	parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'First script to run')
+	parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5, 6, 7 }, help = 'First script to run')
+	parser.add_argument('-a', '--assembled_transcripts', type = str, help = 'Path to a folder of assembled transcripts, assembled by rnaSPAdes. Each assembled transcript file name should start with a unique 10 digit code, and end in "_assembledTranscripts.fasta", E.g. Op_me_hsap_assembledTranscripts.fasta')
+	parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder')
+	parser.add_argument('-x', '--xplate_contam', action = 'store_true', help = 'Run cross-plate contamination removal (includes all files)')
+	parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 5). Alternatively, if you need to use a variety of genetic codes but know which codes to use, you may fill give here the path to a .txt or .tsv with two tab-separated columns, the first with the ten-digit codes and the second column with the corresponding genetics codes. Otherwise, stop at script 4 and fill in "gcode_output.tsv" before running script 5')
+	parser.add_argument('-min', '--minlen', type = int, default = 200, help = 'Minimum transcript length')
+	parser.add_argument('-max', '--maxlen', type = int, default = 12000, help = 'Maximum transcript length')
+	parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
+
+	return parser.parse_args()
+
+		
+#running the first script on all the bare files
+def script_one(args, ten_digit_codes):
+	for file in os.listdir(args.assembled_transcripts):
+		if file[10:] == '_assembledTranscripts.fasta' and file[:10] in ten_digit_codes:
+			os.system('python 1a_ContigFiltStats.py --input_file ' + args.assembled_transcripts + '/' + file + ' --output_file ' + args.output + '/Output/' + file[:10] + ' --minLen ' + str(args.minlen) + ' --maxLen ' + str(args.maxlen) + ' --spades') #SPADES ARGUMENT??
+
+	if args.xplate_contam:
+		os.system('python 1b_XSpeciesContaminationAgnes.py ' + args.output + '/Output/XlaneBleeding ' + str(args.minlen) + ' ' + args.conspecific_names)
+
+
+def script_two(args):
+
+	for folder in os.listdir(args.output + '/Output/'):
+		if os.path.isfile(args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta'):
+			os.system('python 2a_remove_rRNA.py --input_file ' + args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta --databases ' + args.databases)
+
+			fasta_withBact = args.output + '/Output/' + folder + '/' + folder + '_NorRNAseqs.fasta'
+			os.system('python 2b_remove_Bact.py --input_file ' + fasta_withBact + ' --databases ' + args.databases)
+
+#NEED TO SORT OUT FILE NAMES ETC. BELOW HERE
+
+#running the third script
+def script_three(args):
+
+	for folder in os.listdir(args.output + '/Output'):
+		if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta'):
+			os.system('python 3_CountOGsDiamond.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta --evalue 1e-15 --databases ' + args.databases)
+
+
+#running the fourth script
+def script_four(args):			
+	for folder in os.listdir(args.output + '/Output'):
+		if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
+				os.system('python 4_InFrameStopFreq.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --databases ' + args.databases)
+				
+	#putting all of the gcode summaries produced by fourth script into a spreadsheet
+	gcode_info = []
+	for folder in os.listdir(args.output + '/Output'):
+		if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv'):
+				with open(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv') as f:
+					for line in f:
+						line_sep = line.split('\t')
+						if line_sep[0] == 'Summary':
+							gcode_info.append([folder, line_sep[6], line_sep[7], line_sep[8][:-1]])
+
+	valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
+	
+	stop = False
+	gcode_file = { }
+	if args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
+		if os.path.isfile(args.genetic_code):
+			for line in open(args.genetic_code):
+				try:
+					if line.split('\t')[1].strip().lower() in valid_codes:
+						gcode_file.update({ line.split('\t')[0] : line.split('\t')[1].strip() })
+					else:
+						print('Genetic code ERROR -- ' + line.split('\t')[1].strip() + ' is not a valid genetic code. Please fill out the "gcode_output.tsv" file and continue with script 5.')
+				except IndexError:
+					print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it is improperly formatted. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
+					stop = True
+		else:
+			print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it could not be found. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
+			stop = True
+
+	with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
+		g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
+		for row in gcode_info:
+			if args.genetic_code == None:
+				g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
+			elif args.genetic_code.lower() in valid_codes:
+				g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + args.genetic_code + '\n')
+			elif args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
+				try:
+					g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3]+ '\t' + gcode_file[row[0]] + '\n')
+				except KeyError:
+					g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3]+ '\t' + 'Universal' + '\n')
+					print('\nDefaulting to Universal genetic code for taxon ' + row[0] + '\n')
+					#print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but a taxon is missing. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
+					#stop = True
+			else:	
+				stop = True
+
+	if stop or args.genetic_code == None:
+		print('\nStopping after script 4 because genetic code information is incomplete. Please fill out the file "gcode_output.tsv" and continue with script 5.\n')
+		exit()
+
+
+def script_five(args):
+	
+	valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
+	
+	lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')]
+	with open(args.output + '/Output/gcode_output.tsv', 'r') as g:
+		for folder in os.listdir(args.output + '/Output'):
+			if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
+				for line in lines:
+					if line[0] == folder and line[-1].lower() in valid_codes:
+						os.system('python 5_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --genetic_code ' + line[-1])
+					elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
+						print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
+
+
+def script_six(args):
+
+	prefixes = []
+	for file in os.listdir(args.output + '/Output'):
+		if file.endswith('_AA.ORF.fasta'): 
+			prefixes.append(file[:10])
+	
+	unique_prefixes = list(dict.fromkeys(prefixes))
+
+	hook_fasta = ''
+	for file in os.listdir(args.databases + '/db_OG'):
+		if file.split('.')[-1] in ('fasta', 'fas', 'fa', 'faa'):
+			hook_fasta = args.databases + '/db_OG/' + file
+
+	if hook_fasta == '':
+		print('\nNo .fasta file could be found containing Hook sequences. This should be supplied along with the .dmnd-formatted database file in the Databases/db_OG folder. Quitting before script 6.\n')
+
+	for prefix in unique_prefixes:
+		os.system('python 6_FilterPartials.py --file_prefix ' + args.output + '/Output/' + prefix + ' --hook_fasta ' + hook_fasta)
+			
+	for prefix in unique_prefixes:
+		os.system('python 6b_update_cov_post_removepartials.py ' + args.output + '/Output/' + prefix)
+
+
+def script_seven(args):
+
+	for file in os.listdir(args.output + '/Output/ToRename'):
+		if '.AA.ORF.fasta' in file:
+			os.system('python 7_FinalizeName.py --input_file ' + args.output + '/Output/ToRename/' + file + ' --name ' + file[:10])
+
+	os.mkdir(args.output + '/Output/Intermediate')
+
+	for file in os.listdir(args.output + '/Output'):
+		if file != 'ReadyToGo' and file != 'Intermediate':
+			os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
+
+	os.system('python 8_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
+
+
+if __name__ == "__main__":
+
+	args = get_args()
+
+	if (args.first_script == 1 or args.script == 1) and (args.assembled_transcripts == None or not os.path.isdir(args.assembled_transcripts)):
+		print('\nERROR: If starting at the first script, a valid path to a folder of assembled transcript files (which must end in .fasta, .fa, or .fna) should be input using the --assembled_transcripts argument')
+		quit()
+
+	if args.genetic_code == None and args.script == -1:
+		if args.first_script < 5 and args.last_script >= 5:
+			print('\nERROR: You cannot run script 5 without giving a genetic code! If all of the taxa in the run use the same genetic code, then use the --genetic_code argument (e.g. -g Universal). Otherwise, stop after script 4, fill out the spreadsheet called "gcode_translate.tsv," and then run scripts 5-7. If this does not make sense, please ask for help.')
+			quit()
+
+	ten_digit_codes = []
+	if args.first_script == 1 or args.script == 1:
+		for file in os.listdir(args.assembled_transcripts):
+			if file[10:] == '_assembledTranscripts.fasta':
+				ten_digit_codes.append(file[:10])
+	else:
+		if not os.path.isdir(args.output + '/Output'):
+			print('\nERROR: A folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
+			quit()
+
+	if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
+		print('\nERROR: Duplicate 10-digit codes are not allowed.\n')
+		quit()
+
+	for code in ten_digit_codes:
+		for c, char in enumerate(code):
+			if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
+				print('\nERROR: ' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
+				quit()
+
+	if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
+		print('\nERROR: An "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
+		quit()
+	elif os.path.isdir(args.output + '/Output/Intermediate'):
+		print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
+		quit()
+	elif not os.path.isdir(args.output + '/Output'):
+		os.mkdir(args.output + '/Output')
+
+	scripts = [0, script_one, script_two, script_three, script_four, script_five, script_six, script_seven]
+
+	if args.script == -1:
+		if args.first_script < args.last_script:
+			for i in range(1 + args.last_script - args.first_script):
+				print('\nRunning script ' + str(i + args.first_script) + '...\n')
+				if i + args.first_script == 1:
+					if len(ten_digit_codes) == 0:
+						print('\nNo properly-named assembled transcripts files found.\n')
+						quit()
+					else:
+						scripts[i + args.first_script](args, ten_digit_codes)
+				else:
+					scripts[i + args.first_script](args)
+		else:
+			print('\nERROR: Invalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
+			quit()
+	else:
+		if args.script == 1:
+			if len(ten_digit_codes) == 0:
+				print('\nNo properly-named assembled transcripts files found.\n')
+				quit()
+			else:
+				scripts[args.script](args, ten_digit_codes)
+		else:
+			scripts[args.script](args)
+
+		
+		
+						
+				
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/PTL1/Transcriptomes/Scripts/wrapper_submit.sh
+++ b/PTL1/Transcriptomes/Scripts/wrapper_submit.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+#
+#SBATCH --job-name=PTL1_GBF
+#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId)
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances
+#SBATCH --mem=160G
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=YOUREMAIL@smith.edu
+
+module purge       #Cleans up any loaded modules
+module use /gridapps/modules/all    #make sure module locations is loaded
+
+module load slurm
+module load tqdm
+module load Biopython/1.75-foss-2019b-Python-3.7.4
+module load BLAST+/2.9.0-gompi-2019b
+module load DIAMOND/0.9.30-GCC-8.3.0
+module load VSEARCH/2.21.1-GCC-10.3.0
+
+parent='/beegfs/fast/katzlab/becky/PTL1/Transcriptomes/Forams/'
+
+srun -D ${parent}Scripts python3 ${parent}Scripts/wrapper.py -1 1 -2 7 -x --assembled_transcripts ${parent}AssembledTranscripts -o ${parent}  -n ${parent}Conspecific.txt --genetic_code Universal &
+#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate4/Assembled_Transcripts -o ${parent}Plate4  -n ${parent}Plate4/Conspecific.txt --genetic_code ${parent}Plate4/Gcodes.txt &
+#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate7/Assembled_Transcripts -o ${parent}Plate7  -n ${parent}Plate7/Conspecific.txt --genetic_code ${parent}Plate7/Gcodes.txt &
+#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 1 -2 7 -x --assembled_transcripts ${parent}Plate11/Assembled_Transcripts -o ${parent}Plate11  -n ${parent}Plate11/Conspecific.txt --genetic_code ${parent}Plate11/Gcodes.txt &
+#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate18/Assembled_Transcripts -o ${parent}Plate18  -n ${parent}Plate18/Conspecific.txt --genetic_code ${parent}Plate18/Gcodes.txt &
+wait