New script 6 with revised length filtration

2026-02-10 22:30:24 +08:00 · 2023-09-18 10:06:36 -04:00 · 2023-09-18 10:06:36 -04:00 · 7d54bb7416
commit 7d54bb7416
parent 8ccb6eb9a7
1 changed files with 698 additions and 0 deletions
--- a/PTL1/Transcriptomes/Scripts/6_FilterPartials.py
+++ b/PTL1/Transcriptomes/Scripts/6_FilterPartials.py
@ -0,0 +1,698 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 2023-09-18
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch
+##__Usage__: python 6_FilterPartials.py --help
+
+
+##################################################################################################
+## This script is intended to remove incomplete transcripts that have a more complete mate		##
+##																								##
+## Prior to running this script, ensure the following:											##
+##																								##
+## 1. You have assembled your transcriptome and COPIED the 'assembly' file 						##
+##    (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder							##
+## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py			##
+## 3. Removed SSU/LSU sequences from your Fasta File											##
+## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined				##
+## 5. Classified the Non-Strongly Prokaryotic sequences into OGs 								##
+## 6. You either know (or have inferred) the genetic code of the organism						##
+## 7. You have translated the sequences and checked for the data in the RemovePartials folder	##
+##																								##
+## 					E-mail Xyrus (author) for help if needed: maurerax@gmail.com				##
+##																								##
+##										Next Script(s) to Run: 									##
+##						 	  			  7_FinalRename.py										##
+##																								##
+##################################################################################################
+
+from Bio import SeqIO
+from Bio.Seq import Seq
+from statistics import mean
+
+from distutils import spawn
+import argparse, os, sys, time, re
+from argparse import RawTextHelpFormatter,SUPPRESS
+
+
+
+#------------------------------ Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
+###########################################################################################
+	## IF Diamond is IN YOUR PATH then no updating is needed...
+
+def check_diamond_path():
+
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = '/path/to/diamond'
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
+		print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+		+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+		+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
+
+		sys.exit()
+	else:
+		pass
+
+	return diamond_path
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\
+	+color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\
+	+color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\
+	+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
+
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+
+	required_arg_group.add_argument('--file_prefix','-fp', action='store',
+	help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\
+	'to be processed\n'+color.END)
+
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98,
+	help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\
+	' contigs\n (default = 0.98)\n'+color.END)
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+	optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder')
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+
+	args.id_print = str(int(float(args.identity)*100))
+
+	args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/'
+	args.file_prefix = args.file_prefix.split('/')[-1]
+
+	args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')]
+
+	args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')]
+
+	args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')]
+
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		print ('\n')
+		sys.exit()
+
+	return args
+
+
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\
+	' --file_prefix Op_me_Xxma'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.file_listNTD == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if args.file_listAA == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if args.file_listTSV == []:
+		print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\
+		' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
+		'is present in\nthe files of interest')
+		valid_arg += 1
+
+	if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV):
+		pass
+	else:
+		print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\
+		' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\
+		+color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\
+		' each transcriptome\n\nThen try once again.'+color.END)
+		valid_arg += 1
+
+	return valid_arg
+
+
+##########################################################################################
+###------------------------- Creates Folders For Storing Data -------------------------###
+##########################################################################################
+
+def prep_folders(args):
+
+	if os.path.isdir(args.all_output_folder + 'ToRename') != True:
+		os.system('mkdir ' + args.all_output_folder + 'ToRename')
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix) != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix)
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets')
+
+	if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True:
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed')
+		os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets')
+
+
+##########################################################################################
+###-------------------- Merges Fasta Files When Replicates Present --------------------###
+##########################################################################################
+
+def merge_fasta_replicates(args, type):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+
+	count = 0
+	fasta_to_merge = []
+
+	if type == 'NTD':
+		fasta_list = args.file_listNTD
+	else:
+		fasta_list = args.file_listAA
+
+	for file in fasta_list:
+		fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != '']
+		count += 1
+
+	with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w:
+		w.write(''.join(fasta_to_merge))
+
+	time.sleep(.75)
+
+
+##########################################################################################
+###--------------------- Merges TSV Files When Replicates Present ---------------------###
+##########################################################################################
+
+def merge_tsv_replicates(args):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
+
+	count = 0
+	tsv_to_merge = []
+
+	for file in args.file_listTSV:
+		tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != '']
+		count += 1
+
+	with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w:
+		w.write('\n'.join(tsv_to_merge))
+
+	time.sleep(.75)
+
+
+##########################################################################################
+###------------------ Calls on the other Merge Functions by Data Type -----------------###
+##########################################################################################
+
+def merge_relevant_data(args):
+
+	print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END)
+
+	merge_fasta_replicates(args, 'NTD')
+	merge_fasta_replicates(args, 'AA')
+	merge_tsv_replicates(args)
+
+
+##########################################################################################
+###------------------- Uses Diamond to perform Self-vs-Self "BLAST" -------------------###
+##########################################################################################
+
+def self_blast(args, diamond_path):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+
+	diamond_makedb = diamond_path + ' makedb --in ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated'
+
+	diamond_self = diamond_path + ' blastp -q ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated --strand plus --no-self-hits --id '+str(args.identity)+\
+	' --query-cover 0.7 --evalue 1e-15 --threads 60 --outfmt 6 -o ' + cat_folder + 'SpreadSheets/' + args.file_prefix + '.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
+
+	print (color.BOLD+'\n\nBinning ALL '+color.ORANGE+'Nucleotide ORFs'+color.END+color.BOLD\
+	+' for '+color.GREEN+args.file_prefix+color.END+color.BOLD+' at '+args.id_print\
+	+'% identity.\n\n'+color.END)
+
+	os.system(diamond_makedb)
+	os.system(diamond_self)
+
+	return cat_folder+'SpreadSheets/'+args.file_prefix+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
+
+
+##########################################################################################
+###------------------- Uses USearch to perform Self-vs-Self "BLAST" -------------------###
+##########################################################################################
+
+def check_Self_vs_Self(tsv_file):
+
+	evaluation = ''
+
+	tsv_in = [i for i in open(tsv_file).read().split('\n') if i != '']
+
+	if len(tsv_in) == 0:
+		evaluation = 'empty'
+		with open(tsv_file,'w+') as w:
+			w.write('No Self-vs-Self hits were found')
+	else:
+		evaluation = 'continue'
+
+	return evaluation
+
+
+
+##########################################################################################
+###-------------------- Removes Nearly Identical ORFs from Data Set -------------------###
+##########################################################################################
+
+def filter_NTD_data(args, OGLenDB):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	##########################################
+	## Set-up Useful Lists and Dictionaries ##
+	##########################################
+
+	nuc_Above98_hit = {}
+	seqs_to_toss = []
+	prepped_NTD = []
+	prepped_AA = []
+
+	nuc_tsv_100 = 0
+
+	orf_lens = { rec.id : len(str(rec.seq)) for file in args.file_listNTD for rec in SeqIO.parse(file, 'fasta') }
+
+	replicates = ''
+
+	if len(args.file_listNTD) > 1:
+		replicates = 'yes'
+	else:
+		replicates = 'nope'
+
+	print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\
+	' with >'+args.id_print+'% Nucleotide Identity over >70% of\ntheir length when '\
+	'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\
+	+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+	#####################################################################
+	## Self-v-self BLAST Output Parsing - first checks for Seq-length! ##
+	#####################################################################
+
+	nuc_tsv_raw = [i.rstrip('\n') for i in open(cat_folder+'SpreadSheets/'+args.file_prefix\
+	+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv').readlines() if i != '\n']
+
+	too_long = 0
+	for line in nuc_tsv_raw:
+		og_number = re.split('OG.{1}_', line)[-1][:6]
+		og_prefix = line.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if og in OGLenDB.keys():
+			if orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] > 4.5*OGLenDB[og] or orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] < 1.5*OGLenDB[og]:
+				seqs_to_toss.append(line.split('\t')[1])
+				too_long += 1
+
+	nuc_tsv = [line for line in nuc_tsv_raw if line.split('\t')[1] not in seqs_to_toss]
+
+	if len(nuc_tsv) > 0:
+		if 'Cov' in nuc_tsv[0].split('\t')[0].split('_')[-3]:
+			nuc_tsv.sort(key=lambda x: (-int(x.split('\t')[1].split('Len')[-1].split('_')[0]),-int(x.split('\t')[1].split('Cov')[-1].split('_')[0])))
+		else:
+			nuc_tsv.sort(key=lambda x: -int(x.split('\t')[1].split('Len')[-1].split('_')[0]))
+
+	for line in nuc_tsv:
+		if line.split('\t')[1] not in seqs_to_toss:
+			nuc_Above98_hit.setdefault(line.split('\t')[1],[]).append(line.split('\t')[0])
+			seqs_to_toss.append(line.split('\t')[0])
+			if line.split('\t')[2] == '100.0':
+				nuc_tsv_100 += 1
+
+	seqs_to_toss = list(set(seqs_to_toss))
+	inFasta_NTD_rawLen = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') if i.description]
+	inFasta_NTD = [i for i in inFasta_NTD_rawLen if i.description not in seqs_to_toss]
+	inFasta_AA = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta','fasta') if i.description not in seqs_to_toss]
+
+	if replicates != '':
+		for i in inFasta_NTD:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq))
+			else:
+				Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
+				prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq))
+		for i in inFasta_AA:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq).replace('*','X'))
+			else:
+				Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
+				prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq).replace('*','X'))
+	else:
+		for i in inFasta_NTD:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
+			else:
+				prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
+		for i in inFasta_AA:
+			if i.description not in nuc_Above98_hit.keys():
+				prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
+			else:
+				prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
+
+	with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w+') as w:
+		for k, v in nuc_Above98_hit.items():
+			w.write(k+'\t'+'\t'.join(v)+'\n')
+
+	############################################################################################
+	## Check for abnormally short and long sequences for the taxon for every Gene Family (OG) ##
+	##																													   ##	
+	##	IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which	   ##
+	## filters ORFs by length relative to the length of other ORFs from the taxon, may be     ##
+	## worth revising/removing in the next version of PhyloToL Part 1.							   ##
+	##																													   ##
+	############################################################################################
+
+	print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
+	'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+	self_OGLenDB={} ##
+	seqs_to_toss = [] ##
+	too_long = too_short = 0 ##
+
+	for i in prepped_NTD:
+		og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
+		og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		self_OGLenDB.setdefault(og,[]).append(len(i.split('\n')[-1]))
+
+	good_NTD_names = []
+	for i in prepped_NTD:
+		og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
+		og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if (0.3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))) <= len(i.split('\n')[-1]) <= (3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))):
+			good_NTD_names.append(i.split('\n')[0])
+
+	good_NTD_seqs = [i for i in prepped_NTD if i.split('\n')[0] in good_NTD_names]
+	good_AA_seqs = [i for i in prepped_AA if i.split('\n')[0] in good_NTD_names]
+
+	too_short = len(prepped_NTD) - len(good_NTD_names)
+
+	####################################################################
+	## Finalized Outputs are Summarized and Written Out to New Fastas ##
+	####################################################################
+
+	print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+	+color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\
+	str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\
+	color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\
+	+color.PURPLE+' ORFs.\n\n'+color.END)
+
+	print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+	+' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\
+	color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\
+	color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\
+	+color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\
+	color.RED+'TOO LONG or SHORT.\n\n'+color.END)
+
+	print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\
+	+color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END)
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
+		for i in good_NTD_seqs:
+			w.write(i+'\n')
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
+		for i in good_AA_seqs:
+			x.write(i+'\n')
+
+	return good_NTD_names
+
+
+##########################################################################################
+###------------------- Updates SpreadSheet with Update Sequence Names -----------------###
+##########################################################################################
+
+def update_tsv(args, NTD_list_names):
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	inTSV = {'_'.join(i.split('\t')[0].split('_')[1:]):'\t'.join(i.split('\t')[1:]) for i in open(cat_folder+\
+	args.file_prefix+'_Concatenated.allOGCleanresults.tsv').readlines() if i != '\n'}
+
+	Updated_inTSV = [i.strip('>')+'\t'+inTSV[i.split('_Trans')[0].strip('>')] for i in NTD_list_names]
+
+	with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as w:
+		for line in Updated_inTSV:
+			w.write(line+'\n')
+
+
+############################################################################################
+##																													   ##	
+##	IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which	   ##
+## filters ORFs by length that do NOT hit other ORFs from the taxon by self-BLAST, may be ##
+## worth revising/removing in the next version of PhyloToL Part 1.							   ##
+##																													   ##
+############################################################################################
+
+
+def no_partials_present(args, OGLenDB):
+
+	print (color.BOLD+color.RED+'\n\nWarning:'+color.END+color.BOLD+' No partial sequences'\
+	' were found with > '+str(args.id_print)+'% nucleotide identity.\n\nThe data will still be '\
+	'checked for ORFs that are unexpectedly '+color.ORANGE+'Short'+color.END+color.BOLD+' or'\
+	+color.ORANGE+' Long.\n\n'+color.END)
+
+	cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	NTD_file = cat_folder+args.file_prefix+'.NTD.Concatenated.fasta'
+	AA_file = cat_folder+args.file_prefix+'.AA.Concatenated.fasta'
+	TSV_file = cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv'
+
+	self_OGLenDB = {}
+	seqs_to_toss = []
+	too_long, too_short = 0, 0
+
+    ## Small changes in this section for Auden (ought to work now)
+    ## Lists -> Dictionaries and some data curation steps
+
+	inFasta = {i.description:str(i.seq) for i in SeqIO.parse(NTD_file,'fasta')}
+
+	for k,v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if len(v) >= 4.5*OGLenDB[og]:
+			seqs_to_toss.append(k)
+			too_long+= 1
+
+	prepped_NTD = [i for i in inFasta if i not in seqs_to_toss]
+
+	print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\
+	+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
+	'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
+
+    ## toss those sequences from the sequence dictonary (less headache)
+	for crap_seq in seqs_to_toss:
+		del inFasta[crap_seq]
+
+	for k, v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		self_OGLenDB.setdefault(og,[]).append(len(v))
+
+	self_OGLenDB_Final = {k:sum(v)/len(v) for k, v in self_OGLenDB.items()}
+
+	good_NTD_data = { }
+	for k, v in inFasta.items():
+		og_number = re.split('OG.{1}_', k)[-1][:6]
+		og_prefix = k.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if 0.3*self_OGLenDB_Final[og] <= len(v) <= 3*self_OGLenDB_Final[og]:
+			good_NTD_data.update({ k : v })
+
+	good_AA_data = {i.description:str(i.seq) for i in SeqIO.parse(AA_file,'fasta') if i.description in good_NTD_data.keys()}
+
+	good_TSV_data = [i for i in open(cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv')\
+		.read().split('\n') if i != '' and i.split('\t')[0] in good_NTD_data.keys()]
+
+	renamed_TSV_data = [i.split('\t')[0]+'\t'+'\t'.join(i.split('\t')[1:]) for i in good_TSV_data]
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
+		for k,v in good_NTD_data.items():
+			w.write('>'+k+'\n'+v+'\n')
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
+		for k, v in good_AA_data.items():
+			x.write('>'+k+'\n'+v+'\n')
+
+	with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as y:
+		y.write('\n'.join(renamed_TSV_data))
+
+
+############################################################################################
+##																													   ##	
+##	IMPORTANT: The following function was added in September 2023 by ACL and LAK to        ##
+##	replace the pre-Guidance filter that removes sequences <50% and >150% the average      ##
+## length of the OG in the Hook. It may be worth revisiting in the next version of        ##
+## PhyloToL part 1.							   																##
+##																													   ##
+############################################################################################
+
+
+def filter_all_by_hook(args, OGLenDB):
+
+	proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
+
+	remaining_NTD_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta', 'fasta') }
+	remaining_AA_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta', 'fasta') }
+
+	short_from_translation = []
+	if os.path.isfile('UnexpexctedShortStuffBlameXyrus.txt'):
+		for line in open('UnexpexctedShortStuffBlameXyrus.txt'):
+			short_from_translation.append(line.strip())
+
+	good_NTD_seqs = []; good_AA_seqs = []
+	for rec in remaining_NTD_seqs:
+		og_number = re.split('OG.{1}_', rec)[-1][:6]
+		og_prefix = rec.split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		if len(remaining_AA_seqs[rec]) <= 1.5*OGLenDB[og] and len(remaining_AA_seqs[rec]) >= 0.33*OGLenDB[og] and rec.split('_Trans')[0] not in short_from_translation:
+			good_NTD_seqs.append((rec, remaining_NTD_seqs[rec]))
+			good_AA_seqs.append((rec, remaining_AA_seqs[rec]))
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w') as w:
+		for rec in good_NTD_seqs:
+			w.write('>' + rec[0] + '\n' + rec[1] + '\n\n')
+
+	with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w') as x:
+		for rec in good_AA_seqs:
+			x.write('>' + rec[0] + '\n' + rec[1] + '\n\n')
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+
+	for i in args.file_listNTD:
+		os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
+		os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
+		os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/')
+
+
+###########################################################################################
+###-------------------------------- Next Script Message --------------------------------###
+###########################################################################################
+
+def next_script():
+
+	print(color.BOLD+'\nNext Script is: '+color.GREEN+'6b_update_cov_post_removepartials.py\n\n'+color.END)
+
+
+##########################################################################################
+###------------------- Checks Command Line Arguments and Calls Steps ------------------###
+##########################################################################################
+
+def main():
+
+	diamond_path = check_diamond_path()
+
+	args = check_args()
+
+	prep_folders(args)
+
+	merge_relevant_data(args)
+
+	self_BLAST_out = self_blast(args, diamond_path)
+
+	evaluation = check_Self_vs_Self(self_BLAST_out)
+
+	OGLenDB = {}
+	for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
+		if rec.id[-10:] not in OGLenDB:
+			OGLenDB.update({ rec.id[-10:] : [] })
+
+		OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
+
+	for og in OGLenDB:
+		OGLenDB[og] = mean(OGLenDB[og])
+
+	if evaluation != 'empty':
+		NTD_names = filter_NTD_data(args, OGLenDB)
+		update_tsv(args, NTD_names)
+	else:
+		no_partials_present(args, OGLenDB)
+
+	filter_all_by_hook(args, OGLenDB)
+
+	clean_up(args)
+
+	next_script()
+
+main()