From 3015aaa2eb3de39d77e42055c597d8b7d614c2ed Mon Sep 17 00:00:00 2001
From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com>
Date: Mon, 18 Sep 2023 10:02:06 -0400
Subject: [PATCH] New Script 4 with added Hook-relative length filter

---
 PTL1/Genomes/Scripts/4_CountOGsDiamond.py | 338 ++++++++++++++++++++++
 1 file changed, 338 insertions(+)
 create mode 100644 PTL1/Genomes/Scripts/4_CountOGsDiamond.py

diff --git a/PTL1/Genomes/Scripts/4_CountOGsDiamond.py b/PTL1/Genomes/Scripts/4_CountOGsDiamond.py
new file mode 100644
index 0000000..942ef02
--- /dev/null
+++ b/PTL1/Genomes/Scripts/4_CountOGsDiamond.py
@@ -0,0 +1,338 @@
+#!/usr/bin/env python3.5
+
+##__Updated__: 19_09_2017
+##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
+##__Usage__: python 3g_GCodeTranslate.py --help
+
+##############################################################################
+##                                                                          ##
+## This scrip will categorize TRANSLATED CDSs into Homologous Gene Families ##
+##                                                                          ##
+##     Questions about Gene Family Binning/Source? SEE NOTES at Bottom!     ##
+##                                                                          ##
+##      E-mail Xyrus (author) for help if needed: maurerax@gmail.com        ##
+##                                                                          ##
+##############################################################################
+
+import argparse, os, re, sys
+from argparse import RawTextHelpFormatter, SUPPRESS
+from distutils import spawn
+from Bio import SeqIO
+from statistics import mean
+
+
+#----------------------------- Colors For Print Statements ------------------------------#
+class color:
+   PURPLE = '\033[95m'
+   CYAN = '\033[96m'
+   DARKCYAN = '\033[36m'
+   ORANGE = '\033[38;5;214m'
+   BLUE = '\033[94m'
+   GREEN = '\033[92m'
+   YELLOW = '\033[93m'
+   RED = '\033[91m'
+   BOLD = '\033[1m'
+   UNDERLINE = '\033[4m'
+   END = '\033[0m'
+   
+
+#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#
+def check_diamond_path():
+	### IF Diamond is IN YOUR PATH then no updating is needed...
+	diamond_path = ''
+
+	if diamond_path == '':
+		diamond_path = spawn.find_executable("diamond")
+		#diamond_path = /path/to/diamond
+	else:
+		pass
+
+	if diamond_path == None:
+		print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+		+ ' the PATH to the' + color.BLUE + ' "diamond" '+ color.END + color.BOLD\
+		+ 'executable.\n\n' + color.END)
+		print (color.BOLD + color.BLUE + 'LOOK FOR:\n\n' + color.RED\
+		+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+		+ color.BLUE + '\n\nThis is somewhere around lines 55 - 80...\n\n' + color.END)
+
+		sys.exit()
+	else:
+		pass
+
+	return diamond_path
+
+#------------------------------- Main Functions of Script --------------------------------#
+
+###########################################################################################
+###--------------------- Parses and Checks Command-Line Arguments ----------------------###
+###########################################################################################
+
+def check_args():
+
+	parser = argparse.ArgumentParser(description=
+	color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
+	+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
+	+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
+	'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
+	+'THIS script (4_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
+	formatter_class=RawTextHelpFormatter)
+	
+	required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
+ 
+	required_arg_group.add_argument('--input_file','-in', action='store',
+	help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
+	' Eukaryotic protein coding transcripts'+color.END)
+	required_arg_group.add_argument('--databases','-d', action='store',
+	help=color.BOLD+color.GREEN+'Path to folder containing db_OG'+color.END)
+	required_arg_group.add_argument('--evalue','-e', action='store',
+	help=color.BOLD+color.GREEN+'Maximum OG assignment e-value'+color.END)
+
+	optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
+
+	optional_arg_group.add_argument('--threads','-t', default='2',
+	help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
+
+	optional_arg_group.add_argument('-author', action='store_true',
+	help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
+
+	if len(sys.argv[1:]) == 0:
+		print (parser.description)
+		print ('\n')
+		sys.exit()
+
+	args = parser.parse_args()
+	
+	quit_eval = return_more_info(args)
+	if quit_eval > 0:
+		sys.exit()
+
+	args.diamond = check_diamond_path()
+	
+	args.home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
+	
+	args.tsv_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('.AA.fasta','_allOGCleanresults.tsv')
+	
+	args.aa_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed')
+	args.ntd_out = args.home_folder + args.input_file.split('/')[-1].replace('CDS','CDS.Renamed').replace('AA','NTD')
+
+	return args
+
+		
+###########################################################################################
+###------------------------------- Script Usage Message --------------------------------###
+###########################################################################################
+
+def usage_msg():
+	return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_CountOGsDiamond.py'\
+	' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Universal.AA.fasta'+color.END)
+
+
+##########################################################################################
+###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
+##########################################################################################
+
+def return_more_info(args):
+
+	valid_arg = 0
+
+	author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
+	' maurerax@gmail.com\n\n'+color.END)
+
+	if args.author == True:
+		print (author)
+		valid_arg += 1
+
+	if args.input_file != None:
+		if os.path.isfile(args.input_file) != False:
+			if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
+				print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+				'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+				' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+				valid_arg += 1
+			elif args.input_file.endswith('AA.fasta') != True:
+				print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
+				' with '+color.GREEN+'3g_GCodeTranslate.py '+color.END+color.BOLD+'are valid\n\n'\
+				'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
+				'"AA.fasta"\n\n'+color.END)
+				valid_arg += 1
+		else:
+			print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
+			'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
+			' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) 
+			valid_arg += 1
+
+	if os.path.isdir(args.databases + '/db_OG') != True:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
+		'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+		+'\n\nThen try once again\n\n.'+color.END)
+		valid_arg += 1
+
+	ogdb_count = 0
+	for file in os.listdir(args.databases + '/db_OG'):
+		if file.endswith('.dmnd'):
+			ogdb_count += 1
+
+	if ogdb_count == 0:
+		print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+		'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
+		'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
+		color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
+		color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
+		valid_arg += 1
+	elif ogdb_count > 1:
+		print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
+		valid_arg += 1
+
+	return valid_arg
+
+
+###########################################################################################
+###--------------------------- Does the Inital Folder Prep -----------------------------###
+###########################################################################################
+
+def prep_folders(args):
+
+	OG_folder = '/'.join(args.input_file.split('/')[:-1])+'/DiamondOG/'
+	
+	if os.path.isdir(OG_folder) != True:
+		os.system('mkdir '+OG_folder)		
+
+		
+###########################################################################################
+###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
+###########################################################################################
+
+def OG_ublast(args):
+
+	db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
+
+	OG_diamond_cmd = args.diamond + ' blastp -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + args.evalue + ' --subject-cover 0.5 --threads ' + args.threads + ' --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_allOGresults'	
+	os.system(OG_diamond_cmd)
+
+
+###########################################################################################
+###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
+###########################################################################################
+
+def keep_best(args):
+	print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST'\
+	'\nmatch for each transcript\n\n'+color.END)
+	
+	inTSV = [i for i in open(args.input_file.split('.fas')[0]+'_allOGresults').read().split('\n') if i != '']
+	
+	inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
+	
+	keep = []
+	for i in inTSV:
+		if any(i.split('\t')[0] in j for j in keep) != True:
+			keep.append(i)
+
+	updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+\
+	'\t'+'\t'.join(line.split('\t')[1:]) for line in keep]))
+		
+	with open(args.tsv_out, 'w+') as w:
+		for i in updated_lines:
+			w.write(i+'\n')
+	
+
+###########################################################################################
+###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
+###########################################################################################
+
+def update_fasta(args):
+
+	print (color.BOLD+color.PURPLE+'Updating Sequence Names with their BEST OG hits\n\n'+color.END)
+
+	hook_fasta = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.fasta')][0]
+
+	OGLenDB = {}
+	for rec in SeqIO.parse(args.databases + '/db_OG/' + hook_fasta, 'fasta'):
+		if rec.id[-10:] not in OGLenDB:
+			OGLenDB.update({ rec.id[-10:] : [] })
+
+		OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
+
+	for og in OGLenDB:
+		OGLenDB[og] = mean(OGLenDB[og])
+
+	keep = [i for i in open(args.tsv_out).read().split('\n') if i != '']
+	
+	keep_dict = { }
+	for line in keep:
+		og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
+		og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
+		og = og_prefix + og_number
+
+		keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0]  : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
+
+	protFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
+	ntdFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file.replace('.AA.','.NTD.'),'fasta')]
+
+	updated_prot_name = { keep_dict[i.description] : str(i.seq).rstrip('*') for i in protFasta if i.description in keep_dict.keys() }
+	updated_ntd_name = { keep_dict[i.description] : str(i.seq).rstrip('*') for i in ntdFasta if i.description in keep_dict.keys() }
+
+	with open(args.aa_out,'w+') as w:
+		for rec in updated_prot_name:
+			og_number = re.split('OG.{1}_', rec)[1][:6]
+			og_prefix = rec.split(og_number)[0][-4:]
+			og = og_prefix + og_number
+
+			if len(updated_prot_name[rec]) > 0.33*OGLenDB[og] and len(updated_prot_name[rec]) < 1.5*OGLenDB[og]:
+				w.write('>' + rec + '\n' + updated_prot_name[rec] + '\n\n')
+
+	with open(args.ntd_out,'w+') as x:
+		for rec in updated_ntd_name:
+			og_number = re.split('OG.{1}_', rec)[1][:6]
+			og_prefix = rec.split(og_number)[0][-4:]
+			og = og_prefix + og_number
+
+			if len(updated_ntd_name[rec]) > OGLenDB[og] and len(updated_ntd_name[rec]) < 4.5*OGLenDB[og]:
+				x.write('>' + rec + '\n' + updated_ntd_name[rec] + '\n\n')
+
+
+##########################################################################################
+###--------------------- Cleans up the Folder and Moves Final Files -------------------###
+##########################################################################################
+
+def clean_up(args):
+
+	os.system('mv '+args.input_file.replace('.fasta','_allOGresults')+' '+args.home_folder+\
+	'/DiamondOG')
+
+	os.system('cp '+args.aa_out+' '+args.home_folder+'/DiamondOG/')
+	os.system('cp '+args.ntd_out+' '+args.home_folder+'/DiamondOG/')
+	os.system('cp '+args.tsv_out+' '+args.home_folder+'/DiamondOG/')
+
+
+##########################################################################################
+###----------------------------- Calls on Above Functions -----------------------------###
+##########################################################################################	
+				
+def main():
+
+	args = check_args()
+	
+	prep_folders(args)
+
+	OG_ublast(args)
+
+	keep_best(args)
+
+	update_fasta(args)
+
+	clean_up(args)
+
+	print (color.BOLD+'Next Script is: '+color.GREEN+'5g_FinalizeName.py\n\n'+color.END)
+
+main()
+
+#----------------------------------------- NOTES -----------------------------------------#
+#
+# This script uses a "BLAST"-based approach to identify ANCIENT homologous gene families.
+#
+# Gene family designations were taken from OrthoMCL.org and serve as the database for 
+# this script's gene family assignments. These gene family assignments are NON-EXHAUSTIVE
+# and most Lineage-Specific families will be missed!
+#
+# If you have any questions contact Xyrus (author): maurerax@gmail.com
\ No newline at end of file