From b6954c127a8746fcfca155fbec283cccbfb3bf11 Mon Sep 17 00:00:00 2001
From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com>
Date: Fri, 26 Jan 2024 11:17:11 -0500
Subject: [PATCH] Annotating wrapper.py

---
 PTL1/Transcriptomes/Scripts/wrapper.py | 39 +++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 7 deletions(-)

diff --git a/PTL1/Transcriptomes/Scripts/wrapper.py b/PTL1/Transcriptomes/Scripts/wrapper.py
index ba83d46..bd31d6f 100644
--- a/PTL1/Transcriptomes/Scripts/wrapper.py
+++ b/PTL1/Transcriptomes/Scripts/wrapper.py
@@ -1,3 +1,15 @@
+# Last updated Sept 2023
+# Author: Auden Cote-L'Heureux
+
+# This script is a WRAPPER for the PhyloToL Part 1 TRANSCRIPTOMES pipeline. Users should
+# use this script to run the pipeline, rather than running any of the sub-scripts (number 1a through 7b)
+# independently. To run an individual step in the pipeline, use --script X where X is the number (1 through 7).
+# To run multiple sets (usually all of them), use --first script 1 --last_script 7, or whichever first
+# and last scripts are desired. Run "python wrapper.py --help" for details on how to run this script. Before
+# running this script ensure that the databases are correctly located and named, and that input assembled
+# transcripts are named in the format Op_me_Hsap_assembledTranscripts.fasta, where Op_me_Hsap can be replaced
+# with any 10-digit sample identifier.
+
 #Dependencies
 import os, sys, re
 import shutil
@@ -5,6 +17,7 @@ import argparse
 import CheckSetup
 
 
+#Reading in the arguments
 def get_args():
 
 	parser = argparse.ArgumentParser(
@@ -32,10 +45,12 @@ def script_one(args, ten_digit_codes):
 
 	CheckSetup.run(args)
 
+	#Running script 1a on all files
 	for file in os.listdir(args.assembled_transcripts):
 		if file[10:] == '_assembledTranscripts.fasta' and file[:10] in ten_digit_codes:
 			os.system('python 1a_TranscriptLengthFilter.py --input_file ' + args.assembled_transcripts + '/' + file + ' --output_file ' + args.output + '/Output/' + file[:10] + ' --minLen ' + str(args.minlen) + ' --maxLen ' + str(args.maxlen) + ' --spades') #SPADES ARGUMENT??
 
+	#Run script 1b if the XPC step is being run
 	if args.xplate_contam:
 		if not os.path.isfile(args.conspecific_names):
 			print('\nERROR: If you are running cross-plate contamination, a file designating species assignments is required for the --conspecific_names argument\n')
@@ -46,6 +61,7 @@ def script_one(args, ten_digit_codes):
 
 def script_two(args):
 
+	#Run scripts 2a and 2b on all files.
 	for folder in os.listdir(args.output + '/Output/'):
 		if os.path.isfile(args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta'):
 			os.system('python 2a_Identify_rRNA.py --input_file ' + args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta --databases ' + args.databases)
@@ -55,7 +71,7 @@ def script_two(args):
 
 #NEED TO SORT OUT FILE NAMES ETC. BELOW HERE
 
-#running the third script
+#running the third script on all files
 def script_three(args):
 
 	for folder in os.listdir(args.output + '/Output'):
@@ -79,8 +95,10 @@ def script_four(args):
 						if line_sep[0] == 'Summary':
 							gcode_info.append([folder, line_sep[6], line_sep[7], line_sep[8][:-1]])
 
+	#Available genetic codes
 	valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
-	
+
+	#Reading the input genetic codes file if given (should be tab-separated)
 	stop = False
 	gcode_file = { }
 	if args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
@@ -98,6 +116,8 @@ def script_four(args):
 			print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it could not be found. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
 			stop = True
 
+	#Writing the gcode_output.tsv file with in-frame stop codon frequencies. If you didn't input genetic codes originally,
+	#edit the last column in this file to contain your desired genetic codes for each sample.
 	with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
 		g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
 		for row in gcode_info:
@@ -121,6 +141,7 @@ def script_four(args):
 		exit()
 
 
+#Running script 5 on all taxa
 def script_five(args):
 	
 	valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
@@ -132,10 +153,11 @@ def script_five(args):
 				for line in lines:
 					if line[0] == folder and line[-1].lower() in valid_codes:
 						os.system('python 5_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --genetic_code ' + line[-1])
-					elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
+					#Taxa without valid genetic codes will be skipped.
+					elif line[-1].lower() not in valid_codes and 'Genetic Code' not in line:
 						print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
 
-
+#Run script 6 on all files
 def script_six(args):
 
 	prefixes = []
@@ -145,6 +167,7 @@ def script_six(args):
 	
 	unique_prefixes = list(dict.fromkeys(prefixes))
 
+	#Checking to make sure there is only one fasta file for the OG reference database, and recording its name.
 	hook_fasta = ''
 	for file in os.listdir(args.databases + '/db_OG'):
 		if file.split('.')[-1] in ('fasta', 'fas', 'fa', 'faa'):
@@ -152,11 +175,12 @@ def script_six(args):
 
 	if hook_fasta == '':
 		print('\nNo .fasta file could be found containing Hook sequences. This should be supplied along with the .dmnd-formatted database file in the Databases/db_OG folder. Quitting before script 6.\n')
-
+		exit()
+	
 	for prefix in unique_prefixes:
 		os.system('python 6_FilterPartials.py --file_prefix ' + args.output + '/Output/' + prefix + ' --hook_fasta ' + hook_fasta)
 		
-
+#Running scripts 7a and 7b on all taxa
 def script_seven(args):
 
 	for file in os.listdir(args.output + '/Output/ToRename'):
@@ -176,6 +200,7 @@ if __name__ == "__main__":
 
 	args = get_args()
 
+	#The following code checks to make sure combinations of input arguments are compatible
 	if (args.first_script == 1 or args.script == 1) and (args.assembled_transcripts == None or not os.path.isdir(args.assembled_transcripts)):
 		print('\nERROR: If starting at the first script, a valid path to a folder of assembled transcript files (which must end in .fasta, .fa, or .fna) should be input using the --assembled_transcripts argument')
 		quit()
@@ -184,7 +209,7 @@ if __name__ == "__main__":
 		if args.first_script < 5 and args.last_script >= 5:
 			print('\nERROR: You cannot run script 5 without giving a genetic code! If all of the taxa in the run use the same genetic code, then use the --genetic_code argument (e.g. -g Universal). Otherwise, stop after script 4, fill out the spreadsheet called "gcode_translate.tsv," and then run scripts 5-7. If this does not make sense, please ask for help.')
 			quit()
-
+			
 	ten_digit_codes = []
 	if args.first_script == 1 or args.script == 1:
 		for file in os.listdir(args.assembled_transcripts):