Update wrapper.py

Update run_eukphylo.sh
2025-12-27 04:40:24 +08:00 · 2025-09-23 16:05:44 -04:00 · 2025-09-23 16:01:59 -04:00 · 2025-09-19 12:19:51 -04:00 · 2025-09-19 12:19:10 -04:00 · 2025-09-19 11:46:14 -04:00
10 changed files with 67816 additions and 140 deletions
--- a/PTL1/Genomes/Scripts/wrapper_submit.sh
+++ b/PTL1/Genomes/Scripts/wrapper_submit.sh
@ -18,12 +18,23 @@
 #SBATCH --mail-user=email@email.edu

 module purge       #Cleans up any loaded modules
-module load slurm
-module load tqdm/4.62.3-GCCcore-11.2.0
-module load Biopython/1.79-foss-2021b
-module load BLAST+/2.12.0-gompi-2021b
-module load DIAMOND/2.0.13-GCC-11.2.0
+
+#Unity server
+module use /gridapps/modules/all
+module load conda/latest
+module load uri/main
+module load diamond/2.1.7
 module load VSEARCH/2.22.1-GCC-11.3.0
+conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
+
+#Grid server
+module use /gridapps/modules/all
+module load slurm
+module load tqdm/4.66.1-GCCcore-12.3.0
+module load Biopython/1.79-gfbf-2023a
+module load BLAST+/2.14.1-gompi-2023a
+module load DIAMOND/2.1.8-GCC-12.3.0
+module load VSEARCH/2.25.0-GCC-12.3.0

 parent='/Your/Home/Folder/'

--- a/PTL1/Transcriptomes/Scripts/wrapper.py
+++ b/PTL1/Transcriptomes/Scripts/wrapper.py
@ -92,7 +92,7 @@ def script_four(args):
 	if os.path.exists(args.databases + '/Taxa_with_few_sequences.txt'):
 		with open(args.databases + '/Taxa_with_few_sequences.txt', 'r') as f:
 			content = f.read()
-			print(f'These samples do not run through PTL6p1, perhaps because they has no good hits to the hook. We suggest you remove them and restart.')
+			print(f'These samples did not run through EukPhylo part1 because they have no good hits to the hook database or the Diamond sequence aligner ran out of memory. We suggest you remove them and restart.')
 			print(content)
 			print('Stopping Run.')
 		os.remove(args.databases + '/Taxa_with_few_sequences.txt')
--- a/PTL1/Transcriptomes/Scripts/wrapper_submit.sh
+++ b/PTL1/Transcriptomes/Scripts/wrapper_submit.sh
@ -6,29 +6,49 @@
 ## needs and restrictions, followed by some example commands taken from the GitHub Wiki, more detail for which can be found
 ## here: https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-1:-GF-assignment

+## SLURM-SPECIFIC SETUP BELOW

+############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
 ## Slurm specific code
-
 #SBATCH --job-name=EukPhylo
 #SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
 #SBATCH --nodes=1
 #SBATCH --ntasks=1
-#SBATCH --ntasks-per-node=1 # #change to number of srun when running multiple instances
+#SBATCH --ntasks-per-node=1 ##change to number of srun when running multiple instances
 #SBATCH --mem=160G
 #SBATCH --mail-type=ALL
 #SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
-
 module purge       #Cleans up any loaded modules
+module use /gridapps/modules/all
 module load slurm
-module load tqdm/4.62.3-GCCcore-11.2.0
-module load Biopython/1.79-foss-2021b
-module load BLAST+/2.12.0-gompi-2021b
-module load DIAMOND/2.0.13-GCC-11.2.0
-module load VSEARCH/2.22.1-GCC-11.3.0
+module load tqdm/4.66.1-GCCcore-12.3.0
+module load Biopython/1.79-gfbf-2023a
+module load BLAST+/2.14.1-gompi-2023a
+module load DIAMOND/2.1.8-GCC-12.3.0
+module load VSEARCH/2.25.0-GCC-12.3.0

+############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
+## Slurm specific code
+#SBATCH --job-name=EukPhylo
+#SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
+#SBATCH --nodes=1
+#SBATCH --ntasks=1
+#SBATCH --ntasks-per-node=64
+#SBATCH --mem=40G
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=email@xxx.edu
+module purge       #Cleans up any loaded modules
+module use /gridapps/modules/all
+module load conda/latest
+module load uri/main
+module load diamond/2.1.7
+module load VSEARCH/2.22.1-GCC-11.3.0
+conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
+
+## PROVIDE YOUR PARENT PATH
 parent='/Your/Home/Folder/'

-## Example commands
+## EXAMPLE RUN COMMANDS BELOW

 # A simple run that goes from script 1 to script 7 (the last script) using the Universal genetic code
 srun -D ${parent}Scripts python3 ${parent}Scripts/wrapper.py --first_script 1 --last_script 7 --assembled_transcripts ${parent}AssembledTranscripts -o ${parent}Out --genetic_code ${parent}Gcode.txt --databases ${parent}Databases > log.out
--- a/PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt
+++ b/PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt
--- a/PTL2/Scripts/guidance.py
+++ b/PTL2/Scripts/guidance.py
@ -183,10 +183,10 @@ def run(params):
 				os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')

 				#Gap trimming
-				os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')
+				os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')

 				#Copying over final aligments (pre and post gap trimming) into output folder.
-				os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta')
+				os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta')
 				os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
 				
 				#Removing intermediate files if not --keep_temp
@ -225,5 +225,6 @@ def run(params):



+


--- a/PTL2/Scripts/utils.py
+++ b/PTL2/Scripts/utils.py
@ -44,7 +44,7 @@ def get_params():
 	core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')
 	core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff')
 	core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance')
-	core.add_argument('--trimal_cutoff', default = 0.05, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 95% or more gaps, set this parameter to 0.05).')
+	core.add_argument('--trimal_cutoff', default = 0.3, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 70% or more gaps, set this parameter to 0.3).')
 	core.add_argument('--allow_large_files', action = 'store_true', help = 'Allow files with more than 2,000 sequences to run through Guidance.')

 	CL = parser.add_argument_group('Contamination loop parameters')
--- a/PTL2/run_eukphylo.sh
+++ b/PTL2/run_eukphylo.sh
@ -1,44 +1,55 @@
 #!/bin/bash
-## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Feb 2025 by Adri K. Grow
+## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Sept. 2025 by Adri K. Grow

 ## This shell script is used for running EukPhylo part 2, and includes a general setup for use on an HPC that uses 
 ## the Slurm workload manager. It also includes several example run commands, which correspond to examples explained in more detail in the 
 ## EukPhylo Wiki (https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-2:-MSAs,-trees,-and-contamination-loop).
 ## These run commands can also be copied and run in the terminal / command line separately, without a shell script.
-
+## For the contamination loop, We recommend iterating the sister/subsisters loop multiple times as branches will shift. In contrast, we recommend only running clade grabbing once

 ## SLURM-SPECIFIC SETUP BELOW

+############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
+#SBATCH --job-name=EukPhylo
+#SBATCH -n 10 # Number of Cores per Task
+#SBATCH --mem=125G # Requested Memory
+#SBATCH -p cpu # Partition
+#SBATCH -q long # long QOS
+#SBATCH -t 334:00:00 # Job time limit
+#SBATCH --output=Run_EP.%A_%a.out # Stdout (%j expands to jobId)
+#SBATCH --mail-type=ALL
+#SBATCH --mail-user=email@email.edu
+#SBATCH --array=1-600%50
+module purge       #Cleans up any loaded modules
+module load conda/latest
+module load mafft/7.505
+module load diamond/2.1.7
+conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p2/envs/PTL/
+
+############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
 #SBATCH --job-name=EukPhylo # Job name
 #SBATCH --output=Run_EukPhylo.%j.out # Stdout (%j expands to jobId)
 #SBATCH --nodes=1
 #SBATCH --ntasks=10 ## On the Smith College HPC (Grid), we have to change this to be double the number of task/batches you want to launch
 #SBATCH --mail-type=ALL
-#SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
-
-## UMass HPC (Unity) requirements below (DELETE section if not applicable):
-#SBATCH --mem=125G # Requested Memory
-#SBATCH -c 24 # Number of Cores per Task
-#SBATCH -q long # Partition
-#SBATCH -t 336:00:00 # Job time limit
-
+#SBATCH --mail-user=email@email.edu ##add your email address for job updates
 #Load required modules
-module purge       #Cleans up any loaded modules
-module use /gridapps/modules/all    #make sure module locations is loaded
+module purge # Cleans up any loaded modules
+module use /gridapps/modules/all # make sure module locations is loaded
 module load slurm
-module load ETE
-module load Biopython/1.79-foss-2021b
-module load DIAMOND/2.0.13-GCC-11.2.0
-module load MAFFT
-module load RAxML
-module load IQ-TREE/2.1.2-gompi-2021b
-module load tqdm/4.64.1-GCCcore-12.2.0
-module load Python/3.9.6-GCCcore-11.2.0
-module load Guidance_mid #Smith College HPC specific
+module load ETE/3.1.3-foss-2024a
+module load Biopython/1.79-gfbf-2023a
+module load DIAMOND/2.1.8-GCC-12.3.0
+module load MAFFT/7.526-GCC-13.3.0-with-extensions
+module load RAxML-NG/1.2.2-GCC-13.2.0
+module load IQ-TREE/2.3.6-gompi-2023a
+module load tqdm/4.66.1-GCCcore-12.3.0
+module load Python/3.12.3-GCCcore-13.3.0
+module load Guidance_mid/2.1b-foss-2023a #Smith College HPC specific
+export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific #export PATH=$PATH:/Path/To/Executable/Files

-export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific
-#export PATH=$PATH:/Path/To/Executable/Files

+## PROVIDE YOUR PARENT PATH
 parent='/Your/Home/Folder/' # The folder where you are running EukPhylo (this should contain the Scripts and input data folders)

 ## EXAMPLE RUN COMMANDS BELOW
--- a/Utilities/For_Assemblies/ProcessAndRenameAssembledData.py
+++ b/Utilities/For_Assemblies/ProcessAndRenameAssembledData.py
@ -4,7 +4,7 @@ Author & Date: Adri K. Grow + ChatGPT, Nov 11th 2024
 Motivation: assess and rename assembled transcript or genome files for use in EukPhylo Part 1
 Intention: warn if any 'transcripts.fasta' or 'contigs.fasta' files are missing or empty for an LKH, otherwise rename and copy them with their assigned 10-digit code by LKH
 Input:
-    - a base directory containing subdirectories for each LKH assembled file, named 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
+    - a base directory containing subdirectories for each LKH, named either 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
    - a mapping .txt file with LKH#s tab-separated with corresponding 10-digit codes
 Output: 
    - a folder named 'renamed_transcripts|contigs' with assembled files now named by 10-digit codes; e.g. "Sr_rh_Ro04_assembledTranscripts.fasta"
@ -83,3 +83,4 @@ def main():

 if __name__ == "__main__":
    main()
+
--- a/Utilities/for_taxonomy/ReadMapping.py
+++ b/Utilities/for_taxonomy/ReadMapping.py
@ -1,51 +1,56 @@
 '''
 #Author, date: ?
-#Uploaded: updated by Adri Grow, 2024
-#Intent: map a group of trimmed reads to a reference. 
-#Dependencies: Python3, hisat2, samtools, sambamba
-#EDIT LINES: 18 & 32
-#Inputs: Folder named 'TrimmedReads' containing all the trimmed reads.
-#Outputs: Folders with the names of the LKHs containing the sam/bam files.
-#Example: python ReadMapping.py
+#Uploaded: updated by Adri Grow, Aug 2025
+#Intent: map a group of trimmed reads to a reference 
+#Dependencies: Python, HISAT2, samtools, (optional: sambamba)
+#EDIT LINES: 19 & 36
+#Inputs: Folder named 'TrimmedReads' containing the forward and reverse trimmed reads that start with the same unique identifier for each sample/cell
+#Outputs: Folders with the names of the unique identifier (e.g. LKHs) containing the bam files
+#Usage: python3 ReadMapping.py
+#IMPORTANT: Lines 34-42 manipulate the output files in several different ways including converting .sam to .bam, sorting, optional deduplicating, optional quality filtering, and retaining only mapped reads. It is the responsibility of the user to determine exactly which commands are needed for their dataset.
 '''

 import os
 from Bio import SeqIO

-#this first command builds your reference with Hisat.
-#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it).
-#It will output several files. Don't worry about them, Hisat will know what to do.
-os.system("hisat2-build Foram_reference.fasta Foram_Index") #change to your reference.fasta and rename the index
+#This first command builds your reference with HISAT
+#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it)
+#It will output several files. Don't worry about them, HISAT will know what to do
+os.system("hisat2-build Foram_reference.fasta Foram_Index") #Replace "Foram_reference.fasta" with your reference fasta name, and optionally change "Foram_Index" to your preferred index name

-folder = os.listdir("TrimmedReads") #Insert the name of the folder which has your trimmed reads inside the quotes
-folder.sort() #This sorts the folder so that all the LKHs are in order.
+folder = os.listdir("TrimmedReads") #Replace "TrimmedReads" with the name of the folder containing your trimmed reads, if different than TrimmedReads
+
+folder.sort() #This sorts the trimmed reads folder so that all the files are passed in order

 for x in folder:
-	if "LKH" in x and "FPE" in x: #assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
+	#This is specific for file names starting with 'LKH' unqiue identifiers formatted similar to 'LKH###_FPE.fastq.gz'
+	if "LKH" in x and "FPE" in x: #Assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
 		FPE = x
-	if "LKH" in x and "RPE" in x: #assigning a variable to reverse reads.
+		sample_id = FPE.split("_FPE")[0]
+	if "LKH" in x and "RPE" in x: #Assigning a variable to reverse reads
 		RPE = x

-		if(FPE[:7] == RPE[:7]): 
-			#The next few lines are several Hisat commands that will create new files.
-			#EDIT the name of the index and the name of the trimmed reads folder in the first command below
-			os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam") 
-			os.system("samtools view -bS sample.sam > sample.bam")
-			os.system("samtools fixmate -O bam sample.bam  fixmate_sample.bam")
-			os.system("samtools sort -O bam -o sorted_sample.bam fixmate_sample.bam")
-			os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam")
-			os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam")
-			os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam")
-			os.system("samtools view -h -F 4 -b sorted_sample.dedup.bam > defaultparameters_sample.bam")
+		if FPE.split("_FPE")[0] == RPE.split("_RPE")[0]: #Match sample IDs dynamically 
+			#The next few lines are several HISAT commands that will create new files
+			#If necessary, EDIT the name of the index and the name of the trimmed reads folder in the very next line only
+			os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam") #running HISAT2
+			os.system("samtools view -bS sample.sam > sample.bam") #converts .sam file to .bam file
+			os.remove("sample.sam") #remove the .sam file (already converted to .bam, sam files are large and unnecessary to keep)
+			#os.system("samtools fixmate -O bam sample.bam  fixmate_sample.bam") #use this command if you will be using the sambamba markdup command to remove duplicate reads (Katzlab default for transcriptomics and amplicon is to not remove duplicates)
+			os.system("samtools sort -O bam -o sorted_sample.bam sample.bam") #sorts the .bam file alignments by leftmost coordinates
+			#os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam") #removes duplicate reads - may not be appropriate for your study or protocols, user will need to determine if this is best practice for their study
+			#os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam") #only keeps reads with mapping quality ≥ 40, input is the dedup file but can easily be modified to use the sorted .bam file
+			#os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam") #only keeps reads with mapping quality ≥ 20, input is the dedup file but can easily be modified to use the sorted .bam file
+			os.system("samtools view -h -F 4 -b sorted_sample.bam > sorted_mapped_sample.bam") #only keeps mapped reads, using the sorted .bam file as input - this is the Katzlab transcriptomic and amplicon final output that should be used for continued analyses

-			if not os.path.isdir(x[:7]):
-				os.mkdir(x[0:7]) #making folders with the names of the LKHs
+			if not os.path.isdir(sample_id):
+				os.mkdir(sample_id) #making folders with the names of the LKHs or unique identifiers
 				
-			for file in os.listdir('.'): #These lines move the sam/bam files that Hisat creates into the new LKH folders.
+			for file in os.listdir('.'): #These lines move the bam files created into the new LKH/unique identifier folders
 				if(file.endswith('.sam') or file.endswith('.bam')):
-					os.rename(file,x[:7] + '/' + file)
+					os.rename(file, f"{sample_id}/{file}")
 				
-print("~~~~~~~~~~~:>~") #When the snake appears in terminal, your script has finished running!
+print("~~~~~~~~~~~:>~") #When the snake appears in terminal, the script has finished running for all samples/cells!
 		
 	
 	
--- a/Utilities/for_trees/CladeGrabbing.py
+++ b/Utilities/for_trees/CladeGrabbing.py
@ -1,10 +1,10 @@
-#Author, date: Auden Cote-L'Heureux, last updated Apr 1st 2024 by GA
+#Author, date: Auden Cote-L'Heureux, last updated Aug 18th 2025 by AKG
 #Motivation: Select robust sequences from trees
 #Intent: Select clades of interest from large trees using taxonomic specifications
 #Dependencies: Python3, ete3, Biopython
 #Inputs: A folder containing: all PTLp2 output trees and all corresponding unaligned .fasta (pre-guidance) files
 #Outputs: A folder of grabbed clades and filtered unaligned fasta files
-#Example: python CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
+#Example: python3 CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
 #IMPORTANT: key parameters explained in "add_argument" section below

 #Dependencies
@ -18,7 +18,7 @@ def get_args():

 	parser = argparse.ArgumentParser(
 		prog = 'Clade grabber, Version 2.1',
-		description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by GA Feb 13th 2024"
+		description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by AKG Aug 18th 2025"
 	)
 #add_argument section with parameters explained
 	parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to a folder containing input trees (which must have the file extension .tre, .tree, .treefile, or .nex)')
@ -28,6 +28,8 @@ def get_args():
 	parser.add_argument('-nr', '--required_taxa_num', type = int, default = 0, help = 'The number of species belonging to taxa in the --required_taxa list that must be present in the clade. Default is 0.')
 	parser.add_argument('-o', '--outgroup', type = str, default = '', help = 'A comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that will be included as outgroups in the output unaligned fasta files (which will contain only sequences from a single selected clade, and all outgroup sequences in the tree captured by this argument).')
 	parser.add_argument('-c', '--contaminants', type = float, default = 2, help = 'The number of non-ingroup contaminants allowed in a clade, or if less than 1 the proportion of sequences in a clade that can be non-ingroup (i.e. presumed contaminants). Default is to allow 2 contaminants.')
+	parser.add_argument('-ft', '--first_target', type=str, default='', help='[Optional] A comma-separated list or .txt file of complete/partial taxon codes for an initial, broad clade search. If provided, the script will first find clades with these taxa before applying the main --target filter.')
+	parser.add_argument('-fm', '--first_min_presence', type=int, default=0, help='[Optional] Minimum number of sequences from --first_target required in a clade for it to be used in the second-stage search. Ignored if --first_target is not provided.')
 	
 	return parser.parse_args()

@ -85,86 +87,155 @@ def reroot(tree):
 	
 def get_subtrees(args, file):

-	newick = get_newick(args.input + '/' + file)	
+    newick = get_newick(args.input + '/' + file)    
+    tree = ete3.Tree(newick)

-	tree = ete3.Tree(newick)
+    majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))

-	majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
+    # Only try to reroot trees with more than 2 major clades (original behavior)
+    if len(majs) > 2:
+        tree = reroot(tree)

-	#Only try to reroot trees with more than 2 major clades. This was added to fix the ETE3 "Cannot set myself as outgroup" error
-	if len(majs) > 2:
-		tree = reroot(tree)								
+    # -------------------------------
+    # FIRST-STAGE (optional) FILTER
+    # -------------------------------
+    def get_outer_leafsets():
+        """
+        Return a list of sets, each set = leaf names of an outer clade
+        that passes --first_target, --first_min_presence, children_keep,
+        and contaminants logic (using args.contaminants).
+        If --first_target is not used, return one set containing ALL leaves.
+        """
+        if not args.first_target or args.first_min_presence == 0:
+            return [set(leaf.name for leaf in tree)]  # no outer filter → whole tree

-	#Getting a clean list of all target taxa
-	if '.' in args.target:
-		try:
-			target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
-		except AttributeError:
-			print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
-	else:
-		target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']
+        # Parse first_target codes
+        if '.' in args.first_target:
+            first_target_codes = [l.strip() for l in open(args.first_target, 'r').readlines() if l.strip() != '']
+        else:
+            first_target_codes = [code.strip() for code in args.first_target.split(',') if code.strip() != '']

-	#Getting a clean list of all "at least" taxa
-	if '.' in args.required_taxa:
-		try:
-			required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
-		except AttributeError:
-			print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
-	else:
-		required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']
+        outer_sets = []
+        seen_leaves = []

-	target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
-	
+        for node in tree.traverse('levelorder'):
+            # large enough and not subsumed by already accepted outer node
+            if len(node) >= args.first_min_presence and len(set(seen_leaves) & set([leaf.name for leaf in node])) == 0:
+                leaves = [leaf.name for leaf in node]

-	#Creating a record of selected subtrees, and all of the leaves in those subtrees
-	selected_nodes = []; seen_leaves = []
+                # children_keep logic but for first_target
+                children_keep = 0
+                for child in node.children:
+                    taken = False
+                    for code in first_target_codes:
+                        for leaf in child:
+                            if leaf.name.startswith(code):
+                                children_keep += 1
+                                taken = True
+                                break
+                        if taken:
+                            break
+                if children_keep != len(node.children):
+                    continue

-	#Iterating through all nodes in tree, starting at "root" then working towards leaves
-	for node in tree.traverse('levelorder'):
-		#If a node is large enough and is not contained in an already selected clade
+                # count first-target hits (use [:10] uniqueness like original)
+                first_hits = set()
+                for code in first_target_codes:
+                    for leaf in leaves[::-1]:
+                        if leaf.startswith(code):
+                            first_hits.add(leaf[:10])
+                            leaves.remove(leaf)

-		if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
-			leaves = [leaf.name for leaf in node]
+                # contaminants logic applied to FIRST-STAGE (reuse args.contaminants)
+                passes_contam = ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(first_hits)) or
+                                 (args.contaminants >= 1 and len(leaves) <= args.contaminants))

-			#Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade with 1 fewer than the max number of contaminants
-			children_keep = 0
-			for child in node.children:
-				for code in target_codes:
-					taken = False
-					for leaf in child:
-						if leaf.name.startswith(code):
-							children_keep += 1
-							taken = True
-							break
-					if taken:
-						break
+                if len(first_hits) >= args.first_min_presence and passes_contam:
+                    outer_sets.append(set(leaf.name for leaf in node))
+                    seen_leaves.extend([leaf.name for leaf in node])

-			if children_keep == len(node.children):
-				target_leaves = set(); required_taxa_leaves = set()
-				
-				for code in target_codes:
-					for leaf in leaves[::-1]:
-						#print(leaf)
-						if leaf.startswith(code):
-							target_leaves.add(leaf[:10])
+        return outer_sets

-							for req in required_taxa_codes:
-								if leaf.startswith(req):
-								    required_taxa_leaves.add(leaf[:10])
-								    break
-							leaves.remove(leaf)
+    # Build outer sets; if user supplied first-stage args, we'll restrict inner search to these
+    using_first = bool(args.first_target) and args.first_min_presence > 0
+    outer_leafsets = get_outer_leafsets()

+    # --------------------------------
+    # ORIGINAL INNER FILTER (unchanged)
+    # --------------------------------

+    # Getting a clean list of all target taxa
+    if '.' in args.target:
+        try:
+            target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
+        except AttributeError:
+            print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
+    else:
+        target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']

+    # Getting a clean list of all "at least" taxa
+    if '.' in args.required_taxa:
+        try:
+            required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
+        except AttributeError:
+            print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
+    else:
+        required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']

-				#Grab a clade as a subtree if 1) it has enough target taxa; 2) it has enough "at least" taxa; 3) it does not have too many contaminants
-				if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
-					selected_nodes.append(node)
-					seen_leaves.extend([leaf.name for leaf in node])
-	#Write the subtrees to output .tre files
-	for i, node in enumerate(selected_nodes[::-1]):
-		with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
-			o.write(node.write())
+    target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
+
+    # Creating a record of selected subtrees, and all of the leaves in those subtrees
+    selected_nodes = []; seen_leaves = []
+
+    # Iterating through all nodes in tree, starting at "root" then working towards leaves
+    for node in tree.traverse('levelorder'):
+        # If using first-stage filter, only consider nodes fully inside some outer clade
+        if using_first:
+            node_leafs = set(leaf.name for leaf in node)
+            # require subset (node fully contained in an accepted outer clade)
+            if not any(node_leafs.issubset(S) for S in outer_leafsets):
+                continue
+
+        # If a node is large enough and is not contained in an already selected clade
+        if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
+            leaves = [leaf.name for leaf in node]
+
+            # Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade
+            children_keep = 0
+            for child in node.children:
+                for code in target_codes:
+                    taken = False
+                    for leaf in child:
+                        if leaf.name.startswith(code):
+                            children_keep += 1
+                            taken = True
+                            break
+                    if taken:
+                        break
+
+            if children_keep == len(node.children):
+                target_leaves = set(); required_taxa_leaves = set()
+                
+                for code in target_codes:
+                    for leaf in leaves[::-1]:
+                        if leaf.startswith(code):
+                            target_leaves.add(leaf[:10])
+
+                            for req in required_taxa_codes:
+                                if leaf.startswith(req):
+                                    required_taxa_leaves.add(leaf[:10])
+                                    break
+                            leaves.remove(leaf)
+
+                # Grab a clade as a subtree if it passes all filters
+                if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
+                    selected_nodes.append(node)
+                    seen_leaves.extend([leaf.name for leaf in node])
+
+    # Write the subtrees to output .tre files
+    for i, node in enumerate(selected_nodes[::-1]):
+        with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
+            o.write(node.write())
 	

 def make_new_unaligned(args):
Author	SHA1	Message	Date
Godwin N. Ani	1814246c04	Update wrapper.py	2025-09-23 16:05:44 -04:00
Godwin N. Ani	591e32d5e5	Update wrapper.py	2025-09-23 16:01:59 -04:00
Adri K. Grow	fbf51f815a	Update run_eukphylo.sh	2025-09-19 12:19:51 -04:00
Adri K. Grow	ac84339565	Update run_eukphylo.sh	2025-09-19 12:19:10 -04:00
Adri K. Grow	bdec8612ee	Update run_eukphylo.sh	2025-09-19 11:46:14 -04:00
Adri K. Grow	db49d4965e	Update run_eukphylo.sh	2025-09-18 15:46:48 -04:00
Adri K. Grow	f2d8fd9e6c	Update wrapper_submit.sh	2025-09-04 11:21:53 -04:00
Adri K. Grow	fbaa61f23e	Update wrapper_submit.sh	2025-09-04 11:21:17 -04:00
Adri K. Grow	8222b6404e	Update ReadMapping.py	2025-08-28 23:26:39 -04:00
Adri K. Grow	f5ca94b51c	Update ReadMapping.py	2025-08-27 13:18:28 -04:00
Godwin N. Ani	033ed1237e	Add files via upload	2025-08-27 10:49:37 -04:00
Adri K. Grow	4f400d61c8	Update wrapper_submit.sh	2025-08-25 17:21:34 -04:00
Adri K. Grow	692eabc6ad	Update run_eukphylo.sh	2025-08-25 17:20:28 -04:00
Adri K. Grow	f17b43ffc9	Update wrapper_submit.sh	2025-08-25 17:20:21 -04:00
Adri K. Grow	12870b2007	Update run_eukphylo.sh	2025-08-25 17:18:13 -04:00
Adri K. Grow	10c4dda6b7	Update run_eukphylo.sh	2025-08-25 17:15:31 -04:00
Godwin N. Ani	5bee8e55d2	Update wrapper_submit.sh	2025-08-25 17:05:56 -04:00
Adri K. Grow	a00e51523f	Update wrapper_submit.sh	2025-08-25 11:49:11 -04:00
Adri K. Grow	cb71db1f72	Update wrapper_submit.sh	2025-08-25 11:47:10 -04:00
Adri K. Grow	e785532921	Update wrapper_submit.sh	2025-08-25 11:42:05 -04:00
Godwin N. Ani	c042f62249	Update wrapper_submit.sh	2025-08-25 10:59:29 -04:00
Godwin N. Ani	efebf01057	Update wrapper_submit.sh	2025-08-25 10:58:25 -04:00
Adri K. Grow	0fc18547d6	Update ReadMapping.py	2025-08-22 13:16:37 -04:00
Adri K. Grow	fee7125729	Update ProcessAndRenameAssembledData.py	2025-08-21 10:40:08 -04:00
Adri K. Grow	c92cfb1b19	Update ReadMapping.py Making things more universally applicable rather than keeping the hardcoded index range	2025-08-18 14:13:11 -04:00
Adri K. Grow	5fcd3b937e	Update CladeGrabbing.py	2025-08-18 10:56:17 -04:00
Adri K. Grow	20d3926f10	Update CladeGrabbing.py Adding new optional arguments to support two-step clade filtering within one run of the script.	2025-08-18 10:54:25 -04:00
GiuliaRibeiro	873862adbc	Update guidance.py fixed 70gapTrimmed line	2025-08-07 09:53:42 -04:00
Godwin N. Ani	333a7bc063	Update run_eukphylo.sh	2025-07-29 17:43:37 -04:00
Godwin N. Ani	b582321c3a	Update guidance.py	2025-07-17 14:47:39 -04:00
Godwin N. Ani	7a2320284a	Update utils.py	2025-07-17 14:46:52 -04:00
Godwin N. Ani	f53b6926c5	Add files via upload	2025-07-17 11:42:28 -04:00
Godwin N. Ani	399811662c	Delete PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML (1).txt	2025-07-17 11:42:19 -04:00
Godwin N. Ani	a05f94c9fa	Add files via upload	2025-07-17 11:41:09 -04:00
Godwin N. Ani	343496b598	Delete PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt	2025-07-17 11:40:56 -04:00
Godwin N. Ani	3f9ef410a7	Rename Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML (1).txt to Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt	2025-07-17 11:31:34 -04:00
Godwin N. Ani	fb5353fe96	Blacklist_and_greylist	2025-07-17 11:31:09 -04:00