Compare commits

..

37 Commits
v1.0 ... main

Author SHA1 Message Date
Godwin N. Ani
1814246c04
Update wrapper.py 2025-09-23 16:05:44 -04:00
Godwin N. Ani
591e32d5e5
Update wrapper.py 2025-09-23 16:01:59 -04:00
Adri K. Grow
fbf51f815a
Update run_eukphylo.sh 2025-09-19 12:19:51 -04:00
Adri K. Grow
ac84339565
Update run_eukphylo.sh 2025-09-19 12:19:10 -04:00
Adri K. Grow
bdec8612ee
Update run_eukphylo.sh 2025-09-19 11:46:14 -04:00
Adri K. Grow
db49d4965e
Update run_eukphylo.sh 2025-09-18 15:46:48 -04:00
Adri K. Grow
f2d8fd9e6c
Update wrapper_submit.sh 2025-09-04 11:21:53 -04:00
Adri K. Grow
fbaa61f23e
Update wrapper_submit.sh 2025-09-04 11:21:17 -04:00
Adri K. Grow
8222b6404e
Update ReadMapping.py 2025-08-28 23:26:39 -04:00
Adri K. Grow
f5ca94b51c
Update ReadMapping.py 2025-08-27 13:18:28 -04:00
Godwin N. Ani
033ed1237e
Add files via upload 2025-08-27 10:49:37 -04:00
Adri K. Grow
4f400d61c8
Update wrapper_submit.sh 2025-08-25 17:21:34 -04:00
Adri K. Grow
692eabc6ad
Update run_eukphylo.sh 2025-08-25 17:20:28 -04:00
Adri K. Grow
f17b43ffc9
Update wrapper_submit.sh 2025-08-25 17:20:21 -04:00
Adri K. Grow
12870b2007
Update run_eukphylo.sh 2025-08-25 17:18:13 -04:00
Adri K. Grow
10c4dda6b7
Update run_eukphylo.sh 2025-08-25 17:15:31 -04:00
Godwin N. Ani
5bee8e55d2
Update wrapper_submit.sh 2025-08-25 17:05:56 -04:00
Adri K. Grow
a00e51523f
Update wrapper_submit.sh 2025-08-25 11:49:11 -04:00
Adri K. Grow
cb71db1f72
Update wrapper_submit.sh 2025-08-25 11:47:10 -04:00
Adri K. Grow
e785532921
Update wrapper_submit.sh 2025-08-25 11:42:05 -04:00
Godwin N. Ani
c042f62249
Update wrapper_submit.sh 2025-08-25 10:59:29 -04:00
Godwin N. Ani
efebf01057
Update wrapper_submit.sh 2025-08-25 10:58:25 -04:00
Adri K. Grow
0fc18547d6
Update ReadMapping.py 2025-08-22 13:16:37 -04:00
Adri K. Grow
fee7125729
Update ProcessAndRenameAssembledData.py 2025-08-21 10:40:08 -04:00
Adri K. Grow
c92cfb1b19
Update ReadMapping.py
Making things more universally applicable rather than keeping the hardcoded index range
2025-08-18 14:13:11 -04:00
Adri K. Grow
5fcd3b937e
Update CladeGrabbing.py 2025-08-18 10:56:17 -04:00
Adri K. Grow
20d3926f10
Update CladeGrabbing.py
Adding new optional arguments to support two-step clade filtering within one run of the script.
2025-08-18 10:54:25 -04:00
GiuliaRibeiro
873862adbc
Update guidance.py
fixed 70gapTrimmed line
2025-08-07 09:53:42 -04:00
Godwin N. Ani
333a7bc063
Update run_eukphylo.sh 2025-07-29 17:43:37 -04:00
Godwin N. Ani
b582321c3a
Update guidance.py 2025-07-17 14:47:39 -04:00
Godwin N. Ani
7a2320284a
Update utils.py 2025-07-17 14:46:52 -04:00
Godwin N. Ani
f53b6926c5
Add files via upload 2025-07-17 11:42:28 -04:00
Godwin N. Ani
399811662c
Delete PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML (1).txt 2025-07-17 11:42:19 -04:00
Godwin N. Ani
a05f94c9fa
Add files via upload 2025-07-17 11:41:09 -04:00
Godwin N. Ani
343496b598
Delete PTL2/Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt 2025-07-17 11:40:56 -04:00
Godwin N. Ani
3f9ef410a7
Rename Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML (1).txt to Black_and_Grey_list_GuidanceSisSubsisters_RemovedSeq_Conserved_PTLms_June2024_ML.txt 2025-07-17 11:31:34 -04:00
Godwin N. Ani
fb5353fe96
Blacklist_and_greylist 2025-07-17 11:31:09 -04:00
10 changed files with 67816 additions and 140 deletions

View File

@ -18,12 +18,23 @@
#SBATCH --mail-user=email@email.edu
module purge #Cleans up any loaded modules
module load slurm
module load tqdm/4.62.3-GCCcore-11.2.0
module load Biopython/1.79-foss-2021b
module load BLAST+/2.12.0-gompi-2021b
module load DIAMOND/2.0.13-GCC-11.2.0
#Unity server
module use /gridapps/modules/all
module load conda/latest
module load uri/main
module load diamond/2.1.7
module load VSEARCH/2.22.1-GCC-11.3.0
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
#Grid server
module use /gridapps/modules/all
module load slurm
module load tqdm/4.66.1-GCCcore-12.3.0
module load Biopython/1.79-gfbf-2023a
module load BLAST+/2.14.1-gompi-2023a
module load DIAMOND/2.1.8-GCC-12.3.0
module load VSEARCH/2.25.0-GCC-12.3.0
parent='/Your/Home/Folder/'

View File

@ -92,7 +92,7 @@ def script_four(args):
if os.path.exists(args.databases + '/Taxa_with_few_sequences.txt'):
with open(args.databases + '/Taxa_with_few_sequences.txt', 'r') as f:
content = f.read()
print(f'These samples do not run through PTL6p1, perhaps because they has no good hits to the hook. We suggest you remove them and restart.')
print(f'These samples did not run through EukPhylo part1 because they have no good hits to the hook database or the Diamond sequence aligner ran out of memory. We suggest you remove them and restart.')
print(content)
print('Stopping Run.')
os.remove(args.databases + '/Taxa_with_few_sequences.txt')

View File

@ -6,29 +6,49 @@
## needs and restrictions, followed by some example commands taken from the GitHub Wiki, more detail for which can be found
## here: https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-1:-GF-assignment
## SLURM-SPECIFIC SETUP BELOW
############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
## Slurm specific code
#SBATCH --job-name=EukPhylo
#SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=1 # #change to number of srun when running multiple instances
#SBATCH --ntasks-per-node=1 ##change to number of srun when running multiple instances
#SBATCH --mem=160G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
module purge #Cleans up any loaded modules
module use /gridapps/modules/all
module load slurm
module load tqdm/4.62.3-GCCcore-11.2.0
module load Biopython/1.79-foss-2021b
module load BLAST+/2.12.0-gompi-2021b
module load DIAMOND/2.0.13-GCC-11.2.0
module load VSEARCH/2.22.1-GCC-11.3.0
module load tqdm/4.66.1-GCCcore-12.3.0
module load Biopython/1.79-gfbf-2023a
module load BLAST+/2.14.1-gompi-2023a
module load DIAMOND/2.1.8-GCC-12.3.0
module load VSEARCH/2.25.0-GCC-12.3.0
############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
## Slurm specific code
#SBATCH --job-name=EukPhylo
#SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=64
#SBATCH --mem=40G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=email@xxx.edu
module purge #Cleans up any loaded modules
module use /gridapps/modules/all
module load conda/latest
module load uri/main
module load diamond/2.1.7
module load VSEARCH/2.22.1-GCC-11.3.0
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
## PROVIDE YOUR PARENT PATH
parent='/Your/Home/Folder/'
## Example commands
## EXAMPLE RUN COMMANDS BELOW
# A simple run that goes from script 1 to script 7 (the last script) using the Universal genetic code
srun -D ${parent}Scripts python3 ${parent}Scripts/wrapper.py --first_script 1 --last_script 7 --assembled_transcripts ${parent}AssembledTranscripts -o ${parent}Out --genetic_code ${parent}Gcode.txt --databases ${parent}Databases > log.out

View File

@ -183,10 +183,10 @@ def run(params):
os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Gap trimming
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')
#Copying over final aligments (pre and post gap trimming) into output folder.
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta')
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta')
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Removing intermediate files if not --keep_temp
@ -225,5 +225,6 @@ def run(params):

View File

@ -44,7 +44,7 @@ def get_params():
core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')
core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff')
core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance')
core.add_argument('--trimal_cutoff', default = 0.05, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 95% or more gaps, set this parameter to 0.05).')
core.add_argument('--trimal_cutoff', default = 0.3, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 70% or more gaps, set this parameter to 0.3).')
core.add_argument('--allow_large_files', action = 'store_true', help = 'Allow files with more than 2,000 sequences to run through Guidance.')
CL = parser.add_argument_group('Contamination loop parameters')

View File

@ -1,44 +1,55 @@
#!/bin/bash
## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Feb 2025 by Adri K. Grow
## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Sept. 2025 by Adri K. Grow
## This shell script is used for running EukPhylo part 2, and includes a general setup for use on an HPC that uses
## the Slurm workload manager. It also includes several example run commands, which correspond to examples explained in more detail in the
## EukPhylo Wiki (https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-2:-MSAs,-trees,-and-contamination-loop).
## These run commands can also be copied and run in the terminal / command line separately, without a shell script.
## For the contamination loop, We recommend iterating the sister/subsisters loop multiple times as branches will shift. In contrast, we recommend only running clade grabbing once
## SLURM-SPECIFIC SETUP BELOW
############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
#SBATCH --job-name=EukPhylo
#SBATCH -n 10 # Number of Cores per Task
#SBATCH --mem=125G # Requested Memory
#SBATCH -p cpu # Partition
#SBATCH -q long # long QOS
#SBATCH -t 334:00:00 # Job time limit
#SBATCH --output=Run_EP.%A_%a.out # Stdout (%j expands to jobId)
#SBATCH --mail-type=ALL
#SBATCH --mail-user=email@email.edu
#SBATCH --array=1-600%50
module purge #Cleans up any loaded modules
module load conda/latest
module load mafft/7.505
module load diamond/2.1.7
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p2/envs/PTL/
############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
#SBATCH --job-name=EukPhylo # Job name
#SBATCH --output=Run_EukPhylo.%j.out # Stdout (%j expands to jobId)
#SBATCH --nodes=1
#SBATCH --ntasks=10 ## On the Smith College HPC (Grid), we have to change this to be double the number of task/batches you want to launch
#SBATCH --mail-type=ALL
#SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
## UMass HPC (Unity) requirements below (DELETE section if not applicable):
#SBATCH --mem=125G # Requested Memory
#SBATCH -c 24 # Number of Cores per Task
#SBATCH -q long # Partition
#SBATCH -t 336:00:00 # Job time limit
#SBATCH --mail-user=email@email.edu ##add your email address for job updates
#Load required modules
module purge #Cleans up any loaded modules
module use /gridapps/modules/all #make sure module locations is loaded
module purge # Cleans up any loaded modules
module use /gridapps/modules/all # make sure module locations is loaded
module load slurm
module load ETE
module load Biopython/1.79-foss-2021b
module load DIAMOND/2.0.13-GCC-11.2.0
module load MAFFT
module load RAxML
module load IQ-TREE/2.1.2-gompi-2021b
module load tqdm/4.64.1-GCCcore-12.2.0
module load Python/3.9.6-GCCcore-11.2.0
module load Guidance_mid #Smith College HPC specific
module load ETE/3.1.3-foss-2024a
module load Biopython/1.79-gfbf-2023a
module load DIAMOND/2.1.8-GCC-12.3.0
module load MAFFT/7.526-GCC-13.3.0-with-extensions
module load RAxML-NG/1.2.2-GCC-13.2.0
module load IQ-TREE/2.3.6-gompi-2023a
module load tqdm/4.66.1-GCCcore-12.3.0
module load Python/3.12.3-GCCcore-13.3.0
module load Guidance_mid/2.1b-foss-2023a #Smith College HPC specific
export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific #export PATH=$PATH:/Path/To/Executable/Files
export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific
#export PATH=$PATH:/Path/To/Executable/Files
## PROVIDE YOUR PARENT PATH
parent='/Your/Home/Folder/' # The folder where you are running EukPhylo (this should contain the Scripts and input data folders)
## EXAMPLE RUN COMMANDS BELOW

View File

@ -4,7 +4,7 @@ Author & Date: Adri K. Grow + ChatGPT, Nov 11th 2024
Motivation: assess and rename assembled transcript or genome files for use in EukPhylo Part 1
Intention: warn if any 'transcripts.fasta' or 'contigs.fasta' files are missing or empty for an LKH, otherwise rename and copy them with their assigned 10-digit code by LKH
Input:
- a base directory containing subdirectories for each LKH assembled file, named 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
- a base directory containing subdirectories for each LKH, named either 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
- a mapping .txt file with LKH#s tab-separated with corresponding 10-digit codes
Output:
- a folder named 'renamed_transcripts|contigs' with assembled files now named by 10-digit codes; e.g. "Sr_rh_Ro04_assembledTranscripts.fasta"
@ -83,3 +83,4 @@ def main():
if __name__ == "__main__":
main()

View File

@ -1,51 +1,56 @@
'''
#Author, date: ?
#Uploaded: updated by Adri Grow, 2024
#Intent: map a group of trimmed reads to a reference.
#Dependencies: Python3, hisat2, samtools, sambamba
#EDIT LINES: 18 & 32
#Inputs: Folder named 'TrimmedReads' containing all the trimmed reads.
#Outputs: Folders with the names of the LKHs containing the sam/bam files.
#Example: python ReadMapping.py
#Uploaded: updated by Adri Grow, Aug 2025
#Intent: map a group of trimmed reads to a reference
#Dependencies: Python, HISAT2, samtools, (optional: sambamba)
#EDIT LINES: 19 & 36
#Inputs: Folder named 'TrimmedReads' containing the forward and reverse trimmed reads that start with the same unique identifier for each sample/cell
#Outputs: Folders with the names of the unique identifier (e.g. LKHs) containing the bam files
#Usage: python3 ReadMapping.py
#IMPORTANT: Lines 34-42 manipulate the output files in several different ways including converting .sam to .bam, sorting, optional deduplicating, optional quality filtering, and retaining only mapped reads. It is the responsibility of the user to determine exactly which commands are needed for their dataset.
'''
import os
from Bio import SeqIO
#this first command builds your reference with Hisat.
#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it).
#It will output several files. Don't worry about them, Hisat will know what to do.
os.system("hisat2-build Foram_reference.fasta Foram_Index") #change to your reference.fasta and rename the index
#This first command builds your reference with HISAT
#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it)
#It will output several files. Don't worry about them, HISAT will know what to do
os.system("hisat2-build Foram_reference.fasta Foram_Index") #Replace "Foram_reference.fasta" with your reference fasta name, and optionally change "Foram_Index" to your preferred index name
folder = os.listdir("TrimmedReads") #Insert the name of the folder which has your trimmed reads inside the quotes
folder.sort() #This sorts the folder so that all the LKHs are in order.
folder = os.listdir("TrimmedReads") #Replace "TrimmedReads" with the name of the folder containing your trimmed reads, if different than TrimmedReads
folder.sort() #This sorts the trimmed reads folder so that all the files are passed in order
for x in folder:
if "LKH" in x and "FPE" in x: #assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
#This is specific for file names starting with 'LKH' unqiue identifiers formatted similar to 'LKH###_FPE.fastq.gz'
if "LKH" in x and "FPE" in x: #Assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
FPE = x
if "LKH" in x and "RPE" in x: #assigning a variable to reverse reads.
sample_id = FPE.split("_FPE")[0]
if "LKH" in x and "RPE" in x: #Assigning a variable to reverse reads
RPE = x
if(FPE[:7] == RPE[:7]):
#The next few lines are several Hisat commands that will create new files.
#EDIT the name of the index and the name of the trimmed reads folder in the first command below
os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam")
os.system("samtools view -bS sample.sam > sample.bam")
os.system("samtools fixmate -O bam sample.bam fixmate_sample.bam")
os.system("samtools sort -O bam -o sorted_sample.bam fixmate_sample.bam")
os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam")
os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam")
os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam")
os.system("samtools view -h -F 4 -b sorted_sample.dedup.bam > defaultparameters_sample.bam")
if FPE.split("_FPE")[0] == RPE.split("_RPE")[0]: #Match sample IDs dynamically
#The next few lines are several HISAT commands that will create new files
#If necessary, EDIT the name of the index and the name of the trimmed reads folder in the very next line only
os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam") #running HISAT2
os.system("samtools view -bS sample.sam > sample.bam") #converts .sam file to .bam file
os.remove("sample.sam") #remove the .sam file (already converted to .bam, sam files are large and unnecessary to keep)
#os.system("samtools fixmate -O bam sample.bam fixmate_sample.bam") #use this command if you will be using the sambamba markdup command to remove duplicate reads (Katzlab default for transcriptomics and amplicon is to not remove duplicates)
os.system("samtools sort -O bam -o sorted_sample.bam sample.bam") #sorts the .bam file alignments by leftmost coordinates
#os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam") #removes duplicate reads - may not be appropriate for your study or protocols, user will need to determine if this is best practice for their study
#os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam") #only keeps reads with mapping quality ≥ 40, input is the dedup file but can easily be modified to use the sorted .bam file
#os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam") #only keeps reads with mapping quality ≥ 20, input is the dedup file but can easily be modified to use the sorted .bam file
os.system("samtools view -h -F 4 -b sorted_sample.bam > sorted_mapped_sample.bam") #only keeps mapped reads, using the sorted .bam file as input - this is the Katzlab transcriptomic and amplicon final output that should be used for continued analyses
if not os.path.isdir(x[:7]):
os.mkdir(x[0:7]) #making folders with the names of the LKHs
if not os.path.isdir(sample_id):
os.mkdir(sample_id) #making folders with the names of the LKHs or unique identifiers
for file in os.listdir('.'): #These lines move the sam/bam files that Hisat creates into the new LKH folders.
for file in os.listdir('.'): #These lines move the bam files created into the new LKH/unique identifier folders
if(file.endswith('.sam') or file.endswith('.bam')):
os.rename(file,x[:7] + '/' + file)
os.rename(file, f"{sample_id}/{file}")
print("~~~~~~~~~~~:>~") #When the snake appears in terminal, your script has finished running!
print("~~~~~~~~~~~:>~") #When the snake appears in terminal, the script has finished running for all samples/cells!

View File

@ -1,10 +1,10 @@
#Author, date: Auden Cote-L'Heureux, last updated Apr 1st 2024 by GA
#Author, date: Auden Cote-L'Heureux, last updated Aug 18th 2025 by AKG
#Motivation: Select robust sequences from trees
#Intent: Select clades of interest from large trees using taxonomic specifications
#Dependencies: Python3, ete3, Biopython
#Inputs: A folder containing: all PTLp2 output trees and all corresponding unaligned .fasta (pre-guidance) files
#Outputs: A folder of grabbed clades and filtered unaligned fasta files
#Example: python CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
#Example: python3 CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
#IMPORTANT: key parameters explained in "add_argument" section below
#Dependencies
@ -18,7 +18,7 @@ def get_args():
parser = argparse.ArgumentParser(
prog = 'Clade grabber, Version 2.1',
description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by GA Feb 13th 2024"
description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by AKG Aug 18th 2025"
)
#add_argument section with parameters explained
parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to a folder containing input trees (which must have the file extension .tre, .tree, .treefile, or .nex)')
@ -28,6 +28,8 @@ def get_args():
parser.add_argument('-nr', '--required_taxa_num', type = int, default = 0, help = 'The number of species belonging to taxa in the --required_taxa list that must be present in the clade. Default is 0.')
parser.add_argument('-o', '--outgroup', type = str, default = '', help = 'A comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that will be included as outgroups in the output unaligned fasta files (which will contain only sequences from a single selected clade, and all outgroup sequences in the tree captured by this argument).')
parser.add_argument('-c', '--contaminants', type = float, default = 2, help = 'The number of non-ingroup contaminants allowed in a clade, or if less than 1 the proportion of sequences in a clade that can be non-ingroup (i.e. presumed contaminants). Default is to allow 2 contaminants.')
parser.add_argument('-ft', '--first_target', type=str, default='', help='[Optional] A comma-separated list or .txt file of complete/partial taxon codes for an initial, broad clade search. If provided, the script will first find clades with these taxa before applying the main --target filter.')
parser.add_argument('-fm', '--first_min_presence', type=int, default=0, help='[Optional] Minimum number of sequences from --first_target required in a clade for it to be used in the second-stage search. Ignored if --first_target is not provided.')
return parser.parse_args()
@ -85,86 +87,155 @@ def reroot(tree):
def get_subtrees(args, file):
newick = get_newick(args.input + '/' + file)
newick = get_newick(args.input + '/' + file)
tree = ete3.Tree(newick)
tree = ete3.Tree(newick)
majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
# Only try to reroot trees with more than 2 major clades (original behavior)
if len(majs) > 2:
tree = reroot(tree)
#Only try to reroot trees with more than 2 major clades. This was added to fix the ETE3 "Cannot set myself as outgroup" error
if len(majs) > 2:
tree = reroot(tree)
# -------------------------------
# FIRST-STAGE (optional) FILTER
# -------------------------------
def get_outer_leafsets():
"""
Return a list of sets, each set = leaf names of an outer clade
that passes --first_target, --first_min_presence, children_keep,
and contaminants logic (using args.contaminants).
If --first_target is not used, return one set containing ALL leaves.
"""
if not args.first_target or args.first_min_presence == 0:
return [set(leaf.name for leaf in tree)] # no outer filter → whole tree
#Getting a clean list of all target taxa
if '.' in args.target:
try:
target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
except AttributeError:
print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
else:
target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']
# Parse first_target codes
if '.' in args.first_target:
first_target_codes = [l.strip() for l in open(args.first_target, 'r').readlines() if l.strip() != '']
else:
first_target_codes = [code.strip() for code in args.first_target.split(',') if code.strip() != '']
#Getting a clean list of all "at least" taxa
if '.' in args.required_taxa:
try:
required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
except AttributeError:
print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
else:
required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']
outer_sets = []
seen_leaves = []
target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
for node in tree.traverse('levelorder'):
# large enough and not subsumed by already accepted outer node
if len(node) >= args.first_min_presence and len(set(seen_leaves) & set([leaf.name for leaf in node])) == 0:
leaves = [leaf.name for leaf in node]
#Creating a record of selected subtrees, and all of the leaves in those subtrees
selected_nodes = []; seen_leaves = []
# children_keep logic but for first_target
children_keep = 0
for child in node.children:
taken = False
for code in first_target_codes:
for leaf in child:
if leaf.name.startswith(code):
children_keep += 1
taken = True
break
if taken:
break
if children_keep != len(node.children):
continue
#Iterating through all nodes in tree, starting at "root" then working towards leaves
for node in tree.traverse('levelorder'):
#If a node is large enough and is not contained in an already selected clade
# count first-target hits (use [:10] uniqueness like original)
first_hits = set()
for code in first_target_codes:
for leaf in leaves[::-1]:
if leaf.startswith(code):
first_hits.add(leaf[:10])
leaves.remove(leaf)
if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
leaves = [leaf.name for leaf in node]
# contaminants logic applied to FIRST-STAGE (reuse args.contaminants)
passes_contam = ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(first_hits)) or
(args.contaminants >= 1 and len(leaves) <= args.contaminants))
#Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade with 1 fewer than the max number of contaminants
children_keep = 0
for child in node.children:
for code in target_codes:
taken = False
for leaf in child:
if leaf.name.startswith(code):
children_keep += 1
taken = True
break
if taken:
break
if len(first_hits) >= args.first_min_presence and passes_contam:
outer_sets.append(set(leaf.name for leaf in node))
seen_leaves.extend([leaf.name for leaf in node])
if children_keep == len(node.children):
target_leaves = set(); required_taxa_leaves = set()
for code in target_codes:
for leaf in leaves[::-1]:
#print(leaf)
if leaf.startswith(code):
target_leaves.add(leaf[:10])
return outer_sets
for req in required_taxa_codes:
if leaf.startswith(req):
required_taxa_leaves.add(leaf[:10])
break
leaves.remove(leaf)
# Build outer sets; if user supplied first-stage args, we'll restrict inner search to these
using_first = bool(args.first_target) and args.first_min_presence > 0
outer_leafsets = get_outer_leafsets()
# --------------------------------
# ORIGINAL INNER FILTER (unchanged)
# --------------------------------
# Getting a clean list of all target taxa
if '.' in args.target:
try:
target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
except AttributeError:
print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
else:
target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']
# Getting a clean list of all "at least" taxa
if '.' in args.required_taxa:
try:
required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
except AttributeError:
print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
else:
required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']
#Grab a clade as a subtree if 1) it has enough target taxa; 2) it has enough "at least" taxa; 3) it does not have too many contaminants
if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
selected_nodes.append(node)
seen_leaves.extend([leaf.name for leaf in node])
#Write the subtrees to output .tre files
for i, node in enumerate(selected_nodes[::-1]):
with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
o.write(node.write())
target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
# Creating a record of selected subtrees, and all of the leaves in those subtrees
selected_nodes = []; seen_leaves = []
# Iterating through all nodes in tree, starting at "root" then working towards leaves
for node in tree.traverse('levelorder'):
# If using first-stage filter, only consider nodes fully inside some outer clade
if using_first:
node_leafs = set(leaf.name for leaf in node)
# require subset (node fully contained in an accepted outer clade)
if not any(node_leafs.issubset(S) for S in outer_leafsets):
continue
# If a node is large enough and is not contained in an already selected clade
if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
leaves = [leaf.name for leaf in node]
# Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade
children_keep = 0
for child in node.children:
for code in target_codes:
taken = False
for leaf in child:
if leaf.name.startswith(code):
children_keep += 1
taken = True
break
if taken:
break
if children_keep == len(node.children):
target_leaves = set(); required_taxa_leaves = set()
for code in target_codes:
for leaf in leaves[::-1]:
if leaf.startswith(code):
target_leaves.add(leaf[:10])
for req in required_taxa_codes:
if leaf.startswith(req):
required_taxa_leaves.add(leaf[:10])
break
leaves.remove(leaf)
# Grab a clade as a subtree if it passes all filters
if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
selected_nodes.append(node)
seen_leaves.extend([leaf.name for leaf in node])
# Write the subtrees to output .tre files
for i, node in enumerate(selected_nodes[::-1]):
with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
o.write(node.write())
def make_new_unaligned(args):