mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 06:40:25 +08:00
Compare commits
No commits in common. "main" and "v1.0" have entirely different histories.
@ -18,23 +18,12 @@
|
||||
#SBATCH --mail-user=email@email.edu
|
||||
|
||||
module purge #Cleans up any loaded modules
|
||||
|
||||
#Unity server
|
||||
module use /gridapps/modules/all
|
||||
module load conda/latest
|
||||
module load uri/main
|
||||
module load diamond/2.1.7
|
||||
module load VSEARCH/2.22.1-GCC-11.3.0
|
||||
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
|
||||
|
||||
#Grid server
|
||||
module use /gridapps/modules/all
|
||||
module load slurm
|
||||
module load tqdm/4.66.1-GCCcore-12.3.0
|
||||
module load Biopython/1.79-gfbf-2023a
|
||||
module load BLAST+/2.14.1-gompi-2023a
|
||||
module load DIAMOND/2.1.8-GCC-12.3.0
|
||||
module load VSEARCH/2.25.0-GCC-12.3.0
|
||||
module load tqdm/4.62.3-GCCcore-11.2.0
|
||||
module load Biopython/1.79-foss-2021b
|
||||
module load BLAST+/2.12.0-gompi-2021b
|
||||
module load DIAMOND/2.0.13-GCC-11.2.0
|
||||
module load VSEARCH/2.22.1-GCC-11.3.0
|
||||
|
||||
parent='/Your/Home/Folder/'
|
||||
|
||||
|
||||
@ -92,7 +92,7 @@ def script_four(args):
|
||||
if os.path.exists(args.databases + '/Taxa_with_few_sequences.txt'):
|
||||
with open(args.databases + '/Taxa_with_few_sequences.txt', 'r') as f:
|
||||
content = f.read()
|
||||
print(f'These samples did not run through EukPhylo part1 because they have no good hits to the hook database or the Diamond sequence aligner ran out of memory. We suggest you remove them and restart.')
|
||||
print(f'These samples do not run through PTL6p1, perhaps because they has no good hits to the hook. We suggest you remove them and restart.')
|
||||
print(content)
|
||||
print('Stopping Run.')
|
||||
os.remove(args.databases + '/Taxa_with_few_sequences.txt')
|
||||
|
||||
@ -6,49 +6,29 @@
|
||||
## needs and restrictions, followed by some example commands taken from the GitHub Wiki, more detail for which can be found
|
||||
## here: https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-1:-GF-assignment
|
||||
|
||||
## SLURM-SPECIFIC SETUP BELOW
|
||||
|
||||
############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
|
||||
## Slurm specific code
|
||||
|
||||
#SBATCH --job-name=EukPhylo
|
||||
#SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --ntasks-per-node=1 ##change to number of srun when running multiple instances
|
||||
#SBATCH --ntasks-per-node=1 # #change to number of srun when running multiple instances
|
||||
#SBATCH --mem=160G
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
|
||||
|
||||
module purge #Cleans up any loaded modules
|
||||
module use /gridapps/modules/all
|
||||
module load slurm
|
||||
module load tqdm/4.66.1-GCCcore-12.3.0
|
||||
module load Biopython/1.79-gfbf-2023a
|
||||
module load BLAST+/2.14.1-gompi-2023a
|
||||
module load DIAMOND/2.1.8-GCC-12.3.0
|
||||
module load VSEARCH/2.25.0-GCC-12.3.0
|
||||
|
||||
############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
|
||||
## Slurm specific code
|
||||
#SBATCH --job-name=EukPhylo
|
||||
#SBATCH --output=EukPhylo.%j.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --ntasks-per-node=64
|
||||
#SBATCH --mem=40G
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=email@xxx.edu
|
||||
module purge #Cleans up any loaded modules
|
||||
module use /gridapps/modules/all
|
||||
module load conda/latest
|
||||
module load uri/main
|
||||
module load diamond/2.1.7
|
||||
module load tqdm/4.62.3-GCCcore-11.2.0
|
||||
module load Biopython/1.79-foss-2021b
|
||||
module load BLAST+/2.12.0-gompi-2021b
|
||||
module load DIAMOND/2.0.13-GCC-11.2.0
|
||||
module load VSEARCH/2.22.1-GCC-11.3.0
|
||||
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p1
|
||||
|
||||
## PROVIDE YOUR PARENT PATH
|
||||
parent='/Your/Home/Folder/'
|
||||
|
||||
## EXAMPLE RUN COMMANDS BELOW
|
||||
## Example commands
|
||||
|
||||
# A simple run that goes from script 1 to script 7 (the last script) using the Universal genetic code
|
||||
srun -D ${parent}Scripts python3 ${parent}Scripts/wrapper.py --first_script 1 --last_script 7 --assembled_transcripts ${parent}AssembledTranscripts -o ${parent}Out --genetic_code ${parent}Gcode.txt --databases ${parent}Databases > log.out
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -183,10 +183,10 @@ def run(params):
|
||||
os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
|
||||
|
||||
#Gap trimming
|
||||
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')
|
||||
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold ' + str(params.trimal_cutoff) + ' -fasta')
|
||||
|
||||
#Copying over final aligments (pre and post gap trimming) into output folder.
|
||||
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.70gapTrimmed.fasta')
|
||||
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta')
|
||||
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
|
||||
|
||||
#Removing intermediate files if not --keep_temp
|
||||
@ -225,6 +225,5 @@ def run(params):
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -44,7 +44,7 @@ def get_params():
|
||||
core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')
|
||||
core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff')
|
||||
core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance')
|
||||
core.add_argument('--trimal_cutoff', default = 0.3, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 70% or more gaps, set this parameter to 0.3).')
|
||||
core.add_argument('--trimal_cutoff', default = 0.05, type = float, help = 'Gap masking threshold for TrimAl. The maximum proportion of sequences without gaps for a site to be removed (i.e. to remove sites with 95% or more gaps, set this parameter to 0.05).')
|
||||
core.add_argument('--allow_large_files', action = 'store_true', help = 'Allow files with more than 2,000 sequences to run through Guidance.')
|
||||
|
||||
CL = parser.add_argument_group('Contamination loop parameters')
|
||||
|
||||
@ -1,55 +1,44 @@
|
||||
#!/bin/bash
|
||||
## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Sept. 2025 by Adri K. Grow
|
||||
## Last updated Jan 2025 by Auden Cote-L'Heureux; modified Feb 2025 by Adri K. Grow
|
||||
|
||||
## This shell script is used for running EukPhylo part 2, and includes a general setup for use on an HPC that uses
|
||||
## the Slurm workload manager. It also includes several example run commands, which correspond to examples explained in more detail in the
|
||||
## EukPhylo Wiki (https://github.com/Katzlab/EukPhylo/wiki/EukPhylo-Part-2:-MSAs,-trees,-and-contamination-loop).
|
||||
## These run commands can also be copied and run in the terminal / command line separately, without a shell script.
|
||||
## For the contamination loop, We recommend iterating the sister/subsisters loop multiple times as branches will shift. In contrast, we recommend only running clade grabbing once
|
||||
|
||||
|
||||
## SLURM-SPECIFIC SETUP BELOW
|
||||
|
||||
############### FOR UMASS UNITY HPC ############### (DELETE section if not applicable):
|
||||
#SBATCH --job-name=EukPhylo
|
||||
#SBATCH -n 10 # Number of Cores per Task
|
||||
#SBATCH --mem=125G # Requested Memory
|
||||
#SBATCH -p cpu # Partition
|
||||
#SBATCH -q long # long QOS
|
||||
#SBATCH -t 334:00:00 # Job time limit
|
||||
#SBATCH --output=Run_EP.%A_%a.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=email@email.edu
|
||||
#SBATCH --array=1-600%50
|
||||
module purge #Cleans up any loaded modules
|
||||
module load conda/latest
|
||||
module load mafft/7.505
|
||||
module load diamond/2.1.7
|
||||
conda activate /work/pi_lkatz_smith_edu/Conda_PTL6p2/envs/PTL/
|
||||
|
||||
############### FOR SMITH GRID HPC ############### (DELETE section if not applicable):
|
||||
#SBATCH --job-name=EukPhylo # Job name
|
||||
#SBATCH --output=Run_EukPhylo.%j.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=10 ## On the Smith College HPC (Grid), we have to change this to be double the number of task/batches you want to launch
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=email@email.edu ##add your email address for job updates
|
||||
#SBATCH --mail-user=email@xxx.edu ##add your email address for job updates
|
||||
|
||||
## UMass HPC (Unity) requirements below (DELETE section if not applicable):
|
||||
#SBATCH --mem=125G # Requested Memory
|
||||
#SBATCH -c 24 # Number of Cores per Task
|
||||
#SBATCH -q long # Partition
|
||||
#SBATCH -t 336:00:00 # Job time limit
|
||||
|
||||
#Load required modules
|
||||
module purge # Cleans up any loaded modules
|
||||
module use /gridapps/modules/all # make sure module locations is loaded
|
||||
module purge #Cleans up any loaded modules
|
||||
module use /gridapps/modules/all #make sure module locations is loaded
|
||||
module load slurm
|
||||
module load ETE/3.1.3-foss-2024a
|
||||
module load Biopython/1.79-gfbf-2023a
|
||||
module load DIAMOND/2.1.8-GCC-12.3.0
|
||||
module load MAFFT/7.526-GCC-13.3.0-with-extensions
|
||||
module load RAxML-NG/1.2.2-GCC-13.2.0
|
||||
module load IQ-TREE/2.3.6-gompi-2023a
|
||||
module load tqdm/4.66.1-GCCcore-12.3.0
|
||||
module load Python/3.12.3-GCCcore-13.3.0
|
||||
module load Guidance_mid/2.1b-foss-2023a #Smith College HPC specific
|
||||
export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific #export PATH=$PATH:/Path/To/Executable/Files
|
||||
module load ETE
|
||||
module load Biopython/1.79-foss-2021b
|
||||
module load DIAMOND/2.0.13-GCC-11.2.0
|
||||
module load MAFFT
|
||||
module load RAxML
|
||||
module load IQ-TREE/2.1.2-gompi-2021b
|
||||
module load tqdm/4.64.1-GCCcore-12.2.0
|
||||
module load Python/3.9.6-GCCcore-11.2.0
|
||||
module load Guidance_mid #Smith College HPC specific
|
||||
|
||||
export PATH=$PATH:/beegfs/fast/katzlab/grid_phylotol_setup/programs/standard-RAxML-master #Smith College HPC specific
|
||||
#export PATH=$PATH:/Path/To/Executable/Files
|
||||
|
||||
## PROVIDE YOUR PARENT PATH
|
||||
parent='/Your/Home/Folder/' # The folder where you are running EukPhylo (this should contain the Scripts and input data folders)
|
||||
|
||||
## EXAMPLE RUN COMMANDS BELOW
|
||||
|
||||
@ -4,7 +4,7 @@ Author & Date: Adri K. Grow + ChatGPT, Nov 11th 2024
|
||||
Motivation: assess and rename assembled transcript or genome files for use in EukPhylo Part 1
|
||||
Intention: warn if any 'transcripts.fasta' or 'contigs.fasta' files are missing or empty for an LKH, otherwise rename and copy them with their assigned 10-digit code by LKH
|
||||
Input:
|
||||
- a base directory containing subdirectories for each LKH, named either 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
|
||||
- a base directory containing subdirectories for each LKH assembled file, named 'WTA_LKH<xxxx>' or 'WGA_LKH<xxxx>', each containing a 'transcripts.fasta' or 'contigs.fasta' file
|
||||
- a mapping .txt file with LKH#s tab-separated with corresponding 10-digit codes
|
||||
Output:
|
||||
- a folder named 'renamed_transcripts|contigs' with assembled files now named by 10-digit codes; e.g. "Sr_rh_Ro04_assembledTranscripts.fasta"
|
||||
@ -83,4 +83,3 @@ def main():
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
|
||||
@ -1,56 +1,51 @@
|
||||
'''
|
||||
#Author, date: ?
|
||||
#Uploaded: updated by Adri Grow, Aug 2025
|
||||
#Intent: map a group of trimmed reads to a reference
|
||||
#Dependencies: Python, HISAT2, samtools, (optional: sambamba)
|
||||
#EDIT LINES: 19 & 36
|
||||
#Inputs: Folder named 'TrimmedReads' containing the forward and reverse trimmed reads that start with the same unique identifier for each sample/cell
|
||||
#Outputs: Folders with the names of the unique identifier (e.g. LKHs) containing the bam files
|
||||
#Usage: python3 ReadMapping.py
|
||||
#IMPORTANT: Lines 34-42 manipulate the output files in several different ways including converting .sam to .bam, sorting, optional deduplicating, optional quality filtering, and retaining only mapped reads. It is the responsibility of the user to determine exactly which commands are needed for their dataset.
|
||||
#Uploaded: updated by Adri Grow, 2024
|
||||
#Intent: map a group of trimmed reads to a reference.
|
||||
#Dependencies: Python3, hisat2, samtools, sambamba
|
||||
#EDIT LINES: 18 & 32
|
||||
#Inputs: Folder named 'TrimmedReads' containing all the trimmed reads.
|
||||
#Outputs: Folders with the names of the LKHs containing the sam/bam files.
|
||||
#Example: python ReadMapping.py
|
||||
'''
|
||||
|
||||
import os
|
||||
from Bio import SeqIO
|
||||
|
||||
#This first command builds your reference with HISAT
|
||||
#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it)
|
||||
#It will output several files. Don't worry about them, HISAT will know what to do
|
||||
os.system("hisat2-build Foram_reference.fasta Foram_Index") #Replace "Foram_reference.fasta" with your reference fasta name, and optionally change "Foram_Index" to your preferred index name
|
||||
#this first command builds your reference with Hisat.
|
||||
#If you've already done this, DON'T run this command! Instead, comment it out (use a # in front of it).
|
||||
#It will output several files. Don't worry about them, Hisat will know what to do.
|
||||
os.system("hisat2-build Foram_reference.fasta Foram_Index") #change to your reference.fasta and rename the index
|
||||
|
||||
folder = os.listdir("TrimmedReads") #Replace "TrimmedReads" with the name of the folder containing your trimmed reads, if different than TrimmedReads
|
||||
|
||||
folder.sort() #This sorts the trimmed reads folder so that all the files are passed in order
|
||||
folder = os.listdir("TrimmedReads") #Insert the name of the folder which has your trimmed reads inside the quotes
|
||||
folder.sort() #This sorts the folder so that all the LKHs are in order.
|
||||
|
||||
for x in folder:
|
||||
#This is specific for file names starting with 'LKH' unqiue identifiers formatted similar to 'LKH###_FPE.fastq.gz'
|
||||
if "LKH" in x and "FPE" in x: #Assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
|
||||
if "LKH" in x and "FPE" in x: #assigning a variable to forward reads. Make sure you have both forward and reverse reads for each cell!
|
||||
FPE = x
|
||||
sample_id = FPE.split("_FPE")[0]
|
||||
if "LKH" in x and "RPE" in x: #Assigning a variable to reverse reads
|
||||
if "LKH" in x and "RPE" in x: #assigning a variable to reverse reads.
|
||||
RPE = x
|
||||
|
||||
if FPE.split("_FPE")[0] == RPE.split("_RPE")[0]: #Match sample IDs dynamically
|
||||
#The next few lines are several HISAT commands that will create new files
|
||||
#If necessary, EDIT the name of the index and the name of the trimmed reads folder in the very next line only
|
||||
os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam") #running HISAT2
|
||||
os.system("samtools view -bS sample.sam > sample.bam") #converts .sam file to .bam file
|
||||
os.remove("sample.sam") #remove the .sam file (already converted to .bam, sam files are large and unnecessary to keep)
|
||||
#os.system("samtools fixmate -O bam sample.bam fixmate_sample.bam") #use this command if you will be using the sambamba markdup command to remove duplicate reads (Katzlab default for transcriptomics and amplicon is to not remove duplicates)
|
||||
os.system("samtools sort -O bam -o sorted_sample.bam sample.bam") #sorts the .bam file alignments by leftmost coordinates
|
||||
#os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam") #removes duplicate reads - may not be appropriate for your study or protocols, user will need to determine if this is best practice for their study
|
||||
#os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam") #only keeps reads with mapping quality ≥ 40, input is the dedup file but can easily be modified to use the sorted .bam file
|
||||
#os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam") #only keeps reads with mapping quality ≥ 20, input is the dedup file but can easily be modified to use the sorted .bam file
|
||||
os.system("samtools view -h -F 4 -b sorted_sample.bam > sorted_mapped_sample.bam") #only keeps mapped reads, using the sorted .bam file as input - this is the Katzlab transcriptomic and amplicon final output that should be used for continued analyses
|
||||
if(FPE[:7] == RPE[:7]):
|
||||
#The next few lines are several Hisat commands that will create new files.
|
||||
#EDIT the name of the index and the name of the trimmed reads folder in the first command below
|
||||
os.system("hisat2 -x Foram_Index -1 TrimmedReads/" +FPE+ " -2 TrimmedReads/" +RPE+ " -S sample.sam")
|
||||
os.system("samtools view -bS sample.sam > sample.bam")
|
||||
os.system("samtools fixmate -O bam sample.bam fixmate_sample.bam")
|
||||
os.system("samtools sort -O bam -o sorted_sample.bam fixmate_sample.bam")
|
||||
os.system("sambamba markdup -r sorted_sample.bam sorted_sample.dedup.bam")
|
||||
os.system("samtools view -h -b -q 40 sorted_sample.dedup.bam > sorted_sample.q40.bam")
|
||||
os.system("samtools view -h -b -q 20 sorted_sample.dedup.bam > sorted_sample.q20.bam")
|
||||
os.system("samtools view -h -F 4 -b sorted_sample.dedup.bam > defaultparameters_sample.bam")
|
||||
|
||||
if not os.path.isdir(sample_id):
|
||||
os.mkdir(sample_id) #making folders with the names of the LKHs or unique identifiers
|
||||
if not os.path.isdir(x[:7]):
|
||||
os.mkdir(x[0:7]) #making folders with the names of the LKHs
|
||||
|
||||
for file in os.listdir('.'): #These lines move the bam files created into the new LKH/unique identifier folders
|
||||
for file in os.listdir('.'): #These lines move the sam/bam files that Hisat creates into the new LKH folders.
|
||||
if(file.endswith('.sam') or file.endswith('.bam')):
|
||||
os.rename(file, f"{sample_id}/{file}")
|
||||
os.rename(file,x[:7] + '/' + file)
|
||||
|
||||
print("~~~~~~~~~~~:>~") #When the snake appears in terminal, the script has finished running for all samples/cells!
|
||||
print("~~~~~~~~~~~:>~") #When the snake appears in terminal, your script has finished running!
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
#Author, date: Auden Cote-L'Heureux, last updated Aug 18th 2025 by AKG
|
||||
#Author, date: Auden Cote-L'Heureux, last updated Apr 1st 2024 by GA
|
||||
#Motivation: Select robust sequences from trees
|
||||
#Intent: Select clades of interest from large trees using taxonomic specifications
|
||||
#Dependencies: Python3, ete3, Biopython
|
||||
#Inputs: A folder containing: all PTLp2 output trees and all corresponding unaligned .fasta (pre-guidance) files
|
||||
#Outputs: A folder of grabbed clades and filtered unaligned fasta files
|
||||
#Example: python3 CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
|
||||
#Example: python CladeGrabbing.py --input /Path/To/TreesandPreGuidance --target Sr_rh --min_presence 20
|
||||
#IMPORTANT: key parameters explained in "add_argument" section below
|
||||
|
||||
#Dependencies
|
||||
@ -18,7 +18,7 @@ def get_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'Clade grabber, Version 2.1',
|
||||
description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by AKG Aug 18th 2025"
|
||||
description = "Updated Aug 1st, 2023 by Auden Cote-L'Heureux, modified by GA Feb 13th 2024"
|
||||
)
|
||||
#add_argument section with parameters explained
|
||||
parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to a folder containing input trees (which must have the file extension .tre, .tree, .treefile, or .nex)')
|
||||
@ -28,8 +28,6 @@ def get_args():
|
||||
parser.add_argument('-nr', '--required_taxa_num', type = int, default = 0, help = 'The number of species belonging to taxa in the --required_taxa list that must be present in the clade. Default is 0.')
|
||||
parser.add_argument('-o', '--outgroup', type = str, default = '', help = 'A comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that will be included as outgroups in the output unaligned fasta files (which will contain only sequences from a single selected clade, and all outgroup sequences in the tree captured by this argument).')
|
||||
parser.add_argument('-c', '--contaminants', type = float, default = 2, help = 'The number of non-ingroup contaminants allowed in a clade, or if less than 1 the proportion of sequences in a clade that can be non-ingroup (i.e. presumed contaminants). Default is to allow 2 contaminants.')
|
||||
parser.add_argument('-ft', '--first_target', type=str, default='', help='[Optional] A comma-separated list or .txt file of complete/partial taxon codes for an initial, broad clade search. If provided, the script will first find clades with these taxa before applying the main --target filter.')
|
||||
parser.add_argument('-fm', '--first_min_presence', type=int, default=0, help='[Optional] Minimum number of sequences from --first_target required in a clade for it to be used in the second-stage search. Ignored if --first_target is not provided.')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
@ -87,155 +85,86 @@ def reroot(tree):
|
||||
|
||||
def get_subtrees(args, file):
|
||||
|
||||
newick = get_newick(args.input + '/' + file)
|
||||
tree = ete3.Tree(newick)
|
||||
newick = get_newick(args.input + '/' + file)
|
||||
|
||||
majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
|
||||
tree = ete3.Tree(newick)
|
||||
|
||||
# Only try to reroot trees with more than 2 major clades (original behavior)
|
||||
if len(majs) > 2:
|
||||
tree = reroot(tree)
|
||||
majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
|
||||
|
||||
# -------------------------------
|
||||
# FIRST-STAGE (optional) FILTER
|
||||
# -------------------------------
|
||||
def get_outer_leafsets():
|
||||
"""
|
||||
Return a list of sets, each set = leaf names of an outer clade
|
||||
that passes --first_target, --first_min_presence, children_keep,
|
||||
and contaminants logic (using args.contaminants).
|
||||
If --first_target is not used, return one set containing ALL leaves.
|
||||
"""
|
||||
if not args.first_target or args.first_min_presence == 0:
|
||||
return [set(leaf.name for leaf in tree)] # no outer filter → whole tree
|
||||
#Only try to reroot trees with more than 2 major clades. This was added to fix the ETE3 "Cannot set myself as outgroup" error
|
||||
if len(majs) > 2:
|
||||
tree = reroot(tree)
|
||||
|
||||
# Parse first_target codes
|
||||
if '.' in args.first_target:
|
||||
first_target_codes = [l.strip() for l in open(args.first_target, 'r').readlines() if l.strip() != '']
|
||||
else:
|
||||
first_target_codes = [code.strip() for code in args.first_target.split(',') if code.strip() != '']
|
||||
#Getting a clean list of all target taxa
|
||||
if '.' in args.target:
|
||||
try:
|
||||
target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
|
||||
except AttributeError:
|
||||
print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
|
||||
else:
|
||||
target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']
|
||||
|
||||
outer_sets = []
|
||||
seen_leaves = []
|
||||
#Getting a clean list of all "at least" taxa
|
||||
if '.' in args.required_taxa:
|
||||
try:
|
||||
required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
|
||||
except AttributeError:
|
||||
print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
|
||||
else:
|
||||
required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']
|
||||
|
||||
for node in tree.traverse('levelorder'):
|
||||
# large enough and not subsumed by already accepted outer node
|
||||
if len(node) >= args.first_min_presence and len(set(seen_leaves) & set([leaf.name for leaf in node])) == 0:
|
||||
leaves = [leaf.name for leaf in node]
|
||||
target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
|
||||
|
||||
|
||||
# children_keep logic but for first_target
|
||||
children_keep = 0
|
||||
for child in node.children:
|
||||
taken = False
|
||||
for code in first_target_codes:
|
||||
for leaf in child:
|
||||
if leaf.name.startswith(code):
|
||||
children_keep += 1
|
||||
taken = True
|
||||
break
|
||||
if taken:
|
||||
break
|
||||
if children_keep != len(node.children):
|
||||
continue
|
||||
#Creating a record of selected subtrees, and all of the leaves in those subtrees
|
||||
selected_nodes = []; seen_leaves = []
|
||||
|
||||
# count first-target hits (use [:10] uniqueness like original)
|
||||
first_hits = set()
|
||||
for code in first_target_codes:
|
||||
for leaf in leaves[::-1]:
|
||||
if leaf.startswith(code):
|
||||
first_hits.add(leaf[:10])
|
||||
leaves.remove(leaf)
|
||||
#Iterating through all nodes in tree, starting at "root" then working towards leaves
|
||||
for node in tree.traverse('levelorder'):
|
||||
#If a node is large enough and is not contained in an already selected clade
|
||||
|
||||
# contaminants logic applied to FIRST-STAGE (reuse args.contaminants)
|
||||
passes_contam = ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(first_hits)) or
|
||||
(args.contaminants >= 1 and len(leaves) <= args.contaminants))
|
||||
if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
|
||||
leaves = [leaf.name for leaf in node]
|
||||
|
||||
if len(first_hits) >= args.first_min_presence and passes_contam:
|
||||
outer_sets.append(set(leaf.name for leaf in node))
|
||||
seen_leaves.extend([leaf.name for leaf in node])
|
||||
#Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade with 1 fewer than the max number of contaminants
|
||||
children_keep = 0
|
||||
for child in node.children:
|
||||
for code in target_codes:
|
||||
taken = False
|
||||
for leaf in child:
|
||||
if leaf.name.startswith(code):
|
||||
children_keep += 1
|
||||
taken = True
|
||||
break
|
||||
if taken:
|
||||
break
|
||||
|
||||
return outer_sets
|
||||
if children_keep == len(node.children):
|
||||
target_leaves = set(); required_taxa_leaves = set()
|
||||
|
||||
for code in target_codes:
|
||||
for leaf in leaves[::-1]:
|
||||
#print(leaf)
|
||||
if leaf.startswith(code):
|
||||
target_leaves.add(leaf[:10])
|
||||
|
||||
# Build outer sets; if user supplied first-stage args, we'll restrict inner search to these
|
||||
using_first = bool(args.first_target) and args.first_min_presence > 0
|
||||
outer_leafsets = get_outer_leafsets()
|
||||
for req in required_taxa_codes:
|
||||
if leaf.startswith(req):
|
||||
required_taxa_leaves.add(leaf[:10])
|
||||
break
|
||||
leaves.remove(leaf)
|
||||
|
||||
# --------------------------------
|
||||
# ORIGINAL INNER FILTER (unchanged)
|
||||
# --------------------------------
|
||||
|
||||
# Getting a clean list of all target taxa
|
||||
if '.' in args.target:
|
||||
try:
|
||||
target_codes = [l.strip() for l in open(args.target, 'r').readlines() if l.strip() != '']
|
||||
except AttributeError:
|
||||
print('\n\nError: invalid "target" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
|
||||
else:
|
||||
target_codes = [code.strip() for code in args.target.split(',') if code.strip() != '']
|
||||
|
||||
# Getting a clean list of all "at least" taxa
|
||||
if '.' in args.required_taxa:
|
||||
try:
|
||||
required_taxa_codes = [l.strip() for l in open(args.required_taxa, 'r').readlines() if l.strip() != '']
|
||||
except AttributeError:
|
||||
print('\n\nError: invalid "required_taxa" argument. This must be a comma-separated list of any number of digits/characters (e.g. Sr_ci_S OR Am_t), or a file with the extension .txt containing a list of complete or partial taxon codes, to describe taxa that MUST be present in a clade for it to be selected (e.g. you may want at least one whole genome).\n\n')
|
||||
else:
|
||||
required_taxa_codes = [code.strip() for code in args.required_taxa.split(',') if code.strip() != '']
|
||||
|
||||
target_codes = list(dict.fromkeys(target_codes + required_taxa_codes))
|
||||
|
||||
# Creating a record of selected subtrees, and all of the leaves in those subtrees
|
||||
selected_nodes = []; seen_leaves = []
|
||||
|
||||
# Iterating through all nodes in tree, starting at "root" then working towards leaves
|
||||
for node in tree.traverse('levelorder'):
|
||||
# If using first-stage filter, only consider nodes fully inside some outer clade
|
||||
if using_first:
|
||||
node_leafs = set(leaf.name for leaf in node)
|
||||
# require subset (node fully contained in an accepted outer clade)
|
||||
if not any(node_leafs.issubset(S) for S in outer_leafsets):
|
||||
continue
|
||||
|
||||
# If a node is large enough and is not contained in an already selected clade
|
||||
if len(node) >= args.min_presence and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
|
||||
leaves = [leaf.name for leaf in node]
|
||||
|
||||
# Accounting for cases where e.g. one child is a contaminant, and the other child is a good clade
|
||||
children_keep = 0
|
||||
for child in node.children:
|
||||
for code in target_codes:
|
||||
taken = False
|
||||
for leaf in child:
|
||||
if leaf.name.startswith(code):
|
||||
children_keep += 1
|
||||
taken = True
|
||||
break
|
||||
if taken:
|
||||
break
|
||||
|
||||
if children_keep == len(node.children):
|
||||
target_leaves = set(); required_taxa_leaves = set()
|
||||
|
||||
for code in target_codes:
|
||||
for leaf in leaves[::-1]:
|
||||
if leaf.startswith(code):
|
||||
target_leaves.add(leaf[:10])
|
||||
|
||||
for req in required_taxa_codes:
|
||||
if leaf.startswith(req):
|
||||
required_taxa_leaves.add(leaf[:10])
|
||||
break
|
||||
leaves.remove(leaf)
|
||||
|
||||
# Grab a clade as a subtree if it passes all filters
|
||||
if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
|
||||
selected_nodes.append(node)
|
||||
seen_leaves.extend([leaf.name for leaf in node])
|
||||
|
||||
# Write the subtrees to output .tre files
|
||||
for i, node in enumerate(selected_nodes[::-1]):
|
||||
with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
|
||||
o.write(node.write())
|
||||
#Grab a clade as a subtree if 1) it has enough target taxa; 2) it has enough "at least" taxa; 3) it does not have too many contaminants
|
||||
if len(target_leaves) >= args.min_presence and len(required_taxa_leaves) >= args.required_taxa_num and ((args.contaminants < 1 and len(leaves) <= args.contaminants * len(target_leaves)) or len(leaves) <= args.contaminants):
|
||||
selected_nodes.append(node)
|
||||
seen_leaves.extend([leaf.name for leaf in node])
|
||||
#Write the subtrees to output .tre files
|
||||
for i, node in enumerate(selected_nodes[::-1]):
|
||||
with open('Subtrees/' + '.'.join(file.split('.')[:-1]) + '_' + str(i) + '.tre', 'w') as o:
|
||||
o.write(node.write())
|
||||
|
||||
|
||||
def make_new_unaligned(args):
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user