annotating contamination.py

This commit is contained in:
Auden Cote-L'Heureux 2024-02-07 12:48:38 -05:00 committed by GitHub
parent 23b81f83f7
commit 920b3b4ad4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,21 @@
#Updated by ACL on 1/16 to fix subsisters # Last updated Jan 2024
# Authors: Auden Cote-L'Heureux, Mario Ceron-Romero.
# This script contains the entirety of the contamination loop, an iterative tool to assess
# and remove contamination based on analyses of single gene trees. This tool allows for the
# use of one or both of two main methods of contamination assessment informed by tree topology.
# The first method “clade-based contamination removal” is intended for cases when the
# user is interested in genes present in a group of organisms with multiple representative
# samples and/or species in the gene trees. For a given set of target taxa, this method identifies
# robust, monophyletic clades containing those taxa within each gene tree, and removes all other
# sequences belonging to the group of target taxa. The second available method is 'sister-based
# contamination removal', where sequences are removed based on their sister relationship to
# user-specified contaminants. After sequences are removed in each iteration, the loop re-aligns and
# re-builds each tree excluding all removed sequences, and then repeats a specified number of times
# (--n_loops). Running in botb of these modes requires the user to input several parameters,
# which can be found in the manual, and in the 'CL' argument group in utils.py.
#Dependencies
import os, sys, re import os, sys, re
from Bio import SeqIO from Bio import SeqIO
import ete3 import ete3
@ -8,7 +23,7 @@ import guidance
import trees import trees
from statistics import mean from statistics import mean
#Utility function to extract Newick strings from Nexus files.
def get_newick(fname): def get_newick(fname):
newick = '' newick = ''
@ -59,7 +74,7 @@ def reroot(tree):
return tree return tree
#Clade-based contamination removal
def get_subtrees(args, file): def get_subtrees(args, file):
newick = get_newick(file) newick = get_newick(file)
@ -71,6 +86,7 @@ def get_subtrees(args, file):
except: except:
print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n') print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')
#Reading in clade grabbing exceptions
exceptions = [] exceptions = []
if args.clade_grabbing_exceptions != None: if args.clade_grabbing_exceptions != None:
if os.path.isfile(args.clade_grabbing_exceptions): if os.path.isfile(args.clade_grabbing_exceptions):
@ -79,6 +95,7 @@ def get_subtrees(args, file):
print('\nError: it looks like you tried to input a clade grabbing exceptions file, but it could not be found.\n') print('\nError: it looks like you tried to input a clade grabbing exceptions file, but it could not be found.\n')
exit() exit()
#Reading clade grabbing rules
rules_per_clade = [] rules_per_clade = []
if args.clade_grabbing_rules_file != None: if args.clade_grabbing_rules_file != None:
if os.path.isfile(args.clade_grabbing_rules_file): if os.path.isfile(args.clade_grabbing_rules_file):
@ -95,7 +112,8 @@ def get_subtrees(args, file):
exit() exit()
else: else:
rules_per_clade.append({ 'target_taxa' : args.target_taxa, 'num_contams' : args.num_contams, 'min_target_presence' : args.min_target_presence, 'required_taxa' : args.required_taxa, 'required_taxa_num' : args.required_taxa_num }) rules_per_clade.append({ 'target_taxa' : args.target_taxa, 'num_contams' : args.num_contams, 'min_target_presence' : args.min_target_presence, 'required_taxa' : args.required_taxa, 'required_taxa_num' : args.required_taxa_num })
#Reformatting rules
for clade in rules_per_clade: for clade in rules_per_clade:
if os.path.isfile(clade['target_taxa']): if os.path.isfile(clade['target_taxa']):
try: try:
@ -122,6 +140,7 @@ def get_subtrees(args, file):
#Creating a record of selected subtrees, and all of the leaves in those subtrees #Creating a record of selected subtrees, and all of the leaves in those subtrees
selected_leaves = [] selected_leaves = []
#For each set of rules (set of target taxa)
for clade in rules_per_clade: for clade in rules_per_clade:
seen_leaves = [] seen_leaves = []
@ -174,7 +193,7 @@ def get_subtrees(args, file):
return seqs2keep return seqs2keep
#Sisters-based contamination removal
def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax): def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
seqs2remove = [] seqs2remove = []
@ -192,6 +211,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
all_taxa_in_tree = [leaf.name[:10] for leaf in tree] all_taxa_in_tree = [leaf.name[:10] for leaf in tree]
#Reading in cocontaminants
coconts = { } coconts = { }
if args.cocontaminants != None: if args.cocontaminants != None:
if os.path.isfile(args.cocontaminants): if os.path.isfile(args.cocontaminants):
@ -237,6 +257,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
#Create a record of the sister sequences #Create a record of the sister sequences
sisters = list(dict.fromkeys(sisters)) sisters = list(dict.fromkeys(sisters))
#Getting list of removable sequences by sister relationships
sisters_removable = []; bls = [] sisters_removable = []; bls = []
for contam in bad_sisters: for contam in bad_sisters:
for sister in sisters: for sister in sisters:
@ -244,6 +265,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
sisters_removable.append(sister) sisters_removable.append(sister)
bls.append(bad_sisters[contam]) bls.append(bad_sisters[contam])
#Getting list of removable sequences by sub-sister relationships
subsisters_removable = [] subsisters_removable = []
for contam in bad_subsisters: for contam in bad_subsisters:
for sub_sister in sub_sisters: for sub_sister in sub_sisters:
@ -262,7 +284,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
return [leaf.name for leaf in tree if leaf.name not in seqs2remove] return [leaf.name for leaf in tree if leaf.name not in seqs2remove]
#Creating new unaligned file without the removed sequences
def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file): def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file):
if params.cl_exclude_taxa != None: if params.cl_exclude_taxa != None:
@ -294,6 +316,7 @@ def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file):
return seq_file[0], seqs_removed_from_og return seq_file[0], seqs_removed_from_og
#Utility function to run MAFFT in between iterations (if this is the chosen alignment method)
def cl_mafft(params): def cl_mafft(params):
for file in os.listdir(params.output + '/Output/Pre-Guidance'): for file in os.listdir(params.output + '/Output/Pre-Guidance'):
@ -302,14 +325,14 @@ def cl_mafft(params):
os.system('Scripts/trimal-trimAl/source/trimal -in ' + params.output + '/Output/NotGapTrimmed/' + file + ' -out ' + params.output + '/Output/Guidance/' + file.split('.')[0] + '.95gapTrimmed.fasta' + ' -gapthreshold 0.05 -fasta') os.system('Scripts/trimal-trimAl/source/trimal -in ' + params.output + '/Output/NotGapTrimmed/' + file + ' -out ' + params.output + '/Output/Guidance/' + file.split('.')[0] + '.95gapTrimmed.fasta' + ' -gapthreshold 0.05 -fasta')
#Utility function to run FastTree in between iterations (if this is the chosen tree-building method)
def cl_fasttree(params): def cl_fasttree(params):
for file in os.listdir(params.output + '/Output/Guidance'): for file in os.listdir(params.output + '/Output/Guidance'):
if file.split('.')[-1] in ('fasta', 'fas', 'faa'): if file.split('.')[-1] in ('fasta', 'fas', 'faa'):
os.system('FastTree ' + params.output + '/Output/Guidance/' + file + ' > ' + params.output + '/Output/Trees/' + file.split('.')[0] + '.FastTree.tre') os.system('FastTree ' + params.output + '/Output/Guidance/' + file + ' > ' + params.output + '/Output/Trees/' + file.split('.')[0] + '.FastTree.tre')
#Wrapper script to manage parameters and iteration
def run(params): def run(params):
seqs_removed = [] seqs_removed = []
@ -318,8 +341,11 @@ def run(params):
with open('SequencesRemoved_ContaminationLoop.txt', 'w') as o: with open('SequencesRemoved_ContaminationLoop.txt', 'w') as o:
o.write('Sequence\tLoopRemoved\n') o.write('Sequence\tLoopRemoved\n')
#For each iteration
for loop in range(params.nloops): for loop in range(params.nloops):
seqs_removed_loop = [] seqs_removed_loop = []
#Finding input files
if params.start == 'raw': if params.start == 'raw':
seqs_per_og = { file : { rec.id : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/Pre-Guidance/' + file, 'fasta') } for file in os.listdir(params.output + '/Output/Pre-Guidance') if file.split('.')[-1] in ('fasta', 'fas', 'faa') } seqs_per_og = { file : { rec.id : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/Pre-Guidance/' + file, 'fasta') } for file in os.listdir(params.output + '/Output/Pre-Guidance') if file.split('.')[-1] in ('fasta', 'fas', 'faa') }
elif params.start in ('unaligned', 'aligned', 'trees'): elif params.start in ('unaligned', 'aligned', 'trees'):
@ -329,12 +355,12 @@ def run(params):
for file in os.listdir(params.data): for file in os.listdir(params.data):
if file.split('.')[-1] in ('tre', 'tree', 'treefile'): if file.split('.')[-1] in ('tre', 'tree', 'treefile'):
os.system('cp ' + params.data + '/' + file + ' ' + params.output + '/Output/Trees') os.system('cp ' + params.data + '/' + file + ' ' + params.output + '/Output/Trees')
if loop > 0 or params.start == 'raw': if loop > 0 or params.start == 'raw':
os.system('mv ' + params.output + '/Output/Pre-Guidance ' + params.output + '/Output/Pre-Guidance_' + str(loop)) os.system('mv ' + params.output + '/Output/Pre-Guidance ' + params.output + '/Output/Pre-Guidance_' + str(loop))
os.mkdir(params.output + '/Output/Pre-Guidance') os.mkdir(params.output + '/Output/Pre-Guidance')
#Wrapper for running clade-based contamination removal on all trees
if params.contamination_loop == 'clade': if params.contamination_loop == 'clade':
for tree_file in os.listdir(params.output + '/Output/Trees'): for tree_file in os.listdir(params.output + '/Output/Trees'):
if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex') and tree_file not in completed_ogs: if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex') and tree_file not in completed_ogs:
@ -346,7 +372,7 @@ def run(params):
completed_ogs.append(tree_file) completed_ogs.append(tree_file)
else: else:
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed] seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
#Wrapper for running sisters-based contamination removal on all trees
elif params.contamination_loop == 'seq': elif params.contamination_loop == 'seq':
sister_contam_per_tax = { } sister_contam_per_tax = { }
@ -385,12 +411,14 @@ def run(params):
completed_ogs.append(tree_file) completed_ogs.append(tree_file)
else: else:
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed] seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
#Keeping record of removed sequences
seqs_removed += seqs_removed_loop seqs_removed += seqs_removed_loop
with open(params.output + '/Output/SequencesRemoved_ContaminationLoop.txt', 'a') as o: with open(params.output + '/Output/SequencesRemoved_ContaminationLoop.txt', 'a') as o:
for seq in seqs_removed_loop: for seq in seqs_removed_loop:
o.write(seq + '\t' + str(loop) + '\n') o.write(seq + '\t' + str(loop) + '\n')
#Writing output files with sequences removed, with the iteration labeled
os.system('mv ' + params.output + '/Output/Trees ' + params.output + '/Output/Trees_' + str(loop)) os.system('mv ' + params.output + '/Output/Trees ' + params.output + '/Output/Trees_' + str(loop))
os.mkdir(params.output + '/Output/Trees') os.mkdir(params.output + '/Output/Trees')
@ -399,11 +427,12 @@ def run(params):
os.system('mv ' + params.output + '/Output/NotGapTrimmed ' + params.output + '/Output/NotGapTrimmed_' + str(loop)) os.system('mv ' + params.output + '/Output/NotGapTrimmed ' + params.output + '/Output/NotGapTrimmed_' + str(loop))
os.mkdir(params.output + '/Output/NotGapTrimmed') os.mkdir(params.output + '/Output/NotGapTrimmed')
params.start = 'unaligned' params.start = 'unaligned'
params.end = 'trees' params.end = 'trees'
params.tree_method = params.cl_tree_method params.tree_method = params.cl_tree_method
#Re-aligning and building trees without contaminant sequences... then ready for next iteration.
if params.cl_alignment_method == 'mafft_only': if params.cl_alignment_method == 'mafft_only':
cl_mafft(params) cl_mafft(params)
else: else: