mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2026-02-10 21:10:25 +08:00
annotating contamination.py
This commit is contained in:
parent
23b81f83f7
commit
920b3b4ad4
@ -1,6 +1,21 @@
|
|||||||
#Updated by ACL on 1/16 to fix subsisters
|
# Last updated Jan 2024
|
||||||
|
# Authors: Auden Cote-L'Heureux, Mario Ceron-Romero.
|
||||||
|
|
||||||
|
# This script contains the entirety of the contamination loop, an iterative tool to assess
|
||||||
|
# and remove contamination based on analyses of single gene trees. This tool allows for the
|
||||||
|
# use of one or both of two main methods of contamination assessment informed by tree topology.
|
||||||
|
# The first method – “clade-based contamination removal” – is intended for cases when the
|
||||||
|
# user is interested in genes present in a group of organisms with multiple representative
|
||||||
|
# samples and/or species in the gene trees. For a given set of target taxa, this method identifies
|
||||||
|
# robust, monophyletic clades containing those taxa within each gene tree, and removes all other
|
||||||
|
# sequences belonging to the group of target taxa. The second available method is 'sister-based
|
||||||
|
# contamination removal', where sequences are removed based on their sister relationship to
|
||||||
|
# user-specified contaminants. After sequences are removed in each iteration, the loop re-aligns and
|
||||||
|
# re-builds each tree excluding all removed sequences, and then repeats a specified number of times
|
||||||
|
# (--n_loops). Running in botb of these modes requires the user to input several parameters,
|
||||||
|
# which can be found in the manual, and in the 'CL' argument group in utils.py.
|
||||||
|
|
||||||
|
#Dependencies
|
||||||
import os, sys, re
|
import os, sys, re
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
import ete3
|
import ete3
|
||||||
@ -8,7 +23,7 @@ import guidance
|
|||||||
import trees
|
import trees
|
||||||
from statistics import mean
|
from statistics import mean
|
||||||
|
|
||||||
|
#Utility function to extract Newick strings from Nexus files.
|
||||||
def get_newick(fname):
|
def get_newick(fname):
|
||||||
|
|
||||||
newick = ''
|
newick = ''
|
||||||
@ -59,7 +74,7 @@ def reroot(tree):
|
|||||||
|
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
|
#Clade-based contamination removal
|
||||||
def get_subtrees(args, file):
|
def get_subtrees(args, file):
|
||||||
|
|
||||||
newick = get_newick(file)
|
newick = get_newick(file)
|
||||||
@ -71,6 +86,7 @@ def get_subtrees(args, file):
|
|||||||
except:
|
except:
|
||||||
print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')
|
print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')
|
||||||
|
|
||||||
|
#Reading in clade grabbing exceptions
|
||||||
exceptions = []
|
exceptions = []
|
||||||
if args.clade_grabbing_exceptions != None:
|
if args.clade_grabbing_exceptions != None:
|
||||||
if os.path.isfile(args.clade_grabbing_exceptions):
|
if os.path.isfile(args.clade_grabbing_exceptions):
|
||||||
@ -79,6 +95,7 @@ def get_subtrees(args, file):
|
|||||||
print('\nError: it looks like you tried to input a clade grabbing exceptions file, but it could not be found.\n')
|
print('\nError: it looks like you tried to input a clade grabbing exceptions file, but it could not be found.\n')
|
||||||
exit()
|
exit()
|
||||||
|
|
||||||
|
#Reading clade grabbing rules
|
||||||
rules_per_clade = []
|
rules_per_clade = []
|
||||||
if args.clade_grabbing_rules_file != None:
|
if args.clade_grabbing_rules_file != None:
|
||||||
if os.path.isfile(args.clade_grabbing_rules_file):
|
if os.path.isfile(args.clade_grabbing_rules_file):
|
||||||
@ -95,7 +112,8 @@ def get_subtrees(args, file):
|
|||||||
exit()
|
exit()
|
||||||
else:
|
else:
|
||||||
rules_per_clade.append({ 'target_taxa' : args.target_taxa, 'num_contams' : args.num_contams, 'min_target_presence' : args.min_target_presence, 'required_taxa' : args.required_taxa, 'required_taxa_num' : args.required_taxa_num })
|
rules_per_clade.append({ 'target_taxa' : args.target_taxa, 'num_contams' : args.num_contams, 'min_target_presence' : args.min_target_presence, 'required_taxa' : args.required_taxa, 'required_taxa_num' : args.required_taxa_num })
|
||||||
|
|
||||||
|
#Reformatting rules
|
||||||
for clade in rules_per_clade:
|
for clade in rules_per_clade:
|
||||||
if os.path.isfile(clade['target_taxa']):
|
if os.path.isfile(clade['target_taxa']):
|
||||||
try:
|
try:
|
||||||
@ -122,6 +140,7 @@ def get_subtrees(args, file):
|
|||||||
#Creating a record of selected subtrees, and all of the leaves in those subtrees
|
#Creating a record of selected subtrees, and all of the leaves in those subtrees
|
||||||
selected_leaves = []
|
selected_leaves = []
|
||||||
|
|
||||||
|
#For each set of rules (set of target taxa)
|
||||||
for clade in rules_per_clade:
|
for clade in rules_per_clade:
|
||||||
seen_leaves = []
|
seen_leaves = []
|
||||||
|
|
||||||
@ -174,7 +193,7 @@ def get_subtrees(args, file):
|
|||||||
|
|
||||||
return seqs2keep
|
return seqs2keep
|
||||||
|
|
||||||
|
#Sisters-based contamination removal
|
||||||
def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
||||||
|
|
||||||
seqs2remove = []
|
seqs2remove = []
|
||||||
@ -192,6 +211,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
|||||||
|
|
||||||
all_taxa_in_tree = [leaf.name[:10] for leaf in tree]
|
all_taxa_in_tree = [leaf.name[:10] for leaf in tree]
|
||||||
|
|
||||||
|
#Reading in cocontaminants
|
||||||
coconts = { }
|
coconts = { }
|
||||||
if args.cocontaminants != None:
|
if args.cocontaminants != None:
|
||||||
if os.path.isfile(args.cocontaminants):
|
if os.path.isfile(args.cocontaminants):
|
||||||
@ -237,6 +257,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
|||||||
#Create a record of the sister sequences
|
#Create a record of the sister sequences
|
||||||
sisters = list(dict.fromkeys(sisters))
|
sisters = list(dict.fromkeys(sisters))
|
||||||
|
|
||||||
|
#Getting list of removable sequences by sister relationships
|
||||||
sisters_removable = []; bls = []
|
sisters_removable = []; bls = []
|
||||||
for contam in bad_sisters:
|
for contam in bad_sisters:
|
||||||
for sister in sisters:
|
for sister in sisters:
|
||||||
@ -244,6 +265,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
|||||||
sisters_removable.append(sister)
|
sisters_removable.append(sister)
|
||||||
bls.append(bad_sisters[contam])
|
bls.append(bad_sisters[contam])
|
||||||
|
|
||||||
|
#Getting list of removable sequences by sub-sister relationships
|
||||||
subsisters_removable = []
|
subsisters_removable = []
|
||||||
for contam in bad_subsisters:
|
for contam in bad_subsisters:
|
||||||
for sub_sister in sub_sisters:
|
for sub_sister in sub_sisters:
|
||||||
@ -262,7 +284,7 @@ def get_sisters(args, file, sister_contam_per_tax, subsister_contam_per_tax):
|
|||||||
|
|
||||||
return [leaf.name for leaf in tree if leaf.name not in seqs2remove]
|
return [leaf.name for leaf in tree if leaf.name not in seqs2remove]
|
||||||
|
|
||||||
|
#Creating new unaligned file without the removed sequences
|
||||||
def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file):
|
def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file):
|
||||||
|
|
||||||
if params.cl_exclude_taxa != None:
|
if params.cl_exclude_taxa != None:
|
||||||
@ -294,6 +316,7 @@ def write_new_preguidance(params, seqs2keep, seqs_per_og, tree_file):
|
|||||||
return seq_file[0], seqs_removed_from_og
|
return seq_file[0], seqs_removed_from_og
|
||||||
|
|
||||||
|
|
||||||
|
#Utility function to run MAFFT in between iterations (if this is the chosen alignment method)
|
||||||
def cl_mafft(params):
|
def cl_mafft(params):
|
||||||
|
|
||||||
for file in os.listdir(params.output + '/Output/Pre-Guidance'):
|
for file in os.listdir(params.output + '/Output/Pre-Guidance'):
|
||||||
@ -302,14 +325,14 @@ def cl_mafft(params):
|
|||||||
|
|
||||||
os.system('Scripts/trimal-trimAl/source/trimal -in ' + params.output + '/Output/NotGapTrimmed/' + file + ' -out ' + params.output + '/Output/Guidance/' + file.split('.')[0] + '.95gapTrimmed.fasta' + ' -gapthreshold 0.05 -fasta')
|
os.system('Scripts/trimal-trimAl/source/trimal -in ' + params.output + '/Output/NotGapTrimmed/' + file + ' -out ' + params.output + '/Output/Guidance/' + file.split('.')[0] + '.95gapTrimmed.fasta' + ' -gapthreshold 0.05 -fasta')
|
||||||
|
|
||||||
|
#Utility function to run FastTree in between iterations (if this is the chosen tree-building method)
|
||||||
def cl_fasttree(params):
|
def cl_fasttree(params):
|
||||||
|
|
||||||
for file in os.listdir(params.output + '/Output/Guidance'):
|
for file in os.listdir(params.output + '/Output/Guidance'):
|
||||||
if file.split('.')[-1] in ('fasta', 'fas', 'faa'):
|
if file.split('.')[-1] in ('fasta', 'fas', 'faa'):
|
||||||
os.system('FastTree ' + params.output + '/Output/Guidance/' + file + ' > ' + params.output + '/Output/Trees/' + file.split('.')[0] + '.FastTree.tre')
|
os.system('FastTree ' + params.output + '/Output/Guidance/' + file + ' > ' + params.output + '/Output/Trees/' + file.split('.')[0] + '.FastTree.tre')
|
||||||
|
|
||||||
|
#Wrapper script to manage parameters and iteration
|
||||||
def run(params):
|
def run(params):
|
||||||
|
|
||||||
seqs_removed = []
|
seqs_removed = []
|
||||||
@ -318,8 +341,11 @@ def run(params):
|
|||||||
with open('SequencesRemoved_ContaminationLoop.txt', 'w') as o:
|
with open('SequencesRemoved_ContaminationLoop.txt', 'w') as o:
|
||||||
o.write('Sequence\tLoopRemoved\n')
|
o.write('Sequence\tLoopRemoved\n')
|
||||||
|
|
||||||
|
#For each iteration
|
||||||
for loop in range(params.nloops):
|
for loop in range(params.nloops):
|
||||||
seqs_removed_loop = []
|
seqs_removed_loop = []
|
||||||
|
|
||||||
|
#Finding input files
|
||||||
if params.start == 'raw':
|
if params.start == 'raw':
|
||||||
seqs_per_og = { file : { rec.id : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/Pre-Guidance/' + file, 'fasta') } for file in os.listdir(params.output + '/Output/Pre-Guidance') if file.split('.')[-1] in ('fasta', 'fas', 'faa') }
|
seqs_per_og = { file : { rec.id : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/Pre-Guidance/' + file, 'fasta') } for file in os.listdir(params.output + '/Output/Pre-Guidance') if file.split('.')[-1] in ('fasta', 'fas', 'faa') }
|
||||||
elif params.start in ('unaligned', 'aligned', 'trees'):
|
elif params.start in ('unaligned', 'aligned', 'trees'):
|
||||||
@ -329,12 +355,12 @@ def run(params):
|
|||||||
for file in os.listdir(params.data):
|
for file in os.listdir(params.data):
|
||||||
if file.split('.')[-1] in ('tre', 'tree', 'treefile'):
|
if file.split('.')[-1] in ('tre', 'tree', 'treefile'):
|
||||||
os.system('cp ' + params.data + '/' + file + ' ' + params.output + '/Output/Trees')
|
os.system('cp ' + params.data + '/' + file + ' ' + params.output + '/Output/Trees')
|
||||||
|
|
||||||
if loop > 0 or params.start == 'raw':
|
if loop > 0 or params.start == 'raw':
|
||||||
os.system('mv ' + params.output + '/Output/Pre-Guidance ' + params.output + '/Output/Pre-Guidance_' + str(loop))
|
os.system('mv ' + params.output + '/Output/Pre-Guidance ' + params.output + '/Output/Pre-Guidance_' + str(loop))
|
||||||
|
|
||||||
os.mkdir(params.output + '/Output/Pre-Guidance')
|
os.mkdir(params.output + '/Output/Pre-Guidance')
|
||||||
|
|
||||||
|
#Wrapper for running clade-based contamination removal on all trees
|
||||||
if params.contamination_loop == 'clade':
|
if params.contamination_loop == 'clade':
|
||||||
for tree_file in os.listdir(params.output + '/Output/Trees'):
|
for tree_file in os.listdir(params.output + '/Output/Trees'):
|
||||||
if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex') and tree_file not in completed_ogs:
|
if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex') and tree_file not in completed_ogs:
|
||||||
@ -346,7 +372,7 @@ def run(params):
|
|||||||
completed_ogs.append(tree_file)
|
completed_ogs.append(tree_file)
|
||||||
else:
|
else:
|
||||||
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
|
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
|
||||||
|
#Wrapper for running sisters-based contamination removal on all trees
|
||||||
elif params.contamination_loop == 'seq':
|
elif params.contamination_loop == 'seq':
|
||||||
|
|
||||||
sister_contam_per_tax = { }
|
sister_contam_per_tax = { }
|
||||||
@ -385,12 +411,14 @@ def run(params):
|
|||||||
completed_ogs.append(tree_file)
|
completed_ogs.append(tree_file)
|
||||||
else:
|
else:
|
||||||
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
|
seqs_removed_loop += [seq for seq in seqs_per_og[seq_file] if seq not in seqs2keep and seq not in seqs_removed]
|
||||||
|
|
||||||
|
#Keeping record of removed sequences
|
||||||
seqs_removed += seqs_removed_loop
|
seqs_removed += seqs_removed_loop
|
||||||
with open(params.output + '/Output/SequencesRemoved_ContaminationLoop.txt', 'a') as o:
|
with open(params.output + '/Output/SequencesRemoved_ContaminationLoop.txt', 'a') as o:
|
||||||
for seq in seqs_removed_loop:
|
for seq in seqs_removed_loop:
|
||||||
o.write(seq + '\t' + str(loop) + '\n')
|
o.write(seq + '\t' + str(loop) + '\n')
|
||||||
|
|
||||||
|
#Writing output files with sequences removed, with the iteration labeled
|
||||||
os.system('mv ' + params.output + '/Output/Trees ' + params.output + '/Output/Trees_' + str(loop))
|
os.system('mv ' + params.output + '/Output/Trees ' + params.output + '/Output/Trees_' + str(loop))
|
||||||
os.mkdir(params.output + '/Output/Trees')
|
os.mkdir(params.output + '/Output/Trees')
|
||||||
|
|
||||||
@ -399,11 +427,12 @@ def run(params):
|
|||||||
|
|
||||||
os.system('mv ' + params.output + '/Output/NotGapTrimmed ' + params.output + '/Output/NotGapTrimmed_' + str(loop))
|
os.system('mv ' + params.output + '/Output/NotGapTrimmed ' + params.output + '/Output/NotGapTrimmed_' + str(loop))
|
||||||
os.mkdir(params.output + '/Output/NotGapTrimmed')
|
os.mkdir(params.output + '/Output/NotGapTrimmed')
|
||||||
|
|
||||||
params.start = 'unaligned'
|
params.start = 'unaligned'
|
||||||
params.end = 'trees'
|
params.end = 'trees'
|
||||||
params.tree_method = params.cl_tree_method
|
params.tree_method = params.cl_tree_method
|
||||||
|
|
||||||
|
#Re-aligning and building trees without contaminant sequences... then ready for next iteration.
|
||||||
if params.cl_alignment_method == 'mafft_only':
|
if params.cl_alignment_method == 'mafft_only':
|
||||||
cl_mafft(params)
|
cl_mafft(params)
|
||||||
else:
|
else:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user