EukPhylo/PTL2/Scripts/concatenate.py
2025-01-19 11:06:23 -05:00

309 lines
12 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Last updated Jan 2024
# Authors: Auden Cote-L'Heureux and Mario Ceron-Romero
# This script chooses orthologs to concatenate OGs. This can be done as part of an end-to-end EukPhylo run,
# or by inputting already complete alignments and gene trees and running only the concatenation step.
# Use the --concatenate flag to run this step, and optionally use the argument --concat_target_taxa to input
# a file containing a list of taxon codes to be included in the concatenated alignment. If a GF has more
# than one sequence from a taxon, a representative ortholog must be chosen to include in the concatenated alignment.
# To do this, for each taxon EukPhylo keeps only the sequences falling in the monophyletic clade in the tree
# that contains the greatest number of species of the taxons minor clade (or major clade, if the target taxon list
# uses major-clade codes). If multiple sequences from the taxon fall into this largest clade, then the sequence
# with the highest score (defined as length times k-mer coverage for transcriptomic data with k-mer coverage
# in the sequence ID as formatted by rnaSpades, and otherwise just length) is kept for the concatenated alignment.
# If a GF is not present as a taxon, its missing data are filled in with gaps in the concatenated alignment.
# Along with the concatenated alignment, this part of the pipeline outputs individual alignments with orthologs
# selected (and re-aligned with MAFFT), in case a user wants to construct a model-partitioned or other specialized
# kind of species tree.
#Dependencies
import os, sys
from Bio import SeqIO
import ete3
import argparse
from tqdm import tqdm
#Small utility function to extract newick strings from nexus file
def get_newick(fname):
newick = ''
for line in open(fname):
line = line.split(' ')[-1]
if(line.startswith('(') or line.startswith('tree1=')):
newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '')
return newick
#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade,
#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr.
def reroot(tree):
#This nested function returns the largest clade of a given taxonomic group
def get_best_clade(taxon):
best_size = 0; best_clade = []; seen_leaves = []
#Traverse all nodes
for node in tree.traverse('levelorder'):
#If the node is big enough and not subsumed by a node we've already accepted
if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
leaves = [leaf.name for leaf in node]
#Create a record of leaves that belong to the taxonomic group
target_leaves = set()
for leaf in leaves[::-1]:
if leaf[:2] in taxon:
target_leaves.add(leaf[:10])
leaves.remove(leaf)
#If this clade is better than any clade we've seen before, grab it
if len(target_leaves) > best_size and len(leaves) <= 2:
best_clade = node
best_size = len(target_leaves)
seen_leaves.extend([leaf.name for leaf in node])
return best_clade
#Get the biggest clade for each taxonomic group (stops once it finds one)
for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]:
clade = get_best_clade(taxon)
if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3:
tree.set_outgroup( clade)
break
return tree
#Function to select sequences to use per tree
def remove_paralogs(params):
seqs_per_og = { }
for file in tqdm(os.listdir(params.output + '/Output/Guidance')):
if file.split('.')[-1] in ('fasta', 'fas', 'faa'):
prefix = '.'.join(file.split('.')[:-1])
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix)]
if len(tre_f) == 0:
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix.split('.')[0])]
if len(tre_f) == 0:
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(file[:10])]
if len(tre_f) == 0:
print('\nNo tree file found for alignment ' + file + '. Skipping this gene family.\n')
continue
elif len(tre_f) > 1:
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one alignment file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
continue
elif len(tre_f) > 1:
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
continue
elif len(tre_f) > 1:
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
continue
seqs_per_og.update({ file : [] })
og_recs = { rec.id : rec for rec in SeqIO.parse(params.output + '/Output/Guidance/' + file, 'fasta')}
newick = get_newick(params.output + '/Output/Trees/' + tre_f[0])
tree = ete3.Tree(newick)
try:
tree = reroot(tree)
except:
print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')
#Getting a clean list of all target taxa
if os.path.isfile(params.concat_target_taxa):
try:
target_codes = [l.strip() for l in open(params.concat_target_taxa).readlines() if l.strip() != '']
except AttributeError:
print('\n\nError: invalid "concat_target_taxa" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_tu), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
elif params.concat_target_taxa != None:
target_codes = [params.concat_target_taxa]
else:
print('\nERROR: missing --concat_target_taxa argument. When concatenating, you need to give the taxonomic group (sequence prefix), groups, or a file containing a list of groups (multiple prefixes) for which to select sequences to construct a concatenated alignment\n')
exit()
monophyletic_clades = { }
#Create list of relevant major/minor clades for clade grabbing
for taxon in target_codes:
if len(taxon) < 5 and taxon[:2] not in monophyletic_clades:
monophyletic_clades.update({ taxon : [] })
elif len(taxon) >= 5 and taxon[:5] not in monophyletic_clades:
monophyletic_clades.update({ taxon[:5] : [] })
#Grab best clades from all target groups
seen_leaves = []
for clade in monophyletic_clades:
for node in tree.traverse('levelorder'):
#If the node is big enough and not subsumed by a node we've already accepted
if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
leaves = [leaf.name for leaf in node]
#Create a record of leaves that belong to the taxonomic group
target_leaves = set()
for leaf in leaves[::-1]:
if leaf[:2] == clade or leaf[:5] == clade:
target_leaves.add(leaf[:10])
leaves.remove(leaf)
#If the clade is monophyletic
if len(leaves) == 0:
monophyletic_clades[clade].append(node)
seen_leaves.extend([leaf.name for leaf in node])
#Get all target taxa in the alignment
taxa = []
for seq in tree:
for code in target_codes:
if code in seq.name:
taxa.append(seq.name[:10])
break
taxa = list(dict.fromkeys(taxa))
#For each taxon, get its best sequence
for tax in taxa:
#Get all sequences belonging to the taxon
taxseqs = [seq.name for seq in tree if seq.name[:10] == tax]
score = False
#If there's more than one sequence
if len(taxseqs) > 1:
clades = { }
#Get the size of the clade in which each sequence falls (at minor clade level if available, otherwise major clade)
if tax[:5] in monophyletic_clades:
clades = { seq : len([leaf for clade in monophyletic_clades[tax[:5]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs }
elif tax[:2] in monophyletic_clades:
clades = { seq : len([leaf for clade in monophyletic_clades[tax[:2]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs }
#If there's more than one sequence that falls in a robust clade
if len(clades) > 0:
#Filter the list of sequences to those that fall in clades
taxseqs = [seq for seq in taxseqs if seq in clades]
#Get the largest clade in which a sequence from the taxon falls
best_size = max(list(clades.values()))
#Get a list of sequences in a clade of that size
best_seqs = [seq for seq in taxseqs if clades[seq] == best_size]
#If there is only one sequence in the best-sized clade, take it and finish
if len(best_seqs) == 1:
seqs_per_og[file].append(og_recs[best_seqs[0]])
#Otherwise, need to take the sequence with the best score that falls into a clade of that size
else:
taxseqs = best_seqs
score = True
#Otherwise, of all sequences that don't fall in any clade, take the one with the best score
else:
score = True
#If there's only one sequence for the taxon, no problem
elif len(taxseqs) == 1:
seqs_per_og[file].append(og_recs[taxseqs[0]])
#If scoring is necessary, do it on the filter set of sequences for the taxon and keep the best
if score:
use_cov = True
for seq in taxseqs:
if 'Cov' not in seq[10:]:
use_cov = False
break
if use_cov:
taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', '')) * float(x.split('Cov')[-1].split('_')[0]))
else:
taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', '')))
seqs_per_og[file].append(og_recs[taxseqs[0]])
return seqs_per_og
#Function to concatenate all the selected sequences into one alignment file
def concat(seqs_per_og, params):
taxa = list(dict.fromkeys([rec.id[:10] for og in seqs_per_og for rec in seqs_per_og[og]]))
seqs_per_og = { og : { rec.id : str(rec.seq).replace('-', '') for rec in seqs_per_og[og] } for og in seqs_per_og }
if not os.path.isdir(params.output + '/Output/DataToConcatenate'):
os.mkdir(params.output + '/Output/DataToConcatenate')
os.mkdir(params.output + '/Output/DataToConcatenate/Unaligned')
os.mkdir(params.output + '/Output/DataToConcatenate/Aligned')
for og in seqs_per_og:
with open(params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta', 'w') as o:
for tax in seqs_per_og[og]:
o.write('>' + tax + '\n' + seqs_per_og[og][tax] + '\n\n')
os.system('mafft ' + params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta > ' + params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta')
seqs_per_og[og] = { rec.id[:10] : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta', 'fasta') }
concat_seqs_per_tax = { tax : '' for tax in taxa }
for taxon in taxa:
for og in seqs_per_og:
if taxon in seqs_per_og[og]:
concat_seqs_per_tax[taxon] += seqs_per_og[og][taxon]
else:
print(list(seqs_per_og[og].values()))
print(og)
concat_seqs_per_tax[taxon] += ''.join(['-' for i in range(len(list(seqs_per_og[og].values())[0]))])
with open(params.output + '/Output/ConcatenatedAlignment.fasta', 'w') as o:
for tax in concat_seqs_per_tax:
o.write('>' + tax + '\n' + concat_seqs_per_tax[tax] + '\n\n')
#wrapper
def run(params):
if not os.path.isdir(params.output + '/Output/Guidance') or not os.path.isdir(params.output + '/Output/Trees'):
print('\nERROR in concatenation: cannot find alignments and/or trees (looking in ' + params.output + '/Output/Guidance and ' + params.output + '/Output/Trees)')
exit()
else:
seqs_per_og = remove_paralogs(params)
concat(seqs_per_og, params)