diff --git a/PTL2/Scripts/concatenate.py b/PTL2/Scripts/concatenate.py deleted file mode 100644 index 9edd7b2..0000000 --- a/PTL2/Scripts/concatenate.py +++ /dev/null @@ -1,309 +0,0 @@ -# Last updated Jan 2024 -# Authors: Auden Cote-L'Heureux and Mario Ceron-Romero - -# This script chooses orthologs to concatenate OGs. This can be done as part of an end-to-end PhyloToL run, -# or by inputting already complete alignments and gene trees and running only the concatenation step. -# Use the --concatenate flag to run this step, and optionally use the argument --concat_target_taxa to input -# a file containing a list of taxon codes to be included in the concatenated alignment. If a GF has more -# than one sequence from a taxon, a representative ortholog must be chosen to include in the concatenated alignment. -# To do this, for each taxon PhyloToL keeps only the sequences falling in the monophyletic clade in the tree -# that contains the greatest number of species of the taxon’s minor clade (or major clade, if the ‘target taxon list’ -# uses major-clade codes). If multiple sequences from the taxon fall into this largest clade, then the sequence -# with the highest ‘score’ (defined as length times k-mer coverage for transcriptomic data with k-mer coverage -# in the sequence ID as formatted by rnaSpades, and otherwise just length) is kept for the concatenated alignment. -# If a GF is not present as a taxon, its missing data are filled in with gaps in the concatenated alignment. -# Along with the concatenated alignment, this part of the pipeline outputs individual alignments with orthologs -# selected (and re-aligned with MAFFT), in case a user wants to construct a model-partitioned or other specialized -# kind of species tree. - -#Dependencies -import os, sys -from Bio import SeqIO -import ete3 -import argparse -from tqdm import tqdm - - -#Small utility function to extract newick strings from nexus file -def get_newick(fname): - - newick = '' - for line in open(fname): - line = line.split(' ')[-1] - if(line.startswith('(') or line.startswith('tree1=')): - newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '') - - return newick - - -#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade, -#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr. -def reroot(tree): - - #This nested function returns the largest clade of a given taxonomic group - def get_best_clade(taxon): - - best_size = 0; best_clade = []; seen_leaves = [] - #Traverse all nodes - for node in tree.traverse('levelorder'): - #If the node is big enough and not subsumed by a node we've already accepted - if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: - leaves = [leaf.name for leaf in node] - - #Create a record of leaves that belong to the taxonomic group - target_leaves = set() - for leaf in leaves[::-1]: - if leaf[:2] in taxon: - target_leaves.add(leaf[:10]) - leaves.remove(leaf) - - #If this clade is better than any clade we've seen before, grab it - if len(target_leaves) > best_size and len(leaves) <= 2: - best_clade = node - best_size = len(target_leaves) - seen_leaves.extend([leaf.name for leaf in node]) - - return best_clade - - #Get the biggest clade for each taxonomic group (stops once it finds one) - for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]: - clade = get_best_clade(taxon) - if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3: - tree.set_outgroup( clade) - - break - - return tree - - -#Function to select sequences to use per tree -def remove_paralogs(params): - - seqs_per_og = { } - for file in tqdm(os.listdir(params.output + '/Output/Guidance')): - if file.split('.')[-1] in ('fasta', 'fas', 'faa'): - - prefix = '.'.join(file.split('.')[:-1]) - tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix)] - if len(tre_f) == 0: - tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix.split('.')[0])] - - if len(tre_f) == 0: - - tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(file[:10])] - - if len(tre_f) == 0: - print('\nNo tree file found for alignment ' + file + '. Skipping this gene family.\n') - continue - elif len(tre_f) > 1: - print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one alignment file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n') - continue - elif len(tre_f) > 1: - print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n') - continue - elif len(tre_f) > 1: - print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n') - continue - - seqs_per_og.update({ file : [] }) - og_recs = { rec.id : rec for rec in SeqIO.parse(params.output + '/Output/Guidance/' + file, 'fasta')} - - newick = get_newick(params.output + '/Output/Trees/' + tre_f[0]) - tree = ete3.Tree(newick) - - try: - tree = reroot(tree) - except: - print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n') - - #Getting a clean list of all target taxa - - if os.path.isdir(params.concat_target_taxa): - if os.path.isfile(params.concat_target_taxa): - try: - target_codes = [l.strip() for l in open(params.concat_target_taxa).readlines() if l.strip() != ''] - except AttributeError: - print('\n\nError: invalid "concat_target_taxa" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_tu), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n') - elif params.concat_target_taxa != None: - target_codes = [params.concat_target_taxa] - else: - print('\nERROR: missing --concat_target_taxa argument. When concatenating, you need to give the taxonomic group (sequence prefix), groups, or a file containing a list of groups (multiple prefixes) for which to select sequences to construct a concatenated alignment\n') - exit() - - monophyletic_clades = { } - - #Create list of relevant major/minor clades for clade grabbing - for taxon in target_codes: - if len(taxon) < 5 and taxon[:2] not in monophyletic_clades: - monophyletic_clades.update({ taxon : [] }) - elif len(taxon) >= 5 and taxon[:5] not in monophyletic_clades: - monophyletic_clades.update({ taxon[:5] : [] }) - - #Grab best clades from all target groups - seen_leaves = [] - for clade in monophyletic_clades: - for node in tree.traverse('levelorder'): - #If the node is big enough and not subsumed by a node we've already accepted - if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: - leaves = [leaf.name for leaf in node] - - #Create a record of leaves that belong to the taxonomic group - target_leaves = set() - for leaf in leaves[::-1]: - if leaf[:2] == clade or leaf[:5] == clade: - target_leaves.add(leaf[:10]) - leaves.remove(leaf) - - #If the clade is monophyletic - if len(leaves) == 0: - monophyletic_clades[clade].append(node) - seen_leaves.extend([leaf.name for leaf in node]) - - #Get all target taxa in the alignment - taxa = [] - for seq in tree: - for code in target_codes: - if code in seq.name: - taxa.append(seq.name[:10]) - break - - taxa = list(dict.fromkeys(taxa)) - - #For each taxon, get its best sequence - for tax in taxa: - - #Get all sequences belonging to the taxon - taxseqs = [seq.name for seq in tree if seq.name[:10] == tax] - - score = False - - #If there's more than one sequence - if len(taxseqs) > 1: - - clades = { } - - #Get the size of the clade in which each sequence falls (at minor clade level if available, otherwise major clade) - if tax[:5] in monophyletic_clades: - clades = { seq : len([leaf for clade in monophyletic_clades[tax[:5]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs } - elif tax[:2] in monophyletic_clades: - clades = { seq : len([leaf for clade in monophyletic_clades[tax[:2]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs } - - #If there's more than one sequence that falls in a robust clade - if len(clades) > 0: - - #Filter the list of sequences to those that fall in clades - taxseqs = [seq for seq in taxseqs if seq in clades] - - #Get the largest clade in which a sequence from the taxon falls - best_size = max(list(clades.values())) - - #Get a list of sequences in a clade of that size - best_seqs = [seq for seq in taxseqs if clades[seq] == best_size] - - #If there is only one sequence in the best-sized clade, take it and finish - if len(best_seqs) == 1: - seqs_per_og[file].append(og_recs[best_seqs[0]]) - #Otherwise, need to take the sequence with the best score that falls into a clade of that size - else: - taxseqs = best_seqs - score = True - #Otherwise, of all sequences that don't fall in any clade, take the one with the best score - else: - score = True - #If there's only one sequence for the taxon, no problem - elif len(taxseqs) == 1: - seqs_per_og[file].append(og_recs[taxseqs[0]]) - - #If scoring is necessary, do it on the filter set of sequences for the taxon and keep the best - if score: - use_cov = True - for seq in taxseqs: - if 'Cov' not in seq[10:]: - use_cov = False - break - - if use_cov: - taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', '')) * float(x.split('Cov')[-1].split('_')[0])) - else: - taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', ''))) - - seqs_per_og[file].append(og_recs[taxseqs[0]]) - - return seqs_per_og - - -#Function to concatenate all the selected sequences into one alignment file -def concat(seqs_per_og, params): - - taxa = list(dict.fromkeys([rec.id[:10] for og in seqs_per_og for rec in seqs_per_og[og]])) - - seqs_per_og = { og : { rec.id : str(rec.seq).replace('-', '') for rec in seqs_per_og[og] } for og in seqs_per_og } - - if not os.path.isdir(params.output + '/Output/DataToConcatenate'): - os.mkdir(params.output + '/Output/DataToConcatenate') - os.mkdir(params.output + '/Output/DataToConcatenate/Unaligned') - os.mkdir(params.output + '/Output/DataToConcatenate/Aligned') - - for og in seqs_per_og: - with open(params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta', 'w') as o: - for tax in seqs_per_og[og]: - o.write('>' + tax + '\n' + seqs_per_og[og][tax] + '\n\n') - - os.system('mafft ' + params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta > ' + params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta') - - seqs_per_og[og] = { rec.id[:10] : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta', 'fasta') } - - concat_seqs_per_tax = { tax : '' for tax in taxa } - for taxon in taxa: - for og in seqs_per_og: - if taxon in seqs_per_og[og]: - concat_seqs_per_tax[taxon] += seqs_per_og[og][taxon] - else: - print(list(seqs_per_og[og].values())) - print(og) - concat_seqs_per_tax[taxon] += ''.join(['-' for i in range(len(list(seqs_per_og[og].values())[0]))]) - - with open(params.output + '/Output/ConcatenatedAlignment.fasta', 'w') as o: - for tax in concat_seqs_per_tax: - o.write('>' + tax + '\n' + concat_seqs_per_tax[tax] + '\n\n') - - -#wrapper -def run(params): - - if not os.path.isdir(params.output + '/Output/Guidance') or not os.path.isdir(params.output + '/Output/Trees'): - print('\nERROR in concatenation: cannot find alignments and/or trees (looking in ' + params.output + '/Output/Guidance and ' + params.output + '/Output/Trees)') - exit() - else: - seqs_per_og = remove_paralogs(params) - - concat(seqs_per_og, params) - - - - - - - - - - - - - - - - - - - - - - - - - - - - -