From 9cf716d431f416d0d0c17e65d13d7e8db6884f46 Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Fri, 1 Dec 2023 12:15:21 -0500 Subject: [PATCH] Added clade-size (()()()) script --- Utilities/for_trees/CladeSizes_v2.0.py | 183 +++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 Utilities/for_trees/CladeSizes_v2.0.py diff --git a/Utilities/for_trees/CladeSizes_v2.0.py b/Utilities/for_trees/CladeSizes_v2.0.py new file mode 100644 index 0000000..2116294 --- /dev/null +++ b/Utilities/for_trees/CladeSizes_v2.0.py @@ -0,0 +1,183 @@ +#Author, date: Auden Cote-L'Heureux, last updated Dec 1st 2023 +#Motivation: Understand the topology of trees +#Intent: Describe clade sizes for different taxonomic groups +#Dependencies: Python3, ete3 +#Inputs: A folder of trees +#Outputs: a spreadsheet describing clade sizes +#Example: python CladeSizes_v2.0.py -i /path/to/trees + + +import os, sys, re +import ete3 +import argparse +from tqdm import tqdm + + +#Extract Newick string +def get_newick(fname): + + newick = '' + for line in open(fname): + line = line.split(' ')[-1] + if(line.startswith('(') or line.startswith('tree1=')): + newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '') + + return newick + + +#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade, +#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr. +def reroot(tree): + + #This nested function returns the largest clade of a given taxonomic group + def get_best_clade(taxon): + + best_size = 0; best_clade = []; seen_leaves = [] + #Traverse all nodes + for node in tree.traverse('levelorder'): + #If the node is big enough and not subsumed by a node we've already accepted + if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: + leaves = [leaf.name for leaf in node] + + #Create a record of leaves that belong to the taxonomic group + target_leaves = set() + for leaf in leaves[::-1]: + if leaf[:2] in taxon: + target_leaves.add(leaf[:10]) + leaves.remove(leaf) + + #If this clade is better than any clade we've seen before, grab it + if len(target_leaves) > best_size and len(leaves) <= 2: + best_clade = node + best_size = len(target_leaves) + seen_leaves.extend([leaf.name for leaf in node]) + + return best_clade + + #Get the biggest clade for each taxonomic group (stops once it finds one) + for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]: + clade = get_best_clade(taxon) + if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3: + tree.set_outgroup( clade) + + break + + return tree + + +def get_clades(file): + + newick = get_newick(file) + + tree = ete3.Tree(newick) + + #Root tree if possible + try: + tree = reroot(tree) + except: + print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n') + + majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree])) + mins = list(dict.fromkeys([leaf.name[:5] for leaf in tree])) + + #For each major and minor clade, find all monophyletic clades of that taxon + clades_per_tax = { } + for clade_code in majs + mins: + clades_per_tax.update({ clade_code : [] }) + + seen_leaves = [] + for node in tree.traverse('levelorder'): + if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: + leaves = [leaf.name for leaf in node] + + target_leaves = set() + for leaf in leaves[::-1]: + if leaf.startswith(clade_code): + target_leaves.add(leaf[:10]) + + if len(target_leaves) == len(leaves): + clades_per_tax[clade_code].append(len(list(dict.fromkeys([l[:10] for l in target_leaves])))) + seen_leaves.extend([leaf.name for leaf in node]) + + clades_per_tax = { clade : sorted(clades_per_tax[clade], key = lambda x : -x ) for clade in clades_per_tax } + + return clades_per_tax + + +#Write the output spreadsheet +def write_output(clades_per_tax_per_file): + + clades = sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 2]))) + sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 5]))) + with open('CladeSizesPerTaxon.csv', 'w') as o: + o.write('OG,' + ','.join(clades) + '\n') + for tree_file in clades_per_tax_per_file: + o.write(tree_file) + for clade in clades: + if clade in clades_per_tax_per_file[tree_file]: + if len(clades_per_tax_per_file[tree_file][clade]) > 1: + o.write(',' + ' '.join(['(' + str(size) + ')' for size in clades_per_tax_per_file[tree_file][clade]])) + elif len(clades_per_tax_per_file[tree_file][clade]) == 1: + o.write(',' + str(clades_per_tax_per_file[tree_file][clade][0])) + else: + o.write(',') + else: + o.write(',') + + o.write('\n') + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + prog = 'Clade sizes utility version 2.0', + description = "Updated Dec 1, 2023 by Auden Cote-L'Heureux" + ) + + parser.add_argument('-i', '--input', required = True, help = 'Path to folder of tree files') + + args = parser.parse_args() + + clades_per_tax_per_file = { } + for tree_file in tqdm(os.listdir(args.input)): + if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex'): + clades_per_tax = get_clades(args.input + '/' + tree_file) + clades_per_tax_per_file.update({ tree_file.split('.')[0] : clades_per_tax }) + + write_output(clades_per_tax_per_file) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +