From 9cf716d431f416d0d0c17e65d13d7e8db6884f46 Mon Sep 17 00:00:00 2001
From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com>
Date: Fri, 1 Dec 2023 12:15:21 -0500
Subject: [PATCH] Added clade-size (()()()) script

---
 Utilities/for_trees/CladeSizes_v2.0.py | 183 +++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 Utilities/for_trees/CladeSizes_v2.0.py

diff --git a/Utilities/for_trees/CladeSizes_v2.0.py b/Utilities/for_trees/CladeSizes_v2.0.py
new file mode 100644
index 0000000..2116294
--- /dev/null
+++ b/Utilities/for_trees/CladeSizes_v2.0.py
@@ -0,0 +1,183 @@
+#Author, date: Auden Cote-L'Heureux, last updated Dec 1st 2023
+#Motivation: Understand the topology of trees
+#Intent: Describe clade sizes for different taxonomic groups
+#Dependencies: Python3, ete3
+#Inputs: A folder of trees
+#Outputs: a spreadsheet describing clade sizes
+#Example: python CladeSizes_v2.0.py -i /path/to/trees
+
+
+import os, sys, re
+import ete3
+import argparse
+from tqdm import tqdm
+
+
+#Extract Newick string
+def get_newick(fname):
+	
+	newick = ''
+	for line in open(fname):
+		line = line.split(' ')[-1]
+		if(line.startswith('(') or line.startswith('tree1=')):
+			newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '')
+
+	return newick
+
+
+#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade,
+#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr.
+def reroot(tree):
+
+	#This nested function returns the largest clade of a given taxonomic group
+	def get_best_clade(taxon):
+
+		best_size = 0; best_clade = []; seen_leaves = []
+		#Traverse all nodes
+		for node in tree.traverse('levelorder'):
+			#If the node is big enough and not subsumed by a node we've already accepted
+			if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
+				leaves = [leaf.name for leaf in node]
+				
+				#Create a record of leaves that belong to the taxonomic group
+				target_leaves = set()
+				for leaf in leaves[::-1]:
+					if leaf[:2] in taxon:
+						target_leaves.add(leaf[:10])
+						leaves.remove(leaf)
+
+				#If this clade is better than any clade we've seen before, grab it
+				if len(target_leaves) > best_size and len(leaves) <= 2:
+					best_clade = node
+					best_size = len(target_leaves)
+					seen_leaves.extend([leaf.name for leaf in node])
+
+		return best_clade
+
+	#Get the biggest clade for each taxonomic group (stops once it finds one)
+	for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]:
+		clade = get_best_clade(taxon)
+		if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3:
+			tree.set_outgroup( clade)
+
+			break
+
+	return tree
+	
+	
+def get_clades(file):
+
+	newick = get_newick(file)	
+
+	tree = ete3.Tree(newick)
+
+	#Root tree if possible
+	try:
+		tree = reroot(tree)
+	except:
+		print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')					
+
+	majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
+	mins = list(dict.fromkeys([leaf.name[:5] for leaf in tree]))
+
+	#For each major and minor clade, find all monophyletic clades of that taxon
+	clades_per_tax = { }
+	for clade_code in majs + mins:
+		clades_per_tax.update({ clade_code : [] })
+
+		seen_leaves = []
+		for node in tree.traverse('levelorder'):
+			if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
+				leaves = [leaf.name for leaf in node]
+
+				target_leaves = set()
+				for leaf in leaves[::-1]:
+					if leaf.startswith(clade_code):
+						target_leaves.add(leaf[:10])
+
+				if len(target_leaves) == len(leaves):
+					clades_per_tax[clade_code].append(len(list(dict.fromkeys([l[:10] for l in target_leaves]))))
+					seen_leaves.extend([leaf.name for leaf in node])
+
+	clades_per_tax = { clade : sorted(clades_per_tax[clade], key = lambda x : -x ) for clade in clades_per_tax }
+
+	return clades_per_tax
+
+
+#Write the output spreadsheet
+def write_output(clades_per_tax_per_file):
+
+	clades = sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 2]))) + sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 5])))
+	with open('CladeSizesPerTaxon.csv', 'w') as o:
+		o.write('OG,' + ','.join(clades) + '\n')
+		for tree_file in clades_per_tax_per_file:
+			o.write(tree_file)
+			for clade in clades:
+				if clade in clades_per_tax_per_file[tree_file]:
+					if len(clades_per_tax_per_file[tree_file][clade]) > 1:
+						o.write(',' + ' '.join(['(' + str(size) + ')' for size in clades_per_tax_per_file[tree_file][clade]]))
+					elif len(clades_per_tax_per_file[tree_file][clade]) == 1:
+						o.write(',' + str(clades_per_tax_per_file[tree_file][clade][0]))
+					else:
+						o.write(',')
+				else:
+					o.write(',')
+
+			o.write('\n')
+
+
+if __name__ == '__main__':
+
+	parser = argparse.ArgumentParser(
+                    prog = 'Clade sizes utility version 2.0',
+                    description = "Updated Dec 1, 2023 by Auden Cote-L'Heureux"
+                    )
+
+	parser.add_argument('-i', '--input', required = True, help = 'Path to folder of tree files')
+
+	args = parser.parse_args()
+	
+	clades_per_tax_per_file = { }
+	for tree_file in tqdm(os.listdir(args.input)):
+		if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex'):
+			clades_per_tax = get_clades(args.input + '/' + tree_file)
+			clades_per_tax_per_file.update({ tree_file.split('.')[0] : clades_per_tax })
+
+	write_output(clades_per_tax_per_file)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+