Delete Utilities/for_trees/CladeSizes_v2.0.py

2026-02-11 00:50:25 +08:00 · 2023-12-19 13:49:13 -05:00 · 2023-12-19 13:49:13 -05:00 · 2688df4a98
commit 2688df4a98
parent c655354a42
1 changed files with 0 additions and 183 deletions
--- a/Utilities/for_trees/CladeSizes_v2.0.py
+++ b/Utilities/for_trees/CladeSizes_v2.0.py
@ -1,183 +0,0 @@
-#Author, date: Auden Cote-L'Heureux, last updated Dec 1st 2023
-#Motivation: Understand the topology of trees
-#Intent: Describe clade sizes for different taxonomic groups
-#Dependencies: Python3, ete3
-#Inputs: A folder of trees
-#Outputs: a spreadsheet describing clade sizes
-#Example: python CladeSizes_v2.0.py -i /path/to/trees
-
-
-import os, sys, re
-import ete3
-import argparse
-from tqdm import tqdm
-
-
-#Extract Newick string
-def get_newick(fname):
-	
-	newick = ''
-	for line in open(fname):
-		line = line.split(' ')[-1]
-		if(line.startswith('(') or line.startswith('tree1=')):
-			newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '')
-
-	return newick
-
-
-#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade,
-#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr.
-def reroot(tree):
-
-	#This nested function returns the largest clade of a given taxonomic group
-	def get_best_clade(taxon):
-
-		best_size = 0; best_clade = []; seen_leaves = []
-		#Traverse all nodes
-		for node in tree.traverse('levelorder'):
-			#If the node is big enough and not subsumed by a node we've already accepted
-			if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
-				leaves = [leaf.name for leaf in node]
-				
-				#Create a record of leaves that belong to the taxonomic group
-				target_leaves = set()
-				for leaf in leaves[::-1]:
-					if leaf[:2] in taxon:
-						target_leaves.add(leaf[:10])
-						leaves.remove(leaf)
-
-				#If this clade is better than any clade we've seen before, grab it
-				if len(target_leaves) > best_size and len(leaves) <= 2:
-					best_clade = node
-					best_size = len(target_leaves)
-					seen_leaves.extend([leaf.name for leaf in node])
-
-		return best_clade
-
-	#Get the biggest clade for each taxonomic group (stops once it finds one)
-	for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]:
-		clade = get_best_clade(taxon)
-		if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3:
-			tree.set_outgroup( clade)
-
-			break
-
-	return tree
-	
-	
-def get_clades(file):
-
-	newick = get_newick(file)	
-
-	tree = ete3.Tree(newick)
-
-	#Root tree if possible
-	try:
-		tree = reroot(tree)
-	except:
-		print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')					
-
-	majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree]))
-	mins = list(dict.fromkeys([leaf.name[:5] for leaf in tree]))
-
-	#For each major and minor clade, find all monophyletic clades of that taxon
-	clades_per_tax = { }
-	for clade_code in majs + mins:
-		clades_per_tax.update({ clade_code : [] })
-
-		seen_leaves = []
-		for node in tree.traverse('levelorder'):
-			if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
-				leaves = [leaf.name for leaf in node]
-
-				target_leaves = set()
-				for leaf in leaves[::-1]:
-					if leaf.startswith(clade_code):
-						target_leaves.add(leaf[:10])
-
-				if len(target_leaves) == len(leaves):
-					clades_per_tax[clade_code].append(len(list(dict.fromkeys([l[:10] for l in target_leaves]))))
-					seen_leaves.extend([leaf.name for leaf in node])
-
-	clades_per_tax = { clade : sorted(clades_per_tax[clade], key = lambda x : -x ) for clade in clades_per_tax }
-
-	return clades_per_tax
-
-
-#Write the output spreadsheet
-def write_output(clades_per_tax_per_file):
-
-	clades = sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 2]))) + sorted(list(dict.fromkeys([clade for file in clades_per_tax_per_file for clade in clades_per_tax_per_file[file] if len(clade) == 5])))
-	with open('CladeSizesPerTaxon.csv', 'w') as o:
-		o.write('OG,' + ','.join(clades) + '\n')
-		for tree_file in clades_per_tax_per_file:
-			o.write(tree_file)
-			for clade in clades:
-				if clade in clades_per_tax_per_file[tree_file]:
-					if len(clades_per_tax_per_file[tree_file][clade]) > 1:
-						o.write(',' + ' '.join(['(' + str(size) + ')' for size in clades_per_tax_per_file[tree_file][clade]]))
-					elif len(clades_per_tax_per_file[tree_file][clade]) == 1:
-						o.write(',' + str(clades_per_tax_per_file[tree_file][clade][0]))
-					else:
-						o.write(',')
-				else:
-					o.write(',')
-
-			o.write('\n')
-
-
-if __name__ == '__main__':
-
-	parser = argparse.ArgumentParser(
-                    prog = 'Clade sizes utility version 2.0',
-                    description = "Updated Dec 1, 2023 by Auden Cote-L'Heureux"
-                    )
-
-	parser.add_argument('-i', '--input', required = True, help = 'Path to folder of tree files')
-
-	args = parser.parse_args()
-	
-	clades_per_tax_per_file = { }
-	for tree_file in tqdm(os.listdir(args.input)):
-		if tree_file.split('.')[-1] in ('tre', 'tree', 'treefile', 'nex'):
-			clades_per_tax = get_clades(args.input + '/' + tree_file)
-			clades_per_tax_per_file.update({ tree_file.split('.')[0] : clades_per_tax })
-
-	write_output(clades_per_tax_per_file)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-