From 160c4e58cc422fd18a4437efcde9fa1d192d3431 Mon Sep 17 00:00:00 2001 From: Godwin Ani Date: Mon, 3 Jun 2024 16:03:13 -0400 Subject: [PATCH] Fix copying the files twice --- Utilities/for_trees/ColorByClade_v3.2.py | 271 +++++++++++++++++++++++ 1 file changed, 271 insertions(+) create mode 100644 Utilities/for_trees/ColorByClade_v3.2.py diff --git a/Utilities/for_trees/ColorByClade_v3.2.py b/Utilities/for_trees/ColorByClade_v3.2.py new file mode 100644 index 0000000..b3623a3 --- /dev/null +++ b/Utilities/for_trees/ColorByClade_v3.2.py @@ -0,0 +1,271 @@ +#Author, date: Auden Cote-L'Heureux & Godwin Ani, last updated Feb 21st 2024 +#Motivation: Visualize placement of taxa by taxonomic group in trees +#Intent: Color tip labels in trees by taxonomic group +#Dependencies: Python3, ete3 +#Inputs: A folder of trees +#Outputs: a folder of colored trees +#Example: python ColorByClade_v2.1.py -i /path/to/trees + + +import os, sys +import ete3 +import argparse + + +#Needed for communicating with Figtree program +figtree_format = '''begin figtree; + set appearance.backgroundColorAttribute="Default"; + set appearance.backgroundColour=#ffffff; + set appearance.branchColorAttribute="User selection"; + set appearance.branchColorGradient=false; + set appearance.branchLineWidth=1.0; + set appearance.branchMinLineWidth=0.0; + set appearance.branchWidthAttribute="Fixed"; + set appearance.foregroundColour=#000000; + set appearance.hilightingGradient=false; + set appearance.selectionColour=#2d3680; + set branchLabels.colorAttribute="User selection"; + set branchLabels.fontName="sansserif"; + set branchLabels.fontSize=8; + set branchLabels.fontStyle=0; + set branchLabels.isShown=false; + set branchLabels.significantDigits=4; + set layout.expansion=0; + set layout.layoutType="RECTILINEAR"; + set layout.zoom=0; + set legend.attribute=null; + set legend.fontSize=10.0; + set legend.isShown=false; + set legend.significantDigits=4; + set nodeBars.barWidth=4.0; + set nodeBars.displayAttribute=null; + set nodeBars.isShown=false; + set nodeLabels.colorAttribute="User selection"; + set nodeLabels.displayAttribute="Node ages"; + set nodeLabels.fontName="sansserif"; + set nodeLabels.fontSize=16; + set nodeLabels.fontStyle=0; + set nodeLabels.isShown=false; + set nodeLabels.significantDigits=4; + set nodeShape.colourAttribute="User selection"; + set nodeShape.isShown=false; + set nodeShape.minSize=10.0; + set nodeShape.scaleType=Width; + set nodeShape.shapeType=Circle; + set nodeShape.size=4.0; + set nodeShape.sizeAttribute="Fixed"; + set polarLayout.alignTipLabels=false; + set polarLayout.angularRange=0; + set polarLayout.rootAngle=0; + set polarLayout.rootLength=100; + set polarLayout.showRoot=true; + set radialLayout.spread=0.0; + set rectilinearLayout.alignTipLabels=false; + set rectilinearLayout.curvature=0; + set rectilinearLayout.rootLength=100; + set scale.offsetAge=0.0; + set scale.rootAge=1.0; + set scale.scaleFactor=1.0; + set scale.scaleRoot=false; + set scaleAxis.automaticScale=true; + set scaleAxis.fontSize=8.0; + set scaleAxis.isShown=false; + set scaleAxis.lineWidth=1.0; + set scaleAxis.majorTicks=1.0; + set scaleAxis.origin=0.0; + set scaleAxis.reverseAxis=false; + set scaleAxis.showGrid=true; + set scaleBar.automaticScale=true; + set scaleBar.fontSize=10.0; + set scaleBar.isShown=true; + set scaleBar.lineWidth=1.0; + set scaleBar.scaleRange=0.0; + set tipLabels.colorAttribute="User selection"; + set tipLabels.displayAttribute="Names"; + set tipLabels.fontName="sansserif"; + set tipLabels.fontSize=16; + set tipLabels.fontStyle=0; + set tipLabels.isShown=true; + set tipLabels.significantDigits=4; + set trees.order=false; + set trees.orderType="increasing"; + set trees.rooting=false; + set trees.rootingType="User Selection"; + set trees.transform=false; + set trees.transformType="cladogram"; +end;''' + + +def get_args(): + + parser = argparse.ArgumentParser( + prog = 'Tree-coloring script, Version 2.1', + description = "Updated Nov 27th, 2023" + ) + + parser.add_argument('-i', '--input', type = str, required = True, help = 'Path to a folder containing input trees (which must have the file extension .tre, .tree, .treefile, or .nex)') + parser.add_argument('-k', '--keyfile', type = str, help = 'Path to a text file with two tab-separated columns; the first a set of keys and the second a color for each key in hex-code format. Any sequence starting with a particular key will be assigned the color corresponding to that key in this file.') + + return parser.parse_args() + + +#Function to extract newick string from either newick or nexus file +def get_newick(fname): + + newick = '' + for line in open(fname): + line = line.split(' ')[-1] + if(line.startswith('(') or line.startswith('tree1=')): + newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '') + + return newick + + +def reroot(tree): + + #This nested function returns the largest clade of a given taxonomic group + def get_best_clade(taxon): + + best_size = 0; best_clade = []; seen_leaves = [] + #Traverse all nodes + for node in tree.traverse('levelorder'): + #If the node is big enough and not subsumed by a node we've already accepted + if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: + leaves = [leaf.name for leaf in node] + + #Create a record of leaves that belong to the taxonomic group + target_leaves = set() + for leaf in leaves[::-1]: + if leaf[:2] in taxon: + target_leaves.add(leaf[:10]) + leaves.remove(leaf) + + #If this clade is better than any clade we've seen before, grab it + if len(target_leaves) > best_size and len(leaves) <= 2: + best_clade = node + best_size = len(target_leaves) + seen_leaves.extend([leaf.name for leaf in node]) + + return best_clade + + #Get the biggest clade for each taxonomic group (stops once it finds one) + for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]: + clade = get_best_clade(taxon) + + if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3: + + tree.set_outgroup( clade) + + break + + + + return tree + + +def fix_node_labels(newick): + + out_newick = '' + for chunk in newick.split(':'): + if ';' in chunk: + out_newick += chunk + elif ')' in chunk: + out_newick += chunk.split(')')[0] + ')[&NULL_LABEL=' + chunk.split(')')[-1] + ']:' + else: + out_newick += chunk + ':' + + return out_newick + + +def write_lines(o, newick, taxa_and_colors, tree_font_size): + ntax = str(len(taxa_and_colors)) + + newick = fix_node_labels(newick) + + #writes the header to the tree file + o.write('#NEXUS\n') + o.write('begin taxa;\n') + o.write('\tdimensions ntax=' + ntax + ';\n') + o.write('\ttaxlabels\n') + + #write out all taxa + for taxon in taxa_and_colors: + o.write('\t' + taxon + '\n') + + o.write(';\nend;\n\n') + + o.write('begin trees;\n') + o.write('\ttree tree_1 = [&R]\n') + o.write(newick) + o.write('end;\n\n') + + + for line in figtree_format: + if('.fontSize' in line): + o.write(line.replace('8', tree_font_size)) + else: + o.write(line) + + +def write_nexus(newick, leaf_colors, params): + + with open(out_path, 'w') as o: + write_lines(o, newick, taxa_and_colors, tree_font_size) + + + +def color(file, args): + + if args.keyfile != None: + if os.path.isfile(args.keyfile): + try: + colors = { line.split('\t')[0] : line.split('\t')[1].strip() for line in open(args.keyfile) if len(line.split('\t')) == 2 } + except: + print('\nERROR: your keyfile is incorrectly formatted\n') + exit() + else: + print('\nERROR: your input keyfile could not be found\n') + else: + colors = { 'Ba' : '[&!color=#000000]', 'Za' : '[&!color=#808080]', 'Sr' : '[&!color=#B4A26D]', 'Op' : '[&!color=#c07ba0]', 'Pl' : '[&!color=#026736]', 'Ex' : '[&!color=#E63B60]', 'EE' : '[&!color=#0343FC]', 'Am' : '[&!color=#29C5F6]', 'EE_cr' : '[&!color=#08B461]', 'EE_ha' : '[&!color=#03EA74]', 'Sr_ci' : '[&!color=#A97533]', 'Sr_ap' : '[&!color=#D4BA99]', 'Sr_rh' : '[&!color=#8A3324]', 'Sr_st' : '[&!color=#E97451]', 'Sr_di' : '[&!color=#492815]' } + #tree_formatting_wrapper + try: + newick = get_newick(file) + tree = ete3.Tree(newick) + + majs = list(dict.fromkeys([leaf.name[:2] for leaf in tree])) + #Only try to reroot trees with more than 2 major clades. This was added to fix the ETE3 "Cannot set myself as outgroup" error + if len(majs) > 2: + tree = reroot(tree) + tree.ladderize(direction = 1) + except Exception as e: + print(f" {file.split('/')[-1]} has {e} error ") + + leaf_colors = [] + for leaf in tree: + keys = sorted([key for key in colors if leaf.name.startswith(key)], key = lambda x : -len(x)) + + # the line below allows you to have keys anywhere within name and not just start of name.. to use, you have to # the line above + #keys = sorted([key for key in colors if key in leaf.name], key=lambda x: -len(x)) + + if len(keys) > 0: + if '[&!color=' in colors[keys[0]]: + leaf_colors.append(leaf.name + colors[keys[0]]) + else: + leaf_colors.append(leaf.name + '[&!color=' + colors[keys[0]] + ']') + else: + leaf_colors.append(leaf.name) + + with open('ColoredTrees/' + file.split('/')[-1].split('.tree')[0] + '_Colored.tree', 'w') as o: + write_lines(o, newick, leaf_colors, str(12))#change tree font size here (right now it is 12) + +if __name__ == '__main__': + + args = get_args() + + if not os.path.isdir('ColoredTrees'): + os.mkdir('ColoredTrees') + + for tree in os.listdir(args.input): + if tree.split('.')[-1] in ('tree', 'tre', 'treefile', 'nex'): + color(args.input + '/' + tree, args) +