From f77ece5a2a123d235aa34d5a78b001996b3d008b Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Mon, 12 Jun 2023 13:25:29 -0400 Subject: [PATCH] Add files via upload --- PTL2/Scripts/color.py | 99 ++++++++++++++++++++++++ PTL2/Scripts/contamination.py | 55 +++++++++++++ PTL2/Scripts/guidance.py | 118 ++++++++++++++++++++++++++++ PTL2/Scripts/logger.py | 28 +++++++ PTL2/Scripts/phylotol.py | 34 +++++++++ PTL2/Scripts/preguidance.py | 120 +++++++++++++++++++++++++++++ PTL2/Scripts/trees.py | 84 ++++++++++++++++++++ PTL2/Scripts/utils.py | 140 ++++++++++++++++++++++++++++++++++ 8 files changed, 678 insertions(+) create mode 100644 PTL2/Scripts/color.py create mode 100644 PTL2/Scripts/contamination.py create mode 100644 PTL2/Scripts/guidance.py create mode 100644 PTL2/Scripts/logger.py create mode 100644 PTL2/Scripts/phylotol.py create mode 100644 PTL2/Scripts/preguidance.py create mode 100644 PTL2/Scripts/trees.py create mode 100644 PTL2/Scripts/utils.py diff --git a/PTL2/Scripts/color.py b/PTL2/Scripts/color.py new file mode 100644 index 0000000..93a9030 --- /dev/null +++ b/PTL2/Scripts/color.py @@ -0,0 +1,99 @@ +import os, sys +import ete3 + + +def get_newick(fname): + + newick = '' + for line in open(fname): + line = line.split(' ')[-1] + if(line.startswith('(') or line.startswith('tree1=')): + newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '') + + return newick + + +def reroot(tree): + + #This nested function returns the largest clade of a given taxonomic group + def get_best_clade(taxon): + + best_size = 0; best_clade = []; seen_leaves = [] + #Traverse all nodes + for node in tree.traverse('levelorder'): + #If the node is big enough and not subsumed by a node we've already accepted + if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0: + leaves = [leaf.name for leaf in node] + + #Create a record of leaves that belong to the taxonomic group + target_leaves = set() + for leaf in leaves[::-1]: + if leaf[:2] in taxon: + target_leaves.add(leaf[:10]) + leaves.remove(leaf) + + #If this clade is better than any clade we've seen before, grab it + if len(target_leaves) > best_size and len(leaves) <= 2: + best_clade = node + best_size = len(target_leaves) + seen_leaves.extend([leaf.name for leaf in node]) + + return best_clade + + #Get the biggest clade for each taxonomic group (stops once it finds one) + for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]: + clade = get_best_clade(taxon) + if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3: + tree.set_outgroup( clade) + + break + + return tree + + +def write_lines(o, newick, taxa_and_colors, tree_font_size): + ntax = str(len(taxa_and_colors)) + + o.write('#NEXUS\n') + o.write('begin taxa;\n') + o.write('\tdimensions ntax=' + ntax + ';\n') + o.write('\ttaxlabels\n') + + for taxon in taxa_and_colors: + o.write('\t' + taxon + '\n') + + o.write(';\nend;\n\n') + + o.write('begin trees;\n') + o.write('\ttree tree_1 = [&R]\n') + o.write(newick) + o.write('end;\n\n') + + with open('figtree_format.txt', 'r') as ff: + for line in ff: + if('.fontSize' in line): + o.write(line.replace('8', tree_font_size)) + else: + o.write(line) + + +def write_nexus(newick, leaf_colors, params): + + with open(out_path, 'w') as o: + write_lines(o, newick, taxa_and_colors, tree_font_size) + + +def color(file, params): + + colors = { 'Ba' : '[&!color=#000000]', 'Za' : '[&!color=#808080]', 'Sr' : '[&!color=#7b2516]', 'Op' : '[&!color=#12aaff]', 'Pl' : '[&!color=#006300]', 'Ex' : '[&!color=#ffa100]', 'EE' : '[&!color=#ff6288]', 'Am' : '[&!color=#aa00ff]' } + + newick = get_newick(file) + tree = ete3.Tree(newick) + tree = reroot(tree) + tree.ladderize() + + leaf_colors = [leaf + colors[leaf[:2]] for leaf in tree] + + with open(params.output + '/ColoredTrees/' + file.split('.tree')[0] + '_Colored.tree', 'w') as o: + write_lines(o, newick, leaf_colors, params.tree_font_size) + diff --git a/PTL2/Scripts/contamination.py b/PTL2/Scripts/contamination.py new file mode 100644 index 0000000..324ce54 --- /dev/null +++ b/PTL2/Scripts/contamination.py @@ -0,0 +1,55 @@ +import os, sys, re +import ete3 +from logger import Logger + + +def get_best_clade(params, tree): + + if params.target_taxa_file != None: + try: + target_taxa_list = [line.strip() for line in open(PathtoFiles + '/' + target_taxa_list)] + except (FileNotFoundError, TypeError): + Logger.Error('The --target_taxa_file could not be found or was incorrectly formatted.') + + if params.at_least_file != None: + try: + at_least_list = [line.strip() for line in open(PathtoFiles + '/' + at_least_file)] + except (FileNotFoundError, TypeError): + Logger.Error('The --at_least_file could not be found or was incorrectly formatted.') + else: + at_least_list = [] + + + ### FROM HERE BELOW IN THIS FUNCTION NEEDS REPLACING WITH ETE3 + + forbidden_nodes = [node for node in nodes_to_exclude] + for node in nodes_to_exclude: + for num in tree.getNodeNumsAbove(node): + forbidden_nodes.append(tree.node(num)) + + best_node = None + best_size = 0 + for node in tree.iterNodesNoRoot(): + if(node not in forbidden_nodes): + leaves = tree.getAllLeafNames(node) + + num = 0.0; dem = 0.0; + + non_minor = [] + for leaf in leaves: + if(leaf[:2] != target_minor and leaf[:4] != target_minor): + num += 1.0; + non_minor.append(leaf[:10]) + + if(target_taxa_list == 'na' or target_taxa_list == '' or target_taxa_list == 'NA'): + n_targets = len(list(dict.fromkeys([tip[:10] for tip in leaves if(tip[:2] == target_clade or tip[:3] == target_clade or tip[:4] == target_clade or tip[:5] == target_clade or tip[:7] == target_clade or tip[:8] == target_clade)]))) + else: + n_targets = len(list(dict.fromkeys([tip[:10] for tip in leaves if((tip[:2] == target_clade or tip[:3] == target_clade or tip[:4] == target_clade or tip[:5] == target_clade or tip[:7] == target_clade or tip[:8] == target_clade) and (tip[:10] in target_taxa_list or tip[:8] in target_taxa_list))]))) + + at_least_taxa = len(list(dict.fromkeys([leaf[:10] for leaf in tree.getAllLeafNames(node) if leaf[:10] in at_least_list]))) + + if(num <= cont_num_contams and n_targets > best_size and n_targets >= min_presence and at_least_taxa >= num_at_least): + best_node = node + best_size = n_targets + + return best_node \ No newline at end of file diff --git a/PTL2/Scripts/guidance.py b/PTL2/Scripts/guidance.py new file mode 100644 index 0000000..6b30c79 --- /dev/null +++ b/PTL2/Scripts/guidance.py @@ -0,0 +1,118 @@ +import os, sys, re +from Bio import SeqIO +from logger import Logger + +def run(params): + + if params.start == 'raw' or params.start == 'unaligned': + if params.start == 'raw': + preguidance_path = params.output + '/Output/Pre-Guidance' + else: + preguidance_path = params.data + + if not os.path.isdir(preguidance_path): + Logger.Error('The path ' + preguidance_path + ' could not be found when trying to locate pre-Guidance (unaligned) files. Make sure that the --start and --data parameters are correct and/or that the pre-Guidance step ran successfully.') + + if len([f for f in os.listdir(preguidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]) == 0: + Logger.Error('No pre-Guidance (unaligned) files could be found at the path ' + preguidance_path + '. Make sure that the --start and --data parameters are correct, that the pre-Guidance step ran successfully, and that the unaligned files are formatted correctly (they must have the file extension .faa, .fa, or .fasta).') + + os.mkdir(params.output + '/Output/Temp/Guidance') + os.mkdir(params.output + '/Output/Temp/Guidance/Input') + os.mkdir(params.output + '/Output/Temp/Guidance/Output') + + guidance_input = params.output + '/Output/Temp/Guidance/Input/' + os.system('cp -r ' + preguidance_path + '/* ' + guidance_input) + + for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]: + tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] + os.mkdir(tax_guidance_outdir) + + for i in range(params.guidance_iters): + n_recs = len([r for r in SeqIO.parse(guidance_input + '/' + file, 'fasta')]) + + if n_recs < 4: + Logger.Warning('Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family.') + os.system('rm -rf ' + tax_guidance_outdir) + break + + if n_recs < 200: + mafft_alg = 'genafpair' + else: + mafft_alg = 'auto' + + os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Temp/Guidance/Output/' + file[:10] + '/log.txt') + + seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]) + + if n_recs - seqs_below < 4: + Logger.Warning('Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.') + os.system('rm -rf ' + tax_guidance_outdir) + break + + if seqs_below == 0 or i == params.guidance_iters - 1: + Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0]) + break + + os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) + + os.system('rm -r ' + tax_guidance_outdir + '/*') + + seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')] + orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')] + running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep } + + for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.res_cutoff]: + if(orig_seqs[site[0]] in seqs2keep): + running_aln[orig_seqs[site[0]]][site[1]] = 'X' + + cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.col_cutoff] + for seq in running_aln: + running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove]) + + with open(tax_guidance_outdir + '/postGuidance_preTrimAl.fasta', 'w') as o: + for seq in running_aln: + o.write('>' + seq + '\n' + str(running_aln[seq]) + '\n\n') + + os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/postGuidance_preTrimAl.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas -gapthreshold 0.05 -fasta') + os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/postGuidance_preTrimAl.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.phy -gapthreshold 0.05 -phylip') + + os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas') + + if not params.keep_temp: + for gdir_file in os.listdir(tax_guidance_outdir): + if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log'): + os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file) + else: + if gdir_file == 'MSA.MAFFT.aln.With_Names': + os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln') + else: + os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/PTL2/Scripts/logger.py b/PTL2/Scripts/logger.py new file mode 100644 index 0000000..5d280f1 --- /dev/null +++ b/PTL2/Scripts/logger.py @@ -0,0 +1,28 @@ + + +class Logger(): + + OKBLUE = '\033[94m' + OKCYAN = '\033[96m' + OKGREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + @staticmethod + def Error(text, style = ''): + + print(Logger.RED + Logger.BOLD + "ERROR: " + Logger.ENDC + Logger.RED + style + text + Logger.ENDC + '\n') + quit() + + @staticmethod + def Warning(text, style = ''): + + print(Logger.YELLOW + Logger.BOLD + "WARNING: " + Logger.ENDC + Logger.YELLOW + style + text + Logger.ENDC + '\n') + + @staticmethod + def Message(text, style = ''): + + print(Logger.OKGREEN + style + text + Logger.ENDC + '\n') diff --git a/PTL2/Scripts/phylotol.py b/PTL2/Scripts/phylotol.py new file mode 100644 index 0000000..33f703d --- /dev/null +++ b/PTL2/Scripts/phylotol.py @@ -0,0 +1,34 @@ +#!/usr/bin/python3 +import os, sys, re + +import utils +import preguidance +import guidance +import trees +from logger import Logger + +if __name__ == '__main__': + + params = utils.get_params() + + Logger.Message('Cleaning up existing files and organizing output folder', Logger.BOLD) + utils.clean_up(params) + + if params.start == 'raw': + Logger.Message('Running preguidance', Logger.BOLD) + preguidance.run(params) + + if params.start in ('unaligned', 'raw') and params.end in ('aligned', 'trees'): + Logger.Message('Running guidance', Logger.BOLD) + guidance.run(params) + + if params.end == 'trees': + Logger.Message('Building trees', Logger.BOLD) + trees.run(params) + + if params.contamination_loop != None: + Logger.Message('Running contamination loop', Logger.BOLD) + + #if not params.keep_temp: + # os.system('rm -r ' + params.output + '/Output/Temp') + diff --git a/PTL2/Scripts/preguidance.py b/PTL2/Scripts/preguidance.py new file mode 100644 index 0000000..d57448a --- /dev/null +++ b/PTL2/Scripts/preguidance.py @@ -0,0 +1,120 @@ +import os, sys, re +from logger import Logger +from Bio import SeqIO + + + +def run(params): + + try: + ogs = list(dict.fromkeys([line.strip() for line in open(params.gf_list)])) + except (FileNotFoundError, TypeError) as e: + Logger.Error('Unable to read GF list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e)) + + try: + taxa = list(dict.fromkeys([line.strip() for line in open(params.taxon_list)])) + except (FileNotFoundError, TypeError) as e: + Logger.Error('Unable to read taxon list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e)) + + if not os.path.isdir(params.data): + Logger.Error(Logger.Error('Input amino-acid data files not found. Please make sure that the given path (--data) is correct.')) + + aa_files = [f for f in os.listdir(params.data) if f[:10] in taxa if f.endswith('.faa') or f.endswith('.fa') or f.endswith('.fasta')] + + missing_taxa = [tax for tax in taxa if tax not in [f[:10] for f in aa_files]] + if(len(missing_taxa) > 0): + Logger.Warning('The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa])) + + os.mkdir(params.output + '/Output/Temp/OF-SF_Diamond') + + for og in ogs: + Logger.Message('Processing ' + og) + with open(params.output + '/Output/Pre-Guidance/' + og + '_preguidance.faa', 'w') as preguidance_file: + for taxon_file in aa_files: + recs = [] + for rec in sorted([rec for rec in SeqIO.parse(params.data + '/' + taxon_file, 'fasta') if rec.id[-10:] == og], key=lambda x: -len(x.seq)): + if(rec.id == rec.description): + recs.append(rec) + else: + Logger.Warning('\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.') + + masters = []; removed = 0; flag = 0; cycle = 0 + if len(recs) > 1: + while flag == 0: + master_file_name = params.output + '/Output/Temp/OF-SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle) + query_file_name = params.output + '/Output/Temp/OF-SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa' + diamond_out_name = params.output + '/Output/Temp/OF-SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv' + + open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n') + masters.append(recs[0]) + + with open(query_file_name, 'w') as queries: + for rec in recs[1:]: + queries.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n') + + os.system('diamond makedb --in ' + master_file_name + '.faa -d ' + master_file_name) + os.system('diamond blastp -d ' + master_file_name + '.dmnd -q ' + query_file_name + ' --outfmt 6 -o ' + diamond_out_name) + + diamond_out = open(diamond_out_name).readlines() + recs_to_remove = [] + for line in diamond_out: + line = line.strip().split('\t') + alignment_length = int(line[3]); gaps = int(line[5]); seq = str(line[0]); identity = float(line[2]) + + if ((alignment_length - gaps) < params.overlap_cutoff * len(recs[0].seq) and cycle == 0) or identity > params.sim_cutoff: + recs_to_remove.append(seq); removed =+ 1 + + if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2: + recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove] + flag = 1 + else: + recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove] + cycle += 1 + + Logger.Message('\t' + str(removed) + ' sequence(s) removed by the overlap/similarity filters (' + str(cycle + 1) + ' iterations) from ' + taxon_file[:10]) + for rec in recs + masters: + preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n') + + if(not params.keep_temp): + os.system('rm -r ' + params.output + '/Output/Temp/OF-SF_Diamond') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/PTL2/Scripts/trees.py b/PTL2/Scripts/trees.py new file mode 100644 index 0000000..6b9f8c2 --- /dev/null +++ b/PTL2/Scripts/trees.py @@ -0,0 +1,84 @@ +import os, sys, re +from Bio import SeqIO +from logger import Logger +from color import color + +def run(params): + + if params.start == 'aligned': + guidance_path = params.data + else: + guidance_path = params.output + '/Output/Guidance' + + if not os.path.isdir(guidance_path): + Logger.Error('The path ' + guidance_path + ' could not be found when trying to locate Guidance (aligned) files. Make sure that the --start and --data parameters are correct and/or that the Guidance step ran successfully.') + + if len([f for f in os.listdir(guidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fas') or f.endswith('.aln')]) == 0: + Logger.Error('No Guidance (unaligned) files could be found at the path ' + guidance_path + '. Make sure that the --start and --data parameters are correct, that the Guidance step ran successfully, and that the aligned files are formatted correctly (they must have the file extension .faa, .fa, .aln, .fas, or .fasta).') + + for file in [f for f in os.listdir(guidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fas') or f.endswith('.aln')]: + + if params.tree_method == 'iqtree': + if not os.path.isdir(params.output + '/Output/Temp/IQTree'): + os.mkdir(params.output + '/Output/Temp/IQTree') + + tax_iqtree_outdir = params.output + '/Output/Temp/IQTree/' + file.split('.')[0].split('_preguidance')[0] + os.mkdir(tax_iqtree_outdir) + + os.system('iqtree2 -s ' + guidance_path + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas -m LG+G --prefix ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree') + + if os.path.isfile(tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile'): + os.system('cp ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile ' + params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.tree') + color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.tree') + else: + Logger.Warning('No tree file created by IQ-Tree for OG ' + file[:10]) + + elif params.tree_method == 'raxml': + if not os.path.isdir(params.output + '/Output/Temp/RAxML'): + os.mkdir(params.output + '/Output/Temp/RAxML') + + tax_raxml_outdir = params.output + '/Output/Temp/RAxML/' + file.split('.')[0].split('_preguidance')[0] + os.mkdir(tax_raxml_outdir) + + os.sytem('raxmlHPC-PTHREADS-AVX2 -s ' + guidance_path + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.phy -m PROTGAMMALG -f d -p 12345 -# 1 -n ' + file.split('.')[0].split('_preguidance')[0] + '_RAxML -T ' + str(params.guidance_threads)) + + if os.path.isfile(tax_raxml_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile'): + os.system('cp ' + tax_raxml_outdir + '/RAxML_bestTree.' + file.split('.')[0].split('_preguidance')[0] + '_RAxML ' + params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_RAxML.tree') + color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_RAxML.tree') + else: + Logger.Warning('No tree file created by RAxML for OG ' + file[:10]) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/PTL2/Scripts/utils.py b/PTL2/Scripts/utils.py new file mode 100644 index 0000000..e83b349 --- /dev/null +++ b/PTL2/Scripts/utils.py @@ -0,0 +1,140 @@ +import os, sys, re +import argparse +from logger import Logger +import shutil + + +def get_params(): + + parser = argparse.ArgumentParser( + prog = 'PhyloToL v6.0', + description = "Updated December 9, 2022 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0" + ) + + common = parser.add_argument_group('Commonly adjusted parameters') + common.add_argument('--start', default = 'raw', choices = {'raw', 'unaligned', 'aligned', 'trees'}, help = 'Stage at which to start running PhyloToL.') + common.add_argument('--end', default = 'trees', choices = {'unaligned', 'aligned', 'trees'}, help = 'Stage until which to run PhyloToL. Options are "unaligned" (which will run up to but not including guidance), "aligned" (which will run up to but not including RAxML), and "trees" which will run through RAxML') + common.add_argument('--gf_list', default = None, help = 'Path to the file with the GFs of interest. Only required if starting from the raw dataset.') + common.add_argument('--taxon_list', default = None, help = 'Path to the file with the taxa (10-digit codes) to include in the output.') + common.add_argument('--data', help = 'Path to the input dataset. The format of this varies depending on your --start parameter') + common.add_argument('--output', default = '../', help = 'Directory where the output folder should be created. If not given, the folder will be created in the parent directory of the folder containing the scripts.') + common.add_argument('--force', action = 'store_true', help = 'Overwrite all existing files in the "Output" folder.') + common.add_argument('--tree_method', default = 'iqtree', choices = {'iqtree, raxml, all'}, help = 'Program to use for tree-building') + + core = parser.add_argument_group('Core parameters (rarely altered from the defaults)') + core.add_argument('--blast_cutoff', default = 1e-20, type = float, help = 'Blast e-value cutoff') + core.add_argument('--len_cutoff', default = 10, type = int, help = 'Amino acid length cutoff for removal of very short sequences after column removal in Guidance.') + core.add_argument('--sim_cutoff', default = 0.98, type = float, help = 'Sequences from the same taxa that are assigned to the same OG are removed if they are more similar than this cutoff') + core.add_argument('--overlap_cutoff', default = 0.35, type = float, help = 'A sequence is removed if its alignment length to the longest sequence in its OG & taxon is less than this proportion of the length of the longest sequence') + core.add_argument('--guidance_iters', default = 5, type = int, help = 'Number of Guidance iterations for sequence removal') + core.add_argument('--seq_cutoff', default = 0.3, type = float, help = 'During guidance, taxa are removed if their score is below this cutoff') + core.add_argument('--col_cutoff', default = 0.4, type = float, help = 'During guidance, columns are removed if their score is below this cutoff') + core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff') + core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance') + + CL = parser.add_argument_group('Contamination loop parameters') + CL.add_argument('--contamination_loop', default = 'clade', choices = {'seq', 'clade'}, help = 'Remove sequences by looking at the sisters of each sequence in a rules file or by picking the best clades') + CL.add_argument('--nloops', default = 5, type = int, help = 'The maximum number of contamination-removal loops') + + CL.add_argument('--sister_rules', default = None, help = 'Path to a file of rules, nly used in "seq" mode. Sequences in the rules file with specified contaminants will be removed if sister only to those contaminants') + CL.add_argument('--remove_nm_sister', action = 'store_true', help = 'Only used in "seq" mode. When set to true, sequences sister to a non-monophyletic group of taxa that all fall into categories designated as contaminants in the rules file will be removed') + + CL.add_argument('--target_clade', nargs = '+', default = None, help = 'Only used in "clade" mode. Selected clades can have no more than num_contams (below) sequences that are not of this clade (can be 2, 4, 5, 7, 8, or 10 digits)') + CL.add_argument('--num_contams', type = int, default = 2, help = 'Only used in "clade" mode. Selected clades can have no more than this number of sequences that are not of the target clade') + CL.add_argument('--min_target_presence', type = int, default = 8, help = 'Only used in "clade" mode. The minimum number of species belonging to a target clade allowed in a selected clade') + CL.add_argument('--target_taxa_file', default = None, help = 'Only used in "clade" mode. Path to a text file containing 2, 4, 5, 7, 8, or 10 digit codes that are your target taxa; you must have at least min_target_presence (above) of ten-digit codes that match these criteria in your selected clade') + CL.add_argument('--at_least_file', default = None, help = 'Only used in "clade" mode. A file containing 2, 4, 5, 7, 8, or 10 digit codes; any selected clade must have at least at_least_sisters_num of taxa that match these criteria; this is used to require the presence of certain sister lineages') + CL.add_argument('--at_least_sisters_num', type = int, default = 1, help = 'Only used in "clade" mode. See above.') + CL.add_argument('--coverage_diff_OM', type = int, default = None, help = 'The orders of magnitude by which the coverages of two sequences in the same clade can differ for the seq with lower coverage to be removed (default is to NOT filter for differential coverage)') + CL.add_argument('--coverage_diff_abs', type = int, default = None, help = 'The absolute number by which the coverages of two sequences in the same clade can differ for the seq with lower coverage to be removed (default is to NOT filter for differential coverage)') + + sisters = parser.add_argument_group('Parameters for the sister report') + CL.add_argument('--query_clades', nargs = '+', default = None, help = 'A list of 2, 4, 5, 7, 8, or 10 digit codes specifying which taxa for which to count sisters, separated by a comma. Alternatively, input a file containing a list of 10 digit codes of taxa for which to return sisters if there are a lot') + CL.add_argument('--sister_clades', nargs = '+', help = 'A list of 2, 4, 5, 7, 8, or 10 digit codes specifying which taxa which sisters to represent in the spreadsheet, separated by a comma. Alternatively, input a file containing a list of 10 digit codes of taxa for sisters to represent if there are a lot') + CL.add_argument('--break_up', nargs = '+', default = None, help = 'A list of major clades for which to break up the sister report into minor clades') + CL.add_argument('--branch_length_filter', default = 'avg', choices = {'avg', int, float}, help = 'Filter tips to represent by branch length') + CL.add_argument('--single_sister_only', action = 'store_true', help = 'Whether or not you only want to report sister relationships when there is only a single taxon sister to a sequence') + + + other = parser.add_argument_group('Other arguments') + other.add_argument('--concatenate', action = 'store_true', help = 'Remove paralogs and generate an alignment for concatenation') + other.add_argument('--tree_font_size', default = 12, help = "Change this if you're not quite happy with the font size in the output trees. If you want smaller font in your trees, you can lower this value; and if you want larger font in your trees, you can raise this value. Some common values are 8, 10, and 12. Size 16 font is pretty big, and size 4 font is probably too small for most purposes. Iconoclasts use size 9, 11, or 13 font.") + other.add_argument('--keep_temp', action = 'store_true', help = "Use this to keep ALL Guidance intermediate files") + + return parser.parse_args() + + +def clean_up(params): + + if not os.path.isdir(params.output + '/Output'): + os.mkdir(params.output + '/Output') + elif os.path.isdir(params.output + '/Output') and params.force == False: + Logger.Error('An "Output" folder already exists at the given path. Please delete or rename this folder and try again.') + elif params.force and len([d for d in os.listdir(params.output + '/Output') if d != '.DS_Store']) > 0: + Logger.Warning('An "Output" folder already exists at the given path, but all contents were deleted in --force mode.') + os.system('rm -r ' + params.output + '/Output/*') + + os.mkdir(params.output + '/Output/Temp') + + def copy_input(dirname): + if os.path.isdir(params.data): + input_files = [f for f in os.listdir(params.data) if f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fa')] + if len(input_files) > 0: + for f in input_files: + shutil.copyfile(params.data + '/' + f, params.output + '/Output/' + dirname + '/' + f) + else: + Logger.Warning('The given path to a folder of ' + params.start.strip('s') + ' files was located, but no ' + params.start.strip('s') + ' files were found. Make sure the file extensions are .fasta, .fa, or .faa.') + else: + Logger.Error('Input ' + params.start.strip('s') + ' data files not found. Please make sure that the given path (--data) is correct or set --start to "raw".') + + os.mkdir(params.output + '/Output/Pre-Guidance') + if params.start == 'unaligned': + copy_input('Pre-Guidance') + + if params.start in ('unaligned', 'aligned') or params.end in ('aligned', 'trees', None): + os.mkdir(params.output + '/Output/Guidance') + if(params.start == 'aligned'): + copy_input('Guidance') + + if params.end == 'trees' or params.contamination_loop != None: + os.mkdir(params.output + '/Output/Trees') + os.mkdir(params.output + '/Output/ColoredTrees') + if(params.start == 'trees'): + copy_input('Trees') + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +