mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 03:40:24 +08:00
Fixing concat_target_taxa file read error in concatenation
This commit is contained in:
parent
4b8a3fbe64
commit
d7f02a243c
308
PTL2/Scripts/concatenate.py
Normal file
308
PTL2/Scripts/concatenate.py
Normal file
@ -0,0 +1,308 @@
|
||||
# Last updated Jan 2024
|
||||
# Authors: Auden Cote-L'Heureux and Mario Ceron-Romero
|
||||
|
||||
# This script chooses orthologs to concatenate OGs. This can be done as part of an end-to-end PhyloToL run,
|
||||
# or by inputting already complete alignments and gene trees and running only the concatenation step.
|
||||
# Use the --concatenate flag to run this step, and optionally use the argument --concat_target_taxa to input
|
||||
# a file containing a list of taxon codes to be included in the concatenated alignment. If a GF has more
|
||||
# than one sequence from a taxon, a representative ortholog must be chosen to include in the concatenated alignment.
|
||||
# To do this, for each taxon PhyloToL keeps only the sequences falling in the monophyletic clade in the tree
|
||||
# that contains the greatest number of species of the taxon’s minor clade (or major clade, if the ‘target taxon list’
|
||||
# uses major-clade codes). If multiple sequences from the taxon fall into this largest clade, then the sequence
|
||||
# with the highest ‘score’ (defined as length times k-mer coverage for transcriptomic data with k-mer coverage
|
||||
# in the sequence ID as formatted by rnaSpades, and otherwise just length) is kept for the concatenated alignment.
|
||||
# If a GF is not present as a taxon, its missing data are filled in with gaps in the concatenated alignment.
|
||||
# Along with the concatenated alignment, this part of the pipeline outputs individual alignments with orthologs
|
||||
# selected (and re-aligned with MAFFT), in case a user wants to construct a model-partitioned or other specialized
|
||||
# kind of species tree.
|
||||
|
||||
#Dependencies
|
||||
import os, sys
|
||||
from Bio import SeqIO
|
||||
import ete3
|
||||
import argparse
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
#Small utility function to extract newick strings from nexus file
|
||||
def get_newick(fname):
|
||||
|
||||
newick = ''
|
||||
for line in open(fname):
|
||||
line = line.split(' ')[-1]
|
||||
if(line.startswith('(') or line.startswith('tree1=')):
|
||||
newick = line.split('tree1=')[-1].replace("'", '').replace('\\', '')
|
||||
|
||||
return newick
|
||||
|
||||
|
||||
#This function reroots the tree on the largest Ba/Za clade. If there is no prokaryote clade,
|
||||
#it roots on the largest Op clade, then Pl, then Am, then Ex, then Sr.
|
||||
def reroot(tree):
|
||||
|
||||
#This nested function returns the largest clade of a given taxonomic group
|
||||
def get_best_clade(taxon):
|
||||
|
||||
best_size = 0; best_clade = []; seen_leaves = []
|
||||
#Traverse all nodes
|
||||
for node in tree.traverse('levelorder'):
|
||||
#If the node is big enough and not subsumed by a node we've already accepted
|
||||
if len(node) >= 3 and len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
|
||||
leaves = [leaf.name for leaf in node]
|
||||
|
||||
#Create a record of leaves that belong to the taxonomic group
|
||||
target_leaves = set()
|
||||
for leaf in leaves[::-1]:
|
||||
if leaf[:2] in taxon:
|
||||
target_leaves.add(leaf[:10])
|
||||
leaves.remove(leaf)
|
||||
|
||||
#If this clade is better than any clade we've seen before, grab it
|
||||
if len(target_leaves) > best_size and len(leaves) <= 2:
|
||||
best_clade = node
|
||||
best_size = len(target_leaves)
|
||||
seen_leaves.extend([leaf.name for leaf in node])
|
||||
|
||||
return best_clade
|
||||
|
||||
#Get the biggest clade for each taxonomic group (stops once it finds one)
|
||||
for taxon in [('Ba', 'Za'), ('Op'), ('Pl'), ('Am'), ('Ex'), ('Sr')]:
|
||||
clade = get_best_clade(taxon)
|
||||
if len([leaf for leaf in clade if leaf.name[:2] in taxon]) > 3:
|
||||
tree.set_outgroup( clade)
|
||||
|
||||
break
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
#Function to select sequences to use per tree
|
||||
def remove_paralogs(params):
|
||||
|
||||
seqs_per_og = { }
|
||||
for file in tqdm(os.listdir(params.output + '/Output/Guidance')):
|
||||
if file.split('.')[-1] in ('fasta', 'fas', 'faa'):
|
||||
|
||||
prefix = '.'.join(file.split('.')[:-1])
|
||||
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix)]
|
||||
if len(tre_f) == 0:
|
||||
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(prefix.split('.')[0])]
|
||||
|
||||
if len(tre_f) == 0:
|
||||
|
||||
tre_f = [t for t in os.listdir(params.output + '/Output/Trees') if t.startswith(file[:10])]
|
||||
|
||||
if len(tre_f) == 0:
|
||||
print('\nNo tree file found for alignment ' + file + '. Skipping this gene family.\n')
|
||||
continue
|
||||
elif len(tre_f) > 1:
|
||||
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one alignment file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
|
||||
continue
|
||||
elif len(tre_f) > 1:
|
||||
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
|
||||
continue
|
||||
elif len(tre_f) > 1:
|
||||
print('\nMore than one tree file found matching the alignment file ' + file + '. Please make your file names unique: there should be one sequence file for every tree file, with a matching unique prefix (everything before the first "."). Skipping this gene family.\n')
|
||||
continue
|
||||
|
||||
seqs_per_og.update({ file : [] })
|
||||
og_recs = { rec.id : rec for rec in SeqIO.parse(params.output + '/Output/Guidance/' + file, 'fasta')}
|
||||
|
||||
newick = get_newick(params.output + '/Output/Trees/' + tre_f[0])
|
||||
tree = ete3.Tree(newick)
|
||||
|
||||
try:
|
||||
tree = reroot(tree)
|
||||
except:
|
||||
print('\nUnable to re-root the tree ' + file + ' (maybe it had only 1 major clade, or an inconvenient polytomy). Skipping this step and continuing to try to grab robust clades from the tree.\n')
|
||||
|
||||
#Getting a clean list of all target taxa
|
||||
|
||||
if os.path.isfile(params.concat_target_taxa):
|
||||
try:
|
||||
target_codes = [l.strip() for l in open(params.concat_target_taxa).readlines() if l.strip() != '']
|
||||
except AttributeError:
|
||||
print('\n\nError: invalid "concat_target_taxa" argument. This must be a comma-separated list of any number of digits/characters to describe focal taxa (e.g. Sr_ci_S OR Am_tu), or a file with the extension .txt containing a list of complete or partial taxon codes. All sequences containing the complete/partial code will be identified as belonging to target taxa.\n\n')
|
||||
elif params.concat_target_taxa != None:
|
||||
target_codes = [params.concat_target_taxa]
|
||||
else:
|
||||
print('\nERROR: missing --concat_target_taxa argument. When concatenating, you need to give the taxonomic group (sequence prefix), groups, or a file containing a list of groups (multiple prefixes) for which to select sequences to construct a concatenated alignment\n')
|
||||
exit()
|
||||
|
||||
monophyletic_clades = { }
|
||||
|
||||
#Create list of relevant major/minor clades for clade grabbing
|
||||
for taxon in target_codes:
|
||||
if len(taxon) < 5 and taxon[:2] not in monophyletic_clades:
|
||||
monophyletic_clades.update({ taxon : [] })
|
||||
elif len(taxon) >= 5 and taxon[:5] not in monophyletic_clades:
|
||||
monophyletic_clades.update({ taxon[:5] : [] })
|
||||
|
||||
#Grab best clades from all target groups
|
||||
seen_leaves = []
|
||||
for clade in monophyletic_clades:
|
||||
for node in tree.traverse('levelorder'):
|
||||
#If the node is big enough and not subsumed by a node we've already accepted
|
||||
if len(list(set(seen_leaves) & set([leaf.name for leaf in node]))) == 0:
|
||||
leaves = [leaf.name for leaf in node]
|
||||
|
||||
#Create a record of leaves that belong to the taxonomic group
|
||||
target_leaves = set()
|
||||
for leaf in leaves[::-1]:
|
||||
if leaf[:2] == clade or leaf[:5] == clade:
|
||||
target_leaves.add(leaf[:10])
|
||||
leaves.remove(leaf)
|
||||
|
||||
#If the clade is monophyletic
|
||||
if len(leaves) == 0:
|
||||
monophyletic_clades[clade].append(node)
|
||||
seen_leaves.extend([leaf.name for leaf in node])
|
||||
|
||||
#Get all target taxa in the alignment
|
||||
taxa = []
|
||||
for seq in tree:
|
||||
for code in target_codes:
|
||||
if code in seq.name:
|
||||
taxa.append(seq.name[:10])
|
||||
break
|
||||
|
||||
taxa = list(dict.fromkeys(taxa))
|
||||
|
||||
#For each taxon, get its best sequence
|
||||
for tax in taxa:
|
||||
|
||||
#Get all sequences belonging to the taxon
|
||||
taxseqs = [seq.name for seq in tree if seq.name[:10] == tax]
|
||||
|
||||
score = False
|
||||
|
||||
#If there's more than one sequence
|
||||
if len(taxseqs) > 1:
|
||||
|
||||
clades = { }
|
||||
|
||||
#Get the size of the clade in which each sequence falls (at minor clade level if available, otherwise major clade)
|
||||
if tax[:5] in monophyletic_clades:
|
||||
clades = { seq : len([leaf for clade in monophyletic_clades[tax[:5]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs }
|
||||
elif tax[:2] in monophyletic_clades:
|
||||
clades = { seq : len([leaf for clade in monophyletic_clades[tax[:2]] for leaf in clade if seq in [l.name for l in clade]]) for seq in taxseqs }
|
||||
|
||||
#If there's more than one sequence that falls in a robust clade
|
||||
if len(clades) > 0:
|
||||
|
||||
#Filter the list of sequences to those that fall in clades
|
||||
taxseqs = [seq for seq in taxseqs if seq in clades]
|
||||
|
||||
#Get the largest clade in which a sequence from the taxon falls
|
||||
best_size = max(list(clades.values()))
|
||||
|
||||
#Get a list of sequences in a clade of that size
|
||||
best_seqs = [seq for seq in taxseqs if clades[seq] == best_size]
|
||||
|
||||
#If there is only one sequence in the best-sized clade, take it and finish
|
||||
if len(best_seqs) == 1:
|
||||
seqs_per_og[file].append(og_recs[best_seqs[0]])
|
||||
#Otherwise, need to take the sequence with the best score that falls into a clade of that size
|
||||
else:
|
||||
taxseqs = best_seqs
|
||||
score = True
|
||||
#Otherwise, of all sequences that don't fall in any clade, take the one with the best score
|
||||
else:
|
||||
score = True
|
||||
#If there's only one sequence for the taxon, no problem
|
||||
elif len(taxseqs) == 1:
|
||||
seqs_per_og[file].append(og_recs[taxseqs[0]])
|
||||
|
||||
#If scoring is necessary, do it on the filter set of sequences for the taxon and keep the best
|
||||
if score:
|
||||
use_cov = True
|
||||
for seq in taxseqs:
|
||||
if 'Cov' not in seq[10:]:
|
||||
use_cov = False
|
||||
break
|
||||
|
||||
if use_cov:
|
||||
taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', '')) * float(x.split('Cov')[-1].split('_')[0]))
|
||||
else:
|
||||
taxseqs = sorted(taxseqs, key = lambda x : -len(og_recs[x].seq.replace('-', '')))
|
||||
|
||||
seqs_per_og[file].append(og_recs[taxseqs[0]])
|
||||
|
||||
return seqs_per_og
|
||||
|
||||
|
||||
#Function to concatenate all the selected sequences into one alignment file
|
||||
def concat(seqs_per_og, params):
|
||||
|
||||
taxa = list(dict.fromkeys([rec.id[:10] for og in seqs_per_og for rec in seqs_per_og[og]]))
|
||||
|
||||
seqs_per_og = { og : { rec.id : str(rec.seq).replace('-', '') for rec in seqs_per_og[og] } for og in seqs_per_og }
|
||||
|
||||
if not os.path.isdir(params.output + '/Output/DataToConcatenate'):
|
||||
os.mkdir(params.output + '/Output/DataToConcatenate')
|
||||
os.mkdir(params.output + '/Output/DataToConcatenate/Unaligned')
|
||||
os.mkdir(params.output + '/Output/DataToConcatenate/Aligned')
|
||||
|
||||
for og in seqs_per_og:
|
||||
with open(params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta', 'w') as o:
|
||||
for tax in seqs_per_og[og]:
|
||||
o.write('>' + tax + '\n' + seqs_per_og[og][tax] + '\n\n')
|
||||
|
||||
os.system('mafft ' + params.output + '/Output/DataToConcatenate/Unaligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaUnaligned.fasta > ' + params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta')
|
||||
|
||||
seqs_per_og[og] = { rec.id[:10] : str(rec.seq) for rec in SeqIO.parse(params.output + '/Output/DataToConcatenate/Aligned/' + '.'.join(og.split('.')[:-1]) + '_TargetTaxaAligned.fasta', 'fasta') }
|
||||
|
||||
concat_seqs_per_tax = { tax : '' for tax in taxa }
|
||||
for taxon in taxa:
|
||||
for og in seqs_per_og:
|
||||
if taxon in seqs_per_og[og]:
|
||||
concat_seqs_per_tax[taxon] += seqs_per_og[og][taxon]
|
||||
else:
|
||||
print(list(seqs_per_og[og].values()))
|
||||
print(og)
|
||||
concat_seqs_per_tax[taxon] += ''.join(['-' for i in range(len(list(seqs_per_og[og].values())[0]))])
|
||||
|
||||
with open(params.output + '/Output/ConcatenatedAlignment.fasta', 'w') as o:
|
||||
for tax in concat_seqs_per_tax:
|
||||
o.write('>' + tax + '\n' + concat_seqs_per_tax[tax] + '\n\n')
|
||||
|
||||
|
||||
#wrapper
|
||||
def run(params):
|
||||
|
||||
if not os.path.isdir(params.output + '/Output/Guidance') or not os.path.isdir(params.output + '/Output/Trees'):
|
||||
print('\nERROR in concatenation: cannot find alignments and/or trees (looking in ' + params.output + '/Output/Guidance and ' + params.output + '/Output/Trees)')
|
||||
exit()
|
||||
else:
|
||||
seqs_per_og = remove_paralogs(params)
|
||||
|
||||
concat(seqs_per_og, params)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
163
PTL2/Scripts/utils.py
Normal file
163
PTL2/Scripts/utils.py
Normal file
@ -0,0 +1,163 @@
|
||||
# Last updated: Jan 2024
|
||||
# Author: Auden Cote-L'Heureux
|
||||
|
||||
# This script is a general utility script that does two main things. First, it has
|
||||
# a function to read in all PhyloToL parameters, which is called in phylotol.py.
|
||||
# It also has a function that checks for and cleans up existing PhyloToL part 2
|
||||
# output files from previous runs, and creates a new, empty Output folder structure
|
||||
# for the new run. This function is also called only in phylotol.py.
|
||||
|
||||
#Dependencies
|
||||
import os, sys, re
|
||||
import argparse
|
||||
import shutil
|
||||
|
||||
#Reading in all parameters. This function is only called once, in phylotol.py
|
||||
def get_params():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'PhyloToL v6.0',
|
||||
description = "Updated January, 2022 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
|
||||
)
|
||||
|
||||
common = parser.add_argument_group('Commonly adjusted parameters')
|
||||
common.add_argument('--start', default = 'raw', choices = {'raw', 'unaligned', 'aligned', 'trees'}, help = 'Stage at which to start running PhyloToL.')
|
||||
common.add_argument('--end', default = 'trees', choices = {'unaligned', 'aligned', 'trees'}, help = 'Stage until which to run PhyloToL. Options are "unaligned" (which will run up to but not including guidance), "aligned" (which will run up to but not including RAxML), and "trees" which will run through RAxML')
|
||||
common.add_argument('--gf_list', default = None, help = 'Path to the file with the GFs of interest. Only required if starting from the raw dataset.')
|
||||
common.add_argument('--taxon_list', default = None, help = 'Path to the file with the taxa (10-digit codes) to include in the output.')
|
||||
common.add_argument('--data', help = 'Path to the input dataset. The format of this varies depending on your --start parameter. If you are running the contamination loop starting with trees, this folder must include both trees AND a fasta file for each tree (with identical file names other than the extension) that includes an amino-acid sequence for each tip of the tree (with the sequence names matching exactly the tip names).')
|
||||
common.add_argument('--output', default = './', help = 'Directory where the output folder should be created. If not given, the folder will be created in the parent directory of the folder containing the scripts.')
|
||||
common.add_argument('--force', action = 'store_true', help = 'Overwrite all existing files in the "Output" folder.')
|
||||
common.add_argument('--tree_method', default = 'iqtree', choices = {'iqtree', 'raxml', 'all'}, help = 'Program to use for tree-building')
|
||||
common.add_argument('--blacklist', type = str, help = 'A text file with a list of sequence names not to consider')
|
||||
common.add_argument('--og_identifier', default = 'OG', choices = {'OG','OG6','OGA','OGG'}, help = 'Program to use for selecting seq by GC width')
|
||||
common.add_argument('--sim_taxa', default = None, help = 'Path to the file with the taxa (10-digit codes) to apply the similarity filter on.')
|
||||
|
||||
core = parser.add_argument_group('Core parameters (rarely altered from the defaults)')
|
||||
core.add_argument('--blast_cutoff', default = 1e-20, type = float, help = 'Blast e-value cutoff')
|
||||
core.add_argument('--len_cutoff', default = 10, type = int, help = 'Amino acid length cutoff for removal of very short sequences after column removal in Guidance.')
|
||||
core.add_argument('--similarity_filter', action = 'store_true', help = 'Run the similarity filter in pre-Guidance')
|
||||
core.add_argument('--sim_cutoff', default = 1, type = float, help = 'Sequences from the same taxa that are assigned to the same OG are removed if they are more similar than this cutoff')
|
||||
core.add_argument('--guidance_iters', default = 5, type = int, help = 'Number of Guidance iterations for sequence removal')
|
||||
core.add_argument('--seq_cutoff', default = 0.3, type = float, help = 'During guidance, taxa are removed if their score is below this cutoff')
|
||||
core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')
|
||||
core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff')
|
||||
core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance')
|
||||
|
||||
CL = parser.add_argument_group('Contamination loop parameters')
|
||||
CL.add_argument('--contamination_loop', default = None, choices = {'seq', 'clade', 'both'}, help = 'Remove sequences by looking at the sisters of each sequence in a rules file or by picking the best clades')
|
||||
CL.add_argument('--nloops', default = 10, type = int, help = 'The maximum number of contamination-removal loops')
|
||||
CL.add_argument('--cl_tree_method', default = 'fasttree', choices = {'iqtree', 'raxml', 'fasttree', 'iqtree_fast'}, help = 'Tree-building method to use in each contamination loop iteration.')
|
||||
CL.add_argument('--cl_alignment_method', default = 'mafft_only', choices = {'mafft_only', 'guidance'}, help = 'Alignment method to use in each contamination loop iteration.')
|
||||
CL.add_argument('--cl_exclude_taxa', type = str, default = None, help = 'Path to a file containing taxon names present in input MSA/tree files but which should be removed in the first iteration of the contamination loop.')
|
||||
|
||||
CL.add_argument('--sister_rules', default = None, help = 'Path to a file of rules, nly used in "seq" mode. Sequences in the rules file with specified contaminants will be removed if sister only to those contaminants')
|
||||
CL.add_argument('--subsister_rules', default = None, help = 'Path to a file of rules, nly used in "seq" mode. Sequences in the rules file with specified contaminants will be removed if nested in a clade of those contaminants')
|
||||
CL.add_argument('--cocontaminants', default = None, help = 'Path to a file of rules defining samples to be processed as a single taxon in the "sisters" mode. The cocontaminant identifer should match an identifier in the sister rules file')
|
||||
|
||||
CL.add_argument('--clade_grabbing_rules_file', default = None, help = 'Path to a file of rules if clade grabbing on multiple taxonomic groups simultaneously, only used in "clade" mode. It should have 5 tab-separated columns without headers, corresponding to the --target_taxa (separate these by spaces if multiple), --num_contams, --min_target_presence, --required_taxa, and --required_taxa_num argument. This files should NOT have column headers.')
|
||||
CL.add_argument('--target_taxa', type = str, default = None, help = 'Only used in "clade" mode. Selected clades can have no more than num_contams (below) sequences that are not of this clade (can be 2, 4, 5, 7, 8, or 10 digits). You may either one taxon code or a path to a file, each line containing a taxon code.')
|
||||
CL.add_argument('--num_contams', type = int, default = 2, help = 'Only used in "clade" mode. Selected clades can have no more than this number of sequences that are not of the target clade')
|
||||
CL.add_argument('--min_target_presence', type = int, default = 8, help = 'Only used in "clade" mode. The minimum number of species belonging to a target clade allowed in a selected clade')
|
||||
CL.add_argument('--required_taxa', type = str, default = None, help = 'Only used in "clade" mode. A file containing 2, 4, 5, 7, 8, or 10 digit codes; any selected clade must have at least at_least_sisters_num of taxa that match these criteria; this is used to require the presence of certain sister lineages')
|
||||
CL.add_argument('--required_taxa_num', type = int, default = 0, help = 'Only used in "clade" mode. See above.')
|
||||
CL.add_argument('--clade_grabbing_exceptions', type = str, default = None, help = 'Path to a file containing identifiers for taxa that should count towards clades but should never be removed (e.g. photosynthetic orphan lineages if grabbing clades of photosynthetic taxa).')
|
||||
|
||||
other = parser.add_argument_group('Other arguments')
|
||||
other.add_argument('--concatenate', action = 'store_true', help = 'Remove paralogs and generate an alignment for concatenation')
|
||||
other.add_argument('--concat_target_taxa', type = str, default = None, help = 'The taxonomic group (sequence prefix), groups, or a file containing a list of groups (multiple prefixes) for which to select sequences to construct a concatenated alignment')
|
||||
other.add_argument('--tree_font_size', default = 12, help = "Change this if you're not quite happy with the font size in the output trees. If you want smaller font in your trees, you can lower this value; and if you want larger font in your trees, you can raise this value. Some common values are 8, 10, and 12. Size 16 font is pretty big, and size 4 font is probably too small for most purposes. Iconoclasts use size 9, 11, or 13 font.")
|
||||
other.add_argument('--keep_temp', action = 'store_true', help = "Use this to keep ALL Guidance intermediate files")
|
||||
other.add_argument('--keep_iter', '-z', action = 'store_true', help = 'Keep all Guidance iterations (beware this will be very large)')
|
||||
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
#Cleaning up existing output and creating a new output folder structure. This function is only called once, in phylotol.py
|
||||
def clean_up(params):
|
||||
|
||||
#If an output folder doesn't exist, create one.
|
||||
if not os.path.isdir(params.output + '/Output'):
|
||||
os.mkdir(params.output + '/Output')
|
||||
#If an output folder already exists at the given path and not running in force mode, stop.
|
||||
elif os.path.isdir(params.output + '/Output') and params.force == False:
|
||||
print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
|
||||
exit()
|
||||
#If running in force mode, delete any existing output.
|
||||
elif params.force and len([d for d in os.listdir(params.output + '/Output') if d != '.DS_Store']) > 0:
|
||||
print('\nAn "Output" folder already exists at the given path, but all contents were deleted in --force mode.\n')
|
||||
os.system('rm -r ' + params.output + '/Output/*')
|
||||
|
||||
#Create a folder to hold intermediate files.
|
||||
os.mkdir(params.output + '/Output/Intermediate')
|
||||
|
||||
#General function to copy over input data files into the appropriate folder (e.g. if one unaligned amino acid file
|
||||
#is input per OG, then these files will be put into the Output/Pre-Guidance folder.
|
||||
def copy_input(dirname):
|
||||
if os.path.isdir(params.data):
|
||||
input_files = [f for f in os.listdir(params.data) if f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fa')]
|
||||
if len(input_files) > 0:
|
||||
for f in input_files:
|
||||
shutil.copyfile(params.data + '/' + f, params.output + '/Output/' + dirname + '/' + f)
|
||||
else:
|
||||
print('\nThe given path to a folder of ' + params.start.strip('s') + ' files was located, but no ' + params.start.strip('s') + ' files were found. Make sure the file extensions are .fasta, .fa, or .faa.\n')
|
||||
else:
|
||||
print('\nInput ' + params.start.strip('s') + ' data files not found. Please make sure that the given path (--data) is correct or set --start to "raw".\n')
|
||||
|
||||
#Create the Pre-Guidance folder and copy over any input data files that are
|
||||
#formatted as Pre-Guidance files
|
||||
os.mkdir(params.output + '/Output/Pre-Guidance')
|
||||
if params.start == 'unaligned':
|
||||
copy_input('Pre-Guidance')
|
||||
|
||||
#Do the same for aligned files
|
||||
if params.start in ('unaligned', 'aligned') or params.end in ('aligned', 'trees', None):
|
||||
os.mkdir(params.output + '/Output/Guidance')
|
||||
os.mkdir(params.output + '/Output/NotGapTrimmed')
|
||||
if params.start == 'aligned':
|
||||
copy_input('Guidance')
|
||||
|
||||
#And for trees
|
||||
if params.end == 'trees' or params.contamination_loop != None:
|
||||
os.mkdir(params.output + '/Output/Trees')
|
||||
os.mkdir(params.output + '/Output/ColoredTrees')
|
||||
if params.start == 'trees':
|
||||
copy_input('Trees')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user