From cfd582f6ee06a1e4c737bb54c922a137ea0329ef Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Thu, 6 Jul 2023 15:50:16 -0400 Subject: [PATCH] Minor changes to PTL2 --- PTL2/Scripts/guidance.py | 84 +++++++++++++++++++++++----------------- PTL2/Scripts/trees.py | 11 ++++-- PTL2/Scripts/utils.py | 5 ++- 3 files changed, 58 insertions(+), 42 deletions(-) diff --git a/PTL2/Scripts/guidance.py b/PTL2/Scripts/guidance.py index 6b30c79..f714ff1 100644 --- a/PTL2/Scripts/guidance.py +++ b/PTL2/Scripts/guidance.py @@ -27,12 +27,15 @@ def run(params): tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_guidance_outdir) + fail = False for i in range(params.guidance_iters): n_recs = len([r for r in SeqIO.parse(guidance_input + '/' + file, 'fasta')]) if n_recs < 4: Logger.Warning('Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family.') os.system('rm -rf ' + tax_guidance_outdir) + if i == 0: + fail = True break if n_recs < 200: @@ -42,51 +45,60 @@ def run(params): os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Temp/Guidance/Output/' + file[:10] + '/log.txt') - seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]) + if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'): + seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]) - if n_recs - seqs_below < 4: - Logger.Warning('Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.') - os.system('rm -rf ' + tax_guidance_outdir) + if n_recs - seqs_below < 4: + Logger.Warning('Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.') + os.system('rm -rf ' + tax_guidance_outdir) + break + + if seqs_below == 0 or i == params.guidance_iters - 1: + Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0]) + break + + os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) + + os.system('rm -r ' + tax_guidance_outdir + '/*') + else: + fail = True break - if seqs_below == 0 or i == params.guidance_iters - 1: - Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0]) - break + if not fail: + seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')] + orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')] + running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep } - os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) + for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.res_cutoff]: + if(orig_seqs[site[0]] in seqs2keep): + running_aln[orig_seqs[site[0]]][site[1]] = 'X' - os.system('rm -r ' + tax_guidance_outdir + '/*') - - seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')] - orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')] - running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep } - - for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.res_cutoff]: - if(orig_seqs[site[0]] in seqs2keep): - running_aln[orig_seqs[site[0]]][site[1]] = 'X' - - cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.col_cutoff] - for seq in running_aln: - running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove]) - - with open(tax_guidance_outdir + '/postGuidance_preTrimAl.fasta', 'w') as o: + cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.col_cutoff] for seq in running_aln: - o.write('>' + seq + '\n' + str(running_aln[seq]) + '\n\n') + running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove]) - os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/postGuidance_preTrimAl.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas -gapthreshold 0.05 -fasta') - os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/postGuidance_preTrimAl.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.phy -gapthreshold 0.05 -phylip') + with open(tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta', 'w') as o: + for seq in running_aln: + o.write('>' + seq + '\n' + str(running_aln[seq]).replace('-', '') + '\n\n') - os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas') - - if not params.keep_temp: - for gdir_file in os.listdir(tax_guidance_outdir): - if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log'): - os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file) - else: - if gdir_file == 'MSA.MAFFT.aln.With_Names': - os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln') + print('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_postGuidance_preTrimAl_aligned.fasta') + os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta') + + os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold 0.05 -fasta') + + os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta') + os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta') + + + if not params.keep_temp: + for gdir_file in os.listdir(tax_guidance_outdir): + if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'): + os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file) else: - os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) + if gdir_file == 'MSA.MAFFT.aln.With_Names': + os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln') + else: + os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) diff --git a/PTL2/Scripts/trees.py b/PTL2/Scripts/trees.py index 6b9f8c2..97eb7d5 100644 --- a/PTL2/Scripts/trees.py +++ b/PTL2/Scripts/trees.py @@ -25,11 +25,11 @@ def run(params): tax_iqtree_outdir = params.output + '/Output/Temp/IQTree/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_iqtree_outdir) - os.system('iqtree2 -s ' + guidance_path + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fas -m LG+G --prefix ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree') + os.system('iqtree2 -s ' + guidance_path + '/' + file + ' -m LG+G --prefix ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree') if os.path.isfile(tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile'): os.system('cp ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile ' + params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.tree') - color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.tree') + #color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.tree') else: Logger.Warning('No tree file created by IQ-Tree for OG ' + file[:10]) @@ -40,11 +40,14 @@ def run(params): tax_raxml_outdir = params.output + '/Output/Temp/RAxML/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_raxml_outdir) - os.sytem('raxmlHPC-PTHREADS-AVX2 -s ' + guidance_path + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.phy -m PROTGAMMALG -f d -p 12345 -# 1 -n ' + file.split('.')[0].split('_preguidance')[0] + '_RAxML -T ' + str(params.guidance_threads)) + os.system('./Scripts/trimal-trimAl/source/trimal -in ' + guidance_path + '/' + file + ' -phylip -out ' + tax_raxml_outdir + '/aligned.phy') + + print('raxmlHPC -s ' + tax_raxml_outdir + '/aligned.phy -m PROTGAMMALG -f d -p 12345 -# 10 -n ' + file.split('.')[0].split('_preguidance')[0] + '_RAxML -T ' + str(params.guidance_threads)) + os.system('raxmlHPC -s ' + tax_raxml_outdir + '/aligned.phy -m PROTGAMMALG -f d -p 12345 -# 10 -n ' + file.split('.')[0].split('_preguidance')[0] + '_RAxML -T ' + str(params.guidance_threads)) if os.path.isfile(tax_raxml_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree.treefile'): os.system('cp ' + tax_raxml_outdir + '/RAxML_bestTree.' + file.split('.')[0].split('_preguidance')[0] + '_RAxML ' + params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_RAxML.tree') - color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_RAxML.tree') + #color(params.output + '/Output/Trees/' + file.split('.')[0].split('_preguidance')[0] + '_RAxML.tree') else: Logger.Warning('No tree file created by RAxML for OG ' + file[:10]) diff --git a/PTL2/Scripts/utils.py b/PTL2/Scripts/utils.py index e83b349..c0af141 100644 --- a/PTL2/Scripts/utils.py +++ b/PTL2/Scripts/utils.py @@ -19,7 +19,7 @@ def get_params(): common.add_argument('--data', help = 'Path to the input dataset. The format of this varies depending on your --start parameter') common.add_argument('--output', default = '../', help = 'Directory where the output folder should be created. If not given, the folder will be created in the parent directory of the folder containing the scripts.') common.add_argument('--force', action = 'store_true', help = 'Overwrite all existing files in the "Output" folder.') - common.add_argument('--tree_method', default = 'iqtree', choices = {'iqtree, raxml, all'}, help = 'Program to use for tree-building') + common.add_argument('--tree_method', default = 'iqtree', choices = {'iqtree', 'raxml', 'all'}, help = 'Program to use for tree-building') core = parser.add_argument_group('Core parameters (rarely altered from the defaults)') core.add_argument('--blast_cutoff', default = 1e-20, type = float, help = 'Blast e-value cutoff') @@ -28,7 +28,7 @@ def get_params(): core.add_argument('--overlap_cutoff', default = 0.35, type = float, help = 'A sequence is removed if its alignment length to the longest sequence in its OG & taxon is less than this proportion of the length of the longest sequence') core.add_argument('--guidance_iters', default = 5, type = int, help = 'Number of Guidance iterations for sequence removal') core.add_argument('--seq_cutoff', default = 0.3, type = float, help = 'During guidance, taxa are removed if their score is below this cutoff') - core.add_argument('--col_cutoff', default = 0.4, type = float, help = 'During guidance, columns are removed if their score is below this cutoff') + core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff') core.add_argument('--res_cutoff', default = 0.0, type = float, help = 'During guidance, residues are removed if their score is below this cutoff') core.add_argument('--guidance_threads', default = 20, type = int, help = 'Number of threads to allocate to Guidance') @@ -93,6 +93,7 @@ def clean_up(params): if params.start in ('unaligned', 'aligned') or params.end in ('aligned', 'trees', None): os.mkdir(params.output + '/Output/Guidance') + os.mkdir(params.output + '/Output/NotGapTrimmed') if(params.start == 'aligned'): copy_input('Guidance')