From 43a7439a02a09529a6ef123cdf88494e1c324d39 Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Fri, 3 Nov 2023 11:05:46 -0400 Subject: [PATCH] Adjusting sim filter, adding score to guidance removed seqs output --- PTL2/Scripts/guidance.py | 10 +++++++--- PTL2/Scripts/preguidance.py | 4 ++-- PTL2/Scripts/utils.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/PTL2/Scripts/guidance.py b/PTL2/Scripts/guidance.py index f714ff1..898f604 100644 --- a/PTL2/Scripts/guidance.py +++ b/PTL2/Scripts/guidance.py @@ -23,6 +23,9 @@ def run(params): guidance_input = params.output + '/Output/Temp/Guidance/Input/' os.system('cp -r ' + preguidance_path + '/* ' + guidance_input) + guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w') + guidance_removed_file.write('Sequence\tScore\n') + for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]: tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_guidance_outdir) @@ -57,6 +60,9 @@ def run(params): Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0]) break + for line in seqs_below: + guidance_removed_file.write(line) + os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) os.system('rm -r ' + tax_guidance_outdir + '/*') @@ -100,9 +106,7 @@ def run(params): else: os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) - - - + guidance_removed_file.close() diff --git a/PTL2/Scripts/preguidance.py b/PTL2/Scripts/preguidance.py index 272daea..b0c756a 100644 --- a/PTL2/Scripts/preguidance.py +++ b/PTL2/Scripts/preguidance.py @@ -38,7 +38,7 @@ def run(params): Logger.Warning('\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.') masters = []; removed = 0; flag = 0; cycle = 0 - if params.sim_cutoff < 1: + if params.similarity_filter: if len(recs) > 1: while flag == 0: master_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle) @@ -60,7 +60,7 @@ def run(params): for line in diamond_out: line = line.strip().split('\t') - if float(line[2])/100 > params.sim_cutoff: + if float(line[2])/100 >= params.sim_cutoff: recs_to_remove.append(seq); removed =+ 1 if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2: diff --git a/PTL2/Scripts/utils.py b/PTL2/Scripts/utils.py index 83fe232..8b9845e 100644 --- a/PTL2/Scripts/utils.py +++ b/PTL2/Scripts/utils.py @@ -24,8 +24,8 @@ def get_params(): core = parser.add_argument_group('Core parameters (rarely altered from the defaults)') core.add_argument('--blast_cutoff', default = 1e-20, type = float, help = 'Blast e-value cutoff') core.add_argument('--len_cutoff', default = 10, type = int, help = 'Amino acid length cutoff for removal of very short sequences after column removal in Guidance.') + core.add_argument('--similarity_filter', action = 'store_true', help = 'Run the similarity filter in pre-Guidance') core.add_argument('--sim_cutoff', default = 1, type = float, help = 'Sequences from the same taxa that are assigned to the same OG are removed if they are more similar than this cutoff') - core.add_argument('--overlap_cutoff', default = 0.35, type = float, help = 'A sequence is removed if its alignment length to the longest sequence in its OG & taxon is greater than this proportion of the length of the longest sequence') core.add_argument('--guidance_iters', default = 5, type = int, help = 'Number of Guidance iterations for sequence removal') core.add_argument('--seq_cutoff', default = 0.3, type = float, help = 'During guidance, taxa are removed if their score is below this cutoff') core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')