Adjusting sim filter, adding score to guidance removed seqs output

This commit is contained in:
Auden Cote-L'Heureux 2023-11-03 11:05:46 -04:00 committed by GitHub
parent 72660b742b
commit 43a7439a02
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 10 additions and 6 deletions

View File

@ -23,6 +23,9 @@ def run(params):
guidance_input = params.output + '/Output/Temp/Guidance/Input/'
os.system('cp -r ' + preguidance_path + '/* ' + guidance_input)
guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w')
guidance_removed_file.write('Sequence\tScore\n')
for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]:
tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0]
os.mkdir(tax_guidance_outdir)
@ -57,6 +60,9 @@ def run(params):
Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0])
break
for line in seqs_below:
guidance_removed_file.write(line)
os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file)
os.system('rm -r ' + tax_guidance_outdir + '/*')
@ -100,9 +106,7 @@ def run(params):
else:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file)
guidance_removed_file.close()

View File

@ -38,7 +38,7 @@ def run(params):
Logger.Warning('\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.')
masters = []; removed = 0; flag = 0; cycle = 0
if params.sim_cutoff < 1:
if params.similarity_filter:
if len(recs) > 1:
while flag == 0:
master_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
@ -60,7 +60,7 @@ def run(params):
for line in diamond_out:
line = line.strip().split('\t')
if float(line[2])/100 > params.sim_cutoff:
if float(line[2])/100 >= params.sim_cutoff:
recs_to_remove.append(seq); removed =+ 1
if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2:

View File

@ -24,8 +24,8 @@ def get_params():
core = parser.add_argument_group('Core parameters (rarely altered from the defaults)')
core.add_argument('--blast_cutoff', default = 1e-20, type = float, help = 'Blast e-value cutoff')
core.add_argument('--len_cutoff', default = 10, type = int, help = 'Amino acid length cutoff for removal of very short sequences after column removal in Guidance.')
core.add_argument('--similarity_filter', action = 'store_true', help = 'Run the similarity filter in pre-Guidance')
core.add_argument('--sim_cutoff', default = 1, type = float, help = 'Sequences from the same taxa that are assigned to the same OG are removed if they are more similar than this cutoff')
core.add_argument('--overlap_cutoff', default = 0.35, type = float, help = 'A sequence is removed if its alignment length to the longest sequence in its OG & taxon is greater than this proportion of the length of the longest sequence')
core.add_argument('--guidance_iters', default = 5, type = int, help = 'Number of Guidance iterations for sequence removal')
core.add_argument('--seq_cutoff', default = 0.3, type = float, help = 'During guidance, taxa are removed if their score is below this cutoff')
core.add_argument('--col_cutoff', default = 0.0, type = float, help = 'During guidance, columns are removed if their score is below this cutoff')