From 1b587bf2663e6242d687a2755a10d4da0ccb24f1 Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Mon, 13 Nov 2023 12:09:57 -0500 Subject: [PATCH] Version of scripts used to successfully run conserved OGs on 11/13/23 Recent updates: Similarity filter adjusted to have on/off parameter Blacklist implemented & tested Output list of Guidance-removed sequences with seq scores Temp/intermediate file naming issue fixed --- PTL2/Scripts/guidance.py | 14 +++++++------- PTL2/Scripts/preguidance.py | 12 ++++++------ PTL2/Scripts/trees.py | 12 ++++++------ 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/PTL2/Scripts/guidance.py b/PTL2/Scripts/guidance.py index 898f604..d300be2 100644 --- a/PTL2/Scripts/guidance.py +++ b/PTL2/Scripts/guidance.py @@ -16,18 +16,18 @@ def run(params): if len([f for f in os.listdir(preguidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]) == 0: Logger.Error('No pre-Guidance (unaligned) files could be found at the path ' + preguidance_path + '. Make sure that the --start and --data parameters are correct, that the pre-Guidance step ran successfully, and that the unaligned files are formatted correctly (they must have the file extension .faa, .fa, or .fasta).') - os.mkdir(params.output + '/Output/Temp/Guidance') - os.mkdir(params.output + '/Output/Temp/Guidance/Input') - os.mkdir(params.output + '/Output/Temp/Guidance/Output') + os.mkdir(params.output + '/Output/Intermediate/Guidance') + os.mkdir(params.output + '/Output/Intermediate/Guidance/Input') + os.mkdir(params.output + '/Output/Intermediate/Guidance/Output') - guidance_input = params.output + '/Output/Temp/Guidance/Input/' + guidance_input = params.output + '/Output/Intermediate/Guidance/Input/' os.system('cp -r ' + preguidance_path + '/* ' + guidance_input) guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w') guidance_removed_file.write('Sequence\tScore\n') for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]: - tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] + tax_guidance_outdir = params.output + '/Output/Intermediate/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_guidance_outdir) fail = False @@ -46,7 +46,7 @@ def run(params): else: mafft_alg = 'auto' - os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Temp/Guidance/Output/' + file[:10] + '/log.txt') + os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Intermediate/Guidance/Output/' + file[:10] + '/log.txt') if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'): seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]) @@ -60,7 +60,7 @@ def run(params): Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0]) break - for line in seqs_below: + for line in [line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]: guidance_removed_file.write(line) os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) diff --git a/PTL2/Scripts/preguidance.py b/PTL2/Scripts/preguidance.py index b0c756a..b75b32d 100644 --- a/PTL2/Scripts/preguidance.py +++ b/PTL2/Scripts/preguidance.py @@ -24,7 +24,7 @@ def run(params): if(len(missing_taxa) > 0): Logger.Warning('The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa])) - os.mkdir(params.output + '/Output/Temp/SF_Diamond') + os.mkdir(params.output + '/Output/Intermediate/SF_Diamond') for og in ogs: Logger.Message('Processing ' + og) @@ -41,9 +41,9 @@ def run(params): if params.similarity_filter: if len(recs) > 1: while flag == 0: - master_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle) - query_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa' - diamond_out_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv' + master_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle) + query_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa' + diamond_out_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv' open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n') masters.append(recs[0]) @@ -61,7 +61,7 @@ def run(params): line = line.strip().split('\t') if float(line[2])/100 >= params.sim_cutoff: - recs_to_remove.append(seq); removed =+ 1 + recs_to_remove.append(line[0]); removed =+ 1 if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2: recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove] @@ -76,7 +76,7 @@ def run(params): preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n') if(not params.keep_temp): - os.system('rm -r ' + params.output + '/Output/Temp/SF_Diamond') + os.system('rm -r ' + params.output + '/Output/Intermediate/SF_Diamond') diff --git a/PTL2/Scripts/trees.py b/PTL2/Scripts/trees.py index 97eb7d5..08d13ce 100644 --- a/PTL2/Scripts/trees.py +++ b/PTL2/Scripts/trees.py @@ -19,10 +19,10 @@ def run(params): for file in [f for f in os.listdir(guidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fas') or f.endswith('.aln')]: if params.tree_method == 'iqtree': - if not os.path.isdir(params.output + '/Output/Temp/IQTree'): - os.mkdir(params.output + '/Output/Temp/IQTree') + if not os.path.isdir(params.output + '/Output/Intermediate/IQTree'): + os.mkdir(params.output + '/Output/Intermediate/IQTree') - tax_iqtree_outdir = params.output + '/Output/Temp/IQTree/' + file.split('.')[0].split('_preguidance')[0] + tax_iqtree_outdir = params.output + '/Output/Intermediate/IQTree/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_iqtree_outdir) os.system('iqtree2 -s ' + guidance_path + '/' + file + ' -m LG+G --prefix ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree') @@ -34,10 +34,10 @@ def run(params): Logger.Warning('No tree file created by IQ-Tree for OG ' + file[:10]) elif params.tree_method == 'raxml': - if not os.path.isdir(params.output + '/Output/Temp/RAxML'): - os.mkdir(params.output + '/Output/Temp/RAxML') + if not os.path.isdir(params.output + '/Output/Intermediate/RAxML'): + os.mkdir(params.output + '/Output/Intermediate/RAxML') - tax_raxml_outdir = params.output + '/Output/Temp/RAxML/' + file.split('.')[0].split('_preguidance')[0] + tax_raxml_outdir = params.output + '/Output/Intermediate/RAxML/' + file.split('.')[0].split('_preguidance')[0] os.mkdir(tax_raxml_outdir) os.system('./Scripts/trimal-trimAl/source/trimal -in ' + guidance_path + '/' + file + ' -phylip -out ' + tax_raxml_outdir + '/aligned.phy')