Version of scripts used to successfully run conserved OGs on 11/13/23

Recent updates:
Similarity filter adjusted to have on/off parameter
Blacklist implemented & tested
Output list of Guidance-removed sequences with seq scores
Temp/intermediate file naming issue fixed
This commit is contained in:
Auden Cote-L'Heureux 2023-11-13 12:09:57 -05:00 committed by GitHub
parent 842b229833
commit 1b587bf266
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 19 deletions

View File

@ -16,18 +16,18 @@ def run(params):
if len([f for f in os.listdir(preguidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]) == 0:
Logger.Error('No pre-Guidance (unaligned) files could be found at the path ' + preguidance_path + '. Make sure that the --start and --data parameters are correct, that the pre-Guidance step ran successfully, and that the unaligned files are formatted correctly (they must have the file extension .faa, .fa, or .fasta).')
os.mkdir(params.output + '/Output/Temp/Guidance')
os.mkdir(params.output + '/Output/Temp/Guidance/Input')
os.mkdir(params.output + '/Output/Temp/Guidance/Output')
os.mkdir(params.output + '/Output/Intermediate/Guidance')
os.mkdir(params.output + '/Output/Intermediate/Guidance/Input')
os.mkdir(params.output + '/Output/Intermediate/Guidance/Output')
guidance_input = params.output + '/Output/Temp/Guidance/Input/'
guidance_input = params.output + '/Output/Intermediate/Guidance/Input/'
os.system('cp -r ' + preguidance_path + '/* ' + guidance_input)
guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w')
guidance_removed_file.write('Sequence\tScore\n')
for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]:
tax_guidance_outdir = params.output + '/Output/Temp/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0]
tax_guidance_outdir = params.output + '/Output/Intermediate/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0]
os.mkdir(tax_guidance_outdir)
fail = False
@ -46,7 +46,7 @@ def run(params):
else:
mafft_alg = 'auto'
os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Temp/Guidance/Output/' + file[:10] + '/log.txt')
os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Intermediate/Guidance/Output/' + file[:10] + '/log.txt')
if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'):
seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff])
@ -60,7 +60,7 @@ def run(params):
Logger.Message('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0])
break
for line in seqs_below:
for line in [line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]:
guidance_removed_file.write(line)
os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file)

View File

@ -24,7 +24,7 @@ def run(params):
if(len(missing_taxa) > 0):
Logger.Warning('The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa]))
os.mkdir(params.output + '/Output/Temp/SF_Diamond')
os.mkdir(params.output + '/Output/Intermediate/SF_Diamond')
for og in ogs:
Logger.Message('Processing ' + og)
@ -41,9 +41,9 @@ def run(params):
if params.similarity_filter:
if len(recs) > 1:
while flag == 0:
master_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
query_file_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa'
diamond_out_name = params.output + '/Output/Temp/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv'
master_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
query_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa'
diamond_out_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv'
open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n')
masters.append(recs[0])
@ -61,7 +61,7 @@ def run(params):
line = line.strip().split('\t')
if float(line[2])/100 >= params.sim_cutoff:
recs_to_remove.append(seq); removed =+ 1
recs_to_remove.append(line[0]); removed =+ 1
if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2:
recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove]
@ -76,7 +76,7 @@ def run(params):
preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
if(not params.keep_temp):
os.system('rm -r ' + params.output + '/Output/Temp/SF_Diamond')
os.system('rm -r ' + params.output + '/Output/Intermediate/SF_Diamond')

View File

@ -19,10 +19,10 @@ def run(params):
for file in [f for f in os.listdir(guidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta') or f.endswith('.fas') or f.endswith('.aln')]:
if params.tree_method == 'iqtree':
if not os.path.isdir(params.output + '/Output/Temp/IQTree'):
os.mkdir(params.output + '/Output/Temp/IQTree')
if not os.path.isdir(params.output + '/Output/Intermediate/IQTree'):
os.mkdir(params.output + '/Output/Intermediate/IQTree')
tax_iqtree_outdir = params.output + '/Output/Temp/IQTree/' + file.split('.')[0].split('_preguidance')[0]
tax_iqtree_outdir = params.output + '/Output/Intermediate/IQTree/' + file.split('.')[0].split('_preguidance')[0]
os.mkdir(tax_iqtree_outdir)
os.system('iqtree2 -s ' + guidance_path + '/' + file + ' -m LG+G --prefix ' + tax_iqtree_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_IQTree')
@ -34,10 +34,10 @@ def run(params):
Logger.Warning('No tree file created by IQ-Tree for OG ' + file[:10])
elif params.tree_method == 'raxml':
if not os.path.isdir(params.output + '/Output/Temp/RAxML'):
os.mkdir(params.output + '/Output/Temp/RAxML')
if not os.path.isdir(params.output + '/Output/Intermediate/RAxML'):
os.mkdir(params.output + '/Output/Intermediate/RAxML')
tax_raxml_outdir = params.output + '/Output/Temp/RAxML/' + file.split('.')[0].split('_preguidance')[0]
tax_raxml_outdir = params.output + '/Output/Intermediate/RAxML/' + file.split('.')[0].split('_preguidance')[0]
os.mkdir(tax_raxml_outdir)
os.system('./Scripts/trimal-trimAl/source/trimal -in ' + guidance_path + '/' + file + ' -phylip -out ' + tax_raxml_outdir + '/aligned.phy')