Add files via upload

This commit is contained in:
Godwin Ani 2024-04-11 14:43:53 -04:00 committed by GitHub
parent 8b35b8a64d
commit 2f67243515
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,175 +1,195 @@
# Last updated Sept 2023 # Last updated Apr 2 2024
# Authors: Auden Cote-L'Heureux and Mario Ceron-Romero # Authors: Auden Cote-L'Heureux and Mario Ceron-Romero
# This script runs Guidance in an iterative fashion for more both MSA construction # This script runs Guidance in an iterative fashion for more both MSA construction
# and more rigorous homology assessment than what is offered in PhyloToL 6 part 1. # and more rigorous homology assessment than what is offered in PhyloToL 6 part 1.
# Guidance runs until the input number of iterations (--guidance_iters, default = 5) # Guidance runs until the input number of iterations (--guidance_iters, default = 5)
# has been reached, or until there are no sequences below the sequence score cutoff. # has been reached, or until there are no sequences below the sequence score cutoff.
# All sequences below the score cutoff (--seq_cutoff, default = 0.3) are removed at # All sequences below the score cutoff (--seq_cutoff, default = 0.3) are removed at
# each iteration. By default, PhyloToL does not remove residues that fall below the # each iteration. By default, PhyloToL does not remove residues that fall below the
# given residue cutoff (--res_cutoff) and columns that fall below the given column # given residue cutoff (--res_cutoff) and columns that fall below the given column
# cutoff (--col_cutoff, defaults are 0), though this can be turned on by adjusting # cutoff (--col_cutoff, defaults are 0), though this can be turned on by adjusting
# these parameters. Outputs at this point are found in the “Guidance_NotGapTrimmed” # these parameters. Outputs at this point are found in the “Guidance_NotGapTrimmed”
# output folder. We then run MSAs through TrimAl to remove all sites in the alignment # output folder. We then run MSAs through TrimAl to remove all sites in the alignment
# that are at least 95% gaps (or --gap_trim_cutoff) generating files in the “Guidance” # that are at least 95% gaps (or --gap_trim_cutoff) generating files in the “Guidance”
# output folder. # output folder.
# This step is either intended to be run starting with --start = unaligned (but not raw) # This step is either intended to be run starting with --start = unaligned (but not raw)
# inputs, meaning one amino acid alignment per OG. It can also be run directly after the # inputs, meaning one amino acid alignment per OG. It can also be run directly after the
# preguidance step. The run() function is called in two places: in phylotol.py generally, # preguidance step. The run() function is called in two places: in phylotol.py generally,
# and in contamination.py if the contamination loop is using Guidance as the re-alignment # and in contamination.py if the contamination loop is using Guidance as the re-alignment
# method. # method.
#Dependencies #Dependencies
import os, sys, re import os, sys, re
from Bio import SeqIO from Bio import SeqIO
#Called in phylotol.py and contamination.py #Called in phylotol.py and contamination.py
def run(params): def run(params):
if params.start == 'raw' or params.start == 'unaligned': if params.start == 'raw' or params.start == 'unaligned':
#Checking that pre-Guidance has been run or that unaligned files per OG are provided. #Checking that pre-Guidance has been run or that unaligned files per OG are provided.
if params.start == 'raw': if params.start == 'raw':
preguidance_path = params.output + '/Output/Pre-Guidance' preguidance_path = params.output + '/Output/Pre-Guidance'
else: else:
preguidance_path = params.data preguidance_path = params.data
if not os.path.isdir(preguidance_path): if not os.path.isdir(preguidance_path):
print('\nERROR: The path ' + preguidance_path + ' could not be found when trying to locate pre-Guidance (unaligned) files. Make sure that the --start and --data parameters are correct and/or that the pre-Guidance step ran successfully.\n') print('\nERROR: The path ' + preguidance_path + ' could not be found when trying to locate pre-Guidance (unaligned) files. Make sure that the --start and --data parameters are correct and/or that the pre-Guidance step ran successfully.\n')
exit() exit()
if len([f for f in os.listdir(preguidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]) == 0: if len([f for f in os.listdir(preguidance_path) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]) == 0:
print('\nERROR: No pre-Guidance (unaligned) files could be found at the path ' + preguidance_path + '. Make sure that the --start and --data parameters are correct, that the pre-Guidance step ran successfully, and that the unaligned files are formatted correctly (they must have the file extension .faa, .fa, or .fasta).\n') print('\nERROR: No pre-Guidance (unaligned) files could be found at the path ' + preguidance_path + '. Make sure that the --start and --data parameters are correct, that the pre-Guidance step ran successfully, and that the unaligned files are formatted correctly (they must have the file extension .faa, .fa, or .fasta).\n')
exit() exit()
#Creating intermedate folders that will later be deleted unless running with --keep_temp #Creating intermedate folders that will later be deleted unless running with --keep_temp
os.mkdir(params.output + '/Output/Intermediate/Guidance') os.mkdir(params.output + '/Output/Intermediate/Guidance')
os.mkdir(params.output + '/Output/Intermediate/Guidance/Input') os.mkdir(params.output + '/Output/Intermediate/Guidance/Input')
os.mkdir(params.output + '/Output/Intermediate/Guidance/Output') os.mkdir(params.output + '/Output/Intermediate/Guidance/Output')
guidance_input = params.output + '/Output/Intermediate/Guidance/Input/' guidance_input = params.output + '/Output/Intermediate/Guidance/Input/'
os.system('cp -r ' + preguidance_path + '/* ' + guidance_input) os.system('cp -r ' + preguidance_path + '/* ' + guidance_input)
guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w') guidance_removed_file = open(params.output + '/Output/GuidanceRemovedSeqs.txt', 'w')
guidance_removed_file.write('Sequence\tScore\n') guidance_removed_file.write('Sequence\tScore\n')
#For each unaligned AA fasta file #For each unaligned AA fasta file
for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]: for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]:
tax_guidance_outdir = params.output + '/Output/Intermediate/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0] tax_guidance_outdir = params.output + '/Output/Intermediate/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0]
os.mkdir(tax_guidance_outdir) os.mkdir(tax_guidance_outdir)
fail = False fail = False
#For each iteration #For each iteration
for i in range(params.guidance_iters): for i in range(params.guidance_iters):
n_recs = len([r for r in SeqIO.parse(guidance_input + '/' + file, 'fasta')]) n_recs = len([r for r in SeqIO.parse(guidance_input + '/' + file, 'fasta')])
#Guidance can't handle inputs with fewer than 4 sequences #Guidance can't handle inputs with fewer than 4 sequences
if n_recs < 4: if n_recs < 4:
print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n') print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n')
os.system('rm -rf ' + tax_guidance_outdir) os.system('rm -rf ' + tax_guidance_outdir)
if i == 0: if i == 0:
fail = True fail = True
break break
#Determining MAFFT algorithm based on the number of input sequences #Determining MAFFT algorithm based on the number of input sequences
if n_recs < 200: if n_recs < 200:
mafft_alg = 'genafpair' mafft_alg = 'genafpair'
else: else:
mafft_alg = 'auto' mafft_alg = 'auto'
#Running Guidance (one per OG per iteration) #Running Guidance (one per OG per iteration)
os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Intermediate/Guidance/Output/' + file[:10] + '/log.txt') os.system('Scripts/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(params.seq_cutoff) + ' --colCutoff ' + str(params.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(params.guidance_threads) + " --bl 62 --anysymbol' > " + params.output + '/Output/Intermediate/Guidance/Output/' + file[:10] + '/log.txt')
#Checking for a sequence score file; if not available, Guidance failed. #Checking for a sequence score file; if not available, Guidance failed.
if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'): if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'):
#All sequences below score cutoff #All sequences below score cutoff
seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]) seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff])
#If fewer than four were above the cutoff, this OG is done iterating. #If fewer than four were above the cutoff, this OG is done iterating.
if n_recs - seqs_below < 4: if n_recs - seqs_below < 4:
print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n') print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n')
os.system('rm -rf ' + tax_guidance_outdir) os.system('rm -rf ' + tax_guidance_outdir)
break break
#If all sequences were above the cutoff, this OG is done iterating. #If all sequences were above the cutoff, this OG is done iterating.
if seqs_below == 0 or i == params.guidance_iters - 1: if seqs_below == 0 or i == params.guidance_iters - 1:
print('\nGuidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0] + '\n') print('\nGuidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0] + '\n')
break break
#Recording list of sequences removed by Guidance. #Recording list of sequences removed by Guidance.
for line in [line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]: for line in [line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < params.seq_cutoff]:
guidance_removed_file.write(line) guidance_removed_file.write(line)
#Copying over the old file with the new results #Copying over the old file with the new results
os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file) os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file)
#Cleaning up the intermediate files for the iteration.
os.system('rm -r ' + tax_guidance_outdir + '/*') #Handling intermediate files for each iteration.
else: if params.keep_iter:
fail = True if i +1 < params.guidance_iters:
break os.makedirs(params.output + '/Output/Intermediate/Guidance/Iterations/', exist_ok = True)
os.makedirs(params.output + '/Output/Intermediate/Guidance/Iterations/' + str(i+1)+'/', exist_ok = True)
#After all iterations, THEN apply residue and column cutoffs os.makedirs(params.output + '/Output/Intermediate/Guidance/Iterations/' + str(i+1) + '/' + file.split('.')[0].split('_preguidance')[0], exist_ok = True)
if not fail: iteration_folder = params.output + '/Output/Intermediate/Guidance/Iterations/' + str(i +1) + '/' + file.split('.')[0].split('_preguidance')[0]
#Getting a list of sequences to keep os.system('cp -r ' + tax_guidance_outdir + '/* ' + iteration_folder)
seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')]
orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')]
running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep } if not params.keep_temp:
for gdir_file in os.listdir(iteration_folder):
#Residues that fall below the confidence cutoff (--res_cutoff) are replaced with 'X' if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'):
for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.res_cutoff]: os.system('rm -r ' + iteration_folder + '/' + gdir_file)
if(orig_seqs[site[0]] in seqs2keep): else:
running_aln[orig_seqs[site[0]]][site[1]] = 'X' if gdir_file == 'MSA.MAFFT.aln.With_Names':
os.system('mv ' + iteration_folder + '/' + gdir_file + ' ' + iteration_folder + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln')
#Removing columns below the --col_cutoff else:
cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.col_cutoff] os.system('mv ' + iteration_folder + '/' + gdir_file + ' ' + iteration_folder + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file)
for seq in running_aln:
running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove]) os.system('rm -r ' + tax_guidance_outdir + '/*')
else:
with open(tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta', 'w') as o: fail = True
for seq in running_aln: break
o.write('>' + seq + '\n' + str(running_aln[seq]).replace('-', '') + '\n\n')
#After all iterations, THEN apply residue and column cutoffs
#Aligning one last time after removing the final set of sequences and applying the res and col cutoffs if not fail:
print('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_postGuidance_preTrimAl_aligned.fasta') #Getting a list of sequences to keep
os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta') seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')]
orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')]
#Gap trimming running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep }
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold 0.05 -fasta')
#Residues that fall below the confidence cutoff (--res_cutoff) are replaced with 'X'
#Copying over final aligments (pre and post gap trimming) into output folder. for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.res_cutoff]:
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta') if(orig_seqs[site[0]] in seqs2keep):
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta') running_aln[orig_seqs[site[0]]][site[1]] = 'X'
#Removing intermediate files if not --keep_temp #Removing columns below the --col_cutoff
if not params.keep_temp: cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < params.col_cutoff]
for gdir_file in os.listdir(tax_guidance_outdir): for seq in running_aln:
if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'): running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove])
os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file)
else: with open(tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta', 'w') as o:
if gdir_file == 'MSA.MAFFT.aln.With_Names': for seq in running_aln:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln') o.write('>' + seq + '\n' + str(running_aln[seq]).replace('-', '') + '\n\n')
else:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) #Aligning one last time after removing the final set of sequences and applying the res and col cutoffs
print('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_postGuidance_preTrimAl_aligned.fasta')
guidance_removed_file.close() os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Gap trimming
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold 0.05 -fasta')
#Copying over final aligments (pre and post gap trimming) into output folder.
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + params.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta')
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + params.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Removing intermediate files if not --keep_temp
if not params.keep_temp:
for gdir_file in os.listdir(tax_guidance_outdir):
if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'):
os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file)
else:
if gdir_file == 'MSA.MAFFT.aln.With_Names':
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln')
else:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file)
guidance_removed_file.close()