Iterations parameter added

This commit is contained in:
Godwin Ani 2024-03-19 10:29:41 -04:00 committed by GitHub
parent b8a68098f3
commit b91cf0fb63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,14 +1,10 @@
#Author, date: Auden Cote-L'Heureux, July 17 2023 # Last updated Sept 2023
#Motivation: Run Guidance on multiple files # Authors: Auden Cote-L'Heureux and Mario Ceron-Romero
#Intent: To be used in place of PhyloToL part 2 if there is a good reason not to use the guidance-only mode of PhyloToL
#Dependencies: Python3, BioPython
#Inputs: A folder of unaligned fasta files
#Outputs: Folder for each input file with selected Guidance output files, renamed with the original file name at the front
#Example: python3 GuidanceWrapper_v2.1.py -i FastaFiles
########### new code
#Dependencies #Dependencies
import os, sys import os, sys, re
import argparse import argparse
from Bio import SeqIO from Bio import SeqIO
@ -18,7 +14,7 @@ parser = argparse.ArgumentParser(
description = "Updated July 21, 2023 by Auden Cote-L'Heureux" description = "Updated July 21, 2023 by Auden Cote-L'Heureux"
) )
parser.add_argument('--input', '-i', required = True, type = str, help = 'Path to folder of unaligned amino acid fasta files to align. File extensions must be fasta, fa, fas, or faa. Try using the absolute rather than relative path if working on the Grid and having trouble') parser.add_argument('--input', '-i', required = True, type = str, help = 'Path to folder of unaligned amino acid fasta files to align. File extensions must be fasta, fa, fas, or faa. Try using the absolute rather than relative path if working on the Grid and having trouble')
parser.add_argument('--output', '-o', default = '.', type = str, help = 'Path to folder where a folder named "GuidanceOutput" will be created') parser.add_argument('--output', '-o', default = '.', type = str, help = 'Path to folder where files will be created')
parser.add_argument('--codon', action = 'store_true', help = 'Run on nucleotide files by translating codons') parser.add_argument('--codon', action = 'store_true', help = 'Run on nucleotide files by translating codons')
parser.add_argument('--iterations', '-n', default = 5, type = int, help = 'Number of Guidance iterations (default = 5)') parser.add_argument('--iterations', '-n', default = 5, type = int, help = 'Number of Guidance iterations (default = 5)')
parser.add_argument('--guidance_path', '-p', default = 'guidance.v2.02', type = str, help = 'Path to the guidance_v2.02 folder') parser.add_argument('--guidance_path', '-p', default = 'guidance.v2.02', type = str, help = 'Path to the guidance_v2.02 folder')
@ -28,145 +24,140 @@ parser.add_argument('--res_cutoff', '-r', default = 0.0, type = float, help = 'D
parser.add_argument('--force', '-f', action = 'store_true', help = 'Delete existing output folder at given output path') parser.add_argument('--force', '-f', action = 'store_true', help = 'Delete existing output folder at given output path')
parser.add_argument('--keep_temp', '-k', action = 'store_true', help = 'Keep all Guidance intermediate files (beware, some can be very large)') parser.add_argument('--keep_temp', '-k', action = 'store_true', help = 'Keep all Guidance intermediate files (beware, some can be very large)')
parser.add_argument('--guidance_threads', '-t', default = 20, type = int, help = 'Number of threads to allocate to Guidance') parser.add_argument('--guidance_threads', '-t', default = 20, type = int, help = 'Number of threads to allocate to Guidance')
parser.add_argument('--keep_iter', '-z', action = 'store_true', help = 'Keep all Guidance intermediate files (beware, some can be very large)')
args = parser.parse_args() args = parser.parse_args()
#Creating an output folder #Creating intermedate folders that will later be deleted unless running with --keep_temp
if not os.path.isdir(args.output + '/GuidanceOutput'):
os.mkdir(args.output + '/GuidanceOutput')
elif not args.force:
print('\nERROR: An output folder already exists at the given path. Delete it or run in --force mode\n')
quit()
#For each file in the input folder os.makedirs(args.output + '/Output/Intermediate/Guidance')
for file in os.listdir(args.input): os.mkdir(args.output + '/Output/Intermediate/Guidance/Input')
#If it has an appropriate extension for an amino acid fasta file os.mkdir(args.output + '/Output/Intermediate/Guidance/Output')
if file.split('.')[-1] in ('fasta', 'fa', 'fas', 'faa'): #new
#Create the output folder os.mkdir(args.output + '/Output/Guidance')
tax_guidance_outdir = args.output + '/GuidanceOutput/' + file.split('.')[0] os.mkdir(args.output + '/Output/NotGapTrimmed')
if(not os.path.isdir(tax_guidance_outdir)): #new
os.mkdir(tax_guidance_outdir) guidance_input = args.output + 'Output/Intermediate/Guidance/Input/'
os.system('cp -r ' + args.input + '/* ' + guidance_input)
#Read in the sequence data and replace any * and U characters with 'X' guidance_removed_file = open(args.output + '/Output/GuidanceRemovedSeqs.txt', 'w')
recs = [r for r in SeqIO.parse(args.input + '/' + file, 'fasta')] guidance_removed_file.write('Sequence\tScore\n')
with open(args.input + '/' + file, 'w') as o:
for rec in recs:
o.write('>' + rec.description + '\n' + str(rec.seq).replace('*', 'X').replace('U', 'X') + '\n\n')
fail = False #For each unaligned AA fasta file
#For each Guidance iteration (default = 5) for file in [f for f in os.listdir(guidance_input) if f.endswith('.fa') or f.endswith('.faa') or f.endswith('.fasta')]:
for i in range(args.iterations): tax_guidance_outdir = args.output + '/Output/Intermediate/Guidance/Output/' + file.split('.')[0].split('_preguidance')[0]
#Get the number sequences in the unaligned file os.mkdir(tax_guidance_outdir)
n_recs = len([r for r in SeqIO.parse(args.input + '/' + file, 'fasta')]) fail = False
#For each iteration
for i in range(args.iterations):
n_recs = len([r for r in SeqIO.parse(guidance_input + '/' + file, 'fasta')])
#Guidance can't handle inputs with fewer than 4 sequences
if n_recs < 4:
print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n')
os.system('rm -rf ' + tax_guidance_outdir)
if i == 0:
fail = True
break
#Cancel the Guidance run if there are fewer than 4 sequences #Determining MAFFT algorithm based on the number of input sequences
if n_recs < 4: if n_recs < 200:
print('\nGene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i) + ' Guidance iterations, therefore no alignment will be produced for this gene family\n') mafft_alg = 'genafpair'
os.system('rm -rf ' + tax_guidance_outdir) else:
if i == 0: mafft_alg = 'auto'
fail = True #Determining if its nucleotide or AA
break if args.codon:
seqtype = 'codon'
else:
seqtype = 'aa'
#MAFFT parameters depend on sequence count #Run Guidance - new
if n_recs < 200: os.system(args.guidance_path + '/www/Guidance/guidance.pl --seqFile ' + guidance_input + '/' + file + ' --msaProgram MAFFT --seqType ' + seqtype + ' --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(args.seq_cutoff) + ' --colCutoff ' + str(args.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(args.guidance_threads) + " --bl 62 --anysymbol' > " + args.output + '/Output/Intermediate/Guidance/Output/' + file[:10] + '/log.txt')
mafft_alg = 'genafpair'
else:
mafft_alg = 'auto' #Checking for a sequence score file; if not available, Guidance failed.
if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'):
if args.codon: #All sequences below score cutoff
seqtype = 'codon' seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < args.seq_cutoff])
else: #If fewer than four were above the cutoff, this OG is done iterating.
seqtype = 'aa' if n_recs - seqs_below < 4:
print('\nWARNING: Gene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n')
#Run Guidance os.system('rm -rf ' + tax_guidance_outdir)
os.system(args.guidance_path + '/www/Guidance/guidance.pl --seqFile ' + args.input + '/' + file + ' --msaProgram MAFFT --seqType ' + seqtype + ' --outDir ' + tax_guidance_outdir + ' --seqCutoff ' + str(args.seq_cutoff) + ' --colCutoff ' + str(args.col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread " + str(args.guidance_threads) + " --bl 62 --anysymbol' > " + tax_guidance_outdir + '/log.txt') break
#If all sequences were above the cutoff, this OG is done iterating.
#If it ran successfully if seqs_below == 0 or i == args.iterations - 1:
if os.path.isfile(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names'): print('\nGuidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0] + '\n')
sep = ' ' break
if '\t' in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1]: #Recording list of sequences removed by Guidance.
sep = '\t' for line in [line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < args.seq_cutoff]:
guidance_removed_file.write(line)
#Create a record of sequences below the sequence score cutoff #Copying over the old file with the new results
seqs_below = len([line for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split(sep)[-1]) < args.seq_cutoff]) os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + guidance_input + '/' + file)
#new
#If the number of remaining sequences is less than 4, then stop iterating #Handling intermediate files for each iteration.
if n_recs - seqs_below < 4: if args.keep_iter:
print('\nGene famiily ' + file.split('.')[0].split('_preguidance')[0] + ' contains fewer than 4 sequences after ' + str(i + 1) + ' Guidance iterations, therefore no alignment will be produced for this gene family.\n') if i +1 < args.iterations:
os.system('rm -rf ' + tax_guidance_outdir) os.makedirs(args.output + '/Output/iterations/', exist_ok = True)
break os.makedirs(args.output + '/Output/iterations/' + str(i+1)+'/', exist_ok = True)
os.makedirs(args.output + '/Output/iterations/' + str(i+1) + '/' + file.split('.')[0].split('_preguidance')[0], exist_ok = True)
#If there are no sequences below the cutoff or the number of Guidance iterations has been reached, then stop iterating iteration_folder = args.output + '/Output/iterations/'+ str(i +1) + '/' + file.split('.')[0].split('_preguidance')[0]
if seqs_below == 0 or i == args.iterations - 1: os.system('cp -r ' + tax_guidance_outdir + '/* ' + iteration_folder)
print('\nGuidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0].split('_preguidance')[0] + '\n')
break
#If another iteration is needed, then replace the original input file with the sequence-filtered run to input to Guidance for the next iteration
os.system('cp ' + tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + args.input + '/' + file)
#If another iteration is needed, remove the Guidance output from the last iteration
os.system('rm -r ' + tax_guidance_outdir + '/*') os.system('rm -r ' + tax_guidance_outdir + '/*')
else: if not args.keep_temp:
fail = True for gdir_file in os.listdir(iteration_folder):
break if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'):
os.system('rm -r ' + iteration_folder + '/' + gdir_file)
#If Guidance ran successfully
if not fail:
#Create a record of sequences above the sequence score cutoff
seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')]
orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')]
#Put this sequence information in a dictionary and write out the surviving sequences to an unaligned .fasta file
running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep }
with open(tax_guidance_outdir + '/postGuidance_preMAFFT_unaligned.fasta', 'w') as o:
for seq in running_aln:
o.write('>' + seq + '\n' + str(running_aln[seq]).replace('-', '') + '\n\n')
#Align the file with MAFFT
os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preMAFFT_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_MAFFT_realigned.fasta')
#Read in the MAFFT alignment
running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_MAFFT_realigned.fasta', 'fasta') }
sep = ' '
if '\t' in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1]:
sep = '\t'
#Apply residue cutoff per site per sequence
for site in [(int(line.split(sep)[1]), int(line.split(sep)[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(sep)[-1].strip()) < args.res_cutoff]:
if(orig_seqs[site[0]] in seqs2keep):
running_aln[orig_seqs[site[0]]][site[1]] = 'X'
sep = ' '
if '\t' in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1]:
sep = '\t'
#Apply column cutoff per column
cols2remove = [int(line.split(sep)[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(sep)[-1].strip()) < args.col_cutoff]
for seq in running_aln:
running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove])
#Write out the final filtered alignment
with open(tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_MAFFT_realigned.fasta', 'w') as o:
for seq in running_aln:
o.write('>' + seq + '\n' + str(running_aln[seq]) + '\n\n')
#If the user hasn't specified to keep all intermediate files, then delete the extraneous ones and rename the rest per the input file name
if not args.keep_temp:
for gdir_file in os.listdir(tax_guidance_outdir):
if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', file.split('.')[0].split('_preguidance')[0] + '.postGuidance_MAFFT_realigned.fasta'):
os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file)
else:
if gdir_file == 'MSA.MAFFT.aln.With_Names':
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln')
else: else:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file) if gdir_file == 'MSA.MAFFT.aln.With_Names':
os.system('mv ' + iteration_folder + '/' + gdir_file + ' ' + iteration_folder + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln')
else:
os.system('mv ' + iteration_folder + '/' + gdir_file + ' ' + iteration_folder + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file)
#new
else:
fail = True
break
#After all iterations, THEN apply residue and column cutoffs
if not fail:
#Getting a list of sequences to keep
seqs2keep = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names', 'fasta')]
orig_seqs = [rec.description for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta')]
running_aln = { rec.description : str(rec.seq) for rec in SeqIO.parse(tax_guidance_outdir + '/MSA.MAFFT.aln.With_Names', 'fasta') if rec.description in seqs2keep }
#Residues that fall below the confidence cutoff (--res_cutoff) are replaced with 'X'
for site in [(int(line.split()[1]), int(line.split()[0]) - 1) for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_seq.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < args.res_cutoff]:
if(orig_seqs[site[0]] in seqs2keep):
running_aln[orig_seqs[site[0]]][site[1]] = 'X'
#Removing columns below the --col_cutoff
cols2remove = [int(line.split()[0]) - 1 for line in open(tax_guidance_outdir + '/MSA.MAFFT.Guidance2_res_pair_col.scr').readlines()[1:-1] if float(line.split(' ')[-1].strip()) < args.col_cutoff]
for seq in running_aln:
running_aln[seq] = ''.join([running_aln[seq][i] for i in range(len(running_aln[seq])) if i not in cols2remove])
with open(tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta', 'w') as o:
for seq in running_aln:
o.write('>' + seq + '\n' + str(running_aln[seq]).replace('-', '') + '\n\n')
#Aligning one last time after removing the final set of sequences and applying the res and col cutoffs
print('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_postGuidance_preTrimAl_aligned.fasta')
os.system('mafft ' + tax_guidance_outdir + '/postGuidance_preTrimAl_unaligned.fasta > ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Gap trimming
os.system('Scripts/trimal-trimAl/source/trimal -in ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta -out ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta -gapthreshold 0.05 -fasta')
#Copying over final aligments (pre and post gap trimming) into output folder.
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta ' + args.output + '/Output/Guidance/' + file.split('.')[0].split('_preguidance')[0] + '.95gapTrimmed.fasta')
os.system('cp ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta ' + args.output + '/Output/NotGapTrimmed/' + file.split('.')[0].split('_preguidance')[0] + '.postGuidance_preTrimAl_aligned.fasta')
#Removing intermediate files if not --keep_temp
if not args.keep_temp:
for gdir_file in os.listdir(tax_guidance_outdir):
if gdir_file not in ('MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names', 'MSA.MAFFT.aln.With_Names', 'MSA.MAFFT.Guidance2_res_pair_col.scr', 'log', 'postGuidance_preTrimAl_unaligned.fasta'):
os.system('rm -r ' + tax_guidance_outdir + '/' + gdir_file)
else:
if gdir_file == 'MSA.MAFFT.aln.With_Names':
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file + '.aln')
else:
os.system('mv ' + tax_guidance_outdir + '/' + gdir_file + ' ' + tax_guidance_outdir + '/' + file.split('.')[0].split('_preguidance')[0] + '_' + gdir_file)
guidance_removed_file.close()