From 33b4462bc21c027b0efbeb6c065a8058a754129b Mon Sep 17 00:00:00 2001 From: MCLeleu <123706003+MCLeleu@users.noreply.github.com> Date: Fri, 3 Mar 2023 16:55:53 -0500 Subject: [PATCH] Add files via upload --- Utilities/for_fastas/GuidanceWrapper.py | 50 +++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Utilities/for_fastas/GuidanceWrapper.py diff --git a/Utilities/for_fastas/GuidanceWrapper.py b/Utilities/for_fastas/GuidanceWrapper.py new file mode 100644 index 0000000..61c9507 --- /dev/null +++ b/Utilities/for_fastas/GuidanceWrapper.py @@ -0,0 +1,50 @@ +#Author: Auden Cote-L'Heureux +#Last update: 03/01/2023; added replacement of non-AA characters '*' and 'U' by 'X' after discussion with LAK and ML on 02/28/2023 + + +import os, sys +from Bio import SeqIO + +batchdir = sys.argv[1] + +seq_cutoff = 0.3 +col_cutoff = 0.4 + +for file in os.listdir(batchdir): + if(not os.path.isdir('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0])): + os.mkdir('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0]) + + recs = [r for r in SeqIO.parse(batchdir + '/' + file, 'fasta')] + with open(batchdir + '/' + file, 'w') as o: + for rec in recs: + o.write('>' + rec.description + '\n' + str(rec.seq).replace('*', 'X').replace('U', 'X') + '\n\n') + + n_recs = len(recs) + + for i in range(5): + + if n_recs < 4: + print('Fewer than 4 sequences in gene family ' + file.split('.')[0]) + #os.system('rm -rf ' + tax_guidance_outdir) + break + + if n_recs < 200: + mafft_alg = 'genafpair' + else: + mafft_alg = 'auto' + + os.system('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + batchdir + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir /beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0] + ' --seqCutoff ' + str(seq_cutoff) + ' --colCutoff ' + str(col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread 20 --bl 62 --anysymbol'") + + seqs_below = len([line for line in open('GuidanceOutput/' + file.split('.')[0] + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < seq_cutoff]) + + if n_recs - seqs_below < 4: + print('Fewer than 4 sequences left after ' + str(i + 1) + ' iterations in gene family ' + file.split('.')[0]) + #os.system('rm -rf GuidanceOutput/' + file.split('.')[0]) + break + + if seqs_below == 0 or i == 4: + print('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0]) + break + + os.system('cp GuidanceOutput/' + file.split('.')[0] + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + batchdir + '/' + file) + os.system('rm -r GuidanceOutput/' + file.split('.')[0]) \ No newline at end of file