Add files via upload

This commit is contained in:
MCLeleu 2023-03-03 16:55:53 -05:00 committed by GitHub
parent d59c79b149
commit 33b4462bc2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -0,0 +1,50 @@
#Author: Auden Cote-L'Heureux
#Last update: 03/01/2023; added replacement of non-AA characters '*' and 'U' by 'X' after discussion with LAK and ML on 02/28/2023
import os, sys
from Bio import SeqIO
batchdir = sys.argv[1]
seq_cutoff = 0.3
col_cutoff = 0.4
for file in os.listdir(batchdir):
if(not os.path.isdir('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0])):
os.mkdir('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0])
recs = [r for r in SeqIO.parse(batchdir + '/' + file, 'fasta')]
with open(batchdir + '/' + file, 'w') as o:
for rec in recs:
o.write('>' + rec.description + '\n' + str(rec.seq).replace('*', 'X').replace('U', 'X') + '\n\n')
n_recs = len(recs)
for i in range(5):
if n_recs < 4:
print('Fewer than 4 sequences in gene family ' + file.split('.')[0])
#os.system('rm -rf ' + tax_guidance_outdir)
break
if n_recs < 200:
mafft_alg = 'genafpair'
else:
mafft_alg = 'auto'
os.system('/beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/guidance.v2.02/www/Guidance/guidance.pl --seqFile ' + batchdir + '/' + file + ' --msaProgram MAFFT --seqType aa --outDir /beegfs/fast/katzlab/acl/Hook-DEV/Guidance_G1NS/GuidanceOutput/' + file.split('.')[0] + ' --seqCutoff ' + str(seq_cutoff) + ' --colCutoff ' + str(col_cutoff) + " --outOrder as_input --bootstraps 10 --MSA_Param '\\--" + mafft_alg + " --maxiterate 1000 --thread 20 --bl 62 --anysymbol'")
seqs_below = len([line for line in open('GuidanceOutput/' + file.split('.')[0] + '/MSA.MAFFT.Guidance2_res_pair_seq.scr_with_Names').readlines()[1:-1] if float(line.split()[-1]) < seq_cutoff])
if n_recs - seqs_below < 4:
print('Fewer than 4 sequences left after ' + str(i + 1) + ' iterations in gene family ' + file.split('.')[0])
#os.system('rm -rf GuidanceOutput/' + file.split('.')[0])
break
if seqs_below == 0 or i == 4:
print('Guidance complete after ' + str(i + 1) + ' iterations for gene family ' + file.split('.')[0])
break
os.system('cp GuidanceOutput/' + file.split('.')[0] + '/Seqs.Orig.fas.FIXED.Without_low_SP_Seq.With_Names ' + batchdir + '/' + file)
os.system('rm -r GuidanceOutput/' + file.split('.')[0])