mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 05:50:24 +08:00
Delete PTL1/Transcriptomes/Scripts directory
This commit is contained in:
parent
540af11c9a
commit
f4b7839ea9
@ -1,267 +0,0 @@
|
||||
#!/usr/bin/env python3.6
|
||||
|
||||
##__Updated__: 01_04_2023
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 1_ContigFiltStats.py
|
||||
##__Options__: python 1_ContigFiltStats.py --help
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to remove small transcripts or small contigs below a given ##
|
||||
## minimum size from a transcriptome assembly. ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## AutoBactVsEuk.py (removes SSU then Bact) or 2a_removeSSU.py then 2b_removeBact.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqUtils import GC
|
||||
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\nThis script will remove Contigs (and provide a summary of statistics)'\
|
||||
+'\nfrom your Assembly that are shorter than a given length.'+color.ORANGE+\
|
||||
'\n\nA good minimum length to start with is 200bp.'+color.END+color.BOLD+\
|
||||
'\n\nThe minimum length value should be adjusted for your data sets.\n'+color.END+usage_msg(),
|
||||
usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+" Fasta file of Protein/Nucleotide sequences\n"+color.END)
|
||||
|
||||
required_arg_group.add_argument('--output_file','-out',
|
||||
help=color.BOLD+color.GREEN+" Desired Output Name\n\n"+color.END)
|
||||
|
||||
required_arg_group.add_argument('--minLen','-len', default=200, type=int,
|
||||
help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 200)"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--spades','-spades', action='store_true',
|
||||
help=color.BOLD+color.GREEN+'rnaSPAdes transcriptome assembly\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--genbank','-gb', action='store_true',
|
||||
help=color.BOLD+color.GREEN+'Assembly from Genbank\n (Will include Accession Number in'\
|
||||
' contig name)\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1_ContigFiltStats.py'\
|
||||
' --input_file ../Op_me_Xxma_rnaSPAdes_scaffolds_15_05.fasta --output_file '\
|
||||
'Op_me_Xxma --minLen 200 --spades'+color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.output_file == None:
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
Home_folder_name = args.output_file
|
||||
|
||||
if os.path.isdir(args.output_file) != True:
|
||||
os.system('mkdir ' + args.output_file)
|
||||
|
||||
if os.path.isdir(args.output_file + '/OriginalFasta/') != True:
|
||||
os.system('mkdir ' + args.output_file +'/OriginalFasta/')
|
||||
|
||||
if os.path.isdir(args.output_file + '/SizeFiltered/') != True:
|
||||
os.system('mkdir ' + args.output_file +'/SizeFiltered/')
|
||||
|
||||
if os.path.isdir('/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/') != True:
|
||||
os.system('mkdir ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------- Renames the Contigs, Writes them out, and Calculates Basic Info ----------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_Transcriptome(args):
|
||||
|
||||
home_folder = args.output_file + '/SizeFiltered/'
|
||||
|
||||
print (color.BOLD+'\n\nPrepping '+color.GREEN+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta') if len(i.seq) >= args.minLen]
|
||||
inFasta.sort(key=lambda seq_rec: -len(seq_rec.seq))
|
||||
|
||||
renamed_seqs = []
|
||||
seq_code_dict = {}
|
||||
|
||||
count = 1
|
||||
|
||||
seq_name_start = 'Contig'
|
||||
|
||||
if args.genbank == True:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.id,[]).append(seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
||||
seq_code_dict.setdefault(seq_rec.id,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
elif args.spades == True:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3])))))
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_rec.description.split('_')[5])
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3]))))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
else:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
|
||||
print (color.BOLD+'\n\nThere are '+color.RED+str(len(renamed_seqs))+' contigs > '+str(args.minLen)\
|
||||
+color.END+color.BOLD+' in '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta','w+') as w:
|
||||
for seq in renamed_seqs:
|
||||
w.write(seq+'\n')
|
||||
|
||||
if args.spades != True:
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
|
||||
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
|
||||
for k, v in seq_code_dict.items():
|
||||
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
|
||||
else:
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
|
||||
w.write('Original Name\tNew Name\tSeq Length\tSeq GC\tSeq Coverage\n')
|
||||
for k, v in seq_code_dict.items():
|
||||
w.write(k+'\t'+v[0]+'\t'+str(len(v[2]))+'\t'+str(GC(v[2]))+'\t'+str(v[1])+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
os.system('cp ' + args.input_file + ' ' + args.output_file + '/OriginalFasta/' + args.input_file.split('/')[-1].replace('.fasta', '.Original.fasta'))
|
||||
|
||||
os.system('cp ' + args.output_file + '/SizeFiltered/' + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.output_file+'.'+str(args.minLen)+\
|
||||
'bp.fasta'+color.END+color.BOLD+'\n\n')
|
||||
|
||||
print ('Next Script is: '+color.GREEN+'2_Auto_rRNA_BvE.py'+color.END\
|
||||
+color.BOLD+'\n(Alternatively'+color.GREEN+' 2a_remove_rRNA.py followed by 2b_remove_Bact.py'\
|
||||
+color.END+color.BOLD+')\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
temp = rename_Transcriptome(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
@ -1,145 +0,0 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
__author__ = 'Jean-David Grattepanche'
|
||||
__version__ = 'ACL fixed sequence naming issue Feb 23, 2022'
|
||||
__email__ = 'jeandavid.grattepanche@gmail.com'
|
||||
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import string
|
||||
import os.path
|
||||
from Bio import SeqIO
|
||||
from sys import argv
|
||||
listtaxa=[]
|
||||
toosim = 0.99
|
||||
seqcoverage = 0.7
|
||||
|
||||
def merge_files(folder, minlen):
|
||||
mergefile = open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','w+')
|
||||
print("MERGE following files")
|
||||
for taxafile in os.listdir(folder):
|
||||
if taxafile[0] != ".":
|
||||
listtaxa.append(taxafile.split('.' + str(minlen) + 'bp')[0])
|
||||
|
||||
for line2 in SeqIO.parse(folder+'/'+taxafile, 'fasta'):
|
||||
if int(len(str(line2.seq))) >= int(minlen):
|
||||
mergefile.write('>'+taxafile.split('.' + str(minlen) + 'bp')[0] + '_' + line2.description + '\n' + str(line2.seq) + '\n')
|
||||
else:
|
||||
print(line2, " is too short")
|
||||
mergefile.close()
|
||||
|
||||
sort_cluster(folder, listtaxa, minlen)
|
||||
|
||||
|
||||
def sort_cluster(folder, listtaxa, minlen):
|
||||
if not os.path.exists('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/'):
|
||||
os.makedirs('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
|
||||
fastalist = []; fastadict= {}
|
||||
|
||||
print('CREATE a dictionnary of sequences')
|
||||
for record in SeqIO.parse(open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','r'),'fasta'):
|
||||
IDL = record.description, int(record.description.split('_Cov')[1].replace('\n',''))
|
||||
fastalist.append(IDL)
|
||||
fastadict[record.description] = record.seq
|
||||
|
||||
print("CLUSTER sequences that overlap at least 70%")
|
||||
os.system('vsearch --cluster_fast ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta --strand both --query_cov '+str(seqcoverage)+' --id '+str(toosim) +' --uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc --threads 60' )
|
||||
|
||||
input2 = open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r')
|
||||
out2 = open('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','w+')
|
||||
out3 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas','w+')
|
||||
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','w+')
|
||||
print("CREATE a dictionary with clustering results")
|
||||
clustdict= {}; clustlist = []; allseq = []; clustline = {}; list= []; i=0; j=0
|
||||
for row2 in input2:
|
||||
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) < 2: # keep all unique sequences
|
||||
out2.write('>'+row2.split('\t')[8] + '\n' + str(fastadict[row2.split('\t')[8]])+ '\n')
|
||||
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) > 1: # create another dictionary
|
||||
# print("create dico: ", row2.split('\t')[8])
|
||||
clustdict.setdefault(row2.split('\t')[8], [row2.split('\t')[8]])
|
||||
clustlist.append(row2.split('\t')[8])
|
||||
|
||||
for row3 in open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r'):
|
||||
if row3.split('\t')[0] == 'H':
|
||||
# print("add dico: ", row3.split('\t')[9], row3.split('\t')[8])
|
||||
clustdict[row3.split('\t')[9].replace('\n','')].append(row3.split('\t')[8].replace('\n',''))
|
||||
clustline[row3.split('\t')[8].replace('\n','')] = row3.replace('\n','')
|
||||
clustline[row3.split('\t')[9].replace('\n','')] = row3.replace('\n','')
|
||||
|
||||
|
||||
print("PARSE the clusters: keep seed sequences (highest coverage) for each cluster")
|
||||
for clust in clustlist:
|
||||
list = sorted(clustdict[clust], reverse = True, key=lambda x: int(x.split('_Cov')[1]))
|
||||
master = list[0]
|
||||
Covmaster = int(list[0].split('_Cov')[1])
|
||||
master8dig = ('_').join(list[0].split('_')[0:3])[:-2]
|
||||
for seq in list:
|
||||
clustered = seq.replace('\n','')
|
||||
Covclustered = int(clustered.split('_Cov')[1])
|
||||
clustered8dig = ('_').join(clustered.split('_')[0:3])[:-2]
|
||||
# print(master8dig, Covmaster, '//', clustered8dig, Covclustered)
|
||||
if float(Covmaster/Covclustered) < 10:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
elif master8dig == clustered8dig:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
elif Covclustered >= 50:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
else:
|
||||
j +=1
|
||||
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','a')
|
||||
out3.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
print(clustline[clustered],'\t' , master )
|
||||
out4.write(clustline[clustered]+ '\t' + master + '\n')
|
||||
out4.close()
|
||||
print('there are ', str(i),' sequences kept and ',str(j),' sequences removed')
|
||||
|
||||
out2.close()
|
||||
out3.close()
|
||||
|
||||
splittaxa(folder, listtaxa, minlen)
|
||||
|
||||
def splittaxa(folder, listtaxa, minlen):
|
||||
for taxa in listtaxa:
|
||||
tax_sf_path = '/'.join(folder.split('/')[:-1]) + '/' + taxa + '/SizeFiltered/'
|
||||
os.system('mv ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta' + ' ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.preXPlate.fasta')
|
||||
|
||||
with open(tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta','w') as o:
|
||||
for kept in SeqIO.parse('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','fasta'):
|
||||
if taxa in kept.description:
|
||||
o.write('>' + kept.description.replace(taxa + '_', '') + '\n' + str(kept.seq) + '\n')
|
||||
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
|
||||
def main():
|
||||
|
||||
script, folder, minlen = argv
|
||||
merge_files(folder, minlen)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,285 +0,0 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 2a_remove_rDNA.py --help
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to identify and isolate SSU/LSU sequences ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
|
||||
## respective folders! See the manual if you need help ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 2b_removeBact.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\nThis script will remove '+color.RED+'rDNA contigs (both SSU and LSU)'+color.END\
|
||||
+color.BOLD+'\nfrom your Assembly using a set of '+color.RED+'SSU/LSU rDNAs '+color.END\
|
||||
+color.BOLD+'from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
|
||||
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+"Fasta file of Nucleotide sequences"+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('--threads','-t', default='2',
|
||||
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2a_remove_rRNA.py --input_file ../Op_me_Xxma_rna.200bp.fasta'+color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('bp.fasta') != True:
|
||||
print (color.BOLD + '\n\nCheck that you are giving an appropriately Named/Processed'\
|
||||
'Fasta file(s) to this script\n\nNOTE that this script CURRENTLY expects your'\
|
||||
' Fasta files to contain '+color.RED+ '"rna"'+color.END+color.BOLD+' in \nthe Fasta File'\
|
||||
' Name and must end with ' + color.RED + '"bp.fasta"\n\n' + color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_BvsE') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/SSULSUdb.nhr') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'BLAST+ formatted '+color.ORANGE+'SSU-LSU databases!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that they can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
code = args.input_file.split('/')[-1][:10]
|
||||
rRNA_folder = args.input_file.split('SizeFiltered')[0] + '/rRNA_Removal/'
|
||||
|
||||
if os.path.isdir(rRNA_folder) != True:
|
||||
os.system('mkdir '+rRNA_folder)
|
||||
|
||||
return code, rRNA_folder
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------------- Uses BLAST to identify SSU/LSU Sequences ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def remove_rDNA(args, rRNA_folder):
|
||||
|
||||
blast_output = rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_allSSULSUresults.tsv'
|
||||
|
||||
BLASTN_cmd = 'blastn -query ' + args.input_file + ' -evalue 1e-10 -max_target_seqs 1 -outfmt'\
|
||||
' 6 -db ' + args.databases + '/db_BvsE/SSULSUdb -num_threads 2 -out ' + blast_output
|
||||
|
||||
print (color.BOLD+'\n\nBLASTing '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END\
|
||||
+color.BOLD+ ' against the rDNA database\n\n' + color.END)
|
||||
|
||||
os.system(BLASTN_cmd)
|
||||
|
||||
rDNA_Hits = list(set([i.split('\t')[0] for i in open(blast_output).readlines()]))
|
||||
|
||||
print (color.BOLD+'Binning Sequences from '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
+color.END+color.BOLD+'\nas rDNA OR Potentially Protein-Coding\n\n'+color.END)
|
||||
|
||||
no_SSULSU = 0
|
||||
with_SSULSU = 0
|
||||
|
||||
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_rRNAseqs.fasta','w+') as HasSSU:
|
||||
for seq_rec in inFasta:
|
||||
if seq_rec.description in rDNA_Hits:
|
||||
HasSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
|
||||
with_SSULSU += 1
|
||||
|
||||
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0] + '_NorRNAseqs.fasta','w+') as NoSSU:
|
||||
for seq_rec in inFasta:
|
||||
if seq_rec.description not in rDNA_Hits:
|
||||
NoSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
|
||||
no_SSULSU += 1
|
||||
|
||||
return str(with_SSULSU), str(no_SSULSU)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Updates Log of SSU/LSU Removal --------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args, with_SSU, no_SSU):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
|
||||
print (color.BOLD+'There are '+color.RED+with_SSU+' rRNA contigs'+color.END+color.BOLD\
|
||||
+' and '+color.PURPLE+no_SSU+' Putative Protein-coding contigs'+color.END+color.BOLD\
|
||||
+'\nin '+color.DARKCYAN+args.input_file.split('/')[1]+'\n' + color.END)
|
||||
|
||||
with open('../PostAssembly_Logs/'+args.input_file.split('/')[1].split('.fas')[0]+'.Log.txt','a') as LogFile:
|
||||
LogFile.write('rDNA Contigs\t'+with_SSU+'\tn/a\tn/a\n')
|
||||
LogFile.write('Non-rDNA Contigs\t'+no_SSU+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.ORANGE+args.input_file.split('/')[1].split('_rna')[0]\
|
||||
+ '_NorRNAseqs.fasta'+color.END+color.BOLD+'\nin the '+args.input_file.split('/')[1].split('_rna')[0]\
|
||||
+' Folder\n\n' + color.END)
|
||||
print (color.BOLD + 'Next Script is: ' + color.GREEN + '2b_remove_Bact.py\n\n'+ color.END)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = args.input_file.split('SizeFiltered')[0]
|
||||
|
||||
os.system('cp ' + home_folder + 'rRNA_Removal/*NorRNA*.fasta ' + home_folder)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
code, rRNA_folder = prep_folders(args)
|
||||
|
||||
with_SSULSU, no_SSULSU = remove_rDNA(args, rRNA_folder)
|
||||
|
||||
#update_log(args, with_SSULSU, no_SSULSU)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,410 +0,0 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 2b_remove_Bact.py --help
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to identify and isolate SSU/LSU sequences ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
|
||||
## respective folders! See the manual if you need help ##
|
||||
## 4. Run removeSSU.py on your Fasta file ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 3_CountOGsDiamond.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "Diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\nThis script will categorize Contigs as'+color.ORANGE+' STRONGLY '+color.END\
|
||||
+color.BOLD+color.RED+'Eukaryotic \nOR Prokaryotic'+color.END+color.BOLD+' using a set of Proteins'\
|
||||
' from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
|
||||
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences (with rRNAs removed)'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2b_remove_Bact.py --input_file'\
|
||||
' ../Op_me_Xxma/Op_me_Xxma_NorRNAseqs.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
print(args.input_file)
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('NorRNAseqs.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'2a_remove_rRNA.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"NorRNAseqs.fas"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_BvsE') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/eukout.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Eukaryotic Protein database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/micout.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Bacterial/Archaeal Protein database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
|
||||
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
|
||||
if os.path.isdir(BvE_folder) != True:
|
||||
os.system('mkdir '+BvE_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------- Runs Diamond on Bact and Euk small RefSeq Databases ----------------###
|
||||
###########################################################################################
|
||||
|
||||
def ublast_BvE(args, diamond_path):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
mic_output = args.input_file.split('/')[-1]+'micresults.'
|
||||
euk_output = args.input_file.split('/')[-1]+'eukresults.'
|
||||
|
||||
print(color.BOLD+'\n\n"BLAST"-ing against PROK database using DIAMOND: ' + color.DARKCYAN + 'micout.dmnd' + color.END + '\n\n')
|
||||
|
||||
Prok_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/micout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'allmicresults.tsv'
|
||||
|
||||
os.system(Prok_diamond_cmd)
|
||||
|
||||
print(color.BOLD+'\n\n"BLAST"-ing against EUK database using DIAMOND: ' + color.DARKCYAN + 'eukout.dmnd' + color.END + '\n\n')
|
||||
|
||||
Euk_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/eukout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'alleukresults.tsv'
|
||||
|
||||
os.system(Euk_diamond_cmd)
|
||||
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------- Compares Bacterial and Euk Hits for Classification -----------------###
|
||||
###########################################################################################
|
||||
|
||||
def compare_hits(args):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
|
||||
EukDict = {}
|
||||
ProkDict = {}
|
||||
CompDict = {}
|
||||
|
||||
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
for seq_rec in inFasta:
|
||||
EukDict[seq_rec.description] = ''
|
||||
ProkDict[seq_rec.description] = ''
|
||||
CompDict[seq_rec.description] = []
|
||||
|
||||
inEukHits = [i for i in open(BvE_folder + 'alleukresults.tsv').readlines()]
|
||||
inEukHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
|
||||
|
||||
inProkHits = [i for i in open(BvE_folder + 'allmicresults.tsv').readlines()]
|
||||
inProkHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
|
||||
|
||||
for i in inEukHits:
|
||||
if EukDict[i.split('\t')[0]] == '':
|
||||
EukDict[i.split('\t')[0]] = float(i.split('\t')[-2])
|
||||
|
||||
for i in inProkHits:
|
||||
if ProkDict[i.split('\t')[0]] == '':
|
||||
ProkDict[i.split('\t')[0]] = float(i.split('\t')[-2])
|
||||
|
||||
for k in CompDict.keys():
|
||||
if EukDict[k] != '':
|
||||
CompDict[k].append(EukDict[k])
|
||||
else:
|
||||
CompDict[k].append('no hit')
|
||||
if ProkDict[k] != '':
|
||||
CompDict[k].append(ProkDict[k])
|
||||
else:
|
||||
CompDict[k].append('no hit')
|
||||
|
||||
for k, v in CompDict.items():
|
||||
|
||||
### Contigs lacking STRONG Eukaryotic OR Prokaryotic Hits
|
||||
if v[0] == 'no hit' and v[1] == 'no hit':
|
||||
CompDict[k].append('UNDETERMINED')
|
||||
|
||||
### Contigs lacking STRONG Eukaryotic with a Prokaryotic Hit
|
||||
elif v[0] != 'no hit' and v[1] == 'no hit':
|
||||
CompDict[k].append('EUKARYOTIC')
|
||||
|
||||
### Contigs with a Eukaryotic but without a Prokaryotic Hit
|
||||
elif v[0] == 'no hit' and v[1] != 'no hit':
|
||||
CompDict[k].append('PROKARYOTIC')
|
||||
|
||||
### Uses Basic math to determine if contigs with are MORE Eukaryotic than Prokaryotic
|
||||
else:
|
||||
try:
|
||||
prok_euk_ratio = float(v[1])/float(v[0])
|
||||
euk_prok_ratio = float(v[0])/float(v[1])
|
||||
|
||||
if prok_euk_ratio >= 100:
|
||||
CompDict[k].append('EUKARYOTIC')
|
||||
|
||||
elif euk_prok_ratio >= 1000:
|
||||
CompDict[k].append('PROKARYOTIC')
|
||||
|
||||
else:
|
||||
CompDict[k].append('UNDETERMINED')
|
||||
|
||||
except:
|
||||
CompDict[k].append('divide by zero')
|
||||
|
||||
with open(BvE_folder + 'comparisons.txt','w+') as w:
|
||||
for k, v in CompDict.items():
|
||||
w.write(k+':'+':'.join([str(i) for i in v])+'\n')
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
BvE_output_base = BvE_folder+args.input_file.split('/')[-1].split('.fas')[0]
|
||||
|
||||
### Gathers the sequences and categorizes them
|
||||
Euk_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'EUKARYOTIC'), key=lambda x: -int(len(x.seq)))
|
||||
Prok_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'PROKARYOTIC'), key=lambda x: -int(len(x.seq)))
|
||||
Und_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'UNDETERMINED'), key=lambda x: -int(len(x.seq)))
|
||||
Zero_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'divide by zero'), key=lambda x: -int(len(x.seq)))
|
||||
|
||||
|
||||
### Writes out all of the categorized sequences
|
||||
with open(args.input_file.split('NorRNA')[0] + 'WTA_EPU.fasta', 'w') as epu:
|
||||
with open(BvE_output_base+'.Not_Bact.fasta','w+') as nb:
|
||||
for euk_seq in Euk_Fasta:
|
||||
nb.write('>' + euk_seq.description + '\n' + str(euk_seq.seq) + '\n')
|
||||
epu.write('>' + euk_seq.description + '_E' + '\n' + str(euk_seq.seq) + '\n')
|
||||
|
||||
|
||||
with open(BvE_output_base+'.Bact_Hit.fasta','w+') as pr:
|
||||
for prok_seq in Prok_Fasta:
|
||||
pr.write('>' + prok_seq.description + '\n' + str(prok_seq.seq) + '\n')
|
||||
epu.write('>' + prok_seq.description + '_P' + '\n' + str(prok_seq.seq) + '\n')
|
||||
|
||||
with open(BvE_output_base+'.Undetermined.fasta','w+') as und:
|
||||
for und_seq in Und_Fasta:
|
||||
und.write('>' + und_seq.description + '\n' + str(und_seq.seq) + '\n')
|
||||
epu.write('>' + und_seq.description + '_U' + '\n' + str(und_seq.seq) + '\n')
|
||||
|
||||
if len(Zero_Fasta) != 0:
|
||||
with open(BvE_output_base+'.DivideByZero.fasta','w+') as w:
|
||||
for zero_seq in Zero_Fasta:
|
||||
w.write('>' + zero_seq.description + '\n' + str(zero_seq.seq) + '\n')
|
||||
epu.write('>' + zero_seq.description + '_U' + '\n' + str(zero_seq.seq) + '\n')
|
||||
else:
|
||||
pass
|
||||
|
||||
return str(len(Euk_Fasta)), str(len(Prok_Fasta)), str(len(Und_Fasta))
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###----------------------- Updates Log of Prok vs Euk Comparisons ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
print (color.BOLD +'\n\nThere are '+color.RED+Prok_Contigs+' Strongly Prokaryotic contigs'+color.END\
|
||||
+color.BOLD+',\n'+color.ORANGE+Euk_Contigs+' Strongly Eukaryotic contigs'+color.END\
|
||||
+color.BOLD+',\nand '+color.PURPLE+Und_Contigs+' Undetermined Contigs\n'+color.END\
|
||||
+color.BOLD+'in '+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(args.input_file.split('/')[-1].split('_No')[0]) and Logname.endswith('Log.txt'): # ACL - ???
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as Logfilename:
|
||||
Logfilename.write('Prokaryotic Contigs\t'+Prok_Contigs+'\tn/a\tn/a\n')
|
||||
Logfilename.write('Eukaryotic Contigs\t'+Euk_Contigs+'\tn/a\tn/a\n')
|
||||
Logfilename.write('Undetermined Contigs\t'+Und_Contigs+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.split('NorRNA')[0]+'WTA_EPU.fasta'+color.END+color.BOLD+' in the '\
|
||||
+args.input_file.split('/')[1]+' Folder\n\n' + color.END)
|
||||
print (color.BOLD + 'Next Script is: ' + color.GREEN + '3_CountOGsDiamond.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
|
||||
os.system('cp '+home_folder+'/*WTA_EPU.fasta '+home_folder+'/BvE/')
|
||||
os.system('mv '+home_folder+'/*NorRNA*fasta '+home_folder+'/rRNA_Removal/')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
usearch_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
ublast_BvE(args, usearch_path)
|
||||
|
||||
Euk_Contigs, Prok_Contigs, Und_Contigs = compare_hits(args)
|
||||
|
||||
#update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
@ -1,354 +0,0 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 16_10_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 3_CountOGsDiamond.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to classify the STRONGLY Eukaryotic and UNDETERMINED/UNKNOWN ##
|
||||
## contigs into different OGs (e.g. orthologous gene-families) ##
|
||||
## ##
|
||||
## For more info about the OGs, check out: OrthoMCL.org ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 4_StopFrequency.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "usearch" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE USEARCH PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
|
||||
+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
|
||||
+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
|
||||
'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
|
||||
+'THIS script (3_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
||||
' Eukaryotic protein coding transcripts'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('--threads','-t', default='2',
|
||||
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
|
||||
optional_arg_group.add_argument('--evalue','-e', default=1e-5, type = float,
|
||||
help=color.BOLD+color.GREEN+' Maximum e-value for OG assignment\n (default = 1e-5)\n'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3_CountOGsDiamond.py'\
|
||||
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_NBU.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'2b_remove_Bact.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_NBU.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_OG') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again\n\n.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
elif os.path.isfile(args.databases + '/db_OG/OGSout.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Gene Family database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
if os.path.isdir(OG_folder) != True:
|
||||
os.system('mkdir '+OG_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def OG_diamond(args, diamond_path):
|
||||
|
||||
print (color.BOLD+'\nStarting to "BLAST" against OG databases'+color.END)
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
print (color.BOLD + '\n\n"BLAST"-ing against OG database using DIAMOND: ' + color.DARKCYAN + 'OGSout.dmnd' + color.END + '\n\n')
|
||||
|
||||
OG_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/OGSout.dmnd --evalue ' + str(args.evalue) + ' --threads 60 --subject-cover 0.35 --outfmt 6 -o ' + OG_folder + 'allOGresults.tsv'
|
||||
|
||||
os.system(OG_diamond_cmd)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
|
||||
###########################################################################################
|
||||
|
||||
def keep_best(args):
|
||||
|
||||
print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST match for each transcript\n\n'+color.END)
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
inTSV = [i for i in open(OG_folder + 'allOGresults.tsv').readlines()]
|
||||
|
||||
inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
|
||||
|
||||
keep = []
|
||||
for i in inTSV:
|
||||
if any(i.split('\t')[0] in j for j in keep) != True:
|
||||
keep.append(i)
|
||||
|
||||
updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+'\t'+'\t'.join(line.split('\t')[1:]) for line in keep]))
|
||||
|
||||
with open(args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv'), 'w+') as w:
|
||||
for i in updated_lines:
|
||||
w.write(i)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
|
||||
###########################################################################################
|
||||
|
||||
def update_fasta(args):
|
||||
|
||||
print (color.BOLD+color.PURPLE+'Updating Fasta File Sequence Names with their BEST OG hits\n\n'+color.END)
|
||||
|
||||
Renamed_TSV = args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv')
|
||||
|
||||
keep = [i for i in open(Renamed_TSV).readlines() if i != '\n']
|
||||
|
||||
keep_dict = { line.split('\t')[0].split('_OG5')[0] : line.split('\t')[0].split('_OG5')[0] + '_OG5_' + line.split('\t')[1].split('_')[-1] for line in keep if 'OG5' in line.split('\t')[1] }
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
updated_seq_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq)+'\n' for i in inFasta if i.description in keep_dict.keys()]
|
||||
|
||||
seqs_without_OG = ['>'+i.description+'\n'+str(i.seq)+'\n' for i in inFasta if i.description not in keep_dict.keys()]
|
||||
|
||||
with open(args.input_file.replace('.fasta','.Renamed.fasta'),'w+') as w:
|
||||
for i in updated_seq_name:
|
||||
w.write(i)
|
||||
|
||||
with open(args.input_file.replace('.fasta','.LackOG.fasta'),'w+') as x:
|
||||
for i in seqs_without_OG:
|
||||
x.write(i)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Updates Log With OG Assignment Information ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
|
||||
|
||||
Renamed_TSV = home_folder+args.input_file.split('/')[-1].replace('.fasta','.Renamed_allOGCleanresults.tsv')
|
||||
|
||||
keep = [line for line in open(Renamed_TSV).readlines()]
|
||||
all_ogs = [line.split('\t')[1].split('_')[-1] for line in keep if 'OG5' in line.split('\t')[1]]
|
||||
|
||||
total_with_ogs = str(len(all_ogs))
|
||||
unique_ogs = str(len(set(all_ogs)))
|
||||
|
||||
print (color.BOLD +'There are '+color.BLUE +total_with_ogs+' Contigs'+color.END\
|
||||
+color.BOLD+' that hit '+color.DARKCYAN+unique_ogs+' Unique OGs\n'+color.END)
|
||||
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(args.input_file.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
|
||||
LogFile.write('Contigs With OG\t'+total_with_ogs+'\tn/a\tn/a\n')
|
||||
LogFile.write('Unique OGs\t'+unique_ogs+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
os.system('rm ' + args.input_file)
|
||||
|
||||
os.system('cp ' + args.input_file.replace('.fasta','.Renamed.fasta') + ' ' + OG_folder)
|
||||
|
||||
os.system('cp ' + args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv') + ' ' + OG_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
home_folder = '../'+args.input_file.split('/')[1]+'/'
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.replace('.fasta','WTA_EPU.fasta')+color.END+color.BOLD+' in the '+home_folder\
|
||||
+' Folder\n\n' + color.END)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.GREEN+'4_InFrameStopFreq.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
usearch_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
OG_diamond(args, usearch_path)
|
||||
|
||||
keep_best(args)
|
||||
|
||||
update_fasta(args)
|
||||
|
||||
#update_log(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
@ -1,790 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 4_InFrameStopFreq.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to aid in identifying the genetic code of the data given ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 5_GCodeTranslate.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.Data.CodonTable import CodonTable
|
||||
|
||||
|
||||
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
||||
|
||||
tag_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAG'])
|
||||
|
||||
c_uncinata_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA'])
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\n\nThis script is intended to '+color.RED+'AID You '+color.END+color.BOLD\
|
||||
+'in determining the '+color.RED+'\nLikely Genetic Code'+color.END+color.BOLD+' of a'\
|
||||
' given Fasta File of transcripts\n\nInterpretation of the output (StopFreq.tsv) is difficult \nand so '+color.ORANGE\
|
||||
+'TWO EXAMPLES'+color.END+color.BOLD+' can be found in the '+color.CYAN+'NOTES Section'\
|
||||
+color.END+color.BOLD+' of\nTHIS Script '+color.GREEN+'(4_InFrameStopFreq.py)\n'+color.END\
|
||||
+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store', required=True,
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
||||
' Eukaryotic protein coding transcripts'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_InFrameStopFreq.py'\
|
||||
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_EPU.Renamed.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'3_CountOGsUsearcy.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_NBU.Renamed.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_StopFreq') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_StopFreq Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again\n\n.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
elif os.path.isfile(args.databases + '/db_StopFreq/RepEukProts.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Representative Eukaryotic Protein Database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_StopFreq folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
Stop_folder = '../'+args.input_file.split('/')[1]+'/StopCodonFreq/'
|
||||
|
||||
if os.path.isdir(Stop_folder) != True:
|
||||
os.system('mkdir '+Stop_folder)
|
||||
|
||||
if os.path.isdir(Stop_folder+'StopCodonFastas') != True:
|
||||
os.system('mkdir '+Stop_folder+'StopCodonFastas')
|
||||
|
||||
if os.path.isdir(Stop_folder+'SpreadSheets') != True:
|
||||
os.system('mkdir '+Stop_folder+'SpreadSheets')
|
||||
|
||||
return Stop_folder+'StopCodonFastas/'
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Translates Sequences with Each Stop Codon ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_translations(args):
|
||||
print (color.BOLD+'\nIdentifying ORFs in the Fasta file based on the output of 3_CountOGsDiamond.py\n'+color.END)
|
||||
|
||||
intsv = [i for i in open(args.input_file.replace('.fasta','_allOGCleanresults.tsv')).readlines() if i != '\n']
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
prot_dict = {}
|
||||
|
||||
for i in intsv:
|
||||
# print i
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
if (int(i.split('\t')[6])) < 5:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
else:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6]))
|
||||
if (int(i.split('\t')[7])-4) < 5:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7]))
|
||||
else:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])-4)
|
||||
|
||||
|
||||
#------------- Prep translation with 'TAA' as the only Stop -------------#
|
||||
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAA'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=c_uncinata_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=c_uncinata_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
|
||||
#------------- Prep translation with 'TGA' as the only Stop -------------#
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TGA'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=6))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=6))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
|
||||
#------------- Prep translation with 'TAG' as the only Stop -------------#
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAG'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=tag_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=tag_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
#------------ Parsing through data to maintain OG assignments ------------#
|
||||
inOGs = intsv
|
||||
inOGs = [i.split('\t')[0]+';'+i.split('\t')[1][-10:] for i in inOGs]
|
||||
inOGs2 = []
|
||||
for i in inOGs:
|
||||
if 'no_group' not in i.split(';')[1]:
|
||||
inOGs2.append(i)
|
||||
else:
|
||||
inOGs2.append(i.split(';')[0]+';no_group')
|
||||
inOGs2 = list(set(inOGs2))
|
||||
|
||||
#---------------- Write file with 'TAA' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TAA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-3]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_taa_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-3])).translate(table=c_uncinata_table)).upper()+'\n')
|
||||
|
||||
#---------------- Write file with 'TGA' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TGA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-2]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tga_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-2])).translate(table=6)).upper()+'\n')
|
||||
|
||||
#---------------- Write file with 'TAG' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TAG '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-1]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tag_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-1])).translate(table=tag_table)).upper()+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------- Diamonds the Translations Against a SMALL Euk Protein Database ----------###
|
||||
###########################################################################################
|
||||
|
||||
def diamond_ProtDB(args, diamond_path):
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tag_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tag_ORF.RepEukProts.tsv')
|
||||
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tga_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tga_ORF.RepEukProts.tsv')
|
||||
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_taa_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_taa_ORF.RepEukProts.tsv')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Manages the search for In-Frame Stop Codons --------------------###
|
||||
###########################################################################################
|
||||
|
||||
|
||||
def hunt_for_stops(args):
|
||||
|
||||
#------------------------ Open Fasta Files ------------------------#
|
||||
try:
|
||||
TAGinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','fasta') if str(i.seq).endswith('TAG')]
|
||||
print (color.BOLD+'\n\nGathering Sequence information from FASTA and TSV files\n'+color.END)
|
||||
|
||||
except:
|
||||
print (color.BOLD+color.RED+'\n\nMissing Necessary Inputs: Open Script for Usage'\
|
||||
' Information\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
TGAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','fasta') if str(i.seq).endswith('TGA')]
|
||||
|
||||
TAAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','fasta') if str(i.seq).endswith('TAA')]
|
||||
|
||||
## This section originally ONLY considered sequences WITH OG assignments:
|
||||
## TAAinFasta = [i for i in TAAinFasta if 'no_group' not in i.description and str(i.seq).endswith('TAA')]
|
||||
## This has been taken out for now
|
||||
|
||||
#----------------------- Open BLAST Reports -----------------------#
|
||||
|
||||
TAGinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tag_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
TGAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tga_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
TAAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_taa_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
## This section originally ONLY considered sequences WITH OG assignments:
|
||||
## TAAinTSV = i for i in TAAinTSV if i != ''and 'no_group' not in i.split('\t')[0]]
|
||||
## This has been taken out for now
|
||||
|
||||
|
||||
#------------ Set-up Genetic Code Specific Dictionaries ------------#
|
||||
|
||||
tag_dict = {}
|
||||
for i in TAGinTSV:
|
||||
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-6]))
|
||||
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
tga_dict = {}
|
||||
for i in TGAinTSV:
|
||||
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-6]))
|
||||
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
taa_dict = {}
|
||||
for i in TAAinTSV:
|
||||
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-6]))
|
||||
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
#-------------- Preparing In-Frame Stop Codon Counts --------------#
|
||||
|
||||
# All the data when TGA is the sole stop codon
|
||||
tga_codons = 0
|
||||
tga_data_tag = 0
|
||||
tga_data_tga = 0
|
||||
tga_data_taa = 0
|
||||
tga_seq_count = 0
|
||||
|
||||
# All the data when TAG is the sole stop codon
|
||||
tag_codons = 0
|
||||
tag_data_tag = 0
|
||||
tag_data_tga = 0
|
||||
tag_data_taa = 0
|
||||
tag_seq_count = 0
|
||||
|
||||
# All the data when TAA is the sole stop codon
|
||||
taa_codons = 0
|
||||
taa_data_tag = 0
|
||||
taa_data_tga = 0
|
||||
taa_data_taa = 0
|
||||
taa_seq_count = 0
|
||||
|
||||
# All the data for each stop codon combined
|
||||
tga_inframe = 0
|
||||
tag_inframe = 0
|
||||
taa_inframe = 0
|
||||
total_codons = 0
|
||||
total_seq_counts = len(open(args.input_file).read().split('>'))-1
|
||||
|
||||
|
||||
#-------- Gathering In-frame Stop Codon Density Information --------#
|
||||
|
||||
### Collect in-frame stop information for "TAA" and "TAG" when TGA is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TGA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TGAinFasta:
|
||||
try:
|
||||
if tga_dict[i.description][0] == 1:
|
||||
for n in range((tga_dict[i.description][0]-1),((tga_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tga_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tga_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tga_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tga_codons += 1
|
||||
total_codons += 1
|
||||
tga_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((tga_dict[i.description][0]-1)*3),((tga_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tga_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tga_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tga_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tga_codons += 1
|
||||
total_codons += 1
|
||||
tga_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
### Collect in-frame stop information for "TAA" and "TGA" when TAG is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TAG'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TAGinFasta:
|
||||
try:
|
||||
if tag_dict[i.description][0] == 1:
|
||||
for n in range((tag_dict[i.description][0]-1),((tag_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tag_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tag_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tag_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
tag_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((tag_dict[i.description][0]-1)*3),(tag_dict[i.description][1]*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tag_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tag_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tag_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
tag_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
### Collect in-frame stop information for "TGA" and "TAG" when TAA is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TAA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TAAinFasta:
|
||||
try:
|
||||
if taa_dict[i.description][0] == 1:
|
||||
for n in range((taa_dict[i.description][0]-1),((taa_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
taa_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
taa_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
taa_data_tga += 1
|
||||
tga_inframe += 1
|
||||
taa_codons += 1
|
||||
total_codons += 1
|
||||
taa_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((taa_dict[i.description][0]-1)*3),(taa_dict[i.description][1]*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
taa_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
taa_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
taa_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
taa_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
#-------------- Writing Data Out and Print Statement --------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_StopCodonStats.tsv','w+') as w:
|
||||
w.write('Stop Codon\tNumber of Seqs Analyzed\tIn-frame TAG\tIn-frame TGA\tIn-frame TAA\tTotal Codons\tIn-frame TAG density\tIn-frame TGA density\tIn-frame TAA density\n')
|
||||
if tga_codons != 0:
|
||||
w.write('TGA\t'+str(tga_seq_count)+'\t'+str(tga_data_tag)+'\t'+str(tga_data_tga)+'\t'+str(tga_data_taa)+'\t'+str(tga_codons)\
|
||||
+'\t'+"%.2f" % ((float(tga_data_tag)*1000)/float(tga_codons))+'\t'+"%.2f" % ((float(tga_data_tga)*1000)/float(tga_codons))+'\t'\
|
||||
+"%.2f" % ((float(tga_data_taa)*1000)/float(tga_codons))+'\n')
|
||||
else:
|
||||
w.write('TGA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
|
||||
if tag_codons != 0:
|
||||
w.write('TAG\t'+str(tag_seq_count)+'\t'+str(tag_data_tag)+'\t'+str(tag_data_tga)+'\t'+str(tag_data_taa)+'\t'+str(tag_codons)\
|
||||
+'\t'+"%.2f" % ((float(tag_data_tag)*1000)/float(tag_codons))+'\t'+"%.2f" % ((float(tag_data_tga)*1000)/float(tag_codons))+'\t'\
|
||||
+"%.2f" % ((float(tag_data_taa)*1000)/float(tag_codons))+'\n')
|
||||
else:
|
||||
w.write('TAG\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
if taa_codons != 0:
|
||||
w.write('TAA\t'+str(taa_seq_count)+'\t'+str(taa_data_tag)+'\t'+str(taa_data_tga)+'\t'+str(taa_data_taa)+'\t'+str(taa_codons)\
|
||||
+'\t'+"%.2f" % ((float(taa_data_tag)*1000)/float(taa_codons))+'\t'+"%.2f" % ((float(taa_data_tga)*1000)/float(taa_codons))+'\t'\
|
||||
+"%.2f" % ((float(taa_data_taa)*1000)/float(taa_codons))+'\n')
|
||||
else:
|
||||
w.write('TAA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
|
||||
w.write('\n \n')
|
||||
w.write('Summary\t'+str(tga_seq_count+tag_seq_count+taa_seq_count)+'\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)\
|
||||
+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))\
|
||||
+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n')
|
||||
w.write('\nTotal Seqs in Fasta\t'+str(total_seq_counts))
|
||||
|
||||
# print color.BOLD + color.BLUE + '\nSummary\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'\
|
||||
# +"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n\n'\
|
||||
# + str(tag_seq_count) + '\t' + str(tga_seq_count) + '\t' + str(taa_seq_count) + color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
if os.path.isdir('/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq') != True:
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/')
|
||||
else:
|
||||
pass
|
||||
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
||||
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*_ORF.*fasta ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
||||
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*Prots.tsv ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.replace('.fasta','_StopCodonStats.tsv')+color.END+color.BOLD+' in the '+home_folder\
|
||||
+' Folder\n\n' + color.END)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.GREEN+'5_GCodeTranslate.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
diamond_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_translations(args)
|
||||
|
||||
diamond_ProtDB(args, diamond_path)
|
||||
|
||||
hunt_for_stops(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
#----------------------------------------- NOTES -----------------------------------------#
|
||||
#
|
||||
# This script is designed to HELP you make an informed decision about the genetic code being
|
||||
# used by your particular organism. Be aware that it will be limited by the quality of the
|
||||
# data given to it!
|
||||
#
|
||||
# You will need:
|
||||
#
|
||||
# Diamond, BioPython, AND the output from '3_CountOGSDiamond.py'
|
||||
#
|
||||
# If you are not using the Author's database, update your database name(s) in lines: 345-360
|
||||
#
|
||||
# katzlab$ python StopFrequency.py YourFastaFile.fasta
|
||||
#
|
||||
#
|
||||
#------------------------------- Interpretation of Results -------------------------------#
|
||||
#
|
||||
# FORMATTED BELOW WITH TEXTWRANGLER...
|
||||
#
|
||||
# Example output using CILIATE (TGA) genetic Code (NOTE THE In-Frame Densities):
|
||||
#
|
||||
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
||||
# TGA 341 14 0 22 113156 1.2 0 0.92
|
||||
# TAG 424 0 0 34 140085 0 0 0.78
|
||||
# TAA 205 14 0 0 16714 0.84 0 0
|
||||
# Summary 970 28 0 56 269955 2.04 0 1.7
|
||||
#
|
||||
# VALUES in summary line (OR SUM of Density) that are > 1.5 likely indicate that the STOP
|
||||
# codon has been reassigned... in the case above, TAG and TAA look like they have been
|
||||
# reassigned.
|
||||
#
|
||||
#
|
||||
# Example output using UNIVERSAL genetic Code (NOTE THE In-Frame Densities):
|
||||
#
|
||||
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
||||
# TGA 341 1 0 2 113156 0.2 0 0.05
|
||||
# TAG 424 0 2 4 140085 0 0 0.08
|
||||
# TAA 205 1 0 0 16714 0.04 0 0
|
||||
# Summary 970 2 2 6 269955 0.15 0 0.06
|
||||
#
|
||||
# VALUES in summary line (OR SUM of Density) that are > 0.5 likely indicate that the STOP
|
||||
# codon still acts as STOP... in the case above, TAG, TGA and TAA look like they still behave
|
||||
# as a stop codon.
|
||||
#
|
||||
# THIS IS A ROUGH GUIDE FOR INTERPRETING THE RESULTS!!!! BE VERY VERY WARY! NUMBER OF TOTAL
|
||||
# SEQUENCES AND TOTAL CODONS OBSERVED ARE IMPORTANT (TOO FEW AND ANY INTERPRETATION IS DEVOID
|
||||
# OF ANY MEANING).
|
||||
@ -1,770 +0,0 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 20_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 5_GCodeTranslate.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to aid in identifying the genetic code of the data given ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with 1_ContigFiltStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 6_FilterPatials.py (in FinalizeTranscripts Folder) ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
import argparse, os, re, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.Data.CodonTable import CodonTable
|
||||
|
||||
|
||||
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
||||
|
||||
blepharisma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
condylostoma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
c_uncinata_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA'])
|
||||
|
||||
euplotes_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
myrionecta_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
no_stop_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
peritrich_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
tag_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAG'])
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
PURPLE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###------------------------- Checks the Command Line Arguments -------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
|
||||
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
|
||||
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
|
||||
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
|
||||
'"universal")\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--no_RP','-no_RP', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Allows files to "skip" the removal\n of Partial Transcripts\n'\
|
||||
+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
### Adding in names to 'arg' class for more easy use throughout the script
|
||||
args.ntd_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_NTD.ORF.fasta'
|
||||
args.aa_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_AA.ORF.fasta'
|
||||
args.tsv_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_allOGCleanresults.tsv'
|
||||
|
||||
args.home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
args.Diamond_Folder = args.home_folder+'/DiamondOG'
|
||||
args.StopFreq = args.home_folder+'/StopCodonFreq'
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
|
||||
|
||||
args.tsv_file = args.input_file.split('.fas')[0]+ '_allOGCleanresults.tsv'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_GCodeTranslate.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'\
|
||||
' --genetic_code Universal'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
|
||||
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
|
||||
'taa','tag','tga','mesodinium']
|
||||
|
||||
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
|
||||
'Condylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
|
||||
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
|
||||
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
|
||||
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
|
||||
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
|
||||
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
else:
|
||||
if args.list_codes == True:
|
||||
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'3_CountOGsDiamond.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_EPU.Renamed.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
if os.path.isdir(OG_folder) != True:
|
||||
os.system('mkdir '+OG_folder)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + 'TranslatedTranscriptomes') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + 'TranslatedTranscriptomes')
|
||||
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###---------------- Scans 5-Prime End of Transcript for In-Frame "ATG" ----------------###
|
||||
##########################################################################################
|
||||
|
||||
def check_new_start_new(some_seq, low_lim, upper_lim, old_start, codon_table):
|
||||
|
||||
## Looks for in-frame STOP codons in the UTR of the transcript
|
||||
prime5 = str(Seq(some_seq[low_lim:upper_lim]).translate(table=codon_table)).replace('*','x')
|
||||
in_frame_stops = [stops.start() for stops in re.finditer('x',prime5)]
|
||||
|
||||
## Looks for in-frame START codons in the UTR of the transcript
|
||||
in_frame_starts = [starts.start() for starts in re.finditer('M',prime5)]
|
||||
|
||||
## Checks that there are NO in-frame STOP codons between the possible "new" START codon
|
||||
## and the aligned portion of the transcript -- THIS is double checked!
|
||||
if len(in_frame_starts) != 0:
|
||||
if len(in_frame_stops) != 0:
|
||||
if in_frame_stops[-1] < in_frame_starts[-1]:
|
||||
new_start = low_lim+in_frame_starts[-1]*3
|
||||
else:
|
||||
new_start = old_start
|
||||
else:
|
||||
new_start = low_lim+in_frame_starts[-1]*3
|
||||
else:
|
||||
new_start = old_start
|
||||
|
||||
## Skips the double-checking if there are no GOOD potential START codons
|
||||
if new_start == old_start:
|
||||
updated_start = old_start
|
||||
|
||||
else:
|
||||
## Double checks that there are NO IN-FRAME stop codons between the NEW-SUGGESTED Start
|
||||
## position and the OLD-SUPPORTED stop position!
|
||||
between_new_old_start = str(Seq(some_seq[new_start:old_start]).translate(table=1)).replace('*','x')
|
||||
in_frame_stops_check = [stops.start() for stops in re.finditer('x',between_new_old_start)]
|
||||
in_frame_starts_check = [starts.start() for starts in re.finditer('M',between_new_old_start)]
|
||||
if len(in_frame_starts_check) != 0:
|
||||
if len(in_frame_stops_check) != 0:
|
||||
if in_frame_stops_check[-1] < in_frame_starts_check[-1]:
|
||||
updated_start = new_start+in_frame_starts_check[-1]*3
|
||||
else:
|
||||
updated_start = old_start
|
||||
else:
|
||||
updated_start = new_start
|
||||
else:
|
||||
updated_start = new_start
|
||||
|
||||
return updated_start
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Extracts the ORF from the Fasta File and SpreadSheet ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def extract_ORF(prot_dict, codon_table, args):
|
||||
|
||||
print (color.BOLD+'\n\nExtracting '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
|
||||
' the transcriptomic data-set\n\n'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
|
||||
## Attempting to find the most-likely START (ATG) position in the transcript (tricky)
|
||||
## Skips this if the initial Methionine (ATG) is likely present
|
||||
## (e.g. the alignment position of the protein = '1')
|
||||
prot_start = int(v[3].split('..')[0])
|
||||
old_start = v[1]
|
||||
if prot_start != 1:
|
||||
min_dist, max_dist = round_down_three(prot_start)
|
||||
min_start = old_start-min_dist
|
||||
max_start = old_start-max_dist
|
||||
if min_start < 0:
|
||||
min_start = old_start
|
||||
if max_start < 0:
|
||||
max_start = min_start%3
|
||||
# print k+'\tOld_start\t'+str(old_start)+'\tMin_Dist/Start\t'+str(min_dist)+'/'+str(min_start)+'\tMax_Dist/Start\t'+str(max_dist)+'/'+str(max_start)+'\n'
|
||||
updated_start = check_new_start_new(v[-1], max_start, min_start, old_start, codon_table)
|
||||
else:
|
||||
updated_start = old_start
|
||||
temp = prot_dict[k][-1][updated_start:]
|
||||
|
||||
## Uses the given genetic code to identify the stop position of the ORF
|
||||
temp_prot = str(Seq(temp).translate(table=codon_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[k].append(temp[:stop_pos])
|
||||
else:
|
||||
stop_pos = prot_dict[k][2] - prot_dict[k][1]
|
||||
prot_dict[k].append(temp[:stop_pos])
|
||||
|
||||
## Awkward_list is populated with unexpectedly SHORT ORFs!
|
||||
## Reasons for being short include:
|
||||
# An error Xyrus introduced
|
||||
# Not as great genetic code decision (in-frame stop)
|
||||
# Crummy sequence/assembly quality (false in-frame stop codons)
|
||||
|
||||
awkward_list = []
|
||||
look_good = []
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
expected_min = len(v[-2][v[1]:v[2]])-1
|
||||
if len(v[-1]) < expected_min:
|
||||
awkward_list.append(k)
|
||||
else:
|
||||
look_good.append(k)
|
||||
|
||||
if len(awkward_list) != 0:
|
||||
with open('UnexpexctedShortStuffBlameXyrus.txt','w+') as x:
|
||||
for entry in awkward_list:
|
||||
x.write(entry+'\n')
|
||||
else:
|
||||
pass
|
||||
|
||||
print (color.BOLD+'\n\nTranslating '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
|
||||
' using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
prot_dict[k].append(str(Seq(v[-1]).translate(table=codon_table)).rstrip('*'))
|
||||
|
||||
return prot_dict
|
||||
|
||||
##########################################################################################
|
||||
###------------ Grabs the Coding Coordinates from the OG-BLAST SpreadSheet ------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_translations(args):
|
||||
|
||||
print (color.BOLD+'\n\nGrabbing useful info from the '+color.ORANGE+args.input_file\
|
||||
.split('/')[-1]+color.END+color.BOLD+' Fasta File\nand from the '+color.ORANGE+args.tsv_file\
|
||||
.split('/')[-1]+color.END+color.BOLD+' OG-Assignment Spreadsheet'+color.END)
|
||||
|
||||
inTSV = ['\t'.join(i.rstrip('\n').split('\t')[:-1]) for i in open(args.tsv_file).readlines() if i != '\n']
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
# ORF identification step here, uses the 'allOGCleanresults.tsv file to identify the ORF
|
||||
prot_dict = {}
|
||||
|
||||
# Special scenario! Only for when the genetic code is not particularly useful ...
|
||||
if args.genetic_code.lower() == 'none' or args.genetic_code.lower() == 'condylostoma' or args.genetic_code.lower() == 'condy':
|
||||
for i in inTSV:
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
## Makes sure that the dictionary has the transcript in the correct orientation
|
||||
for i in inFasta:
|
||||
if i.description in prot_dict.keys():
|
||||
if 'RC' == prot_dict[i.description][0]:
|
||||
prot_dict[i.description].append(str(i.seq.reverse_complement()))
|
||||
else:
|
||||
prot_dict[i.description].append(str(i.seq))
|
||||
|
||||
else:
|
||||
for i in inTSV:
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment (but in the "correct" orientation)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
## Makes sure that the dictionary has the transcript in the correct orientation
|
||||
for i in inFasta:
|
||||
if i.description in prot_dict.keys():
|
||||
if 'RC' == prot_dict[i.description][0]:
|
||||
prot_dict[i.description].append(str(i.seq.reverse_complement()))
|
||||
else:
|
||||
prot_dict[i.description].append(str(i.seq))
|
||||
|
||||
return prot_dict
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------ Rounds Down Values to Nearest "3" -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def round_down_three(num):
|
||||
min_val = int(num*3*.5)-int(num*3*.5)%3
|
||||
max_val = int(num*6)-int(num*6)%3
|
||||
return min_val, max_val
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Makes Translation Steps (Later) Easier -----------------------###
|
||||
##########################################################################################
|
||||
|
||||
def standardize_gcode(given_code):
|
||||
if given_code == 'ciliate' or given_code == 'tga':
|
||||
codon_table = 6
|
||||
elif given_code == 'chilodonella' or given_code == 'chilo' or given_code == 'taa':
|
||||
codon_table = c_uncinata_table
|
||||
elif given_code == 'blepharisma' or given_code == 'bleph':
|
||||
codon_table = blepharisma_table
|
||||
elif given_code == 'euplotes' or given_code == 'eup':
|
||||
codon_table = euplotes_table
|
||||
elif given_code == 'myrionecta' or given_code == 'mesodinium':
|
||||
codon_table = myrionecta_table
|
||||
elif given_code == 'peritrich' or given_code == 'vorticella':
|
||||
codon_table = peritrich_table
|
||||
elif given_code == 'none':
|
||||
codon_table = no_stop_table
|
||||
elif given_code == 'condylostoma' or given_code == 'condy':
|
||||
codon_table = condylostoma_table
|
||||
elif given_code == 'tag':
|
||||
codon_table = tag_table
|
||||
elif given_code == 'universal':
|
||||
codon_table = 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\n\nNo valid genetic code provided!\n\n'+color.END+\
|
||||
color.BOLD+'Using the "Universal" genetic code (by default)\n\nPlease check that the'\
|
||||
' code you wish to use is supported:'+color.CYAN+'\n\npython 5_GCodeTranslate.py'\
|
||||
' -list_codes\n\n'+color.END)
|
||||
codon_table = 1
|
||||
|
||||
return codon_table
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------ Updates Spreadsheet with Updated Contig Names --------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_spreadsheet(args, updated_spreadsheet_dict):
|
||||
if os.path.isdir(args.home_folder + '/DiamondOG/') != True:
|
||||
os.system(args.home_folder + '/DiamondOG/')
|
||||
else:
|
||||
pass
|
||||
|
||||
inTSV = [line.rstrip('\n') for line in open(args.tsv_file).readlines() if line != '\n' and line.split('\t')[0] in updated_spreadsheet_dict.keys()]
|
||||
|
||||
updatedTSV = [updated_spreadsheet_dict[line.split('\t')[0]]+'\t'+'\t'.join(line.split('\t')[1:]) for line in inTSV]
|
||||
|
||||
with open(args.tsv_out,'w+') as w:
|
||||
w.write('\n'.join(updatedTSV))
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Updates Log With OG Assignment Information ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(filename, codon_table):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
ntd_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.fasta','fasta')]
|
||||
aa_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.aa.fasta','fasta')]
|
||||
|
||||
min_ntd_ORF = str(min([len(i.seq) for i in ntd_ORF]))
|
||||
max_ntd_ORF = str(max([len(i.seq) for i in ntd_ORF]))
|
||||
avg_ntd_ORF = '%.2f' % (sum([len(i.seq) for i in ntd_ORF])/float(len(ntd_ORF)))
|
||||
|
||||
min_aa_ORF = str(min([len(i.seq) for i in aa_ORF]))
|
||||
max_aa_ORF = str(max([len(i.seq) for i in aa_ORF]))
|
||||
avg_aa_ORF = '%.2f' % (sum([len(i.seq) for i in aa_ORF])/float(len(aa_ORF)))
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(filename.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
|
||||
LogFile.write('Nucleotide ORFs\t'+str(len(ntd_ORF))+'\tn/a\tn/a\n')
|
||||
LogFile.write('Nucleotide ORF Lengths\t'+avg_ntd_ORF+'\t'+min_ntd_ORF+'\t'+max_ntd_ORF+'\n')
|
||||
LogFile.write('Protein ORFs\t'+str(len(aa_ORF))+'\tn/a\tn/a\n')
|
||||
LogFile.write('Protein ORF Lengths\t'+avg_aa_ORF+'\t'+min_aa_ORF+'\t'+max_aa_ORF+'\n')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###----------------------- Write File with Provided Genetic Code ----------------------###
|
||||
##########################################################################################
|
||||
|
||||
def write_data_out(prot_dict, codon_table, args):
|
||||
|
||||
update_spreadsheet_dict = {}
|
||||
|
||||
#The code below only works if rnaspades was used; constrained by addition of script 6b
|
||||
for k, v in prot_dict.items():
|
||||
#if 'Cov' in k:
|
||||
new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-3:])
|
||||
#update_spreadsheet_dict[k] = new_name
|
||||
update_spreadsheet_dict[k] = k
|
||||
#else:
|
||||
#new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-2:])
|
||||
#update_spreadsheet_dict[k] = new_name
|
||||
#update_spreadsheet_dict[k] = k
|
||||
|
||||
with open(args.ntd_out,'w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'ORF'+color.END+color.BOLD\
|
||||
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-2])+'\n')
|
||||
|
||||
with open(args.aa_out, 'w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'Translated ORF'+color.END+color.BOLD\
|
||||
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-1])+'\n')
|
||||
|
||||
return update_spreadsheet_dict
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
if args.input_file.split('.fas')[0].split('/')[-1] + '_StopCodonStats.tsv' in os.listdir(args.home_folder):
|
||||
os.system('mv ' + args.input_file.split('.fas')[0] + '_StopCodonStats.tsv ' + args.StopFreq)
|
||||
|
||||
os.system('mv '+args.tsv_file+' '+args.Diamond_Folder)
|
||||
os.system('mv '+args.input_file+' '+args.Diamond_Folder)
|
||||
|
||||
if args.no_RP == True:
|
||||
if os.path.isdir(args.all_output_folder + 'ToRename/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + 'ToRename/')
|
||||
|
||||
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
|
||||
else:
|
||||
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder)
|
||||
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder)
|
||||
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder)
|
||||
|
||||
os.system('mv ' + args.home_folder + ' ' + args.all_output_folder + 'TranslatedTranscriptomes')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.ntd_out.split('/')[-1]+color.END+\
|
||||
color.BOLD+',\n'+color.DARKCYAN+args.aa_out.split('/')[-1]+color.END+color.BOLD+', and\n'\
|
||||
+color.DARKCYAN+args.tsv_out.split('/')[-1]+color.END+color.BOLD+',\nwhich are in the '+\
|
||||
color.ORANGE+args.home_folder.split('/')[-1]+' Folder'+color.END)
|
||||
|
||||
if args.no_RP == True:
|
||||
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'7_FinalRename.py'+color.END+color.BOLD+\
|
||||
' in the '+color.PURPLE+'RemovePartials Folder'+color.END+color.BOLD+'\nwith a copy of'\
|
||||
' the outputs of this script!'+color.END)
|
||||
print(color.BOLD+'\n\nRemember that you have chosen '+color.RED+'NOT '+color.END+color.BOLD+\
|
||||
'to remove partials\nand are skipping to the renaming step!\n\n'+color.END)
|
||||
|
||||
else:
|
||||
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'6_FilterPartials.py'+color.END+color.BOLD+\
|
||||
' in the '+color.PURPLE+'FinalizeTranscripts Folder'+color.END+color.BOLD+'\nwith a copy of'\
|
||||
' the outputs of this script!\n\n'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
codon_table = standardize_gcode(args.genetic_code.lower())
|
||||
|
||||
prot_dict_Prepped = prep_translations(args)
|
||||
|
||||
prot_dict_Final = extract_ORF(prot_dict_Prepped, codon_table, args)
|
||||
|
||||
new_spreadsheet_names = write_data_out(prot_dict_Final, codon_table, args)
|
||||
|
||||
update_spreadsheet(args, new_spreadsheet_names)
|
||||
|
||||
# update_log(fasta_file, gcode)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
File diff suppressed because one or more lines are too long
@ -1,81 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
from __future__ import print_function
|
||||
|
||||
__author__ = "Jean-David Grattepanche"
|
||||
__version__ = "2, August 28, 2017"
|
||||
__email__ = "jeandavid.grattepanche@gmail.com"
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import string
|
||||
import os.path
|
||||
from Bio import SeqIO
|
||||
from sys import argv
|
||||
|
||||
|
||||
def Addcoverage(code):
|
||||
seqfolder = code
|
||||
all_output_folder = '/'.join(code.split('/')[:-1])
|
||||
code = code.split('/')[-1]
|
||||
|
||||
covupd = {}
|
||||
for seqcoll in open(seqfolder + '/' + code + '_SeqPairsAbove98.txt','r'):
|
||||
CL = 0
|
||||
for transc in seqcoll.split('\t'):
|
||||
if CL == 0:
|
||||
reftrans = ('_').join(transc.split('_')[1:])
|
||||
coverage = int(transc.split('Cov')[1].split('_')[0])
|
||||
Length = int(transc.split('Len')[1].split('_')[0])
|
||||
CL += coverage * Length
|
||||
covupd[reftrans] = CL
|
||||
|
||||
if os.path.isdir(seqfolder + '/Updated_Coverage/') != True:
|
||||
os.system('mkdir ' + seqfolder + '/Updated_Coverage/')
|
||||
if os.path.isdir(seqfolder + '/Updated_Coverage/SpreadSheets/') != True:
|
||||
os.system('mkdir ' + seqfolder + '/Updated_Coverage/SpreadSheets/')
|
||||
|
||||
for spreadsh in os.listdir(seqfolder + '/Processed/SpreadSheets/'):
|
||||
if spreadsh.endswith('.tsv'):
|
||||
outtsvtokeep = open(seqfolder + '/Updated_Coverage/SpreadSheets/' + spreadsh.split('Final')[0] + 'UC.Final' + spreadsh.split('Final')[1],'w+')
|
||||
for row in open(seqfolder + '/Processed/SpreadSheets/'+ spreadsh, 'r'):
|
||||
if row.split('_Trans')[0] in covupd:
|
||||
newcov2= covupd[row.split('_Trans')[0]] / int(row.split('_Len')[1].split('_')[0])
|
||||
outtsvtokeep.write(row.split('Cov')[0]+'Cov'+str(newcov2)+'_OG5' +row.split('OG5')[1].split('_Trans')[0] +'\t' +('\t').join(row.split('\t')[1:]))
|
||||
else:
|
||||
if 'Trans' in row:
|
||||
outtsvtokeep.write(row.split('_Trans')[0]+ '\t' +('\t').join(row.split('\t')[1:]))
|
||||
else:
|
||||
outtsvtokeep.write(row)
|
||||
outtsvtokeep.close()
|
||||
|
||||
for seqfile in os.listdir(seqfolder + '/Processed'):
|
||||
if seqfile.endswith('.fasta'):
|
||||
outseqtokeep = open(seqfolder + '/Updated_Coverage/' + seqfile.split('Final')[0] + 'UC.Final' + seqfile.split('Final')[1],'w+')
|
||||
for Seq in SeqIO.parse(seqfolder + '/Processed/' + seqfile ,'fasta'):
|
||||
if Seq.description.split('_Trans')[0] not in covupd:
|
||||
outseqtokeep.write('>'+Seq.description.split('_Trans')[0]+ '\n'+str(Seq.seq) +'\n')
|
||||
else:
|
||||
newcov= covupd[Seq.description.split('_Trans')[0]] / int(Seq.description.split('_Len')[1].split('_')[0])
|
||||
outseqtokeep.write('>'+Seq.description.split('Cov')[0]+'Cov'+str(newcov)+'_OG5' +Seq.description.split('OG5')[1].split('_Trans')[0]+ '\n'+str(Seq.seq) +'\n')
|
||||
outseqtokeep.close()
|
||||
|
||||
if os.path.isdir(all_output_folder + '/ToRename') != True:
|
||||
os.system('mkdir ' + all_output_folder + '/ToRename')
|
||||
|
||||
os.system('cp ' + seqfolder + '/Updated_Coverage/*fasta ' + all_output_folder + '/ToRename/')
|
||||
os.system('cp ' + seqfolder + '/Updated_Coverage/SpreadSheets/*tsv ' + all_output_folder + '/ToRename/')
|
||||
|
||||
|
||||
def main():
|
||||
script, code = argv
|
||||
Addcoverage(code)
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,398 +0,0 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 31_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 6_FilterPartials.py --help
|
||||
|
||||
##################################################################################################
|
||||
## This script is intended to rename the outputs of the FilterPartials script ##
|
||||
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
|
||||
## ##
|
||||
## Prior to r`ning this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
|
||||
## 8. Partial sequences have been removed from the transcriptomic data sets ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## NONE! You're FINISHED! :D ##
|
||||
## ##
|
||||
##################################################################################################
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
#----------------------- Solely to Make Print Statements Colorful -----------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
|
||||
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
|
||||
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
|
||||
required_arg_group.add_argument('--name','-n', action='store',
|
||||
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
|
||||
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2])
|
||||
|
||||
if '.allOGCleanresults' in args.input_TSV:
|
||||
args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('.allOGCleanresults.','.AA.ORF.')\
|
||||
.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
|
||||
else:
|
||||
args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('_allOGCleanresults.','_AA.ORF.')\
|
||||
.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
|
||||
|
||||
args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0]
|
||||
if 'fasta' in args.file_prefix:
|
||||
args.file_prefix = args.name
|
||||
|
||||
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
|
||||
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
|
||||
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
|
||||
args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/'
|
||||
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\
|
||||
' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_args = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_args += 1
|
||||
|
||||
if args.input_file.endswith('AA.ORF.fasta'):
|
||||
args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta')
|
||||
args.input_AA = args.input_file
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
|
||||
elif args.input_file.endswith('NTD.ORF.fasta'):
|
||||
args.input_NTD = args.input_file
|
||||
args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta')
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
print(args.input_TSV)
|
||||
|
||||
if os.path.isfile(args.input_NTD) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_AA) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_TSV) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\
|
||||
' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
return valid_args
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Double Checks Format for 10-Character Code ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_code(args):
|
||||
|
||||
check_name = args.name.split('_')
|
||||
|
||||
if len(args.name) != 10:
|
||||
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
elif args.name.count('_') != 2:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
|
||||
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
|
||||
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
|
||||
.split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\
|
||||
"code: "+color.CYAN+args.name+color.END+'\n')
|
||||
else:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------- Creates Folders For Storing Data -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
|
||||
|
||||
|
||||
if os.path.isdir(args.r2g_ntd) != True:
|
||||
os.system('mkdir ' + args.r2g_ntd)
|
||||
if os.path.isdir(args.r2g_aa) != True:
|
||||
os.system('mkdir ' + args.r2g_aa)
|
||||
if os.path.isdir(args.r2g_tsv) != True:
|
||||
os.system('mkdir ' + args.r2g_tsv)
|
||||
if os.path.isdir(args.r2g_xml) != True:
|
||||
os.system('mkdir ' + args.r2g_xml)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed')
|
||||
|
||||
###########################################################################################
|
||||
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_paralogs(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
|
||||
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
|
||||
if '\n\n' in open(args.input_TSV).read():
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n\n','\n'+args.name+'_XX_')
|
||||
else:
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n','\n'+args.name+'_XX_')
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w:
|
||||
w.write(renamed_Final_Prots)
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x:
|
||||
x.write(renamed_Final_Nucs)
|
||||
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y:
|
||||
y.write(renamed_Final_tsv)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------------- Header/Tail Lines ---------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def header_tail():
|
||||
header = '<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n'\
|
||||
'<BlastOutput>\n <BlastOutput_program>blastp</BlastOutput_program>\n <BlastOutput_version>BLASTP 2.2.29+</BlastOutput_version>\n'\
|
||||
' <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n'\
|
||||
' <BlastOutput_db>../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta</BlastOutput_db>\n <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n'
|
||||
|
||||
tail = '</BlastOutput_iterations>\n</BlastOutput>'
|
||||
return header, tail
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- TSV to XML Conversion -------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def convert_TSV_data(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
TSVforConvert = home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1]
|
||||
|
||||
inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n']
|
||||
|
||||
iterations = []
|
||||
|
||||
for n in range(len(inTSV)):
|
||||
if n == 0:
|
||||
iterations.append(' <BlastOutput_query-def>'+inTSV[n].split('\t')[0]+'</BlastOutput_query-def>\n <BlastOutput_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</BlastOutput_query-len>\n'\
|
||||
' <BlastOutput_param>\n <Parameters>\n <Parameters_matrix>BLOSUM62</Parameters_matrix>\n <Parameters_expect>1e-10</Parameters_expect>\n'\
|
||||
' <Parameters_gap-open>11</Parameters_gap-open>\n <Parameters_gap-extend>1</Parameters_gap-extend>\n <Parameters_filter>F</Parameters_filter>\n'\
|
||||
' </Parameters>\n </BlastOutput_param>\n<BlastOutput_iterations>\n<Iteration>\n <Iteration_iter-num>1</Iteration_iter-num>\n <Iteration_query-ID>Query_1</Iteration_query-ID>\n'\
|
||||
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
|
||||
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
|
||||
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
|
||||
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
|
||||
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
|
||||
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
|
||||
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
|
||||
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
|
||||
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
|
||||
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
|
||||
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
|
||||
else:
|
||||
iterations.append('<Iteration>\n <Iteration_iter-num>'+str(n+1)+'</Iteration_iter-num>\n <Iteration_query-ID>Query_'+str(n+1)+'</Iteration_query-ID>\n'\
|
||||
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
|
||||
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
|
||||
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
|
||||
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
|
||||
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
|
||||
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
|
||||
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
|
||||
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
|
||||
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
|
||||
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
|
||||
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
|
||||
|
||||
return iterations
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Writes Out the Fake XML File ----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def write_Fake_XML(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/'
|
||||
|
||||
print (color.BOLD+'\n\nConverting '+color.ORANGE+args.input_file.split('/')[-1]+color.END\
|
||||
+color.BOLD+' to XML format\n'+color.END)
|
||||
|
||||
header, tail = header_tail()
|
||||
|
||||
iterations = convert_TSV_data(args)
|
||||
|
||||
with open(home_folder+args.out_XML,'w+') as w:
|
||||
w.write(header)
|
||||
w.write(''.join(iterations))
|
||||
w.write(tail)
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
|
||||
##########################################################################################
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
os.system('cp ' + args.all_output_folder + '/' + args.file_prefix+'/'+args.out_XML+' '+args.r2g_xml)
|
||||
|
||||
os.system('cp '+home_folder+'*tsv '+args.r2g_tsv)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
|
||||
os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/Finished/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/Finished')
|
||||
|
||||
os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished')
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.out_XML\
|
||||
.split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\
|
||||
args.out_XML.split('_XX_')[-1].split('_Filtered')[0]+color.END+color.BOLD+' and '\
|
||||
+color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\
|
||||
'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
check_code(args)
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
rename_paralogs(args)
|
||||
|
||||
write_Fake_XML(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
@ -1,215 +0,0 @@
|
||||
#Dependencies
|
||||
import os, sys, re
|
||||
import shutil
|
||||
import argparse
|
||||
|
||||
|
||||
def get_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'PhyloToL v6.0 Part 1 for Transcriptomes',
|
||||
description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
|
||||
)
|
||||
|
||||
parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6, 7 }, help = 'Script to run if you are only running one script')
|
||||
parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'First script to run')
|
||||
parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5, 6, 7 }, help = 'First script to run')
|
||||
parser.add_argument('-a', '--assembled_transcripts', type = str, help = 'Path to a folder of assembled transcripts, assembled by rnaSPAdes. Each assembled transcript file name should start with a unique 10 digit code, and end in "_assembledTranscripts.fasta", E.g. Op_me_hsap_assembledTranscripts.fasta')
|
||||
parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder')
|
||||
parser.add_argument('-x', '--xplate_contam', action = 'store_true', help = 'Run cross-plate contamination removal (includes all files)')
|
||||
parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 5). Otherwise, stop at script 4 and fill in "gcode_output.tsv" before running script 5')
|
||||
parser.add_argument('-l', '--minlen', type = int, default = 200, help = 'Minimum transcript length')
|
||||
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
#running the first script on all the bare files
|
||||
def script_one(args, ten_digit_codes):
|
||||
for file in os.listdir(args.assembled_transcripts):
|
||||
if file[10:] == '_assembledTranscripts.fasta' and file[:10] in ten_digit_codes:
|
||||
os.system('python 1a_ContigFiltStats.py --input_file ' + args.assembled_transcripts + '/' + file + ' --output_file ' + args.output + '/Output/' + file[:10] + ' --minLen ' + str(args.minlen) + ' --spades') #SPADES ARGUMENT??
|
||||
|
||||
if args.xplate_contam:
|
||||
os.system('python 1b_XSpeciesContaminationAgnes.py ' + args.output + '/Output/XlaneBleeding ' + str(args.minlen))
|
||||
|
||||
|
||||
def script_two(args):
|
||||
|
||||
for folder in os.listdir(args.output + '/Output/'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta'):
|
||||
os.system('python 2a_remove_rRNA.py --input_file ' + args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta --databases ' + args.databases)
|
||||
|
||||
fasta_withBact = args.output + '/Output/' + folder + '/' + folder + '_NorRNAseqs.fasta'
|
||||
os.system('python 2b_remove_Bact.py --input_file ' + fasta_withBact + ' --databases ' + args.databases)
|
||||
|
||||
#NEED TO SORT OUT FILE NAMES ETC. BELOW HERE
|
||||
|
||||
#running the third script
|
||||
def script_three(args):
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta'):
|
||||
os.system('python 3_CountOGsDiamond.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta --evalue 1e-15 --databases ' + args.databases)
|
||||
|
||||
|
||||
#running the fourth script
|
||||
def script_four(args):
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
|
||||
os.system('python 4_InFrameStopFreq.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --databases ' + args.databases)
|
||||
|
||||
#putting all of the gcode summaries produced by fourth script into a spreadsheet
|
||||
gcode_info = []
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv'):
|
||||
with open(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv') as f:
|
||||
for line in f:
|
||||
line_sep = line.split('\t')
|
||||
if line_sep[0] == 'Summary':
|
||||
gcode_info.append([folder, line_sep[6], line_sep[7], line_sep[8][:-1]])
|
||||
|
||||
with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
|
||||
g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
|
||||
for row in gcode_info:
|
||||
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\n')
|
||||
|
||||
#fifth script - to be run after the spreadsheet of the genetic code types (template with codon counts generated by script 4) is filled out
|
||||
def script_five(args):
|
||||
|
||||
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
|
||||
|
||||
if args.genetic_code != None and args.genetic_code.lower() in valid_codes:
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
|
||||
os.system('python 5_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --genetic_code ' + args.genetic_code.lower())
|
||||
else:
|
||||
lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')]
|
||||
with open(args.output + '/Output/gcode_output.tsv', 'r') as g:
|
||||
for folder in os.listdir(args.output + '/Output'):
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
|
||||
for line in lines:
|
||||
if line[0] == folder and line[-1].lower() in valid_codes:
|
||||
os.system('python 5_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --genetic_code ' + line[-1])
|
||||
elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
|
||||
print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
|
||||
|
||||
|
||||
def script_six(args):
|
||||
|
||||
prefixes = []
|
||||
for file in os.listdir(args.output + '/Output'):
|
||||
if file[10:] in ('_WTA_EPU.Renamed_Universal_AA.ORF.fasta', '_WTA_EPU.Renamed_Universal_allOGCleanresults.tsv', '_WTA_EPU.Renamed_Universal_NTD.ORF.fasta'):
|
||||
prefixes.append(file[:10])
|
||||
|
||||
unique_prefixes = list(dict.fromkeys(prefixes))
|
||||
|
||||
for prefix in unique_prefixes:
|
||||
os.system('python 6_FilterPartials.py --file_prefix ' + args.output + '/Output/' + prefix)
|
||||
|
||||
for prefix in unique_prefixes:
|
||||
os.system('python 6b_update_cov_post_removepartials.py ' + args.output + '/Output/' + prefix)
|
||||
|
||||
|
||||
def script_seven(args):
|
||||
|
||||
for file in os.listdir(args.output + '/Output/ToRename'):
|
||||
if '.AA.ORF.fasta' in file:
|
||||
os.system('python 7_FinalizeName.py --input_file ' + args.output + '/Output/ToRename/' + file + ' --name ' + file[:10])
|
||||
|
||||
os.mkdir(args.output + '/Output/Intermediate')
|
||||
|
||||
for file in os.listdir(args.output + '/Output'):
|
||||
if file != 'ReadyToGo' and file != 'Intermediate':
|
||||
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
args = get_args()
|
||||
|
||||
if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.assembled_transcripts):
|
||||
print('\nERROR: If starting at the first script, a valid path to a folder of assembled transcript files (which must end in .fasta, .fa, or .fna) should be input using the --assembled_transcripts argument')
|
||||
quit()
|
||||
|
||||
if args.genetic_code == None and args.script == -1:
|
||||
if args.first_script < 5 and args.last_script >= 5:
|
||||
print('\nERROR: You cannot run script 5 without giving a genetic code! If all of the taxa in the run use the same genetic code, then use the --genetic_code argument (e.g. -g Universal). Otherwise, stop after script 4, fill out the spreadsheet called "gcode_translate.tsv," and then run scripts 5-7. If this does not make sense, please ask for help.')
|
||||
quit()
|
||||
|
||||
ten_digit_codes = []
|
||||
if args.first_script == 1 or args.script == 1:
|
||||
for file in os.listdir(args.assembled_transcripts):
|
||||
if file[10:] == '_assembledTranscripts.fasta':
|
||||
ten_digit_codes.append(file[:10])
|
||||
else:
|
||||
if not os.path.isdir(args.output + '/Output'):
|
||||
print('\nERROR: A folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
|
||||
quit()
|
||||
|
||||
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
|
||||
print('\nERROR: Duplicate 10-digit codes are not allowed.\n')
|
||||
quit()
|
||||
|
||||
for code in ten_digit_codes:
|
||||
for c, char in enumerate(code):
|
||||
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
|
||||
print('\nERROR: ' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
|
||||
quit()
|
||||
|
||||
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
|
||||
print('\nERROR: An "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
|
||||
quit()
|
||||
elif not os.path.isdir(args.output + '/Output'):
|
||||
os.mkdir(args.output + '/Output')
|
||||
|
||||
scripts = [0, script_one, script_two, script_three, script_four, script_five, script_six, script_seven]
|
||||
|
||||
if args.script == -1:
|
||||
if args.first_script < args.last_script:
|
||||
for i in range(1 + args.last_script - args.first_script):
|
||||
print('\nRunning script ' + str(i + args.first_script) + '...\n')
|
||||
if i + args.first_script == 1:
|
||||
if len(ten_digit_codes) == 0:
|
||||
print('\nNo properly-named assembled transcripts files found.\n')
|
||||
quit()
|
||||
else:
|
||||
scripts[i + args.first_script](args, ten_digit_codes)
|
||||
else:
|
||||
scripts[i + args.first_script](args)
|
||||
else:
|
||||
print('\nERROR: Invalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
|
||||
quit()
|
||||
else:
|
||||
if args.script == 1:
|
||||
if len(ten_digit_codes) == 0:
|
||||
print('\nNo properly-named assembled transcripts files found.\n')
|
||||
quit()
|
||||
else:
|
||||
scripts[args.script](args, ten_digit_codes)
|
||||
else:
|
||||
scripts[args.script](args)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,23 +0,0 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
#SBATCH --job-name=PTL1
|
||||
#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId)
|
||||
#SBATCH --nodes=1
|
||||
#SBATCH --ntasks=1
|
||||
#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances
|
||||
#SBATCH --mem=160G
|
||||
#SBATCH --mail-type=ALL
|
||||
#SBATCH --mail-user=pleasedontemailauden@smith.edu
|
||||
|
||||
module purge #Cleans up any loaded modules
|
||||
module use /gridapps/modules/all #make sure module locations is loaded
|
||||
|
||||
module load slurm
|
||||
module load Biopython/1.75-foss-2019b-Python-3.7.4
|
||||
module load BLAST+
|
||||
module load DIAMOND/0.9.30-GCC-8.3.0
|
||||
|
||||
export PATH=$PATH:/Users/katzlab/scratch/katzlab/grid_phylotol_setup/programs/standard-RAxML-master
|
||||
export PATH=$PATH:/Users/katzlab/scratch/katzlab/grid_vsearch_setup/vsearch-2.15.1-linux-x86_64/bin
|
||||
|
||||
python wrapper.py --first_script 1 --last_script 7 -a ../TestData --xplate_contam --genetic_code Universal --databases ../Databases --evalue 1e-5
|
||||
Loading…
x
Reference in New Issue
Block a user