mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 06:50:24 +08:00
791 lines
33 KiB
Python
791 lines
33 KiB
Python
#!/usr/bin/env python
|
|
|
|
##__Updated__: 18_08_2017
|
|
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
|
##__Usage__: python 4_InFrameStopFreq.py --help
|
|
|
|
|
|
##########################################################################################
|
|
## This script is intended to aid in identifying the genetic code of the data given ##
|
|
## ##
|
|
## Prior to running this script, ensure the following: ##
|
|
## ##
|
|
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
|
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
|
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
|
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
|
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
|
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
|
## ##
|
|
## COMMAND Example Below ##
|
|
## Extra Notes at Bottom of Script ##
|
|
## ##
|
|
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
|
## ##
|
|
## Next Script(s) to Run: ##
|
|
## 5_GCodeTranslate.py ##
|
|
## ##
|
|
##########################################################################################
|
|
|
|
|
|
import argparse, os, sys
|
|
from argparse import RawTextHelpFormatter,SUPPRESS
|
|
from distutils import spawn
|
|
|
|
from Bio import SeqIO
|
|
from Bio.Seq import Seq
|
|
from Bio.Data.CodonTable import CodonTable
|
|
|
|
|
|
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
|
|
|
tag_table = CodonTable(forward_table={
|
|
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
|
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
|
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
|
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
|
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
|
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
|
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
|
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
|
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
|
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
|
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
|
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
|
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
|
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
|
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
|
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
|
start_codons = [ 'ATG'],
|
|
stop_codons = ['TAG'])
|
|
|
|
c_uncinata_table = CodonTable(forward_table={
|
|
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
|
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
|
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
|
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
|
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
|
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
|
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
|
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
|
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
|
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
|
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
|
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
|
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
|
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
|
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
|
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
|
start_codons = [ 'ATG'],
|
|
stop_codons = ['TAA'])
|
|
|
|
#------------------------------ Colors For Print Statements ------------------------------#
|
|
class color:
|
|
PURPLE = '\033[95m'
|
|
CYAN = '\033[96m'
|
|
DARKCYAN = '\033[36m'
|
|
ORANGE = '\033[38;5;214m'
|
|
BLUE = '\033[94m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
RED = '\033[91m'
|
|
BOLD = '\033[1m'
|
|
UNDERLINE = '\033[4m'
|
|
END = '\033[0m'
|
|
|
|
#------------------------------- Main Functions of Script --------------------------------#
|
|
|
|
###########################################################################################
|
|
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
|
###########################################################################################
|
|
## IF Diamond is IN YOUR PATH then no updating is needed...
|
|
|
|
def check_diamond_path():
|
|
|
|
diamond_path = ''
|
|
|
|
if diamond_path == '':
|
|
diamond_path = spawn.find_executable("diamond")
|
|
#diamond_path = '/path/to/diamond'
|
|
else:
|
|
pass
|
|
|
|
if diamond_path == None:
|
|
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
|
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
|
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
|
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
|
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
|
|
|
sys.exit()
|
|
else:
|
|
pass
|
|
|
|
return diamond_path
|
|
|
|
|
|
###########################################################################################
|
|
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
|
###########################################################################################
|
|
|
|
def check_args():
|
|
|
|
parser = argparse.ArgumentParser(description=
|
|
color.BOLD+'\n\nThis script is intended to '+color.RED+'AID You '+color.END+color.BOLD\
|
|
+'in determining the '+color.RED+'\nLikely Genetic Code'+color.END+color.BOLD+' of a'\
|
|
' given Fasta File of transcripts\n\nInterpretation of the output (StopFreq.tsv) is difficult \nand so '+color.ORANGE\
|
|
+'TWO EXAMPLES'+color.END+color.BOLD+' can be found in the '+color.CYAN+'NOTES Section'\
|
|
+color.END+color.BOLD+' of\nTHIS Script '+color.GREEN+'(4_InFrameStopFreq.py)\n'+color.END\
|
|
+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
|
|
|
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
|
|
|
required_arg_group.add_argument('--input_file','-in', action='store', required=True,
|
|
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
|
' Eukaryotic protein coding transcripts'+color.END)
|
|
required_arg_group.add_argument('--databases','-d', action='store',
|
|
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
|
|
|
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
|
optional_arg_group.add_argument('-author', action='store_true',
|
|
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
|
|
|
if len(sys.argv[1:]) == 0:
|
|
print (parser.description)
|
|
print ('\n')
|
|
sys.exit()
|
|
|
|
args = parser.parse_args()
|
|
|
|
quit_eval = return_more_info(args)
|
|
if quit_eval > 0:
|
|
sys.exit()
|
|
|
|
return args
|
|
|
|
|
|
###########################################################################################
|
|
###------------------------------- Script Usage Message --------------------------------###
|
|
###########################################################################################
|
|
|
|
def usage_msg():
|
|
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_InFrameStopFreq.py'\
|
|
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_EPU.Renamed.fasta'+color.END)
|
|
|
|
|
|
##########################################################################################
|
|
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
|
##########################################################################################
|
|
|
|
def return_more_info(args):
|
|
|
|
valid_arg = 0
|
|
|
|
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
|
' maurerax@gmail.com\n\n'+color.END)
|
|
|
|
if args.author == True:
|
|
print (author)
|
|
valid_arg += 1
|
|
|
|
if args.input_file != None:
|
|
if os.path.isfile(args.input_file) != False:
|
|
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
|
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
|
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
|
valid_arg += 1
|
|
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
|
|
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
|
' with '+color.GREEN+'3_CountOGsUsearcy.py '+color.END+color.BOLD+'are valid\n\n'\
|
|
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
|
'"WTA_NBU.Renamed.fasta"\n\n'+color.END)
|
|
valid_arg += 1
|
|
else:
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
|
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
|
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
|
valid_arg += 1
|
|
|
|
if os.path.isdir(args.databases + '/db_StopFreq') != True:
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
|
+color.ORANGE+'db_StopFreq Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
|
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
|
+'\n\nThen try once again\n\n.'+color.END)
|
|
valid_arg += 1
|
|
|
|
elif os.path.isfile(args.databases + '/db_StopFreq/RepEukProts.dmnd') != True:
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
|
'Diamond formatted '+color.ORANGE+'Representative Eukaryotic Protein Database!\n\n'+color.END+color.BOLD+\
|
|
'Ensure that it can be found in the '+color.ORANGE+'db_StopFreq folder'+color.END+\
|
|
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
|
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
|
valid_arg += 1
|
|
|
|
return valid_arg
|
|
|
|
|
|
###########################################################################################
|
|
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
|
###########################################################################################
|
|
|
|
def prep_folders(args):
|
|
|
|
Stop_folder = '../'+args.input_file.split('/')[1]+'/StopCodonFreq/'
|
|
|
|
if os.path.isdir(Stop_folder) != True:
|
|
os.system('mkdir '+Stop_folder)
|
|
|
|
if os.path.isdir(Stop_folder+'StopCodonFastas') != True:
|
|
os.system('mkdir '+Stop_folder+'StopCodonFastas')
|
|
|
|
if os.path.isdir(Stop_folder+'SpreadSheets') != True:
|
|
os.system('mkdir '+Stop_folder+'SpreadSheets')
|
|
|
|
return Stop_folder+'StopCodonFastas/'
|
|
|
|
|
|
###########################################################################################
|
|
###--------------------- Translates Sequences with Each Stop Codon ---------------------###
|
|
###########################################################################################
|
|
|
|
def prep_translations(args):
|
|
print (color.BOLD+'\nIdentifying ORFs in the Fasta file based on the output of 3_CountOGsDiamond.py\n'+color.END)
|
|
|
|
intsv = [i for i in open(args.input_file.replace('.fasta','_allOGCleanresults.tsv')).readlines() if i != '\n']
|
|
|
|
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
|
|
|
prot_dict = {}
|
|
|
|
for i in intsv:
|
|
# print i
|
|
prot_dict.setdefault(i.split('\t')[0],[])
|
|
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
|
prot_dict[i.split('\t')[0]].append('F')
|
|
if (int(i.split('\t')[6])) < 5:
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
|
else:
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
|
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
|
prot_dict[i.split('\t')[0]].append('RC')
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6]))
|
|
if (int(i.split('\t')[7])-4) < 5:
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7]))
|
|
else:
|
|
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])-4)
|
|
|
|
|
|
#------------- Prep translation with 'TAA' as the only Stop -------------#
|
|
|
|
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAA'+color.END\
|
|
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
|
|
|
for key, value in prot_dict.items():
|
|
for seq_rec in inFasta:
|
|
if key in seq_rec.description:
|
|
stop_pos = 0
|
|
if prot_dict[key][0] == 'F':
|
|
temp = seq_rec.seq[prot_dict[key][1]:]
|
|
temp_prot = str(temp.translate(table=c_uncinata_table))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
|
if prot_dict[key][0] == 'RC':
|
|
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
|
temp_prot = str(temp.translate(table=c_uncinata_table))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
|
|
|
|
|
#------------- Prep translation with 'TGA' as the only Stop -------------#
|
|
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TGA'+color.END\
|
|
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
|
|
|
for key, value in prot_dict.items():
|
|
for seq_rec in inFasta:
|
|
if key in seq_rec.description:
|
|
stop_pos = 0
|
|
if prot_dict[key][0] == 'F':
|
|
temp = seq_rec.seq[prot_dict[key][1]:]
|
|
temp_prot = str(temp.translate(table=6))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
|
if prot_dict[key][0] == 'RC':
|
|
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
|
temp_prot = str(temp.translate(table=6))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
|
|
|
|
|
#------------- Prep translation with 'TAG' as the only Stop -------------#
|
|
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAG'+color.END\
|
|
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
|
|
|
|
|
for key, value in prot_dict.items():
|
|
for seq_rec in inFasta:
|
|
if key in seq_rec.description:
|
|
stop_pos = 0
|
|
if prot_dict[key][0] == 'F':
|
|
temp = seq_rec.seq[prot_dict[key][1]:]
|
|
temp_prot = str(temp.translate(table=tag_table))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
|
if prot_dict[key][0] == 'RC':
|
|
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
|
temp_prot = str(temp.translate(table=tag_table))
|
|
if '*' in temp_prot:
|
|
stop_pos = (temp_prot.index('*')+1)*3
|
|
prot_dict[key].append(temp[:stop_pos])
|
|
else:
|
|
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
|
|
|
#------------ Parsing through data to maintain OG assignments ------------#
|
|
inOGs = intsv
|
|
inOGs = [i.split('\t')[0]+';'+i.split('\t')[1][-10:] for i in inOGs]
|
|
inOGs2 = []
|
|
for i in inOGs:
|
|
if 'no_group' not in i.split(';')[1]:
|
|
inOGs2.append(i)
|
|
else:
|
|
inOGs2.append(i.split(';')[0]+';no_group')
|
|
inOGs2 = list(set(inOGs2))
|
|
|
|
#---------------- Write file with 'TAA' is the only Stop ----------------#
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','w+') as w:
|
|
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
|
+' TAA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
|
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-3]).upper()+'\n')
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_taa_ORF.aa.fasta','w+') as w:
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-3])).translate(table=c_uncinata_table)).upper()+'\n')
|
|
|
|
#---------------- Write file with 'TGA' is the only Stop ----------------#
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','w+') as w:
|
|
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
|
+' TGA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
|
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-2]).upper()+'\n')
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_tga_ORF.aa.fasta','w+') as w:
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-2])).translate(table=6)).upper()+'\n')
|
|
|
|
#---------------- Write file with 'TAG' is the only Stop ----------------#
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','w+') as w:
|
|
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
|
+' TAG '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
|
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-1]).upper()+'\n')
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_tag_ORF.aa.fasta','w+') as w:
|
|
for key, value in prot_dict.items():
|
|
for j in inOGs2:
|
|
if key == j.split(';')[0]:
|
|
if len(prot_dict[key]) < 4:
|
|
pass
|
|
else:
|
|
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-1])).translate(table=tag_table)).upper()+'\n')
|
|
|
|
|
|
###########################################################################################
|
|
###---------- Diamonds the Translations Against a SMALL Euk Protein Database ----------###
|
|
###########################################################################################
|
|
|
|
def diamond_ProtDB(args, diamond_path):
|
|
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tag_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tag_ORF.RepEukProts.tsv')
|
|
|
|
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tga_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tga_ORF.RepEukProts.tsv')
|
|
|
|
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_taa_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_taa_ORF.RepEukProts.tsv')
|
|
|
|
|
|
###########################################################################################
|
|
###-------------------- Manages the search for In-Frame Stop Codons --------------------###
|
|
###########################################################################################
|
|
|
|
|
|
def hunt_for_stops(args):
|
|
|
|
#------------------------ Open Fasta Files ------------------------#
|
|
try:
|
|
TAGinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','fasta') if str(i.seq).endswith('TAG')]
|
|
print (color.BOLD+'\n\nGathering Sequence information from FASTA and TSV files\n'+color.END)
|
|
|
|
except:
|
|
print (color.BOLD+color.RED+'\n\nMissing Necessary Inputs: Open Script for Usage'\
|
|
' Information\n\n'+color.END)
|
|
sys.exit()
|
|
|
|
TGAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','fasta') if str(i.seq).endswith('TGA')]
|
|
|
|
TAAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','fasta') if str(i.seq).endswith('TAA')]
|
|
|
|
## This section originally ONLY considered sequences WITH OG assignments:
|
|
## TAAinFasta = [i for i in TAAinFasta if 'no_group' not in i.description and str(i.seq).endswith('TAA')]
|
|
## This has been taken out for now
|
|
|
|
#----------------------- Open BLAST Reports -----------------------#
|
|
|
|
TAGinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tag_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
|
|
|
TGAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tga_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
|
|
|
TAAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_taa_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
|
|
|
## This section originally ONLY considered sequences WITH OG assignments:
|
|
## TAAinTSV = i for i in TAAinTSV if i != ''and 'no_group' not in i.split('\t')[0]]
|
|
## This has been taken out for now
|
|
|
|
|
|
#------------ Set-up Genetic Code Specific Dictionaries ------------#
|
|
|
|
tag_dict = {}
|
|
for i in TAGinTSV:
|
|
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-6]))
|
|
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-5]))
|
|
|
|
tga_dict = {}
|
|
for i in TGAinTSV:
|
|
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-6]))
|
|
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-5]))
|
|
|
|
taa_dict = {}
|
|
for i in TAAinTSV:
|
|
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-6]))
|
|
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-5]))
|
|
|
|
#-------------- Preparing In-Frame Stop Codon Counts --------------#
|
|
|
|
# All the data when TGA is the sole stop codon
|
|
tga_codons = 0
|
|
tga_data_tag = 0
|
|
tga_data_tga = 0
|
|
tga_data_taa = 0
|
|
tga_seq_count = 0
|
|
|
|
# All the data when TAG is the sole stop codon
|
|
tag_codons = 0
|
|
tag_data_tag = 0
|
|
tag_data_tga = 0
|
|
tag_data_taa = 0
|
|
tag_seq_count = 0
|
|
|
|
# All the data when TAA is the sole stop codon
|
|
taa_codons = 0
|
|
taa_data_tag = 0
|
|
taa_data_tga = 0
|
|
taa_data_taa = 0
|
|
taa_seq_count = 0
|
|
|
|
# All the data for each stop codon combined
|
|
tga_inframe = 0
|
|
tag_inframe = 0
|
|
taa_inframe = 0
|
|
total_codons = 0
|
|
total_seq_counts = len(open(args.input_file).read().split('>'))-1
|
|
|
|
|
|
#-------- Gathering In-frame Stop Codon Density Information --------#
|
|
|
|
### Collect in-frame stop information for "TAA" and "TAG" when TGA is the ONLY stop
|
|
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
|
+' TGA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
|
|
|
for i in TGAinFasta:
|
|
try:
|
|
if tga_dict[i.description][0] == 1:
|
|
for n in range((tga_dict[i.description][0]-1),((tga_dict[i.description][1])*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
tga_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
tga_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
tga_data_tga += 1
|
|
tga_inframe += 1
|
|
tga_codons += 1
|
|
total_codons += 1
|
|
tga_seq_count += 1
|
|
|
|
else:
|
|
for n in range(((tga_dict[i.description][0]-1)*3),((tga_dict[i.description][1])*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
tga_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
tga_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
tga_data_tga += 1
|
|
tga_inframe += 1
|
|
tga_codons += 1
|
|
total_codons += 1
|
|
tga_seq_count += 1
|
|
except:
|
|
pass
|
|
|
|
### Collect in-frame stop information for "TAA" and "TGA" when TAG is the ONLY stop
|
|
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
|
+' TAG'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
|
|
|
for i in TAGinFasta:
|
|
try:
|
|
if tag_dict[i.description][0] == 1:
|
|
for n in range((tag_dict[i.description][0]-1),((tag_dict[i.description][1])*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
tag_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
tag_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
tag_data_tga += 1
|
|
tga_inframe += 1
|
|
tag_codons += 1
|
|
total_codons += 1
|
|
tag_seq_count += 1
|
|
|
|
else:
|
|
for n in range(((tag_dict[i.description][0]-1)*3),(tag_dict[i.description][1]*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
tag_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
tag_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
tag_data_tga += 1
|
|
tga_inframe += 1
|
|
tag_codons += 1
|
|
total_codons += 1
|
|
tag_seq_count += 1
|
|
except:
|
|
pass
|
|
|
|
|
|
### Collect in-frame stop information for "TGA" and "TAG" when TAA is the ONLY stop
|
|
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
|
+' TAA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
|
|
|
for i in TAAinFasta:
|
|
try:
|
|
if taa_dict[i.description][0] == 1:
|
|
for n in range((taa_dict[i.description][0]-1),((taa_dict[i.description][1])*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
taa_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
taa_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
taa_data_tga += 1
|
|
tga_inframe += 1
|
|
taa_codons += 1
|
|
total_codons += 1
|
|
taa_seq_count += 1
|
|
|
|
else:
|
|
for n in range(((taa_dict[i.description][0]-1)*3),(taa_dict[i.description][1]*3)-3,3):
|
|
if str(i.seq).upper()[n:n+3] == 'TAG':
|
|
taa_data_tag += 1
|
|
tag_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
|
taa_data_taa += 1
|
|
taa_inframe += 1
|
|
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
|
taa_data_tga += 1
|
|
tga_inframe += 1
|
|
tag_codons += 1
|
|
total_codons += 1
|
|
taa_seq_count += 1
|
|
except:
|
|
pass
|
|
|
|
#-------------- Writing Data Out and Print Statement --------------#
|
|
|
|
with open(args.input_file.split('.fas')[0]+'_StopCodonStats.tsv','w+') as w:
|
|
w.write('Stop Codon\tNumber of Seqs Analyzed\tIn-frame TAG\tIn-frame TGA\tIn-frame TAA\tTotal Codons\tIn-frame TAG density\tIn-frame TGA density\tIn-frame TAA density\n')
|
|
if tga_codons != 0:
|
|
w.write('TGA\t'+str(tga_seq_count)+'\t'+str(tga_data_tag)+'\t'+str(tga_data_tga)+'\t'+str(tga_data_taa)+'\t'+str(tga_codons)\
|
|
+'\t'+"%.2f" % ((float(tga_data_tag)*1000)/float(tga_codons))+'\t'+"%.2f" % ((float(tga_data_tga)*1000)/float(tga_codons))+'\t'\
|
|
+"%.2f" % ((float(tga_data_taa)*1000)/float(tga_codons))+'\n')
|
|
else:
|
|
w.write('TGA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
|
|
|
if tag_codons != 0:
|
|
w.write('TAG\t'+str(tag_seq_count)+'\t'+str(tag_data_tag)+'\t'+str(tag_data_tga)+'\t'+str(tag_data_taa)+'\t'+str(tag_codons)\
|
|
+'\t'+"%.2f" % ((float(tag_data_tag)*1000)/float(tag_codons))+'\t'+"%.2f" % ((float(tag_data_tga)*1000)/float(tag_codons))+'\t'\
|
|
+"%.2f" % ((float(tag_data_taa)*1000)/float(tag_codons))+'\n')
|
|
else:
|
|
w.write('TAG\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
|
if taa_codons != 0:
|
|
w.write('TAA\t'+str(taa_seq_count)+'\t'+str(taa_data_tag)+'\t'+str(taa_data_tga)+'\t'+str(taa_data_taa)+'\t'+str(taa_codons)\
|
|
+'\t'+"%.2f" % ((float(taa_data_tag)*1000)/float(taa_codons))+'\t'+"%.2f" % ((float(taa_data_tga)*1000)/float(taa_codons))+'\t'\
|
|
+"%.2f" % ((float(taa_data_taa)*1000)/float(taa_codons))+'\n')
|
|
else:
|
|
w.write('TAA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
|
|
|
w.write('\n \n')
|
|
w.write('Summary\t'+str(tga_seq_count+tag_seq_count+taa_seq_count)+'\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)\
|
|
+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))\
|
|
+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n')
|
|
w.write('\nTotal Seqs in Fasta\t'+str(total_seq_counts))
|
|
|
|
# print color.BOLD + color.BLUE + '\nSummary\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'\
|
|
# +"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n\n'\
|
|
# + str(tag_seq_count) + '\t' + str(tga_seq_count) + '\t' + str(taa_seq_count) + color.END
|
|
|
|
|
|
##########################################################################################
|
|
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
|
##########################################################################################
|
|
|
|
def clean_up(args):
|
|
if os.path.isdir('/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq') != True:
|
|
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/')
|
|
else:
|
|
pass
|
|
|
|
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
|
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
|
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*_ORF.*fasta ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
|
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*Prots.tsv ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
|
|
|
|
|
###########################################################################################
|
|
###-------------------------------- Next Script Message --------------------------------###
|
|
###########################################################################################
|
|
|
|
def next_script(args):
|
|
|
|
home_folder = '/'.join(args.input_file.split('/')[:-1])
|
|
|
|
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
|
.replace('.fasta','_StopCodonStats.tsv')+color.END+color.BOLD+' in the '+home_folder\
|
|
+' Folder\n\n' + color.END)
|
|
|
|
print (color.BOLD+'Next Script is: '+color.GREEN+'5_GCodeTranslate.py\n\n'+ color.END)
|
|
|
|
|
|
##########################################################################################
|
|
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
|
##########################################################################################
|
|
|
|
def main():
|
|
|
|
diamond_path = check_diamond_path()
|
|
|
|
args = check_args()
|
|
|
|
prep_translations(args)
|
|
|
|
diamond_ProtDB(args, diamond_path)
|
|
|
|
hunt_for_stops(args)
|
|
|
|
clean_up(args)
|
|
|
|
next_script(args)
|
|
|
|
main()
|
|
|
|
|
|
#----------------------------------------- NOTES -----------------------------------------#
|
|
#
|
|
# This script is designed to HELP you make an informed decision about the genetic code being
|
|
# used by your particular organism. Be aware that it will be limited by the quality of the
|
|
# data given to it!
|
|
#
|
|
# You will need:
|
|
#
|
|
# Diamond, BioPython, AND the output from '3_CountOGSDiamond.py'
|
|
#
|
|
# If you are not using the Author's database, update your database name(s) in lines: 345-360
|
|
#
|
|
# katzlab$ python StopFrequency.py YourFastaFile.fasta
|
|
#
|
|
#
|
|
#------------------------------- Interpretation of Results -------------------------------#
|
|
#
|
|
# FORMATTED BELOW WITH TEXTWRANGLER...
|
|
#
|
|
# Example output using CILIATE (TGA) genetic Code (NOTE THE In-Frame Densities):
|
|
#
|
|
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
|
# TGA 341 14 0 22 113156 1.2 0 0.92
|
|
# TAG 424 0 0 34 140085 0 0 0.78
|
|
# TAA 205 14 0 0 16714 0.84 0 0
|
|
# Summary 970 28 0 56 269955 2.04 0 1.7
|
|
#
|
|
# VALUES in summary line (OR SUM of Density) that are > 1.5 likely indicate that the STOP
|
|
# codon has been reassigned... in the case above, TAG and TAA look like they have been
|
|
# reassigned.
|
|
#
|
|
#
|
|
# Example output using UNIVERSAL genetic Code (NOTE THE In-Frame Densities):
|
|
#
|
|
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
|
# TGA 341 1 0 2 113156 0.2 0 0.05
|
|
# TAG 424 0 2 4 140085 0 0 0.08
|
|
# TAA 205 1 0 0 16714 0.04 0 0
|
|
# Summary 970 2 2 6 269955 0.15 0 0.06
|
|
#
|
|
# VALUES in summary line (OR SUM of Density) that are > 0.5 likely indicate that the STOP
|
|
# codon still acts as STOP... in the case above, TAG, TGA and TAA look like they still behave
|
|
# as a stop codon.
|
|
#
|
|
# THIS IS A ROUGH GUIDE FOR INTERPRETING THE RESULTS!!!! BE VERY VERY WARY! NUMBER OF TOTAL
|
|
# SEQUENCES AND TOTAL CODONS OBSERVED ARE IMPORTANT (TOO FEW AND ANY INTERPRETATION IS DEVOID
|
|
# OF ANY MEANING).
|