mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 07:10:25 +08:00
Updates to script 6 filters & script names
This commit is contained in:
parent
f9d747ce38
commit
fa294909c5
268
PTL1/Transcriptomes/Scripts/1a_TranscriptLengthFilter.py
Normal file
268
PTL1/Transcriptomes/Scripts/1a_TranscriptLengthFilter.py
Normal file
@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python3.6
|
||||
|
||||
##__Updated__: 01_04_2023 by Auden Cote-L'Heureux
|
||||
##__Authors__: Xyrus Maurer-Alcala
|
||||
##__Usage__: python 1_ContigFiltStats.py
|
||||
##__Options__: python 1_ContigFiltStats.py --help
|
||||
|
||||
########################################################################################
|
||||
## This script is intended to remove transcripts below or above a given
|
||||
## size range from a transcriptome assembly.
|
||||
##
|
||||
## Prior to running this script, ensure the following:
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder
|
||||
##
|
||||
## COMMAND Example Below
|
||||
##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com
|
||||
##
|
||||
## Next Script(s) to Run:
|
||||
## AutoBactVsEuk.py (removes SSU then Bact) or 2a_removeSSU.py then 2b_removeBact.py
|
||||
##
|
||||
########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqUtils import GC
|
||||
|
||||
|
||||
#----------------------------- Colors For Print Statements ------------------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\nThis script will remove Contigs (and provide a summary of statistics)'\
|
||||
+'\nfrom your Assembly that are shorter than a given length.'+color.ORANGE+\
|
||||
'\n\nA good minimum length to start with is 200bp.'+color.END+color.BOLD+\
|
||||
'\n\nThe minimum length value should be adjusted for your data sets.\n'+color.END+usage_msg(),
|
||||
usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+" Fasta file of Protein/Nucleotide sequences\n"+color.END)
|
||||
|
||||
required_arg_group.add_argument('--output_file','-out',
|
||||
help=color.BOLD+color.GREEN+" Desired Output Name\n\n"+color.END)
|
||||
|
||||
required_arg_group.add_argument('--minLen','-min', default=200, type=int,
|
||||
help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 200)"+color.END)
|
||||
required_arg_group.add_argument('--maxLen','-max', default=15000, type=int,
|
||||
help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 15000)"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--spades','-spades', action='store_true',
|
||||
help=color.BOLD+color.GREEN+'rnaSPAdes transcriptome assembly\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--genbank','-gb', action='store_true',
|
||||
help=color.BOLD+color.GREEN+'Assembly from Genbank\n (Will include Accession Number in'\
|
||||
' contig name)\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1_ContigFiltStats.py'\
|
||||
' --input_file ../Op_me_Xxma_rnaSPAdes_scaffolds_15_05.fasta --output_file '\
|
||||
'Op_me_Xxma --minLen 200 --spades'+color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.output_file == None:
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
Home_folder_name = args.output_file
|
||||
|
||||
if os.path.isdir(args.output_file) != True:
|
||||
os.system('mkdir ' + args.output_file)
|
||||
|
||||
if os.path.isdir(args.output_file + '/OriginalFasta/') != True:
|
||||
os.system('mkdir ' + args.output_file +'/OriginalFasta/')
|
||||
|
||||
if os.path.isdir(args.output_file + '/SizeFiltered/') != True:
|
||||
os.system('mkdir ' + args.output_file +'/SizeFiltered/')
|
||||
|
||||
if os.path.isdir('/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/') != True:
|
||||
os.system('mkdir ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------- Renames the Contigs, Writes them out, and Calculates Basic Info ----------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_Transcriptome(args):
|
||||
|
||||
home_folder = args.output_file + '/SizeFiltered/'
|
||||
|
||||
print (color.BOLD+'\n\nPrepping '+color.GREEN+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta') if len(i.seq) >= args.minLen and len(i.seq) <= args.maxLen]
|
||||
inFasta.sort(key=lambda seq_rec: -len(seq_rec.seq))
|
||||
|
||||
renamed_seqs = []
|
||||
seq_code_dict = {}
|
||||
|
||||
count = 1
|
||||
|
||||
seq_name_start = 'Contig'
|
||||
|
||||
if args.genbank == True:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.id,[]).append(seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
||||
seq_code_dict.setdefault(seq_rec.id,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
elif args.spades == True:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3])))))
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_rec.description.split('_')[5])
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3]))))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
else:
|
||||
for seq_rec in inFasta:
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
||||
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
|
||||
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
|
||||
count += 1
|
||||
|
||||
print (color.BOLD+'\n\nThere are '+color.RED+str(len(renamed_seqs))+' contigs > '+str(args.minLen)\
|
||||
+color.END+color.BOLD+' in '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta','w+') as w:
|
||||
for seq in renamed_seqs:
|
||||
w.write(seq+'\n')
|
||||
|
||||
if args.spades != True:
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
|
||||
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
|
||||
for k, v in seq_code_dict.items():
|
||||
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
|
||||
else:
|
||||
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
|
||||
w.write('Original Name\tNew Name\tSeq Length\tSeq GC\tSeq Coverage\n')
|
||||
for k, v in seq_code_dict.items():
|
||||
w.write(k+'\t'+v[0]+'\t'+str(len(v[2]))+'\t'+str(GC(v[2]))+'\t'+str(v[1])+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
os.system('cp ' + args.input_file + ' ' + args.output_file + '/OriginalFasta/' + args.input_file.split('/')[-1].replace('.fasta', '.Original.fasta'))
|
||||
|
||||
os.system('cp ' + args.output_file + '/SizeFiltered/' + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.output_file+'.'+str(args.minLen)+\
|
||||
'bp.fasta'+color.END+color.BOLD+'\n\n')
|
||||
|
||||
print ('Next Script is: 'color.GREEN+' 2a_Idenfify_rRNA.py followed by 2b_Identify_Bact.py'\
|
||||
+color.END+color.BOLD+')\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
temp = rename_Transcriptome(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
153
PTL1/Transcriptomes/Scripts/1b_CrossPlateContamination.py
Normal file
153
PTL1/Transcriptomes/Scripts/1b_CrossPlateContamination.py
Normal file
@ -0,0 +1,153 @@
|
||||
#!/usr/bin/python3
|
||||
|
||||
__author__ = 'Jean-David Grattepanche'
|
||||
__version__ = 'ACL fixed sequence naming issue Feb 23, 2022'
|
||||
__email__ = 'jeandavid.grattepanche@gmail.com'
|
||||
|
||||
|
||||
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import string
|
||||
import os.path
|
||||
from Bio import SeqIO
|
||||
from sys import argv
|
||||
listtaxa=[]
|
||||
toosim = 0.99
|
||||
seqcoverage = 0.7
|
||||
|
||||
def merge_files(folder, minlen, conspecific_names):
|
||||
mergefile = open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','w+')
|
||||
print("MERGE following files")
|
||||
for taxafile in os.listdir(folder):
|
||||
if taxafile[0] != ".":
|
||||
listtaxa.append(taxafile.split('.' + str(minlen) + 'bp')[0])
|
||||
|
||||
for line2 in SeqIO.parse(folder+'/'+taxafile, 'fasta'):
|
||||
if int(len(str(line2.seq))) >= int(minlen):
|
||||
mergefile.write('>'+taxafile.split('.' + str(minlen) + 'bp')[0] + '_' + line2.description + '\n' + str(line2.seq) + '\n')
|
||||
else:
|
||||
print(line2, " is too short")
|
||||
mergefile.close()
|
||||
|
||||
sort_cluster(folder, listtaxa, minlen, conspecific_names)
|
||||
|
||||
|
||||
def sort_cluster(folder, listtaxa, minlen, conspecific_names):
|
||||
if not os.path.exists('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/'):
|
||||
os.makedirs('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
|
||||
fastalist = []; fastadict= {}
|
||||
conspecific_names_dict = { line.split('\t')[0] : line.split('\t')[1].strip() for line in open(conspecific_names) }
|
||||
|
||||
print('CREATE a dictionnary of sequences')
|
||||
for record in SeqIO.parse(open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','r'),'fasta'):
|
||||
if record.id[:10] not in conspecific_names_dict:
|
||||
print('\nError in cross-plate contamination assessment: the ten-digit code ' + record.id[:10] + ' is not found in the conspecific names file. Please check that this file is correct and try again.\n')
|
||||
quit()
|
||||
|
||||
IDL = record.description, int(record.description.split('_Cov')[1].replace('\n',''))
|
||||
fastalist.append(IDL)
|
||||
fastadict[record.description] = record.seq
|
||||
|
||||
print("CLUSTER sequences that overlap at least 70%")
|
||||
os.system('vsearch --cluster_fast ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta --strand both --query_cov '+str(seqcoverage)+' --id '+str(toosim) +' --uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc --threads 60' )
|
||||
|
||||
#input2 = open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r')
|
||||
#input2 = open('/Output_PostClusterBackup/clusteringresults_vsearch/results_forclustering.uc','r')
|
||||
cluster_output = '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc'
|
||||
out2 = open('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','w+')
|
||||
out3 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas','w+')
|
||||
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','w+')
|
||||
print("CREATE a dictionary with clustering results")
|
||||
clustdict= {}; clustlist = []; allseq = []; clustline = {}; list= []; i=0; j=0
|
||||
for row2 in open(cluster_output, 'r'):
|
||||
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) < 2: # keep all unique sequences
|
||||
out2.write('>'+row2.split('\t')[8] + '\n' + str(fastadict[row2.split('\t')[8]])+ '\n')
|
||||
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) > 1: # create another dictionary
|
||||
# print("create dico: ", row2.split('\t')[8])
|
||||
clustdict.setdefault(row2.split('\t')[8], [row2.split('\t')[8]])
|
||||
clustlist.append(row2.split('\t')[8])
|
||||
|
||||
for row3 in open(cluster_output, 'r'):
|
||||
if row3.split('\t')[0] == 'H':
|
||||
# print("add dico: ", row3.split('\t')[9], row3.split('\t')[8])
|
||||
clustdict[row3.split('\t')[9].replace('\n','')].append(row3.split('\t')[8].replace('\n',''))
|
||||
clustline[row3.split('\t')[8].replace('\n','')] = row3.replace('\n','')
|
||||
clustline[row3.split('\t')[9].replace('\n','')] = row3.replace('\n','')
|
||||
|
||||
print("PARSE the clusters: keep seed sequences (highest coverage) for each cluster")
|
||||
for clust in clustlist:
|
||||
list = sorted(clustdict[clust], reverse = True, key=lambda x: int(x.split('_Cov')[1]))
|
||||
master = list[0]
|
||||
Covmaster = int(list[0].split('_Cov')[1])
|
||||
master8dig = ('_').join(list[0].split('_')[0:3])[:-2]
|
||||
for seq in list:
|
||||
clustered = seq.replace('\n','')
|
||||
Covclustered = int(clustered.split('_Cov')[1])
|
||||
clustered8dig = ('_').join(clustered.split('_')[0:3])[:-2]
|
||||
# print(master8dig, Covmaster, '//', clustered8dig, Covclustered)
|
||||
if float(Covmaster/Covclustered) < 10:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
elif conspecific_names_dict[master[:10]] == conspecific_names_dict[clustered[:10]]:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
elif Covclustered >= 50:
|
||||
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
i +=1
|
||||
else:
|
||||
j +=1
|
||||
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','a')
|
||||
out3.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
|
||||
print(clustline[clustered],'\t' , master )
|
||||
out4.write(clustline[clustered]+ '\t' + master + '\n')
|
||||
out4.close()
|
||||
|
||||
|
||||
print('there are ', str(i),' sequences kept and ',str(j),' sequences removed')
|
||||
|
||||
out2.close()
|
||||
out3.close()
|
||||
|
||||
splittaxa(folder, listtaxa, minlen)
|
||||
|
||||
def splittaxa(folder, listtaxa, minlen):
|
||||
for taxa in listtaxa:
|
||||
tax_sf_path = '/'.join(folder.split('/')[:-1]) + '/' + taxa + '/SizeFiltered/'
|
||||
os.system('mv ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta' + ' ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.preXPlate.fasta')
|
||||
|
||||
with open(tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta','w') as o:
|
||||
for kept in SeqIO.parse('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','fasta'):
|
||||
if taxa in kept.description:
|
||||
o.write('>' + kept.description.replace(taxa + '_', '') + '\n' + str(kept.seq) + '\n')
|
||||
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
|
||||
|
||||
def main():
|
||||
|
||||
script, folder, minlen, conspecific_names = argv
|
||||
merge_files(folder, minlen, conspecific_names)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
285
PTL1/Transcriptomes/Scripts/2a_Identify_rRNA.py
Normal file
285
PTL1/Transcriptomes/Scripts/2a_Identify_rRNA.py
Normal file
@ -0,0 +1,285 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 2a_remove_rDNA.py --help
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to identify and isolate SSU/LSU sequences ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
|
||||
## respective folders! See the manual if you need help ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 2b_removeBact.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\nThis script will remove '+color.RED+'rDNA contigs (both SSU and LSU)'+color.END\
|
||||
+color.BOLD+'\nfrom your Assembly using a set of '+color.RED+'SSU/LSU rDNAs '+color.END\
|
||||
+color.BOLD+'from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
|
||||
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+"Fasta file of Nucleotide sequences"+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('--threads','-t', default='2',
|
||||
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2a_remove_rRNA.py --input_file ../Op_me_Xxma_rna.200bp.fasta'+color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('bp.fasta') != True:
|
||||
print (color.BOLD + '\n\nCheck that you are giving an appropriately Named/Processed'\
|
||||
'Fasta file(s) to this script\n\nNOTE that this script CURRENTLY expects your'\
|
||||
' Fasta files to contain '+color.RED+ '"rna"'+color.END+color.BOLD+' in \nthe Fasta File'\
|
||||
' Name and must end with ' + color.RED + '"bp.fasta"\n\n' + color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_BvsE') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/SSULSUdb.nhr') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'BLAST+ formatted '+color.ORANGE+'SSU-LSU databases!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that they can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
code = args.input_file.split('/')[-1][:10]
|
||||
rRNA_folder = args.input_file.split('SizeFiltered')[0] + '/rRNA_Removal/'
|
||||
|
||||
if os.path.isdir(rRNA_folder) != True:
|
||||
os.system('mkdir '+rRNA_folder)
|
||||
|
||||
return code, rRNA_folder
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------------- Uses BLAST to identify SSU/LSU Sequences ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def remove_rDNA(args, rRNA_folder):
|
||||
|
||||
blast_output = rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_allSSULSUresults.tsv'
|
||||
|
||||
BLASTN_cmd = 'blastn -query ' + args.input_file + ' -evalue 1e-10 -max_target_seqs 1 -outfmt'\
|
||||
' 6 -db ' + args.databases + '/db_BvsE/SSULSUdb -num_threads 2 -out ' + blast_output
|
||||
|
||||
print (color.BOLD+'\n\nBLASTing '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END\
|
||||
+color.BOLD+ ' against the rDNA database\n\n' + color.END)
|
||||
|
||||
os.system(BLASTN_cmd)
|
||||
|
||||
rDNA_Hits = list(set([i.split('\t')[0] for i in open(blast_output).readlines()]))
|
||||
|
||||
print (color.BOLD+'Binning Sequences from '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
+color.END+color.BOLD+'\nas rDNA OR Potentially Protein-Coding\n\n'+color.END)
|
||||
|
||||
no_SSULSU = 0
|
||||
with_SSULSU = 0
|
||||
|
||||
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_rRNAseqs.fasta','w+') as HasSSU:
|
||||
for seq_rec in inFasta:
|
||||
if seq_rec.description in rDNA_Hits:
|
||||
HasSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
|
||||
with_SSULSU += 1
|
||||
|
||||
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0] + '_NorRNAseqs.fasta','w+') as NoSSU:
|
||||
for seq_rec in inFasta:
|
||||
if seq_rec.description not in rDNA_Hits:
|
||||
NoSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
|
||||
no_SSULSU += 1
|
||||
|
||||
return str(with_SSULSU), str(no_SSULSU)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Updates Log of SSU/LSU Removal --------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args, with_SSU, no_SSU):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
|
||||
print (color.BOLD+'There are '+color.RED+with_SSU+' rRNA contigs'+color.END+color.BOLD\
|
||||
+' and '+color.PURPLE+no_SSU+' Putative Protein-coding contigs'+color.END+color.BOLD\
|
||||
+'\nin '+color.DARKCYAN+args.input_file.split('/')[1]+'\n' + color.END)
|
||||
|
||||
with open('../PostAssembly_Logs/'+args.input_file.split('/')[1].split('.fas')[0]+'.Log.txt','a') as LogFile:
|
||||
LogFile.write('rDNA Contigs\t'+with_SSU+'\tn/a\tn/a\n')
|
||||
LogFile.write('Non-rDNA Contigs\t'+no_SSU+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.ORANGE+args.input_file.split('/')[1].split('_rna')[0]\
|
||||
+ '_NorRNAseqs.fasta'+color.END+color.BOLD+'\nin the '+args.input_file.split('/')[1].split('_rna')[0]\
|
||||
+' Folder\n\n' + color.END)
|
||||
print (color.BOLD + 'Next Script is: ' + color.GREEN + '2b_remove_Bact.py\n\n'+ color.END)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = args.input_file.split('SizeFiltered')[0]
|
||||
|
||||
os.system('cp ' + home_folder + 'rRNA_Removal/*NorRNA*.fasta ' + home_folder)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
code, rRNA_folder = prep_folders(args)
|
||||
|
||||
with_SSULSU, no_SSULSU = remove_rDNA(args, rRNA_folder)
|
||||
|
||||
#update_log(args, with_SSULSU, no_SSULSU)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
410
PTL1/Transcriptomes/Scripts/2b_Identify_Proks.py
Normal file
410
PTL1/Transcriptomes/Scripts/2b_Identify_Proks.py
Normal file
@ -0,0 +1,410 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 2b_remove_Bact.py --help
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to identify and isolate SSU/LSU sequences ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
|
||||
## respective folders! See the manual if you need help ##
|
||||
## 4. Run removeSSU.py on your Fasta file ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 3_CountOGsDiamond.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "Diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\nThis script will categorize Contigs as'+color.ORANGE+' STRONGLY '+color.END\
|
||||
+color.BOLD+color.RED+'Eukaryotic \nOR Prokaryotic'+color.END+color.BOLD+' using a set of Proteins'\
|
||||
' from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
|
||||
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences (with rRNAs removed)'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2b_remove_Bact.py --input_file'\
|
||||
' ../Op_me_Xxma/Op_me_Xxma_NorRNAseqs.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
print(args.input_file)
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('NorRNAseqs.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'2a_remove_rRNA.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"NorRNAseqs.fas"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_BvsE') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again.')
|
||||
valid_arg += 1
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/eukout.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Eukaryotic Protein database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
|
||||
elif os.path.isfile(args.databases + '/db_BvsE/micout.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Bacterial/Archaeal Protein database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
|
||||
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
|
||||
if os.path.isdir(BvE_folder) != True:
|
||||
os.system('mkdir '+BvE_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------- Runs Diamond on Bact and Euk small RefSeq Databases ----------------###
|
||||
###########################################################################################
|
||||
|
||||
def ublast_BvE(args, diamond_path):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
mic_output = args.input_file.split('/')[-1]+'micresults.'
|
||||
euk_output = args.input_file.split('/')[-1]+'eukresults.'
|
||||
|
||||
print(color.BOLD+'\n\n"BLAST"-ing against PROK database using DIAMOND: ' + color.DARKCYAN + 'micout.dmnd' + color.END + '\n\n')
|
||||
|
||||
Prok_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/micout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'allmicresults.tsv'
|
||||
|
||||
os.system(Prok_diamond_cmd)
|
||||
|
||||
print(color.BOLD+'\n\n"BLAST"-ing against EUK database using DIAMOND: ' + color.DARKCYAN + 'eukout.dmnd' + color.END + '\n\n')
|
||||
|
||||
Euk_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/eukout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'alleukresults.tsv'
|
||||
|
||||
os.system(Euk_diamond_cmd)
|
||||
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------------- Compares Bacterial and Euk Hits for Classification -----------------###
|
||||
###########################################################################################
|
||||
|
||||
def compare_hits(args):
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
|
||||
EukDict = {}
|
||||
ProkDict = {}
|
||||
CompDict = {}
|
||||
|
||||
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
for seq_rec in inFasta:
|
||||
EukDict[seq_rec.description] = ''
|
||||
ProkDict[seq_rec.description] = ''
|
||||
CompDict[seq_rec.description] = []
|
||||
|
||||
inEukHits = [i for i in open(BvE_folder + 'alleukresults.tsv').readlines()]
|
||||
inEukHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
|
||||
|
||||
inProkHits = [i for i in open(BvE_folder + 'allmicresults.tsv').readlines()]
|
||||
inProkHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
|
||||
|
||||
for i in inEukHits:
|
||||
if EukDict[i.split('\t')[0]] == '':
|
||||
EukDict[i.split('\t')[0]] = float(i.split('\t')[-2])
|
||||
|
||||
for i in inProkHits:
|
||||
if ProkDict[i.split('\t')[0]] == '':
|
||||
ProkDict[i.split('\t')[0]] = float(i.split('\t')[-2])
|
||||
|
||||
for k in CompDict.keys():
|
||||
if EukDict[k] != '':
|
||||
CompDict[k].append(EukDict[k])
|
||||
else:
|
||||
CompDict[k].append('no hit')
|
||||
if ProkDict[k] != '':
|
||||
CompDict[k].append(ProkDict[k])
|
||||
else:
|
||||
CompDict[k].append('no hit')
|
||||
|
||||
for k, v in CompDict.items():
|
||||
|
||||
### Contigs lacking STRONG Eukaryotic OR Prokaryotic Hits
|
||||
if v[0] == 'no hit' and v[1] == 'no hit':
|
||||
CompDict[k].append('UNDETERMINED')
|
||||
|
||||
### Contigs lacking STRONG Eukaryotic with a Prokaryotic Hit
|
||||
elif v[0] != 'no hit' and v[1] == 'no hit':
|
||||
CompDict[k].append('EUKARYOTIC')
|
||||
|
||||
### Contigs with a Eukaryotic but without a Prokaryotic Hit
|
||||
elif v[0] == 'no hit' and v[1] != 'no hit':
|
||||
CompDict[k].append('PROKARYOTIC')
|
||||
|
||||
### Uses Basic math to determine if contigs with are MORE Eukaryotic than Prokaryotic
|
||||
else:
|
||||
try:
|
||||
prok_euk_ratio = float(v[1])/float(v[0])
|
||||
euk_prok_ratio = float(v[0])/float(v[1])
|
||||
|
||||
if prok_euk_ratio >= 100:
|
||||
CompDict[k].append('EUKARYOTIC')
|
||||
|
||||
elif euk_prok_ratio >= 1000:
|
||||
CompDict[k].append('PROKARYOTIC')
|
||||
|
||||
else:
|
||||
CompDict[k].append('UNDETERMINED')
|
||||
|
||||
except:
|
||||
CompDict[k].append('divide by zero')
|
||||
|
||||
with open(BvE_folder + 'comparisons.txt','w+') as w:
|
||||
for k, v in CompDict.items():
|
||||
w.write(k+':'+':'.join([str(i) for i in v])+'\n')
|
||||
|
||||
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
|
||||
BvE_output_base = BvE_folder+args.input_file.split('/')[-1].split('.fas')[0]
|
||||
|
||||
### Gathers the sequences and categorizes them
|
||||
Euk_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'EUKARYOTIC'), key=lambda x: -int(len(x.seq)))
|
||||
Prok_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'PROKARYOTIC'), key=lambda x: -int(len(x.seq)))
|
||||
Und_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'UNDETERMINED'), key=lambda x: -int(len(x.seq)))
|
||||
Zero_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'divide by zero'), key=lambda x: -int(len(x.seq)))
|
||||
|
||||
|
||||
### Writes out all of the categorized sequences
|
||||
with open(args.input_file.split('NorRNA')[0] + 'WTA_EPU.fasta', 'w') as epu:
|
||||
with open(BvE_output_base+'.Not_Bact.fasta','w+') as nb:
|
||||
for euk_seq in Euk_Fasta:
|
||||
nb.write('>' + euk_seq.description + '\n' + str(euk_seq.seq) + '\n')
|
||||
epu.write('>' + euk_seq.description + '_E' + '\n' + str(euk_seq.seq) + '\n')
|
||||
|
||||
|
||||
with open(BvE_output_base+'.Bact_Hit.fasta','w+') as pr:
|
||||
for prok_seq in Prok_Fasta:
|
||||
pr.write('>' + prok_seq.description + '\n' + str(prok_seq.seq) + '\n')
|
||||
epu.write('>' + prok_seq.description + '_P' + '\n' + str(prok_seq.seq) + '\n')
|
||||
|
||||
with open(BvE_output_base+'.Undetermined.fasta','w+') as und:
|
||||
for und_seq in Und_Fasta:
|
||||
und.write('>' + und_seq.description + '\n' + str(und_seq.seq) + '\n')
|
||||
epu.write('>' + und_seq.description + '_U' + '\n' + str(und_seq.seq) + '\n')
|
||||
|
||||
if len(Zero_Fasta) != 0:
|
||||
with open(BvE_output_base+'.DivideByZero.fasta','w+') as w:
|
||||
for zero_seq in Zero_Fasta:
|
||||
w.write('>' + zero_seq.description + '\n' + str(zero_seq.seq) + '\n')
|
||||
epu.write('>' + zero_seq.description + '_U' + '\n' + str(zero_seq.seq) + '\n')
|
||||
else:
|
||||
pass
|
||||
|
||||
return str(len(Euk_Fasta)), str(len(Prok_Fasta)), str(len(Und_Fasta))
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###----------------------- Updates Log of Prok vs Euk Comparisons ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
print (color.BOLD +'\n\nThere are '+color.RED+Prok_Contigs+' Strongly Prokaryotic contigs'+color.END\
|
||||
+color.BOLD+',\n'+color.ORANGE+Euk_Contigs+' Strongly Eukaryotic contigs'+color.END\
|
||||
+color.BOLD+',\nand '+color.PURPLE+Und_Contigs+' Undetermined Contigs\n'+color.END\
|
||||
+color.BOLD+'in '+args.input_file.split('/')[-1]+color.END)
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(args.input_file.split('/')[-1].split('_No')[0]) and Logname.endswith('Log.txt'): # ACL - ???
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as Logfilename:
|
||||
Logfilename.write('Prokaryotic Contigs\t'+Prok_Contigs+'\tn/a\tn/a\n')
|
||||
Logfilename.write('Eukaryotic Contigs\t'+Euk_Contigs+'\tn/a\tn/a\n')
|
||||
Logfilename.write('Undetermined Contigs\t'+Und_Contigs+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.split('NorRNA')[0]+'WTA_EPU.fasta'+color.END+color.BOLD+' in the '\
|
||||
+args.input_file.split('/')[1]+' Folder\n\n' + color.END)
|
||||
print (color.BOLD + 'Next Script is: ' + color.GREEN + '3_CountOGsDiamond.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
|
||||
os.system('cp '+home_folder+'/*WTA_EPU.fasta '+home_folder+'/BvE/')
|
||||
os.system('mv '+home_folder+'/*NorRNA*fasta '+home_folder+'/rRNA_Removal/')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
usearch_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
ublast_BvE(args, usearch_path)
|
||||
|
||||
Euk_Contigs, Prok_Contigs, Und_Contigs = compare_hits(args)
|
||||
|
||||
#update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
372
PTL1/Transcriptomes/Scripts/3_AssignOGs.py
Normal file
372
PTL1/Transcriptomes/Scripts/3_AssignOGs.py
Normal file
@ -0,0 +1,372 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 16_10_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 3_CountOGsDiamond.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to classify the STRONGLY Eukaryotic and UNDETERMINED/UNKNOWN ##
|
||||
## contigs into different OGs (e.g. orthologous gene-families) ##
|
||||
## ##
|
||||
## For more info about the OGs, check out: OrthoMCL.org ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 4_StopFrequency.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
import argparse, os, sys, re
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "usearch" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE USEARCH PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
|
||||
+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
|
||||
+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
|
||||
'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
|
||||
+'THIS script (3_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
|
||||
formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
||||
' Eukaryotic protein coding transcripts'+color.END)
|
||||
required_arg_group.add_argument('--databases','-g', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to fasta file with Hook sequences"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('--threads','-t', default='2',
|
||||
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
|
||||
optional_arg_group.add_argument('--evalue','-e', default=1e-5, type = float,
|
||||
help=color.BOLD+color.GREEN+' Maximum e-value for OG assignment\n (default = 1e-5)\n'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3_CountOGsDiamond.py'\
|
||||
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_NBU.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'2b_remove_Bact.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_NBU.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_OG') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again\n\n.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
ogdb_count = 0
|
||||
for file in os.listdir(args.databases + '/db_OG'):
|
||||
if file.endswith('.dmnd'):
|
||||
ogdb_count += 1
|
||||
|
||||
if ogdb_count == 0:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif ogdb_count > 1:
|
||||
print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
if os.path.isdir(OG_folder) != True:
|
||||
os.system('mkdir '+OG_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def OG_diamond(args, diamond_path):
|
||||
|
||||
print (color.BOLD+'\nStarting to "BLAST" against OG databases'+color.END)
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
|
||||
|
||||
print (color.BOLD + '\n\n"BLAST"-ing against OG database using DIAMOND: ' + color.DARKCYAN + db + color.END + '\n\n')
|
||||
|
||||
OG_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + str(args.evalue) + ' --threads 60 --subject-cover 0.35 --outfmt 6 -o ' + OG_folder + 'allOGresults.tsv'
|
||||
|
||||
os.system(OG_diamond_cmd)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
|
||||
###########################################################################################
|
||||
|
||||
def keep_best(args):
|
||||
|
||||
print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST match for each transcript\n\n'+color.END)
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
inTSV = [i for i in open(OG_folder + 'allOGresults.tsv').readlines()]
|
||||
|
||||
inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
|
||||
|
||||
keep = []
|
||||
for i in inTSV:
|
||||
if any(i.split('\t')[0] in j for j in keep) != True:
|
||||
keep.append(i)
|
||||
|
||||
updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+'\t'+'\t'.join(line.split('\t')[1:]) for line in keep]))
|
||||
|
||||
with open(args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv'), 'w+') as w:
|
||||
for i in updated_lines:
|
||||
w.write(i)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
|
||||
###########################################################################################
|
||||
|
||||
def update_fasta(args):
|
||||
|
||||
print (color.BOLD+color.PURPLE+'Updating Fasta File Sequence Names with their BEST OG hits\n\n'+color.END)
|
||||
|
||||
Renamed_TSV = args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv')
|
||||
|
||||
keep = [i for i in open(Renamed_TSV).readlines() if i != '\n']
|
||||
|
||||
keep_dict = { }
|
||||
for line in keep:
|
||||
try:
|
||||
og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
|
||||
og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
|
||||
og = og_prefix + og_number
|
||||
|
||||
keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0] : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
updated_seq_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq)+'\n' for i in inFasta if i.description in keep_dict.keys()]
|
||||
|
||||
seqs_without_OG = ['>'+i.description+'\n'+str(i.seq)+'\n' for i in inFasta if i.description not in keep_dict.keys()]
|
||||
|
||||
with open(args.input_file.replace('.fasta','.Renamed.fasta'),'w+') as w:
|
||||
for i in updated_seq_name:
|
||||
w.write(i)
|
||||
|
||||
with open(args.input_file.replace('.fasta','.LackOG.fasta'),'w+') as x:
|
||||
for i in seqs_without_OG:
|
||||
x.write(i)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Updates Log With OG Assignment Information ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(args):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
|
||||
|
||||
Renamed_TSV = home_folder+args.input_file.split('/')[-1].replace('.fasta','.Renamed_allOGCleanresults.tsv')
|
||||
|
||||
keep = [line for line in open(Renamed_TSV).readlines()]
|
||||
all_ogs = [line.split('\t')[1].split('_')[-1] for line in keep if len(re.split('_OG.{1}_', line.split('\t')[1])) > 1]
|
||||
|
||||
total_with_ogs = str(len(all_ogs))
|
||||
unique_ogs = str(len(set(all_ogs)))
|
||||
|
||||
print (color.BOLD +'There are '+color.BLUE +total_with_ogs+' Contigs'+color.END\
|
||||
+color.BOLD+' that hit '+color.DARKCYAN+unique_ogs+' Unique OGs\n'+color.END)
|
||||
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(args.input_file.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
|
||||
LogFile.write('Contigs With OG\t'+total_with_ogs+'\tn/a\tn/a\n')
|
||||
LogFile.write('Unique OGs\t'+unique_ogs+'\tn/a\tn/a\n')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
os.system('rm ' + args.input_file)
|
||||
|
||||
os.system('cp ' + args.input_file.replace('.fasta','.Renamed.fasta') + ' ' + OG_folder)
|
||||
|
||||
os.system('cp ' + args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv') + ' ' + OG_folder)
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
home_folder = '../'+args.input_file.split('/')[1]+'/'
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.replace('.fasta','WTA_EPU.fasta')+color.END+color.BOLD+' in the '+home_folder\
|
||||
+' Folder\n\n' + color.END)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.GREEN+'4_InFrameStopFreq.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
usearch_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
OG_diamond(args, usearch_path)
|
||||
|
||||
keep_best(args)
|
||||
|
||||
update_fasta(args)
|
||||
|
||||
#update_log(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
790
PTL1/Transcriptomes/Scripts/4_InFrameStopCodonEsimator.py
Normal file
790
PTL1/Transcriptomes/Scripts/4_InFrameStopCodonEsimator.py
Normal file
@ -0,0 +1,790 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
##__Updated__: 18_08_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 4_InFrameStopFreq.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to aid in identifying the genetic code of the data given ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 5_GCodeTranslate.py ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
from distutils import spawn
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.Data.CodonTable import CodonTable
|
||||
|
||||
|
||||
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
||||
|
||||
tag_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAG'])
|
||||
|
||||
c_uncinata_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA'])
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD+'\n\nThis script is intended to '+color.RED+'AID You '+color.END+color.BOLD\
|
||||
+'in determining the '+color.RED+'\nLikely Genetic Code'+color.END+color.BOLD+' of a'\
|
||||
' given Fasta File of transcripts\n\nInterpretation of the output (StopFreq.tsv) is difficult \nand so '+color.ORANGE\
|
||||
+'TWO EXAMPLES'+color.END+color.BOLD+' can be found in the '+color.CYAN+'NOTES Section'\
|
||||
+color.END+color.BOLD+' of\nTHIS Script '+color.GREEN+'(4_InFrameStopFreq.py)\n'+color.END\
|
||||
+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store', required=True,
|
||||
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
|
||||
' Eukaryotic protein coding transcripts'+color.END)
|
||||
required_arg_group.add_argument('--databases','-d', action='store',
|
||||
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_InFrameStopFreq.py'\
|
||||
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_EPU.Renamed.fasta'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'3_CountOGsUsearcy.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_NBU.Renamed.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if os.path.isdir(args.databases + '/db_StopFreq') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
+color.ORANGE+'db_StopFreq Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
|
||||
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
|
||||
+'\n\nThen try once again\n\n.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
elif os.path.isfile(args.databases + '/db_StopFreq/RepEukProts.dmnd') != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
|
||||
'Diamond formatted '+color.ORANGE+'Representative Eukaryotic Protein Database!\n\n'+color.END+color.BOLD+\
|
||||
'Ensure that it can be found in the '+color.ORANGE+'db_StopFreq folder'+color.END+\
|
||||
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
|
||||
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
Stop_folder = '../'+args.input_file.split('/')[1]+'/StopCodonFreq/'
|
||||
|
||||
if os.path.isdir(Stop_folder) != True:
|
||||
os.system('mkdir '+Stop_folder)
|
||||
|
||||
if os.path.isdir(Stop_folder+'StopCodonFastas') != True:
|
||||
os.system('mkdir '+Stop_folder+'StopCodonFastas')
|
||||
|
||||
if os.path.isdir(Stop_folder+'SpreadSheets') != True:
|
||||
os.system('mkdir '+Stop_folder+'SpreadSheets')
|
||||
|
||||
return Stop_folder+'StopCodonFastas/'
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Translates Sequences with Each Stop Codon ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_translations(args):
|
||||
print (color.BOLD+'\nIdentifying ORFs in the Fasta file based on the output of 3_CountOGsDiamond.py\n'+color.END)
|
||||
|
||||
intsv = [i for i in open(args.input_file.replace('.fasta','_allOGCleanresults.tsv')).readlines() if i != '\n']
|
||||
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
prot_dict = {}
|
||||
|
||||
for i in intsv:
|
||||
# print i
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
if (int(i.split('\t')[6])) < 5:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
else:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6]))
|
||||
if (int(i.split('\t')[7])-4) < 5:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7]))
|
||||
else:
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])-4)
|
||||
|
||||
|
||||
#------------- Prep translation with 'TAA' as the only Stop -------------#
|
||||
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAA'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=c_uncinata_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=c_uncinata_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
|
||||
#------------- Prep translation with 'TGA' as the only Stop -------------#
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TGA'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=6))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=6))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
|
||||
#------------- Prep translation with 'TAG' as the only Stop -------------#
|
||||
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAG'+color.END\
|
||||
+color.BOLD+' as the sole STOP codon\n'+color.END)
|
||||
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for seq_rec in inFasta:
|
||||
if key in seq_rec.description:
|
||||
stop_pos = 0
|
||||
if prot_dict[key][0] == 'F':
|
||||
temp = seq_rec.seq[prot_dict[key][1]:]
|
||||
temp_prot = str(temp.translate(table=tag_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
|
||||
if prot_dict[key][0] == 'RC':
|
||||
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
|
||||
temp_prot = str(temp.translate(table=tag_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[key].append(temp[:stop_pos])
|
||||
else:
|
||||
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
|
||||
|
||||
#------------ Parsing through data to maintain OG assignments ------------#
|
||||
inOGs = intsv
|
||||
inOGs = [i.split('\t')[0]+';'+i.split('\t')[1][-10:] for i in inOGs]
|
||||
inOGs2 = []
|
||||
for i in inOGs:
|
||||
if 'no_group' not in i.split(';')[1]:
|
||||
inOGs2.append(i)
|
||||
else:
|
||||
inOGs2.append(i.split(';')[0]+';no_group')
|
||||
inOGs2 = list(set(inOGs2))
|
||||
|
||||
#---------------- Write file with 'TAA' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TAA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-3]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_taa_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-3])).translate(table=c_uncinata_table)).upper()+'\n')
|
||||
|
||||
#---------------- Write file with 'TGA' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TGA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-2]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tga_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-2])).translate(table=6)).upper()+'\n')
|
||||
|
||||
#---------------- Write file with 'TAG' is the only Stop ----------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
|
||||
+' TAG '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
|
||||
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-1]).upper()+'\n')
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_tag_ORF.aa.fasta','w+') as w:
|
||||
for key, value in prot_dict.items():
|
||||
for j in inOGs2:
|
||||
if key == j.split(';')[0]:
|
||||
if len(prot_dict[key]) < 4:
|
||||
pass
|
||||
else:
|
||||
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-1])).translate(table=tag_table)).upper()+'\n')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###---------- Diamonds the Translations Against a SMALL Euk Protein Database ----------###
|
||||
###########################################################################################
|
||||
|
||||
def diamond_ProtDB(args, diamond_path):
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tag_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tag_ORF.RepEukProts.tsv')
|
||||
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tga_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tga_ORF.RepEukProts.tsv')
|
||||
|
||||
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_taa_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_taa_ORF.RepEukProts.tsv')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Manages the search for In-Frame Stop Codons --------------------###
|
||||
###########################################################################################
|
||||
|
||||
|
||||
def hunt_for_stops(args):
|
||||
|
||||
#------------------------ Open Fasta Files ------------------------#
|
||||
try:
|
||||
TAGinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','fasta') if str(i.seq).endswith('TAG')]
|
||||
print (color.BOLD+'\n\nGathering Sequence information from FASTA and TSV files\n'+color.END)
|
||||
|
||||
except:
|
||||
print (color.BOLD+color.RED+'\n\nMissing Necessary Inputs: Open Script for Usage'\
|
||||
' Information\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
TGAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','fasta') if str(i.seq).endswith('TGA')]
|
||||
|
||||
TAAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','fasta') if str(i.seq).endswith('TAA')]
|
||||
|
||||
## This section originally ONLY considered sequences WITH OG assignments:
|
||||
## TAAinFasta = [i for i in TAAinFasta if 'no_group' not in i.description and str(i.seq).endswith('TAA')]
|
||||
## This has been taken out for now
|
||||
|
||||
#----------------------- Open BLAST Reports -----------------------#
|
||||
|
||||
TAGinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tag_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
TGAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tga_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
TAAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_taa_ORF.RepEukProts.tsv').read().split('\n') if i != '']
|
||||
|
||||
## This section originally ONLY considered sequences WITH OG assignments:
|
||||
## TAAinTSV = i for i in TAAinTSV if i != ''and 'no_group' not in i.split('\t')[0]]
|
||||
## This has been taken out for now
|
||||
|
||||
|
||||
#------------ Set-up Genetic Code Specific Dictionaries ------------#
|
||||
|
||||
tag_dict = {}
|
||||
for i in TAGinTSV:
|
||||
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-6]))
|
||||
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
tga_dict = {}
|
||||
for i in TGAinTSV:
|
||||
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-6]))
|
||||
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
taa_dict = {}
|
||||
for i in TAAinTSV:
|
||||
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-6]))
|
||||
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-5]))
|
||||
|
||||
#-------------- Preparing In-Frame Stop Codon Counts --------------#
|
||||
|
||||
# All the data when TGA is the sole stop codon
|
||||
tga_codons = 0
|
||||
tga_data_tag = 0
|
||||
tga_data_tga = 0
|
||||
tga_data_taa = 0
|
||||
tga_seq_count = 0
|
||||
|
||||
# All the data when TAG is the sole stop codon
|
||||
tag_codons = 0
|
||||
tag_data_tag = 0
|
||||
tag_data_tga = 0
|
||||
tag_data_taa = 0
|
||||
tag_seq_count = 0
|
||||
|
||||
# All the data when TAA is the sole stop codon
|
||||
taa_codons = 0
|
||||
taa_data_tag = 0
|
||||
taa_data_tga = 0
|
||||
taa_data_taa = 0
|
||||
taa_seq_count = 0
|
||||
|
||||
# All the data for each stop codon combined
|
||||
tga_inframe = 0
|
||||
tag_inframe = 0
|
||||
taa_inframe = 0
|
||||
total_codons = 0
|
||||
total_seq_counts = len(open(args.input_file).read().split('>'))-1
|
||||
|
||||
|
||||
#-------- Gathering In-frame Stop Codon Density Information --------#
|
||||
|
||||
### Collect in-frame stop information for "TAA" and "TAG" when TGA is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TGA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TGAinFasta:
|
||||
try:
|
||||
if tga_dict[i.description][0] == 1:
|
||||
for n in range((tga_dict[i.description][0]-1),((tga_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tga_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tga_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tga_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tga_codons += 1
|
||||
total_codons += 1
|
||||
tga_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((tga_dict[i.description][0]-1)*3),((tga_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tga_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tga_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tga_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tga_codons += 1
|
||||
total_codons += 1
|
||||
tga_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
### Collect in-frame stop information for "TAA" and "TGA" when TAG is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TAG'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TAGinFasta:
|
||||
try:
|
||||
if tag_dict[i.description][0] == 1:
|
||||
for n in range((tag_dict[i.description][0]-1),((tag_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tag_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tag_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tag_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
tag_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((tag_dict[i.description][0]-1)*3),(tag_dict[i.description][1]*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
tag_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
tag_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
tag_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
tag_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
### Collect in-frame stop information for "TGA" and "TAG" when TAA is the ONLY stop
|
||||
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
|
||||
+' TAA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
|
||||
|
||||
for i in TAAinFasta:
|
||||
try:
|
||||
if taa_dict[i.description][0] == 1:
|
||||
for n in range((taa_dict[i.description][0]-1),((taa_dict[i.description][1])*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
taa_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
taa_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
taa_data_tga += 1
|
||||
tga_inframe += 1
|
||||
taa_codons += 1
|
||||
total_codons += 1
|
||||
taa_seq_count += 1
|
||||
|
||||
else:
|
||||
for n in range(((taa_dict[i.description][0]-1)*3),(taa_dict[i.description][1]*3)-3,3):
|
||||
if str(i.seq).upper()[n:n+3] == 'TAG':
|
||||
taa_data_tag += 1
|
||||
tag_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
|
||||
taa_data_taa += 1
|
||||
taa_inframe += 1
|
||||
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
|
||||
taa_data_tga += 1
|
||||
tga_inframe += 1
|
||||
tag_codons += 1
|
||||
total_codons += 1
|
||||
taa_seq_count += 1
|
||||
except:
|
||||
pass
|
||||
|
||||
#-------------- Writing Data Out and Print Statement --------------#
|
||||
|
||||
with open(args.input_file.split('.fas')[0]+'_StopCodonStats.tsv','w+') as w:
|
||||
w.write('Stop Codon\tNumber of Seqs Analyzed\tIn-frame TAG\tIn-frame TGA\tIn-frame TAA\tTotal Codons\tIn-frame TAG density\tIn-frame TGA density\tIn-frame TAA density\n')
|
||||
if tga_codons != 0:
|
||||
w.write('TGA\t'+str(tga_seq_count)+'\t'+str(tga_data_tag)+'\t'+str(tga_data_tga)+'\t'+str(tga_data_taa)+'\t'+str(tga_codons)\
|
||||
+'\t'+"%.2f" % ((float(tga_data_tag)*1000)/float(tga_codons))+'\t'+"%.2f" % ((float(tga_data_tga)*1000)/float(tga_codons))+'\t'\
|
||||
+"%.2f" % ((float(tga_data_taa)*1000)/float(tga_codons))+'\n')
|
||||
else:
|
||||
w.write('TGA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
|
||||
if tag_codons != 0:
|
||||
w.write('TAG\t'+str(tag_seq_count)+'\t'+str(tag_data_tag)+'\t'+str(tag_data_tga)+'\t'+str(tag_data_taa)+'\t'+str(tag_codons)\
|
||||
+'\t'+"%.2f" % ((float(tag_data_tag)*1000)/float(tag_codons))+'\t'+"%.2f" % ((float(tag_data_tga)*1000)/float(tag_codons))+'\t'\
|
||||
+"%.2f" % ((float(tag_data_taa)*1000)/float(tag_codons))+'\n')
|
||||
else:
|
||||
w.write('TAG\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
if taa_codons != 0:
|
||||
w.write('TAA\t'+str(taa_seq_count)+'\t'+str(taa_data_tag)+'\t'+str(taa_data_tga)+'\t'+str(taa_data_taa)+'\t'+str(taa_codons)\
|
||||
+'\t'+"%.2f" % ((float(taa_data_tag)*1000)/float(taa_codons))+'\t'+"%.2f" % ((float(taa_data_tga)*1000)/float(taa_codons))+'\t'\
|
||||
+"%.2f" % ((float(taa_data_taa)*1000)/float(taa_codons))+'\n')
|
||||
else:
|
||||
w.write('TAA\t0\t0\t0\t0\t0\t0\t0\t0\n')
|
||||
|
||||
w.write('\n \n')
|
||||
w.write('Summary\t'+str(tga_seq_count+tag_seq_count+taa_seq_count)+'\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)\
|
||||
+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))\
|
||||
+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n')
|
||||
w.write('\nTotal Seqs in Fasta\t'+str(total_seq_counts))
|
||||
|
||||
# print color.BOLD + color.BLUE + '\nSummary\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'\
|
||||
# +"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n\n'\
|
||||
# + str(tag_seq_count) + '\t' + str(tga_seq_count) + '\t' + str(taa_seq_count) + color.END
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
if os.path.isdir('/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq') != True:
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/')
|
||||
else:
|
||||
pass
|
||||
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
||||
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
||||
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*_ORF.*fasta ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
|
||||
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*Prots.tsv ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
|
||||
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
|
||||
.replace('.fasta','_StopCodonStats.tsv')+color.END+color.BOLD+' in the '+home_folder\
|
||||
+' Folder\n\n' + color.END)
|
||||
|
||||
print (color.BOLD+'Next Script is: '+color.GREEN+'5_GCodeTranslate.py\n\n'+ color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
diamond_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_translations(args)
|
||||
|
||||
diamond_ProtDB(args, diamond_path)
|
||||
|
||||
hunt_for_stops(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
|
||||
|
||||
#----------------------------------------- NOTES -----------------------------------------#
|
||||
#
|
||||
# This script is designed to HELP you make an informed decision about the genetic code being
|
||||
# used by your particular organism. Be aware that it will be limited by the quality of the
|
||||
# data given to it!
|
||||
#
|
||||
# You will need:
|
||||
#
|
||||
# Diamond, BioPython, AND the output from '3_CountOGSDiamond.py'
|
||||
#
|
||||
# If you are not using the Author's database, update your database name(s) in lines: 345-360
|
||||
#
|
||||
# katzlab$ python StopFrequency.py YourFastaFile.fasta
|
||||
#
|
||||
#
|
||||
#------------------------------- Interpretation of Results -------------------------------#
|
||||
#
|
||||
# FORMATTED BELOW WITH TEXTWRANGLER...
|
||||
#
|
||||
# Example output using CILIATE (TGA) genetic Code (NOTE THE In-Frame Densities):
|
||||
#
|
||||
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
||||
# TGA 341 14 0 22 113156 1.2 0 0.92
|
||||
# TAG 424 0 0 34 140085 0 0 0.78
|
||||
# TAA 205 14 0 0 16714 0.84 0 0
|
||||
# Summary 970 28 0 56 269955 2.04 0 1.7
|
||||
#
|
||||
# VALUES in summary line (OR SUM of Density) that are > 1.5 likely indicate that the STOP
|
||||
# codon has been reassigned... in the case above, TAG and TAA look like they have been
|
||||
# reassigned.
|
||||
#
|
||||
#
|
||||
# Example output using UNIVERSAL genetic Code (NOTE THE In-Frame Densities):
|
||||
#
|
||||
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
|
||||
# TGA 341 1 0 2 113156 0.2 0 0.05
|
||||
# TAG 424 0 2 4 140085 0 0 0.08
|
||||
# TAA 205 1 0 0 16714 0.04 0 0
|
||||
# Summary 970 2 2 6 269955 0.15 0 0.06
|
||||
#
|
||||
# VALUES in summary line (OR SUM of Density) that are > 0.5 likely indicate that the STOP
|
||||
# codon still acts as STOP... in the case above, TAG, TGA and TAA look like they still behave
|
||||
# as a stop codon.
|
||||
#
|
||||
# THIS IS A ROUGH GUIDE FOR INTERPRETING THE RESULTS!!!! BE VERY VERY WARY! NUMBER OF TOTAL
|
||||
# SEQUENCES AND TOTAL CODONS OBSERVED ARE IMPORTANT (TOO FEW AND ANY INTERPRETATION IS DEVOID
|
||||
# OF ANY MEANING).
|
||||
766
PTL1/Transcriptomes/Scripts/5_GCodeTranslate.py
Normal file
766
PTL1/Transcriptomes/Scripts/5_GCodeTranslate.py
Normal file
@ -0,0 +1,766 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 20_09_2017
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 5_GCodeTranslate.py --help
|
||||
|
||||
|
||||
##########################################################################################
|
||||
## This script is intended to aid in identifying the genetic code of the data given ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with 1_ContigFiltStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 6_FilterPatials.py (in FinalizeTranscripts Folder) ##
|
||||
## ##
|
||||
##########################################################################################
|
||||
|
||||
import argparse, os, re, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from Bio.Data.CodonTable import CodonTable
|
||||
|
||||
|
||||
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
|
||||
|
||||
blepharisma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
condylostoma_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
c_uncinata_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA'])
|
||||
|
||||
euplotes_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAA','TAG'])
|
||||
|
||||
myrionecta_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
no_stop_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = [''])
|
||||
|
||||
peritrich_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TGA'])
|
||||
|
||||
tag_table = CodonTable(forward_table={
|
||||
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
|
||||
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
|
||||
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
|
||||
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
|
||||
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
|
||||
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
|
||||
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
|
||||
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
|
||||
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
|
||||
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
|
||||
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
|
||||
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
|
||||
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
|
||||
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
|
||||
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
|
||||
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
|
||||
start_codons = [ 'ATG'],
|
||||
stop_codons = ['TAG'])
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
PURPLE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###------------------------- Checks the Command Line Arguments -------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
|
||||
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
|
||||
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
|
||||
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
|
||||
'"universal")\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--no_RP','-no_RP', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Allows files to "skip" the removal\n of Partial Transcripts\n'\
|
||||
+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
sys.exit()
|
||||
|
||||
### Adding in names to 'arg' class for more easy use throughout the script
|
||||
args.ntd_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_NTD.ORF.fasta'
|
||||
args.aa_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_AA.ORF.fasta'
|
||||
args.tsv_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_allOGCleanresults.tsv'
|
||||
|
||||
args.home_folder = '/'.join(args.input_file.split('/')[:-1])
|
||||
args.Diamond_Folder = args.home_folder+'/DiamondOG'
|
||||
args.StopFreq = args.home_folder+'/StopCodonFreq'
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
|
||||
|
||||
args.tsv_file = args.input_file.split('.fas')[0]+ '_allOGCleanresults.tsv'
|
||||
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_GCodeTranslate.py'\
|
||||
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'\
|
||||
' --genetic_code Universal'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
|
||||
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
|
||||
'taa','tag','tga','mesodinium']
|
||||
|
||||
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
|
||||
'Condylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
|
||||
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
|
||||
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
|
||||
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
|
||||
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
|
||||
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
else:
|
||||
if args.list_codes == True:
|
||||
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
|
||||
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.input_file != None:
|
||||
if os.path.isfile(args.input_file) != False:
|
||||
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
|
||||
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
|
||||
' with '+color.GREEN+'3_CountOGsDiamond.py '+color.END+color.BOLD+'are valid\n\n'\
|
||||
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
|
||||
'"WTA_EPU.Renamed.fasta"\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
||||
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
|
||||
|
||||
if os.path.isdir(OG_folder) != True:
|
||||
os.system('mkdir '+OG_folder)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + 'TranslatedTranscriptomes') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + 'TranslatedTranscriptomes')
|
||||
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###---------------- Scans 5-Prime End of Transcript for In-Frame "ATG" ----------------###
|
||||
##########################################################################################
|
||||
|
||||
def check_new_start_new(some_seq, low_lim, upper_lim, old_start, codon_table):
|
||||
|
||||
## Looks for in-frame STOP codons in the UTR of the transcript
|
||||
prime5 = str(Seq(some_seq[low_lim:upper_lim]).translate(table=codon_table)).replace('*','x')
|
||||
in_frame_stops = [stops.start() for stops in re.finditer('x',prime5)]
|
||||
|
||||
## Looks for in-frame START codons in the UTR of the transcript
|
||||
in_frame_starts = [starts.start() for starts in re.finditer('M',prime5)]
|
||||
|
||||
## Checks that there are NO in-frame STOP codons between the possible "new" START codon
|
||||
## and the aligned portion of the transcript -- THIS is double checked!
|
||||
if len(in_frame_starts) != 0:
|
||||
if len(in_frame_stops) != 0:
|
||||
if in_frame_stops[-1] < in_frame_starts[-1]:
|
||||
new_start = low_lim+in_frame_starts[-1]*3
|
||||
else:
|
||||
new_start = old_start
|
||||
else:
|
||||
new_start = low_lim+in_frame_starts[-1]*3
|
||||
else:
|
||||
new_start = old_start
|
||||
|
||||
## Skips the double-checking if there are no GOOD potential START codons
|
||||
if new_start == old_start:
|
||||
updated_start = old_start
|
||||
|
||||
else:
|
||||
## Double checks that there are NO IN-FRAME stop codons between the NEW-SUGGESTED Start
|
||||
## position and the OLD-SUPPORTED stop position!
|
||||
between_new_old_start = str(Seq(some_seq[new_start:old_start]).translate(table=1)).replace('*','x')
|
||||
in_frame_stops_check = [stops.start() for stops in re.finditer('x',between_new_old_start)]
|
||||
in_frame_starts_check = [starts.start() for starts in re.finditer('M',between_new_old_start)]
|
||||
if len(in_frame_starts_check) != 0:
|
||||
if len(in_frame_stops_check) != 0:
|
||||
if in_frame_stops_check[-1] < in_frame_starts_check[-1]:
|
||||
updated_start = new_start+in_frame_starts_check[-1]*3
|
||||
else:
|
||||
updated_start = old_start
|
||||
else:
|
||||
updated_start = new_start
|
||||
else:
|
||||
updated_start = new_start
|
||||
|
||||
return updated_start
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Extracts the ORF from the Fasta File and SpreadSheet ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def extract_ORF(prot_dict, codon_table, args):
|
||||
|
||||
print (color.BOLD+'\n\nExtracting '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
|
||||
' the transcriptomic data-set\n\n'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
|
||||
## Attempting to find the most-likely START (ATG) position in the transcript (tricky)
|
||||
## Skips this if the initial Methionine (ATG) is likely present
|
||||
## (e.g. the alignment position of the protein = '1')
|
||||
prot_start = int(v[3].split('..')[0])
|
||||
old_start = v[1]
|
||||
if prot_start != 1:
|
||||
min_dist, max_dist = round_down_three(prot_start)
|
||||
min_start = old_start-min_dist
|
||||
max_start = old_start-max_dist
|
||||
if min_start < 0:
|
||||
min_start = old_start
|
||||
if max_start < 0:
|
||||
max_start = min_start%3
|
||||
# print k+'\tOld_start\t'+str(old_start)+'\tMin_Dist/Start\t'+str(min_dist)+'/'+str(min_start)+'\tMax_Dist/Start\t'+str(max_dist)+'/'+str(max_start)+'\n'
|
||||
updated_start = check_new_start_new(v[-1], max_start, min_start, old_start, codon_table)
|
||||
else:
|
||||
updated_start = old_start
|
||||
temp = prot_dict[k][-1][updated_start:]
|
||||
|
||||
## Uses the given genetic code to identify the stop position of the ORF
|
||||
temp_prot = str(Seq(temp).translate(table=codon_table))
|
||||
if '*' in temp_prot:
|
||||
stop_pos = (temp_prot.index('*')+1)*3
|
||||
prot_dict[k].append(temp[:stop_pos])
|
||||
else:
|
||||
stop_pos = prot_dict[k][2] - prot_dict[k][1]
|
||||
prot_dict[k].append(temp[:stop_pos])
|
||||
|
||||
## Awkward_list is populated with unexpectedly SHORT ORFs!
|
||||
## Reasons for being short include:
|
||||
# An error Xyrus introduced
|
||||
# Not as great genetic code decision (in-frame stop)
|
||||
# Crummy sequence/assembly quality (false in-frame stop codons)
|
||||
|
||||
awkward_list = []
|
||||
look_good = []
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
expected_min = len(v[-2][v[1]:v[2]])-1
|
||||
if len(v[-1]) < expected_min:
|
||||
awkward_list.append(k)
|
||||
else:
|
||||
look_good.append(k)
|
||||
|
||||
if len(awkward_list) != 0:
|
||||
with open('ShortTranscripts_FromTranslation.txt','w+') as x:
|
||||
for entry in awkward_list:
|
||||
x.write(entry+'\n')
|
||||
else:
|
||||
pass
|
||||
|
||||
print (color.BOLD+'\n\nTranslating '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
|
||||
' using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
prot_dict[k].append(str(Seq(v[-1]).translate(table=codon_table)).rstrip('*'))
|
||||
|
||||
return prot_dict
|
||||
|
||||
##########################################################################################
|
||||
###------------ Grabs the Coding Coordinates from the OG-BLAST SpreadSheet ------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_translations(args):
|
||||
|
||||
print (color.BOLD+'\n\nGrabbing useful info from the '+color.ORANGE+args.input_file\
|
||||
.split('/')[-1]+color.END+color.BOLD+' Fasta File\nand from the '+color.ORANGE+args.tsv_file\
|
||||
.split('/')[-1]+color.END+color.BOLD+' OG-Assignment Spreadsheet'+color.END)
|
||||
|
||||
inTSV = ['\t'.join(i.rstrip('\n').split('\t')[:-1]) for i in open(args.tsv_file).readlines() if i != '\n']
|
||||
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
|
||||
|
||||
# ORF identification step here, uses the 'allOGCleanresults.tsv file to identify the ORF
|
||||
prot_dict = {}
|
||||
|
||||
# Special scenario! Only for when the genetic code is not particularly useful ...
|
||||
if args.genetic_code.lower() == 'none' or args.genetic_code.lower() == 'condylostoma' or args.genetic_code.lower() == 'condy':
|
||||
for i in inTSV:
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
## Makes sure that the dictionary has the transcript in the correct orientation
|
||||
for i in inFasta:
|
||||
if i.description in prot_dict.keys():
|
||||
if 'RC' == prot_dict[i.description][0]:
|
||||
prot_dict[i.description].append(str(i.seq.reverse_complement()))
|
||||
else:
|
||||
prot_dict[i.description].append(str(i.seq))
|
||||
|
||||
else:
|
||||
for i in inTSV:
|
||||
prot_dict.setdefault(i.split('\t')[0],[])
|
||||
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('F')
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
|
||||
## Saves the Transcript Orientation (Coding vs. Template Strand)
|
||||
prot_dict[i.split('\t')[0]].append('RC')
|
||||
## Collects initial Start and Stop positions from the BLAST alignment (but in the "correct" orientation)
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
|
||||
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
|
||||
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
|
||||
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
|
||||
|
||||
## Makes sure that the dictionary has the transcript in the correct orientation
|
||||
for i in inFasta:
|
||||
if i.description in prot_dict.keys():
|
||||
if 'RC' == prot_dict[i.description][0]:
|
||||
prot_dict[i.description].append(str(i.seq.reverse_complement()))
|
||||
else:
|
||||
prot_dict[i.description].append(str(i.seq))
|
||||
|
||||
return prot_dict
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------ Rounds Down Values to Nearest "3" -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def round_down_three(num):
|
||||
min_val = int(num*3*.5)-int(num*3*.5)%3
|
||||
max_val = int(num*6)-int(num*6)%3
|
||||
return min_val, max_val
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Makes Translation Steps (Later) Easier -----------------------###
|
||||
##########################################################################################
|
||||
|
||||
def standardize_gcode(given_code):
|
||||
if given_code == 'ciliate' or given_code == 'tga':
|
||||
codon_table = 6
|
||||
elif given_code == 'chilodonella' or given_code == 'chilo' or given_code == 'taa':
|
||||
codon_table = c_uncinata_table
|
||||
elif given_code == 'blepharisma' or given_code == 'bleph':
|
||||
codon_table = blepharisma_table
|
||||
elif given_code == 'euplotes' or given_code == 'eup':
|
||||
codon_table = euplotes_table
|
||||
elif given_code == 'myrionecta' or given_code == 'mesodinium':
|
||||
codon_table = myrionecta_table
|
||||
elif given_code == 'peritrich' or given_code == 'vorticella':
|
||||
codon_table = peritrich_table
|
||||
elif given_code == 'none':
|
||||
codon_table = no_stop_table
|
||||
elif given_code == 'condylostoma' or given_code == 'condy':
|
||||
codon_table = condylostoma_table
|
||||
elif given_code == 'tag':
|
||||
codon_table = tag_table
|
||||
elif given_code == 'universal':
|
||||
codon_table = 1
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\n\nNo valid genetic code provided!\n\n'+color.END+\
|
||||
color.BOLD+'Using the "Universal" genetic code (by default)\n\nPlease check that the'\
|
||||
' code you wish to use is supported:'+color.CYAN+'\n\npython 5_GCodeTranslate.py'\
|
||||
' -list_codes\n\n'+color.END)
|
||||
codon_table = 1
|
||||
|
||||
return codon_table
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------ Updates Spreadsheet with Updated Contig Names --------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_spreadsheet(args, updated_spreadsheet_dict):
|
||||
if os.path.isdir(args.home_folder + '/DiamondOG/') != True:
|
||||
os.system(args.home_folder + '/DiamondOG/')
|
||||
else:
|
||||
pass
|
||||
|
||||
inTSV = [line.rstrip('\n') for line in open(args.tsv_file).readlines() if line != '\n' and line.split('\t')[0] in updated_spreadsheet_dict.keys()]
|
||||
|
||||
updatedTSV = [updated_spreadsheet_dict[line.split('\t')[0]]+'\t'+'\t'.join(line.split('\t')[1:]) for line in inTSV]
|
||||
|
||||
with open(args.tsv_out,'w+') as w:
|
||||
w.write('\n'.join(updatedTSV))
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Updates Log With OG Assignment Information ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def update_log(filename, codon_table):
|
||||
|
||||
if os.path.isdir('../PostAssembly_Logs/') != True:
|
||||
os.system('mkdir ../PostAssembly_Logs/')
|
||||
else:
|
||||
pass
|
||||
|
||||
ntd_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.fasta','fasta')]
|
||||
aa_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.aa.fasta','fasta')]
|
||||
|
||||
min_ntd_ORF = str(min([len(i.seq) for i in ntd_ORF]))
|
||||
max_ntd_ORF = str(max([len(i.seq) for i in ntd_ORF]))
|
||||
avg_ntd_ORF = '%.2f' % (sum([len(i.seq) for i in ntd_ORF])/float(len(ntd_ORF)))
|
||||
|
||||
min_aa_ORF = str(min([len(i.seq) for i in aa_ORF]))
|
||||
max_aa_ORF = str(max([len(i.seq) for i in aa_ORF]))
|
||||
avg_aa_ORF = '%.2f' % (sum([len(i.seq) for i in aa_ORF])/float(len(aa_ORF)))
|
||||
|
||||
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
|
||||
if Logname.startswith(filename.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
|
||||
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
|
||||
LogFile.write('Nucleotide ORFs\t'+str(len(ntd_ORF))+'\tn/a\tn/a\n')
|
||||
LogFile.write('Nucleotide ORF Lengths\t'+avg_ntd_ORF+'\t'+min_ntd_ORF+'\t'+max_ntd_ORF+'\n')
|
||||
LogFile.write('Protein ORFs\t'+str(len(aa_ORF))+'\tn/a\tn/a\n')
|
||||
LogFile.write('Protein ORF Lengths\t'+avg_aa_ORF+'\t'+min_aa_ORF+'\t'+max_aa_ORF+'\n')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###----------------------- Write File with Provided Genetic Code ----------------------###
|
||||
##########################################################################################
|
||||
|
||||
def write_data_out(prot_dict, codon_table, args):
|
||||
|
||||
update_spreadsheet_dict = {}
|
||||
|
||||
#The code below only works if rnaspades was used; constrained by addition of script 6b
|
||||
for k, v in prot_dict.items():
|
||||
#if 'Cov' in k:
|
||||
new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-3:])
|
||||
#update_spreadsheet_dict[k] = new_name
|
||||
update_spreadsheet_dict[k] = k
|
||||
#else:
|
||||
#new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-2:])
|
||||
#update_spreadsheet_dict[k] = new_name
|
||||
#update_spreadsheet_dict[k] = k
|
||||
|
||||
with open(args.ntd_out,'w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'ORF'+color.END+color.BOLD\
|
||||
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-2])+'\n')
|
||||
|
||||
with open(args.aa_out, 'w+') as w:
|
||||
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'Translated ORF'+color.END+color.BOLD\
|
||||
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
|
||||
|
||||
for k, v in prot_dict.items():
|
||||
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-1])+'\n')
|
||||
|
||||
return update_spreadsheet_dict
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
if args.input_file.split('.fas')[0].split('/')[-1] + '_StopCodonStats.tsv' in os.listdir(args.home_folder):
|
||||
os.system('mv ' + args.input_file.split('.fas')[0] + '_StopCodonStats.tsv ' + args.StopFreq)
|
||||
|
||||
os.system('mv '+args.tsv_file+' '+args.Diamond_Folder)
|
||||
os.system('mv '+args.input_file+' '+args.Diamond_Folder)
|
||||
|
||||
if args.no_RP == True:
|
||||
if os.path.isdir(args.all_output_folder + 'ToRename/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + 'ToRename/')
|
||||
|
||||
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder + 'ToRename/')
|
||||
|
||||
else:
|
||||
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder)
|
||||
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder)
|
||||
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder)
|
||||
|
||||
os.system('mv ' + args.home_folder + ' ' + args.all_output_folder + 'TranslatedTranscriptomes')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.ntd_out.split('/')[-1]+color.END+\
|
||||
color.BOLD+',\n'+color.DARKCYAN+args.aa_out.split('/')[-1]+color.END+color.BOLD+', and\n'\
|
||||
+color.DARKCYAN+args.tsv_out.split('/')[-1]+color.END+color.BOLD+',\nwhich are in the '+\
|
||||
color.ORANGE+args.home_folder.split('/')[-1]+' Folder'+color.END)
|
||||
|
||||
if args.no_RP == True:
|
||||
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'7_FinalRename.py'+color.END+color.BOLD+\
|
||||
' in the '+color.PURPLE+'RemovePartials Folder'+color.END+color.BOLD+'\nwith a copy of'\
|
||||
' the outputs of this script!'+color.END)
|
||||
print(color.BOLD+'\n\nRemember that you have chosen '+color.RED+'NOT '+color.END+color.BOLD+\
|
||||
'to remove partials\nand are skipping to the renaming step!\n\n'+color.END)
|
||||
|
||||
else:
|
||||
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'6_FilterPartials.py'+color.END+color.BOLD+\
|
||||
' in the '+color.PURPLE+'FinalizeTranscripts Folder'+color.END+color.BOLD+'\nwith a copy of'\
|
||||
' the outputs of this script!\n\n'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
codon_table = standardize_gcode(args.genetic_code.lower())
|
||||
|
||||
prot_dict_Prepped = prep_translations(args)
|
||||
|
||||
prot_dict_Final = extract_ORF(prot_dict_Prepped, codon_table, args)
|
||||
|
||||
new_spreadsheet_names = write_data_out(prot_dict_Final, codon_table, args)
|
||||
|
||||
update_spreadsheet(args, new_spreadsheet_names)
|
||||
|
||||
# update_log(fasta_file, gcode)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
452
PTL1/Transcriptomes/Scripts/6_FilterPartials.py
Normal file
452
PTL1/Transcriptomes/Scripts/6_FilterPartials.py
Normal file
@ -0,0 +1,452 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 2023-09-27 by Auden Cote-L'Heureux
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch
|
||||
##__Usage__: python 6_FilterPartials.py --help
|
||||
|
||||
|
||||
##################################################################################################
|
||||
## This script is intended to remove incomplete transcripts that have a more complete mate ##
|
||||
## ##
|
||||
## Prior to running this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 200bp) ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## 7_FinalRename.py ##
|
||||
## ##
|
||||
##################################################################################################
|
||||
|
||||
from Bio import SeqIO
|
||||
from Bio.Seq import Seq
|
||||
from statistics import mean
|
||||
|
||||
from distutils import spawn
|
||||
import argparse, os, sys, time, re
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
#------------------------------ Colors For Print Statements ------------------------------#
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
|
||||
###########################################################################################
|
||||
## IF Diamond is IN YOUR PATH then no updating is needed...
|
||||
|
||||
def check_diamond_path():
|
||||
|
||||
diamond_path = ''
|
||||
|
||||
if diamond_path == '':
|
||||
diamond_path = spawn.find_executable("diamond")
|
||||
#diamond_path = '/path/to/diamond'
|
||||
else:
|
||||
pass
|
||||
|
||||
if diamond_path == None:
|
||||
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
|
||||
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
|
||||
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
|
||||
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
|
||||
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
else:
|
||||
pass
|
||||
|
||||
return diamond_path
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\
|
||||
+color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\
|
||||
+color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--file_prefix','-fp', action='store',
|
||||
help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\
|
||||
'to be processed\n'+color.END)
|
||||
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98,
|
||||
help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\
|
||||
' contigs\n (default = 0.98)\n'+color.END)
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder')
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
args.id_print = str(int(float(args.identity)*100))
|
||||
|
||||
args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/'
|
||||
args.file_prefix = args.file_prefix.split('/')[-1]
|
||||
|
||||
args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')]
|
||||
|
||||
args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')]
|
||||
|
||||
args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')]
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\
|
||||
' --file_prefix Op_me_Xxma'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_arg = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_arg += 1
|
||||
|
||||
if args.file_listNTD == []:
|
||||
print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\
|
||||
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
|
||||
'is present in\nthe files of interest')
|
||||
valid_arg += 1
|
||||
|
||||
if args.file_listAA == []:
|
||||
print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\
|
||||
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
|
||||
'is present in\nthe files of interest')
|
||||
valid_arg += 1
|
||||
|
||||
if args.file_listTSV == []:
|
||||
print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\
|
||||
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
|
||||
'is present in\nthe files of interest')
|
||||
valid_arg += 1
|
||||
|
||||
if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV):
|
||||
pass
|
||||
else:
|
||||
print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\
|
||||
' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\
|
||||
+color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\
|
||||
' each transcriptome\n\nThen try once again.'+color.END)
|
||||
valid_arg += 1
|
||||
|
||||
return valid_arg
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------- Creates Folders For Storing Data -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
if os.path.isdir(args.all_output_folder + 'ToRename') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + 'ToRename')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + args.file_prefix) != True:
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original')
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets')
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/')
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed')
|
||||
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets')
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Merges Fasta Files When Replicates Present --------------------###
|
||||
##########################################################################################
|
||||
|
||||
def merge_fasta_replicates(args, type):
|
||||
|
||||
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
|
||||
|
||||
count = 0
|
||||
fasta_to_merge = []
|
||||
|
||||
if type == 'NTD':
|
||||
fasta_list = args.file_listNTD
|
||||
else:
|
||||
fasta_list = args.file_listAA
|
||||
|
||||
for file in fasta_list:
|
||||
fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != '']
|
||||
count += 1
|
||||
|
||||
with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w:
|
||||
w.write(''.join(fasta_to_merge))
|
||||
|
||||
time.sleep(.75)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Merges TSV Files When Replicates Present ---------------------###
|
||||
##########################################################################################
|
||||
|
||||
def merge_tsv_replicates(args):
|
||||
|
||||
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
|
||||
|
||||
count = 0
|
||||
tsv_to_merge = []
|
||||
|
||||
for file in args.file_listTSV:
|
||||
tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != '']
|
||||
count += 1
|
||||
|
||||
with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w:
|
||||
w.write('\n'.join(tsv_to_merge))
|
||||
|
||||
time.sleep(.75)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------ Calls on the other Merge Functions by Data Type -----------------###
|
||||
##########################################################################################
|
||||
|
||||
def merge_relevant_data(args):
|
||||
|
||||
print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END)
|
||||
|
||||
merge_fasta_replicates(args, 'NTD')
|
||||
merge_fasta_replicates(args, 'AA')
|
||||
merge_tsv_replicates(args)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Removes Nearly Identical ORFs from Data Set -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def filter_NTD_data(args, OGLenDB):
|
||||
|
||||
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
|
||||
proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
|
||||
|
||||
print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\
|
||||
' with >'+args.id_print+'% Nucleotide Identity over >33% of\ntheir length when '\
|
||||
'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\
|
||||
+color.CYAN+args.file_prefix+'\n\n'+color.END)
|
||||
|
||||
#Read in the NTD and AA sequences output by script 5 and concatenated across files for the taxon above
|
||||
starting_NTD_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') }
|
||||
starting_AA_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta', 'fasta') }
|
||||
|
||||
#Creating a record of short sequences left over from translation to be removed
|
||||
short_from_translation = []
|
||||
if os.path.isfile('ShortTranscripts_FromTranslation.txt'):
|
||||
for line in open('ShortTranscripts_FromTranslation.txt'):
|
||||
short_from_translation.append(line.strip())
|
||||
|
||||
#Remove sequences <33% or >150% the average length of their OG in the Hook database
|
||||
good_NTD_seqs = []; good_AA_seqs = []
|
||||
for rec in starting_NTD_seqs:
|
||||
og_number = re.split('OG.{1}_', rec)[-1][:6]
|
||||
og_prefix = rec.split(og_number)[0][-4:]
|
||||
og = og_prefix + og_number
|
||||
|
||||
if len(starting_AA_seqs[rec]) <= 1.5*OGLenDB[og] and len(starting_AA_seqs[rec]) >= 0.33*OGLenDB[og] and '-'.join(rec.split('_')[1:]) not in short_from_translation:
|
||||
good_NTD_seqs.append((rec, starting_NTD_seqs[rec]))
|
||||
good_AA_seqs.append((rec, starting_AA_seqs[rec]))
|
||||
|
||||
#Write out all sequences after removal of ORFs based on comparison to mean Hook OG length
|
||||
with open(proc_folder + args.file_prefix + '.NTD.HookLenFiltered_NotPartialFiltered.fasta', 'w') as o:
|
||||
for seq in good_NTD_seqs:
|
||||
o.write('>' + seq[0] + '\n' + seq[1] + '\n\n')
|
||||
|
||||
with open(proc_folder + args.file_prefix + '.AA.HookLenFiltered_NotPartialFiltered.fasta', 'w') as o:
|
||||
for seq in good_AA_seqs:
|
||||
o.write('>' + seq[0] + '\n' + seq[1] + '\n\n')
|
||||
|
||||
#BLAST-ing all nucleotide sequences that survived the Hook-relative length filter against each other
|
||||
db_cmd = 'makeblastdb -in ' + proc_folder + args.file_prefix + '.NTD.HookLenFiltered_NotPartialFiltered.fasta -dbtype nucl -parse_seqids -out ' + proc_folder + args.file_prefix + '.NTD.HookLenFiltered_NotPartialFiltered'
|
||||
blastn_cmd = 'blastn -query ' + proc_folder + args.file_prefix + '.NTD.HookLenFiltered_NotPartialFiltered.fasta -db ' + proc_folder + args.file_prefix + '.NTD.HookLenFiltered_NotPartialFiltered -perc_identity 98 -outfmt "6 std qcovs" -out ' + proc_folder + '/SpreadSheets/All_NTD_SelfBLAST_Results.tsv'
|
||||
|
||||
os.system(db_cmd)
|
||||
os.system(blastn_cmd)
|
||||
|
||||
#Creating a record of query, subject pairs for each OG
|
||||
pairs_per_og = { }
|
||||
for line in open(proc_folder + '/SpreadSheets/All_NTD_SelfBLAST_Results.tsv'):
|
||||
|
||||
#IMPORTANT: This line is where the query coverage threshold is determined, and it might be helpful to adjust this when trying to optimally filter chimeras. By default it is set to 33%
|
||||
if line.split('\t')[0] != line.split('\t')[1] and line.split('\t')[0][-10:] == line.split('\t')[1][-10:] and int(line.split('\t')[-1].strip()) > 33:
|
||||
if line.split('\t')[0][-10:] not in pairs_per_og:
|
||||
pairs_per_og.update({ line.split('\t')[0][-10:] : [] })
|
||||
|
||||
pairs_per_og[line.split('\t')[0][-10:]].append([line.split('\t')[0], line.split('\t')[1]])
|
||||
|
||||
#For each OG, iterating through master sequences by decreasing score (cov*len), removing sequences that hit each master
|
||||
partials_to_remove = []; pairs_with_query_removed = []
|
||||
for og in pairs_per_og:
|
||||
|
||||
#Sorting the sequences by score (cov*len)
|
||||
seqs = sorted(list(dict.fromkeys([seq for pair in pairs_per_og[og] for seq in pair])), key = lambda x : -(int(x.split('Len')[-1].split('_')[0]) * int(x.split('Cov')[-1].split('_')[0])))
|
||||
|
||||
#Iterating through masters and determining which sequences to remove
|
||||
for master in seqs:
|
||||
if master not in partials_to_remove:
|
||||
for pair in pairs_per_og[og]:
|
||||
if pair[1] == master:
|
||||
partials_to_remove.append(pair[0])
|
||||
pairs_with_query_removed.append(pair)
|
||||
|
||||
#Writing out a record of the BLAST hits of all relevant pairs (i.e. when removed sequences hit a master sequence)
|
||||
with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w') as w:
|
||||
for line in open(proc_folder + '/SpreadSheets/All_NTD_SelfBLAST_Results.tsv'):
|
||||
if [line.split('\t')[0], line.split('\t')[1]] in pairs_with_query_removed:
|
||||
w.write(line)
|
||||
|
||||
####################################################################
|
||||
## Finalized Outputs are Summarized and Written Out to New Fastas ##
|
||||
####################################################################
|
||||
|
||||
# print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
|
||||
# +color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\
|
||||
# str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\
|
||||
# color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\
|
||||
# +color.PURPLE+' ORFs.\n\n'+color.END)
|
||||
|
||||
# print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
|
||||
# +' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\
|
||||
# color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\
|
||||
# color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\
|
||||
# +color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\
|
||||
# +color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\
|
||||
# color.RED+'TOO LONG or SHORT.\n\n'+color.END)
|
||||
|
||||
# print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\
|
||||
# +color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END)
|
||||
|
||||
with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
|
||||
for i in good_NTD_seqs:
|
||||
if i[0] not in partials_to_remove:
|
||||
w.write('>' + i[0] + '\n' + str(i[1]) + '\n')
|
||||
|
||||
with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
|
||||
for i in good_AA_seqs:
|
||||
if i[0] not in partials_to_remove:
|
||||
x.write('>' + i[0] + '\n' + str(i[1]) + '\n')
|
||||
|
||||
good_seq_names = [i[0] for i in good_NTD_seqs]
|
||||
|
||||
with open(proc_folder + '/SpreadSheets/' + args.file_prefix + '_Filtered.Final.allOGCleanresults.tsv', 'w') as t:
|
||||
for line in open(cat_folder + '/SpreadSheets/' + args.file_prefix + '_Concatenated.allOGCleanresults.tsv'):
|
||||
if line.split('\t')[0] in good_seq_names and line.split('\t')[0] not in partials_to_remove:
|
||||
t.write(line)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
|
||||
##########################################################################################
|
||||
|
||||
def clean_up(args):
|
||||
|
||||
for i in args.file_listNTD:
|
||||
os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
|
||||
os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
|
||||
os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/')
|
||||
|
||||
os.system('cp ' + args.all_output_folder + args.file_prefix + '/Processed/*ORF.fasta ' + args.all_output_folder + '/ToRename/')
|
||||
os.system('cp ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets/*allOGCleanresults.tsv ' + args.all_output_folder + '/ToRename/')
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script():
|
||||
|
||||
print(color.BOLD+'\nNext Script is: '+color.GREEN+'7_FinalizeName.py\n\n'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------- Checks Command Line Arguments and Calls Steps ------------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
diamond_path = check_diamond_path()
|
||||
|
||||
args = check_args()
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
merge_relevant_data(args)
|
||||
|
||||
OGLenDB = {}
|
||||
for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
|
||||
if rec.id[-10:] not in OGLenDB:
|
||||
OGLenDB.update({ rec.id[-10:] : [] })
|
||||
|
||||
OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
|
||||
|
||||
for og in OGLenDB:
|
||||
OGLenDB[og] = mean(OGLenDB[og])
|
||||
|
||||
filter_NTD_data(args, OGLenDB)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script()
|
||||
|
||||
main()
|
||||
300
PTL1/Transcriptomes/Scripts/7_FinalizeName.py
Normal file
300
PTL1/Transcriptomes/Scripts/7_FinalizeName.py
Normal file
@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 9_29_2023 by Auden Cote-L'Heureux
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 6_FilterPartials.py --help
|
||||
|
||||
##################################################################################################
|
||||
## This script is intended to rename the outputs of the FilterPartials script ##
|
||||
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
|
||||
## ##
|
||||
## Prior to r`ning this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
|
||||
## 8. Partial sequences have been removed from the transcriptomic data sets ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## NONE! You're FINISHED! :D ##
|
||||
## ##
|
||||
##################################################################################################
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
#----------------------- Solely to Make Print Statements Colorful -----------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
|
||||
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
|
||||
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
|
||||
required_arg_group.add_argument('--name','-n', action='store',
|
||||
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
|
||||
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2])
|
||||
|
||||
args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0]
|
||||
if 'fasta' in args.file_prefix:
|
||||
args.file_prefix = args.name
|
||||
|
||||
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
|
||||
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
|
||||
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
|
||||
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\
|
||||
' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_args = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_args += 1
|
||||
|
||||
print(args.input_file)
|
||||
|
||||
if args.input_file.endswith('AA.ORF.fasta'):
|
||||
args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta')
|
||||
args.input_AA = args.input_file
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
|
||||
elif args.input_file.endswith('NTD.ORF.fasta'):
|
||||
args.input_NTD = args.input_file
|
||||
args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta')
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
print(args.input_TSV)
|
||||
|
||||
if os.path.isfile(args.input_NTD) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_AA) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_TSV) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\
|
||||
' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
return valid_args
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Double Checks Format for 10-Character Code ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_code(args):
|
||||
|
||||
check_name = args.name.split('_')
|
||||
|
||||
if len(args.name) != 10:
|
||||
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
elif args.name.count('_') != 2:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
|
||||
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
|
||||
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
|
||||
.split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\
|
||||
"code: "+color.CYAN+args.name+color.END+'\n')
|
||||
else:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------- Creates Folders For Storing Data -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
|
||||
|
||||
|
||||
if os.path.isdir(args.r2g_ntd) != True:
|
||||
os.system('mkdir ' + args.r2g_ntd)
|
||||
if os.path.isdir(args.r2g_aa) != True:
|
||||
os.system('mkdir ' + args.r2g_aa)
|
||||
if os.path.isdir(args.r2g_tsv) != True:
|
||||
os.system('mkdir ' + args.r2g_tsv)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed')
|
||||
|
||||
###########################################################################################
|
||||
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_paralogs(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
|
||||
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
|
||||
if '\n\n' in open(args.input_TSV).read():
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n\n','\n'+args.name+'_XX_')
|
||||
else:
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n','\n'+args.name+'_XX_')
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w:
|
||||
w.write(renamed_Final_Prots)
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x:
|
||||
x.write(renamed_Final_Nucs)
|
||||
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y:
|
||||
y.write(renamed_Final_tsv)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
|
||||
##########################################################################################
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
os.system('cp '+home_folder+'*tsv '+args.r2g_tsv)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
|
||||
os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/Finished/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/Finished')
|
||||
|
||||
os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished')
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nThere is no next script!\n\n'+color.END)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
check_code(args)
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
rename_paralogs(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
Loading…
x
Reference in New Issue
Block a user