Add files via upload

This commit is contained in:
Auden Cote-L'Heureux 2023-06-12 13:31:51 -04:00 committed by GitHub
parent f4b7839ea9
commit 6dc90e0d4c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 37278 additions and 0 deletions

View File

@ -0,0 +1,269 @@
#!/usr/bin/env python3.6
##__Updated__: 01_04_2023
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 1_ContigFiltStats.py
##__Options__: python 1_ContigFiltStats.py --help
##########################################################################################
## This script is intended to remove small transcripts or small contigs below a given ##
## minimum size from a transcriptome assembly. ##
## ##
## Prior to running this script, ensure the following: ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## ##
## COMMAND Example Below ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## AutoBactVsEuk.py (removes SSU then Bact) or 2a_removeSSU.py then 2b_removeBact.py ##
## ##
##########################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
from Bio.SeqUtils import GC
#----------------------------- Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD+'\nThis script will remove Contigs (and provide a summary of statistics)'\
+'\nfrom your Assembly that are shorter than a given length.'+color.ORANGE+\
'\n\nA good minimum length to start with is 200bp.'+color.END+color.BOLD+\
'\n\nThe minimum length value should be adjusted for your data sets.\n'+color.END+usage_msg(),
usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+" Fasta file of Protein/Nucleotide sequences\n"+color.END)
required_arg_group.add_argument('--output_file','-out',
help=color.BOLD+color.GREEN+" Desired Output Name\n\n"+color.END)
required_arg_group.add_argument('--minLen','-min', default=200, type=int,
help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 200)"+color.END)
required_arg_group.add_argument('--maxLen','-max', default=15000, type=int,
help=color.BOLD+color.GREEN+" Minimum number of base pairs for contigs\n (default = 15000)"+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--spades','-spades', action='store_true',
help=color.BOLD+color.GREEN+'rnaSPAdes transcriptome assembly\n'+color.END)
optional_arg_group.add_argument('--genbank','-gb', action='store_true',
help=color.BOLD+color.GREEN+'Assembly from Genbank\n (Will include Accession Number in'\
' contig name)\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
args = parser.parse_args()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1_ContigFiltStats.py'\
' --input_file ../Op_me_Xxma_rnaSPAdes_scaffolds_15_05.fasta --output_file '\
'Op_me_Xxma --minLen 200 --spades'+color.END
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if args.output_file == None:
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
Home_folder_name = args.output_file
if os.path.isdir(args.output_file) != True:
os.system('mkdir ' + args.output_file)
if os.path.isdir(args.output_file + '/OriginalFasta/') != True:
os.system('mkdir ' + args.output_file +'/OriginalFasta/')
if os.path.isdir(args.output_file + '/SizeFiltered/') != True:
os.system('mkdir ' + args.output_file +'/SizeFiltered/')
if os.path.isdir('/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/') != True:
os.system('mkdir ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
###########################################################################################
###---------- Renames the Contigs, Writes them out, and Calculates Basic Info ----------###
###########################################################################################
def rename_Transcriptome(args):
home_folder = args.output_file + '/SizeFiltered/'
print (color.BOLD+'\n\nPrepping '+color.GREEN+args.input_file.split('/')[-1]+color.END)
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta') if len(i.seq) >= args.minLen and len(i.seq) <= args.maxLen]
inFasta.sort(key=lambda seq_rec: -len(seq_rec.seq))
renamed_seqs = []
seq_code_dict = {}
count = 1
seq_name_start = 'Contig'
if args.genbank == True:
for seq_rec in inFasta:
seq_code_dict.setdefault(seq_rec.id,[]).append(seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
seq_code_dict.setdefault(seq_rec.id,[]).append(str(seq_rec.seq).upper())
renamed_seqs.append('>'+seq_rec.id.split('_')[-1].split('.')[0]+'_Contig_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
count += 1
elif args.spades == True:
for seq_rec in inFasta:
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3])))))
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_rec.description.split('_')[5])
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'_Cov'+str(int(round(float(seq_rec.description.split('_')[-3]))))+'\n'+str(seq_rec.seq).upper())
count += 1
else:
for seq_rec in inFasta:
seq_code_dict.setdefault(seq_rec.description,[]).append(seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq)))
seq_code_dict.setdefault(seq_rec.description,[]).append(str(seq_rec.seq).upper())
renamed_seqs.append('>'+seq_name_start+'_'+str(count)+'_Len'+str(len(seq_rec.seq))+'\n'+str(seq_rec.seq).upper())
count += 1
print (color.BOLD+'\n\nThere are '+color.RED+str(len(renamed_seqs))+' contigs > '+str(args.minLen)\
+color.END+color.BOLD+' in '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END)
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta','w+') as w:
for seq in renamed_seqs:
w.write(seq+'\n')
if args.spades != True:
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
for k, v in seq_code_dict.items():
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
else:
with open(home_folder + args.output_file.split('/')[-1] + '.' + str(args.minLen) + 'bp.SeqCodes.tsv','w+') as w:
w.write('Original Name\tNew Name\tSeq Length\tSeq GC\tSeq Coverage\n')
for k, v in seq_code_dict.items():
w.write(k+'\t'+v[0]+'\t'+str(len(v[2]))+'\t'+str(GC(v[2]))+'\t'+str(v[1])+'\n')
###########################################################################################
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
###########################################################################################
def clean_up(args):
os.system('cp ' + args.input_file + ' ' + args.output_file + '/OriginalFasta/' + args.input_file.split('/')[-1].replace('.fasta', '.Original.fasta'))
os.system('cp ' + args.output_file + '/SizeFiltered/' + args.output_file.split('/')[-1] + '.' + str(args.minLen)+'bp.fasta ' + '/'.join(args.output_file.split('/')[:-1]) + '/XlaneBleeding/')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.output_file+'.'+str(args.minLen)+\
'bp.fasta'+color.END+color.BOLD+'\n\n')
print ('Next Script is: '+color.GREEN+'2_Auto_rRNA_BvE.py'+color.END\
+color.BOLD+'\n(Alternatively'+color.GREEN+' 2a_remove_rRNA.py followed by 2b_remove_Bact.py'\
+color.END+color.BOLD+')\n\n'+ color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
temp = rename_Transcriptome(args)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,153 @@
#!/usr/bin/python3
__author__ = 'Jean-David Grattepanche'
__version__ = 'ACL fixed sequence naming issue Feb 23, 2022'
__email__ = 'jeandavid.grattepanche@gmail.com'
import sys
import os
import re
import time
import string
import os.path
from Bio import SeqIO
from sys import argv
listtaxa=[]
toosim = 0.99
seqcoverage = 0.7
def merge_files(folder, minlen, conspecific_names):
mergefile = open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','w+')
print("MERGE following files")
for taxafile in os.listdir(folder):
if taxafile[0] != ".":
listtaxa.append(taxafile.split('.' + str(minlen) + 'bp')[0])
for line2 in SeqIO.parse(folder+'/'+taxafile, 'fasta'):
if int(len(str(line2.seq))) >= int(minlen):
mergefile.write('>'+taxafile.split('.' + str(minlen) + 'bp')[0] + '_' + line2.description + '\n' + str(line2.seq) + '\n')
else:
print(line2, " is too short")
mergefile.close()
sort_cluster(folder, listtaxa, minlen, conspecific_names)
def sort_cluster(folder, listtaxa, minlen, conspecific_names):
if not os.path.exists('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/'):
os.makedirs('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
fastalist = []; fastadict= {}
conspecific_names_dict = { line.split('\t')[0] : line.split('\t')[1].strip() for line in open(conspecific_names) }
print('CREATE a dictionnary of sequences')
for record in SeqIO.parse(open('/'.join(folder.split('/')[:-1]) + '/forclustering.fasta','r'),'fasta'):
if record.id[:10] not in conspecific_names_dict:
print('\nError in cross-plate contamination assessment: the ten-digit code ' + record.id[:10] + ' is not found in the conspecific names file. Please check that this file is correct and try again.\n')
quit()
IDL = record.description, int(record.description.split('_Cov')[1].replace('\n',''))
fastalist.append(IDL)
fastadict[record.description] = record.seq
print("CLUSTER sequences that overlap at least 70%")
os.system('vsearch --cluster_fast ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta --strand both --query_cov '+str(seqcoverage)+' --id '+str(toosim) +' --uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc --threads 60' )
#input2 = open('/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc','r')
#input2 = open('/Output_PostClusterBackup/clusteringresults_vsearch/results_forclustering.uc','r')
cluster_output = '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/results_forclustering.uc'
out2 = open('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','w+')
out3 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas','w+')
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','w+')
print("CREATE a dictionary with clustering results")
clustdict= {}; clustlist = []; allseq = []; clustline = {}; list= []; i=0; j=0
for row2 in open(cluster_output, 'r'):
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) < 2: # keep all unique sequences
out2.write('>'+row2.split('\t')[8] + '\n' + str(fastadict[row2.split('\t')[8]])+ '\n')
if row2.split('\t')[0] == 'C' and int(row2.split('\t')[2]) > 1: # create another dictionary
# print("create dico: ", row2.split('\t')[8])
clustdict.setdefault(row2.split('\t')[8], [row2.split('\t')[8]])
clustlist.append(row2.split('\t')[8])
for row3 in open(cluster_output, 'r'):
if row3.split('\t')[0] == 'H':
# print("add dico: ", row3.split('\t')[9], row3.split('\t')[8])
clustdict[row3.split('\t')[9].replace('\n','')].append(row3.split('\t')[8].replace('\n',''))
clustline[row3.split('\t')[8].replace('\n','')] = row3.replace('\n','')
clustline[row3.split('\t')[9].replace('\n','')] = row3.replace('\n','')
print("PARSE the clusters: keep seed sequences (highest coverage) for each cluster")
for clust in clustlist:
list = sorted(clustdict[clust], reverse = True, key=lambda x: int(x.split('_Cov')[1]))
master = list[0]
Covmaster = int(list[0].split('_Cov')[1])
master8dig = ('_').join(list[0].split('_')[0:3])[:-2]
for seq in list:
clustered = seq.replace('\n','')
Covclustered = int(clustered.split('_Cov')[1])
clustered8dig = ('_').join(clustered.split('_')[0:3])[:-2]
# print(master8dig, Covmaster, '//', clustered8dig, Covclustered)
if float(Covmaster/Covclustered) < 10:
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
i +=1
elif conspecific_names_dict[master[:10]] == conspecific_names_dict[clustered[:10]]:
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
i +=1
elif Covclustered >= 50:
out2.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
i +=1
else:
j +=1
out4 = open('/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc','a')
out3.write('>'+clustered + '\n' + str(fastadict[clustered])+ '\n')
print(clustline[clustered],'\t' , master )
out4.write(clustline[clustered]+ '\t' + master + '\n')
out4.close()
print('there are ', str(i),' sequences kept and ',str(j),' sequences removed')
out2.close()
out3.close()
splittaxa(folder, listtaxa, minlen)
def splittaxa(folder, listtaxa, minlen):
for taxa in listtaxa:
tax_sf_path = '/'.join(folder.split('/')[:-1]) + '/' + taxa + '/SizeFiltered/'
os.system('mv ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta' + ' ' + tax_sf_path + taxa + '.' + str(minlen) + 'bp.preXPlate.fasta')
with open(tax_sf_path + taxa + '.' + str(minlen) + 'bp.fasta','w') as o:
for kept in SeqIO.parse('/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas','fasta'):
if taxa in kept.description:
o.write('>' + kept.description.replace(taxa + '_', '') + '\n' + str(kept.seq) + '\n')
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatokeep.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.fas ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/fastatoremoved.uc ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
os.system('mv ' + '/'.join(folder.split('/')[:-1]) + '/forclustering.fasta ' + '/'.join(folder.split('/')[:-1]) + '/clusteringresults_vsearch/')
def main():
script, folder, minlen, conspecific_names = argv
merge_files(folder, minlen, conspecific_names)
main()

View File

@ -0,0 +1,285 @@
#!/usr/bin/env python3.5
##__Updated__: 18_08_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 2a_remove_rDNA.py --help
##########################################################################################
## This script is intended to identify and isolate SSU/LSU sequences ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
## respective folders! See the manual if you need help ##
## ##
## COMMAND Example Below ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 2b_removeBact.py ##
## ##
##########################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD+'\nThis script will remove '+color.RED+'rDNA contigs (both SSU and LSU)'+color.END\
+color.BOLD+'\nfrom your Assembly using a set of '+color.RED+'SSU/LSU rDNAs '+color.END\
+color.BOLD+'from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+"Fasta file of Nucleotide sequences"+color.END)
required_arg_group.add_argument('--databases','-d', action='store',
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--threads','-t', default='2',
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2a_remove_rRNA.py --input_file ../Op_me_Xxma_rna.200bp.fasta'+color.END
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('bp.fasta') != True:
print (color.BOLD + '\n\nCheck that you are giving an appropriately Named/Processed'\
'Fasta file(s) to this script\n\nNOTE that this script CURRENTLY expects your'\
' Fasta files to contain '+color.RED+ '"rna"'+color.END+color.BOLD+' in \nthe Fasta File'\
' Name and must end with ' + color.RED + '"bp.fasta"\n\n' + color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if os.path.isdir(args.databases + '/db_BvsE') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+'\n\nThen try once again.')
valid_arg += 1
elif os.path.isfile(args.databases + '/db_BvsE/SSULSUdb.nhr') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'BLAST+ formatted '+color.ORANGE+'SSU-LSU databases!\n\n'+color.END+color.BOLD+\
'Ensure that they can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.')
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
code = args.input_file.split('/')[-1][:10]
rRNA_folder = args.input_file.split('SizeFiltered')[0] + '/rRNA_Removal/'
if os.path.isdir(rRNA_folder) != True:
os.system('mkdir '+rRNA_folder)
return code, rRNA_folder
###########################################################################################
###---------------------- Uses BLAST to identify SSU/LSU Sequences ---------------------###
###########################################################################################
def remove_rDNA(args, rRNA_folder):
blast_output = rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_allSSULSUresults.tsv'
BLASTN_cmd = 'blastn -query ' + args.input_file + ' -evalue 1e-10 -max_target_seqs 1 -outfmt'\
' 6 -db ' + args.databases + '/db_BvsE/SSULSUdb -num_threads 2 -out ' + blast_output
print (color.BOLD+'\n\nBLASTing '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END\
+color.BOLD+ ' against the rDNA database\n\n' + color.END)
os.system(BLASTN_cmd)
rDNA_Hits = list(set([i.split('\t')[0] for i in open(blast_output).readlines()]))
print (color.BOLD+'Binning Sequences from '+color.DARKCYAN+args.input_file.split('/')[-1]\
+color.END+color.BOLD+'\nas rDNA OR Potentially Protein-Coding\n\n'+color.END)
no_SSULSU = 0
with_SSULSU = 0
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0]+'_rRNAseqs.fasta','w+') as HasSSU:
for seq_rec in inFasta:
if seq_rec.description in rDNA_Hits:
HasSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
with_SSULSU += 1
with open(rRNA_folder + args.input_file.split('/')[-1].split('.200bp.fasta')[0] + '_NorRNAseqs.fasta','w+') as NoSSU:
for seq_rec in inFasta:
if seq_rec.description not in rDNA_Hits:
NoSSU.write('>'+seq_rec.description+'\n'+str(seq_rec.seq)+'\n')
no_SSULSU += 1
return str(with_SSULSU), str(no_SSULSU)
###########################################################################################
###--------------------------- Updates Log of SSU/LSU Removal --------------------------###
###########################################################################################
def update_log(args, with_SSU, no_SSU):
if os.path.isdir('../PostAssembly_Logs/') != True:
os.system('mkdir ../PostAssembly_Logs/')
print (color.BOLD+'There are '+color.RED+with_SSU+' rRNA contigs'+color.END+color.BOLD\
+' and '+color.PURPLE+no_SSU+' Putative Protein-coding contigs'+color.END+color.BOLD\
+'\nin '+color.DARKCYAN+args.input_file.split('/')[1]+'\n' + color.END)
with open('../PostAssembly_Logs/'+args.input_file.split('/')[1].split('.fas')[0]+'.Log.txt','a') as LogFile:
LogFile.write('rDNA Contigs\t'+with_SSU+'\tn/a\tn/a\n')
LogFile.write('Non-rDNA Contigs\t'+no_SSU+'\tn/a\tn/a\n')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\nLook for '+color.ORANGE+args.input_file.split('/')[1].split('_rna')[0]\
+ '_NorRNAseqs.fasta'+color.END+color.BOLD+'\nin the '+args.input_file.split('/')[1].split('_rna')[0]\
+' Folder\n\n' + color.END)
print (color.BOLD + 'Next Script is: ' + color.GREEN + '2b_remove_Bact.py\n\n'+ color.END)
###########################################################################################
###-------------------------- Cleans Up the PostAssembly Folder ------------------------###
###########################################################################################
def clean_up(args):
home_folder = args.input_file.split('SizeFiltered')[0]
os.system('cp ' + home_folder + 'rRNA_Removal/*NorRNA*.fasta ' + home_folder)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
code, rRNA_folder = prep_folders(args)
with_SSULSU, no_SSULSU = remove_rDNA(args, rRNA_folder)
#update_log(args, with_SSULSU, no_SSULSU)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,410 @@
#!/usr/bin/env python3.5
##__Updated__: 18_08_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 2b_remove_Bact.py --help
##########################################################################################
## This script is intended to identify and isolate SSU/LSU sequences ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ##
## respective folders! See the manual if you need help ##
## 4. Run removeSSU.py on your Fasta file ##
## ##
## COMMAND Example Below ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 3_CountOGsDiamond.py ##
## ##
##########################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from distutils import spawn
from Bio import SeqIO
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
###########################################################################################
## IF Diamond is IN YOUR PATH then no updating is needed...
def check_diamond_path():
diamond_path = ''
if diamond_path == '':
diamond_path = spawn.find_executable("diamond")
#diamond_path = '/path/to/diamond'
else:
pass
if diamond_path == None:
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+' the PATH to the'+color.BLUE+' "Diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
sys.exit()
else:
pass
return diamond_path
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\nThis script will categorize Contigs as'+color.ORANGE+' STRONGLY '+color.END\
+color.BOLD+color.RED+'Eukaryotic \nOR Prokaryotic'+color.END+color.BOLD+' using a set of Proteins'\
' from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\
+color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences (with rRNAs removed)'+color.END)
required_arg_group.add_argument('--databases','-d', action='store',
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2b_remove_Bact.py --input_file'\
' ../Op_me_Xxma/Op_me_Xxma_NorRNAseqs.fasta'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
print(args.input_file)
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('NorRNAseqs.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'2a_remove_rRNA.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"NorRNAseqs.fas"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if os.path.isdir(args.databases + '/db_BvsE') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+'\n\nThen try once again.')
valid_arg += 1
elif os.path.isfile(args.databases + '/db_BvsE/eukout.dmnd') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'Diamond formatted '+color.ORANGE+'Eukaryotic Protein database!\n\n'+color.END+color.BOLD+\
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
elif os.path.isfile(args.databases + '/db_BvsE/micout.dmnd') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'Diamond formatted '+color.ORANGE+'Bacterial/Archaeal Protein database!\n\n'+color.END+color.BOLD+\
'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.'+color.END)
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
if os.path.isdir(BvE_folder) != True:
os.system('mkdir '+BvE_folder)
###########################################################################################
###---------------- Runs Diamond on Bact and Euk small RefSeq Databases ----------------###
###########################################################################################
def ublast_BvE(args, diamond_path):
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
mic_output = args.input_file.split('/')[-1]+'micresults.'
euk_output = args.input_file.split('/')[-1]+'eukresults.'
print(color.BOLD+'\n\n"BLAST"-ing against PROK database using DIAMOND: ' + color.DARKCYAN + 'micout.dmnd' + color.END + '\n\n')
Prok_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/micout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'allmicresults.tsv'
os.system(Prok_diamond_cmd)
print(color.BOLD+'\n\n"BLAST"-ing against EUK database using DIAMOND: ' + color.DARKCYAN + 'eukout.dmnd' + color.END + '\n\n')
Euk_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/eukout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'alleukresults.tsv'
os.system(Euk_diamond_cmd)
###########################################################################################
###---------------- Compares Bacterial and Euk Hits for Classification -----------------###
###########################################################################################
def compare_hits(args):
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
EukDict = {}
ProkDict = {}
CompDict = {}
inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')]
for seq_rec in inFasta:
EukDict[seq_rec.description] = ''
ProkDict[seq_rec.description] = ''
CompDict[seq_rec.description] = []
inEukHits = [i for i in open(BvE_folder + 'alleukresults.tsv').readlines()]
inEukHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
inProkHits = [i for i in open(BvE_folder + 'allmicresults.tsv').readlines()]
inProkHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3])))
for i in inEukHits:
if EukDict[i.split('\t')[0]] == '':
EukDict[i.split('\t')[0]] = float(i.split('\t')[-2])
for i in inProkHits:
if ProkDict[i.split('\t')[0]] == '':
ProkDict[i.split('\t')[0]] = float(i.split('\t')[-2])
for k in CompDict.keys():
if EukDict[k] != '':
CompDict[k].append(EukDict[k])
else:
CompDict[k].append('no hit')
if ProkDict[k] != '':
CompDict[k].append(ProkDict[k])
else:
CompDict[k].append('no hit')
for k, v in CompDict.items():
### Contigs lacking STRONG Eukaryotic OR Prokaryotic Hits
if v[0] == 'no hit' and v[1] == 'no hit':
CompDict[k].append('UNDETERMINED')
### Contigs lacking STRONG Eukaryotic with a Prokaryotic Hit
elif v[0] != 'no hit' and v[1] == 'no hit':
CompDict[k].append('EUKARYOTIC')
### Contigs with a Eukaryotic but without a Prokaryotic Hit
elif v[0] == 'no hit' and v[1] != 'no hit':
CompDict[k].append('PROKARYOTIC')
### Uses Basic math to determine if contigs with are MORE Eukaryotic than Prokaryotic
else:
try:
prok_euk_ratio = float(v[1])/float(v[0])
euk_prok_ratio = float(v[0])/float(v[1])
if prok_euk_ratio >= 100:
CompDict[k].append('EUKARYOTIC')
elif euk_prok_ratio >= 1000:
CompDict[k].append('PROKARYOTIC')
else:
CompDict[k].append('UNDETERMINED')
except:
CompDict[k].append('divide by zero')
with open(BvE_folder + 'comparisons.txt','w+') as w:
for k, v in CompDict.items():
w.write(k+':'+':'.join([str(i) for i in v])+'\n')
BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/'
BvE_output_base = BvE_folder+args.input_file.split('/')[-1].split('.fas')[0]
### Gathers the sequences and categorizes them
Euk_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'EUKARYOTIC'), key=lambda x: -int(len(x.seq)))
Prok_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'PROKARYOTIC'), key=lambda x: -int(len(x.seq)))
Und_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'UNDETERMINED'), key=lambda x: -int(len(x.seq)))
Zero_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'divide by zero'), key=lambda x: -int(len(x.seq)))
### Writes out all of the categorized sequences
with open(args.input_file.split('NorRNA')[0] + 'WTA_EPU.fasta', 'w') as epu:
with open(BvE_output_base+'.Not_Bact.fasta','w+') as nb:
for euk_seq in Euk_Fasta:
nb.write('>' + euk_seq.description + '\n' + str(euk_seq.seq) + '\n')
epu.write('>' + euk_seq.description + '_E' + '\n' + str(euk_seq.seq) + '\n')
with open(BvE_output_base+'.Bact_Hit.fasta','w+') as pr:
for prok_seq in Prok_Fasta:
pr.write('>' + prok_seq.description + '\n' + str(prok_seq.seq) + '\n')
epu.write('>' + prok_seq.description + '_P' + '\n' + str(prok_seq.seq) + '\n')
with open(BvE_output_base+'.Undetermined.fasta','w+') as und:
for und_seq in Und_Fasta:
und.write('>' + und_seq.description + '\n' + str(und_seq.seq) + '\n')
epu.write('>' + und_seq.description + '_U' + '\n' + str(und_seq.seq) + '\n')
if len(Zero_Fasta) != 0:
with open(BvE_output_base+'.DivideByZero.fasta','w+') as w:
for zero_seq in Zero_Fasta:
w.write('>' + zero_seq.description + '\n' + str(zero_seq.seq) + '\n')
epu.write('>' + zero_seq.description + '_U' + '\n' + str(zero_seq.seq) + '\n')
else:
pass
return str(len(Euk_Fasta)), str(len(Prok_Fasta)), str(len(Und_Fasta))
###########################################################################################
###----------------------- Updates Log of Prok vs Euk Comparisons ----------------------###
###########################################################################################
def update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs):
if os.path.isdir('../PostAssembly_Logs/') != True:
os.system('mkdir ../PostAssembly_Logs/')
else:
pass
print (color.BOLD +'\n\nThere are '+color.RED+Prok_Contigs+' Strongly Prokaryotic contigs'+color.END\
+color.BOLD+',\n'+color.ORANGE+Euk_Contigs+' Strongly Eukaryotic contigs'+color.END\
+color.BOLD+',\nand '+color.PURPLE+Und_Contigs+' Undetermined Contigs\n'+color.END\
+color.BOLD+'in '+args.input_file.split('/')[-1]+color.END)
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
if Logname.startswith(args.input_file.split('/')[-1].split('_No')[0]) and Logname.endswith('Log.txt'): # ACL - ???
with open('../PostAssembly_Logs/'+Logname,'a') as Logfilename:
Logfilename.write('Prokaryotic Contigs\t'+Prok_Contigs+'\tn/a\tn/a\n')
Logfilename.write('Eukaryotic Contigs\t'+Euk_Contigs+'\tn/a\tn/a\n')
Logfilename.write('Undetermined Contigs\t'+Und_Contigs+'\tn/a\tn/a\n')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
.split('NorRNA')[0]+'WTA_EPU.fasta'+color.END+color.BOLD+' in the '\
+args.input_file.split('/')[1]+' Folder\n\n' + color.END)
print (color.BOLD + 'Next Script is: ' + color.GREEN + '3_CountOGsDiamond.py\n\n'+ color.END)
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
home_folder = '/'.join(args.input_file.split('/')[:-1])
os.system('cp '+home_folder+'/*WTA_EPU.fasta '+home_folder+'/BvE/')
os.system('mv '+home_folder+'/*NorRNA*fasta '+home_folder+'/rRNA_Removal/')
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
usearch_path = check_diamond_path()
args = check_args()
prep_folders(args)
ublast_BvE(args, usearch_path)
Euk_Contigs, Prok_Contigs, Und_Contigs = compare_hits(args)
#update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,372 @@
#!/usr/bin/env python3.5
##__Updated__: 16_10_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 3_CountOGsDiamond.py --help
##########################################################################################
## This script is intended to classify the STRONGLY Eukaryotic and UNDETERMINED/UNKNOWN ##
## contigs into different OGs (e.g. orthologous gene-families) ##
## ##
## For more info about the OGs, check out: OrthoMCL.org ##
## ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 4_StopFrequency.py ##
## ##
##########################################################################################
import argparse, os, sys, re
from argparse import RawTextHelpFormatter,SUPPRESS
from distutils import spawn
from Bio import SeqIO
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
###########################################################################################
## IF Diamond is IN YOUR PATH then no updating is needed...
def check_diamond_path():
diamond_path = ''
if diamond_path == '':
diamond_path = spawn.find_executable("diamond")
#diamond_path = '/path/to/diamond'
else:
pass
if diamond_path == None:
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+' the PATH to the'+color.BLUE+' "usearch" '+color.END+color.BOLD+'executable.\n\n'+color.END)
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+'#------------------------------ UPDATE USEARCH PATH BELOW! -------------------------------#'\
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
sys.exit()
else:
pass
return diamond_path
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script will categorize Contigs into'+color.ORANGE+' "Homologous" '\
+color.END+color.BOLD+'Gene Families (OGs)\nbased on '+color.RED+'OrthoMCL'+color.END\
+color.BOLD+"'s Gene Family Grouping\n\n\nNotes on this script and "+color.GREEN+\
'OrthoMCL Families'+color.END+color.BOLD+' can be found\nat the bottom of '+color.GREEN\
+'THIS script (3_CountOGsDiamond.py)\n'+color.END+usage_msg(), usage=SUPPRESS,
formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
' Eukaryotic protein coding transcripts'+color.END)
required_arg_group.add_argument('--databases','-g', action='store',
help=color.BOLD+color.GREEN+"Path to fasta file with Hook sequences"+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--threads','-t', default='2',
help=color.BOLD+color.GREEN+' Number of threads to use for BLAST\n (default = 2)\n'+color.END)
optional_arg_group.add_argument('--evalue','-e', default=1e-5, type = float,
help=color.BOLD+color.GREEN+' Maximum e-value for OG assignment\n (default = 1e-5)\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 3_CountOGsDiamond.py'\
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_NBU.fasta'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('WTA_EPU.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'2b_remove_Bact.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"WTA_NBU.fasta"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if os.path.isdir(args.databases + '/db_OG') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+color.ORANGE+'db_OG Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+'\n\nThen try once again\n\n.'+color.END)
valid_arg += 1
ogdb_count = 0
for file in os.listdir(args.databases + '/db_OG'):
if file.endswith('.dmnd'):
ogdb_count += 1
if ogdb_count == 0:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'Diamond formatted '+color.ORANGE+'Gene Family databases!\n\n'+color.END+color.BOLD+\
'Ensure that they can be found in the '+color.ORANGE+'db_OG folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
valid_arg += 1
elif ogdb_count > 1:
print('\nMultiple OG databases found. Please only provide 1 database in the db_OG folder.\n')
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
if os.path.isdir(OG_folder) != True:
os.system('mkdir '+OG_folder)
###########################################################################################
###--------------------- Runs Diamond on Split OrthoMCL Databases ----------------------###
###########################################################################################
def OG_diamond(args, diamond_path):
print (color.BOLD+'\nStarting to "BLAST" against OG databases'+color.END)
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
db = [file for file in os.listdir(args.databases + '/db_OG') if file.endswith('.dmnd')][0]
print (color.BOLD + '\n\n"BLAST"-ing against OG database using DIAMOND: ' + color.DARKCYAN + db + color.END + '\n\n')
OG_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' -d ' + args.databases + '/db_OG/' + db + ' --evalue ' + str(args.evalue) + ' --threads 60 --subject-cover 0.35 --outfmt 6 -o ' + OG_folder + 'allOGresults.tsv'
os.system(OG_diamond_cmd)
###########################################################################################
###--------------- Keeps the Single BEST Hit (HSP-score) Per Transcript ----------------###
###########################################################################################
def keep_best(args):
print (color.BOLD+color.PURPLE+'\n\nProcessing OG-database results to keep only the BEST match for each transcript\n\n'+color.END)
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
inTSV = [i for i in open(OG_folder + 'allOGresults.tsv').readlines()]
inTSV.sort(key = lambda x: -float(x.split('\t')[-1]))
keep = []
for i in inTSV:
if any(i.split('\t')[0] in j for j in keep) != True:
keep.append(i)
updated_lines = list(set([line.split('\t')[0]+'_'+'_'.join(line.split('\t')[1].split('_')[-2:])+'\t'+'\t'.join(line.split('\t')[1:]) for line in keep]))
with open(args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv'), 'w+') as w:
for i in updated_lines:
w.write(i)
###########################################################################################
###-------- Copies and Updates Names of Transcripts With OG Hits to New Fasta ----------###
###########################################################################################
def update_fasta(args):
print (color.BOLD+color.PURPLE+'Updating Fasta File Sequence Names with their BEST OG hits\n\n'+color.END)
Renamed_TSV = args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv')
keep = [i for i in open(Renamed_TSV).readlines() if i != '\n']
keep_dict = { }
for line in keep:
try:
og_number = re.split('OG.{1}_', line.split('\t')[1])[1][:6]
og_prefix = line.split('\t')[1].split(og_number)[0][-4:]
og = og_prefix + og_number
keep_dict.update({ re.split('_OG.{1}_', line.split('\t')[0])[0] : re.split('_OG.{1}_', line.split('\t')[0])[0] + '_' + og_prefix + line.split('\t')[1].split('_')[-1] })
except IndexError:
pass
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
updated_seq_name = ['>'+keep_dict[i.description]+'\n'+str(i.seq)+'\n' for i in inFasta if i.description in keep_dict.keys()]
seqs_without_OG = ['>'+i.description+'\n'+str(i.seq)+'\n' for i in inFasta if i.description not in keep_dict.keys()]
with open(args.input_file.replace('.fasta','.Renamed.fasta'),'w+') as w:
for i in updated_seq_name:
w.write(i)
with open(args.input_file.replace('.fasta','.LackOG.fasta'),'w+') as x:
for i in seqs_without_OG:
x.write(i)
###########################################################################################
###-------------------- Updates Log With OG Assignment Information ---------------------###
###########################################################################################
def update_log(args):
if os.path.isdir('../PostAssembly_Logs/') != True:
os.system('mkdir ../PostAssembly_Logs/')
else:
pass
home_folder = '/'.join(args.input_file.split('/')[:-1]) + '/'
Renamed_TSV = home_folder+args.input_file.split('/')[-1].replace('.fasta','.Renamed_allOGCleanresults.tsv')
keep = [line for line in open(Renamed_TSV).readlines()]
all_ogs = [line.split('\t')[1].split('_')[-1] for line in keep if len(re.split('_OG.{1}_', line.split('\t')[1])) > 1]
total_with_ogs = str(len(all_ogs))
unique_ogs = str(len(set(all_ogs)))
print (color.BOLD +'There are '+color.BLUE +total_with_ogs+' Contigs'+color.END\
+color.BOLD+' that hit '+color.DARKCYAN+unique_ogs+' Unique OGs\n'+color.END)
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
if Logname.startswith(args.input_file.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
LogFile.write('Contigs With OG\t'+total_with_ogs+'\tn/a\tn/a\n')
LogFile.write('Unique OGs\t'+unique_ogs+'\tn/a\tn/a\n')
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
os.system('rm ' + args.input_file)
os.system('cp ' + args.input_file.replace('.fasta','.Renamed.fasta') + ' ' + OG_folder)
os.system('cp ' + args.input_file.replace('.fasta','.Renamed_allOGCleanresults.tsv') + ' ' + OG_folder)
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
home_folder = '../'+args.input_file.split('/')[1]+'/'
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
.replace('.fasta','WTA_EPU.fasta')+color.END+color.BOLD+' in the '+home_folder\
+' Folder\n\n' + color.END)
print (color.BOLD+'Next Script is: '+color.GREEN+'4_InFrameStopFreq.py\n\n'+ color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
usearch_path = check_diamond_path()
args = check_args()
prep_folders(args)
OG_diamond(args, usearch_path)
keep_best(args)
update_fasta(args)
#update_log(args)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,790 @@
#!/usr/bin/env python
##__Updated__: 18_08_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 4_InFrameStopFreq.py --help
##########################################################################################
## This script is intended to aid in identifying the genetic code of the data given ##
## ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
## ##
## COMMAND Example Below ##
## Extra Notes at Bottom of Script ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 5_GCodeTranslate.py ##
## ##
##########################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from distutils import spawn
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data.CodonTable import CodonTable
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
tag_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAG'])
c_uncinata_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA'])
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
###########################################################################################
## IF Diamond is IN YOUR PATH then no updating is needed...
def check_diamond_path():
diamond_path = ''
if diamond_path == '':
diamond_path = spawn.find_executable("diamond")
#diamond_path = '/path/to/diamond'
else:
pass
if diamond_path == None:
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
sys.exit()
else:
pass
return diamond_path
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD+'\n\nThis script is intended to '+color.RED+'AID You '+color.END+color.BOLD\
+'in determining the '+color.RED+'\nLikely Genetic Code'+color.END+color.BOLD+' of a'\
' given Fasta File of transcripts\n\nInterpretation of the output (StopFreq.tsv) is difficult \nand so '+color.ORANGE\
+'TWO EXAMPLES'+color.END+color.BOLD+' can be found in the '+color.CYAN+'NOTES Section'\
+color.END+color.BOLD+' of\nTHIS Script '+color.GREEN+'(4_InFrameStopFreq.py)\n'+color.END\
+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store', required=True,
help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences enriched \nwith'\
' Eukaryotic protein coding transcripts'+color.END)
required_arg_group.add_argument('--databases','-d', action='store',
help=color.BOLD+color.GREEN+"Path to databases"+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 4_InFrameStopFreq.py'\
' --input_file ../Op_me_Xxma/Op_me_Xxma_WTA_EPU.Renamed.fasta'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'3_CountOGsUsearcy.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"WTA_NBU.Renamed.fasta"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
if os.path.isdir(args.databases + '/db_StopFreq') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
+color.ORANGE+'db_StopFreq Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\
'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\
+'\n\nThen try once again\n\n.'+color.END)
valid_arg += 1
elif os.path.isfile(args.databases + '/db_StopFreq/RepEukProts.dmnd') != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\
'Diamond formatted '+color.ORANGE+'Representative Eukaryotic Protein Database!\n\n'+color.END+color.BOLD+\
'Ensure that it can be found in the '+color.ORANGE+'db_StopFreq folder'+color.END+\
color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\
color.END+color.BOLD+'\n\nThen try once again.\n\n'+color.END)
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
Stop_folder = '../'+args.input_file.split('/')[1]+'/StopCodonFreq/'
if os.path.isdir(Stop_folder) != True:
os.system('mkdir '+Stop_folder)
if os.path.isdir(Stop_folder+'StopCodonFastas') != True:
os.system('mkdir '+Stop_folder+'StopCodonFastas')
if os.path.isdir(Stop_folder+'SpreadSheets') != True:
os.system('mkdir '+Stop_folder+'SpreadSheets')
return Stop_folder+'StopCodonFastas/'
###########################################################################################
###--------------------- Translates Sequences with Each Stop Codon ---------------------###
###########################################################################################
def prep_translations(args):
print (color.BOLD+'\nIdentifying ORFs in the Fasta file based on the output of 3_CountOGsDiamond.py\n'+color.END)
intsv = [i for i in open(args.input_file.replace('.fasta','_allOGCleanresults.tsv')).readlines() if i != '\n']
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
prot_dict = {}
for i in intsv:
# print i
prot_dict.setdefault(i.split('\t')[0],[])
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
prot_dict[i.split('\t')[0]].append('F')
if (int(i.split('\t')[6])) < 5:
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
else:
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
prot_dict[i.split('\t')[0]].append('RC')
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6]))
if (int(i.split('\t')[7])-4) < 5:
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7]))
else:
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])-4)
#------------- Prep translation with 'TAA' as the only Stop -------------#
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAA'+color.END\
+color.BOLD+' as the sole STOP codon\n'+color.END)
for key, value in prot_dict.items():
for seq_rec in inFasta:
if key in seq_rec.description:
stop_pos = 0
if prot_dict[key][0] == 'F':
temp = seq_rec.seq[prot_dict[key][1]:]
temp_prot = str(temp.translate(table=c_uncinata_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
if prot_dict[key][0] == 'RC':
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
temp_prot = str(temp.translate(table=c_uncinata_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
#------------- Prep translation with 'TGA' as the only Stop -------------#
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TGA'+color.END\
+color.BOLD+' as the sole STOP codon\n'+color.END)
for key, value in prot_dict.items():
for seq_rec in inFasta:
if key in seq_rec.description:
stop_pos = 0
if prot_dict[key][0] == 'F':
temp = seq_rec.seq[prot_dict[key][1]:]
temp_prot = str(temp.translate(table=6))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
if prot_dict[key][0] == 'RC':
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
temp_prot = str(temp.translate(table=6))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
#------------- Prep translation with 'TAG' as the only Stop -------------#
print (color.BOLD+'\n\nTranslating DNA using'+color.RED+' TAG'+color.END\
+color.BOLD+' as the sole STOP codon\n'+color.END)
for key, value in prot_dict.items():
for seq_rec in inFasta:
if key in seq_rec.description:
stop_pos = 0
if prot_dict[key][0] == 'F':
temp = seq_rec.seq[prot_dict[key][1]:]
temp_prot = str(temp.translate(table=tag_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][1]:prot_dict[key][2]])
if prot_dict[key][0] == 'RC':
temp = seq_rec.seq[:prot_dict[key][1]].reverse_complement()
temp_prot = str(temp.translate(table=tag_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[key].append(temp[:stop_pos])
else:
prot_dict[key].append(seq_rec.seq[prot_dict[key][2]:prot_dict[key][1]].reverse_complement())
#------------ Parsing through data to maintain OG assignments ------------#
inOGs = intsv
inOGs = [i.split('\t')[0]+';'+i.split('\t')[1][-10:] for i in inOGs]
inOGs2 = []
for i in inOGs:
if 'no_group' not in i.split(';')[1]:
inOGs2.append(i)
else:
inOGs2.append(i.split(';')[0]+';no_group')
inOGs2 = list(set(inOGs2))
#---------------- Write file with 'TAA' is the only Stop ----------------#
with open(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','w+') as w:
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+' TAA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-3]).upper()+'\n')
with open(args.input_file.split('.fas')[0]+'_taa_ORF.aa.fasta','w+') as w:
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-3])).translate(table=c_uncinata_table)).upper()+'\n')
#---------------- Write file with 'TGA' is the only Stop ----------------#
with open(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','w+') as w:
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+' TGA '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-2]).upper()+'\n')
with open(args.input_file.split('.fas')[0]+'_tga_ORF.aa.fasta','w+') as w:
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-2])).translate(table=6)).upper()+'\n')
#---------------- Write file with 'TAG' is the only Stop ----------------#
with open(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','w+') as w:
print (color.BOLD+'\n\nWriting FASTA files with ORF and Protein sequences with'+color.RED\
+' TAG '+color.END+color.BOLD+'as only STOP codon\n'+color.END)
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(value[-1]).upper()+'\n')
with open(args.input_file.split('.fas')[0]+'_tag_ORF.aa.fasta','w+') as w:
for key, value in prot_dict.items():
for j in inOGs2:
if key == j.split(';')[0]:
if len(prot_dict[key]) < 4:
pass
else:
w.write('>'+key+'_'+j.split(';')[1]+'\n'+str(Seq(str(value[-1])).translate(table=tag_table)).upper()+'\n')
###########################################################################################
###---------- Diamonds the Translations Against a SMALL Euk Protein Database ----------###
###########################################################################################
def diamond_ProtDB(args, diamond_path):
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tag_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tag_ORF.RepEukProts.tsv')
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_tga_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_tga_ORF.RepEukProts.tsv')
os.system(diamond_path + ' blastp -q ' + args.input_file.split('.fas')[0] + '_taa_ORF.aa.fasta -d ' + args.databases + '/db_StopFreq/RepEukProts.dmnd --evalue 1e-5 --max-target-seqs 1 --threads 60 --outfmt 6 -o ' + args.input_file.split('.fas')[0] + '_taa_ORF.RepEukProts.tsv')
###########################################################################################
###-------------------- Manages the search for In-Frame Stop Codons --------------------###
###########################################################################################
def hunt_for_stops(args):
#------------------------ Open Fasta Files ------------------------#
try:
TAGinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tag_ORF.fasta','fasta') if str(i.seq).endswith('TAG')]
print (color.BOLD+'\n\nGathering Sequence information from FASTA and TSV files\n'+color.END)
except:
print (color.BOLD+color.RED+'\n\nMissing Necessary Inputs: Open Script for Usage'\
' Information\n\n'+color.END)
sys.exit()
TGAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_tga_ORF.fasta','fasta') if str(i.seq).endswith('TGA')]
TAAinFasta = [i for i in SeqIO.parse(args.input_file.split('.fas')[0]+'_taa_ORF.fasta','fasta') if str(i.seq).endswith('TAA')]
## This section originally ONLY considered sequences WITH OG assignments:
## TAAinFasta = [i for i in TAAinFasta if 'no_group' not in i.description and str(i.seq).endswith('TAA')]
## This has been taken out for now
#----------------------- Open BLAST Reports -----------------------#
TAGinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tag_ORF.RepEukProts.tsv').read().split('\n') if i != '']
TGAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_tga_ORF.RepEukProts.tsv').read().split('\n') if i != '']
TAAinTSV = [i for i in open(args.input_file.split('.fas')[0]+'_taa_ORF.RepEukProts.tsv').read().split('\n') if i != '']
## This section originally ONLY considered sequences WITH OG assignments:
## TAAinTSV = i for i in TAAinTSV if i != ''and 'no_group' not in i.split('\t')[0]]
## This has been taken out for now
#------------ Set-up Genetic Code Specific Dictionaries ------------#
tag_dict = {}
for i in TAGinTSV:
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-6]))
tag_dict.setdefault(i.split('\t')[0].replace('_TAG',''),[]).append(int(i.split('\t')[-5]))
tga_dict = {}
for i in TGAinTSV:
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-6]))
tga_dict.setdefault(i.split('\t')[0].replace('_Ciliate',''),[]).append(int(i.split('\t')[-5]))
taa_dict = {}
for i in TAAinTSV:
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-6]))
taa_dict.setdefault(i.split('\t')[0].replace('_Chilo',''),[]).append(int(i.split('\t')[-5]))
#-------------- Preparing In-Frame Stop Codon Counts --------------#
# All the data when TGA is the sole stop codon
tga_codons = 0
tga_data_tag = 0
tga_data_tga = 0
tga_data_taa = 0
tga_seq_count = 0
# All the data when TAG is the sole stop codon
tag_codons = 0
tag_data_tag = 0
tag_data_tga = 0
tag_data_taa = 0
tag_seq_count = 0
# All the data when TAA is the sole stop codon
taa_codons = 0
taa_data_tag = 0
taa_data_tga = 0
taa_data_taa = 0
taa_seq_count = 0
# All the data for each stop codon combined
tga_inframe = 0
tag_inframe = 0
taa_inframe = 0
total_codons = 0
total_seq_counts = len(open(args.input_file).read().split('>'))-1
#-------- Gathering In-frame Stop Codon Density Information --------#
### Collect in-frame stop information for "TAA" and "TAG" when TGA is the ONLY stop
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+' TGA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
for i in TGAinFasta:
try:
if tga_dict[i.description][0] == 1:
for n in range((tga_dict[i.description][0]-1),((tga_dict[i.description][1])*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
tga_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
tga_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
tga_data_tga += 1
tga_inframe += 1
tga_codons += 1
total_codons += 1
tga_seq_count += 1
else:
for n in range(((tga_dict[i.description][0]-1)*3),((tga_dict[i.description][1])*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
tga_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
tga_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
tga_data_tga += 1
tga_inframe += 1
tga_codons += 1
total_codons += 1
tga_seq_count += 1
except:
pass
### Collect in-frame stop information for "TAA" and "TGA" when TAG is the ONLY stop
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+' TAG'+color.END+color.BOLD+' is the only STOP\n'+color.END)
for i in TAGinFasta:
try:
if tag_dict[i.description][0] == 1:
for n in range((tag_dict[i.description][0]-1),((tag_dict[i.description][1])*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
tag_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
tag_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
tag_data_tga += 1
tga_inframe += 1
tag_codons += 1
total_codons += 1
tag_seq_count += 1
else:
for n in range(((tag_dict[i.description][0]-1)*3),(tag_dict[i.description][1]*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
tag_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
tag_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
tag_data_tga += 1
tga_inframe += 1
tag_codons += 1
total_codons += 1
tag_seq_count += 1
except:
pass
### Collect in-frame stop information for "TGA" and "TAG" when TAA is the ONLY stop
print (color.BOLD+'\nCollecting in-frame stop codon information when'+color.RED\
+' TAA'+color.END+color.BOLD+' is the only STOP\n'+color.END)
for i in TAAinFasta:
try:
if taa_dict[i.description][0] == 1:
for n in range((taa_dict[i.description][0]-1),((taa_dict[i.description][1])*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
taa_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
taa_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
taa_data_tga += 1
tga_inframe += 1
taa_codons += 1
total_codons += 1
taa_seq_count += 1
else:
for n in range(((taa_dict[i.description][0]-1)*3),(taa_dict[i.description][1]*3)-3,3):
if str(i.seq).upper()[n:n+3] == 'TAG':
taa_data_tag += 1
tag_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TAA':
taa_data_taa += 1
taa_inframe += 1
if str(i.seq).upper()[n:n+3].upper() == 'TGA':
taa_data_tga += 1
tga_inframe += 1
tag_codons += 1
total_codons += 1
taa_seq_count += 1
except:
pass
#-------------- Writing Data Out and Print Statement --------------#
with open(args.input_file.split('.fas')[0]+'_StopCodonStats.tsv','w+') as w:
w.write('Stop Codon\tNumber of Seqs Analyzed\tIn-frame TAG\tIn-frame TGA\tIn-frame TAA\tTotal Codons\tIn-frame TAG density\tIn-frame TGA density\tIn-frame TAA density\n')
if tga_codons != 0:
w.write('TGA\t'+str(tga_seq_count)+'\t'+str(tga_data_tag)+'\t'+str(tga_data_tga)+'\t'+str(tga_data_taa)+'\t'+str(tga_codons)\
+'\t'+"%.2f" % ((float(tga_data_tag)*1000)/float(tga_codons))+'\t'+"%.2f" % ((float(tga_data_tga)*1000)/float(tga_codons))+'\t'\
+"%.2f" % ((float(tga_data_taa)*1000)/float(tga_codons))+'\n')
else:
w.write('TGA\t0\t0\t0\t0\t0\t0\t0\t0\n')
if tag_codons != 0:
w.write('TAG\t'+str(tag_seq_count)+'\t'+str(tag_data_tag)+'\t'+str(tag_data_tga)+'\t'+str(tag_data_taa)+'\t'+str(tag_codons)\
+'\t'+"%.2f" % ((float(tag_data_tag)*1000)/float(tag_codons))+'\t'+"%.2f" % ((float(tag_data_tga)*1000)/float(tag_codons))+'\t'\
+"%.2f" % ((float(tag_data_taa)*1000)/float(tag_codons))+'\n')
else:
w.write('TAG\t0\t0\t0\t0\t0\t0\t0\t0\n')
if taa_codons != 0:
w.write('TAA\t'+str(taa_seq_count)+'\t'+str(taa_data_tag)+'\t'+str(taa_data_tga)+'\t'+str(taa_data_taa)+'\t'+str(taa_codons)\
+'\t'+"%.2f" % ((float(taa_data_tag)*1000)/float(taa_codons))+'\t'+"%.2f" % ((float(taa_data_tga)*1000)/float(taa_codons))+'\t'\
+"%.2f" % ((float(taa_data_taa)*1000)/float(taa_codons))+'\n')
else:
w.write('TAA\t0\t0\t0\t0\t0\t0\t0\t0\n')
w.write('\n \n')
w.write('Summary\t'+str(tga_seq_count+tag_seq_count+taa_seq_count)+'\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)\
+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))\
+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n')
w.write('\nTotal Seqs in Fasta\t'+str(total_seq_counts))
# print color.BOLD + color.BLUE + '\nSummary\t'+str(tag_inframe)+'\t'+str(tga_inframe)+'\t'+str(taa_inframe)+'\t'+str(total_codons)+'\t'+"%.2f" % ((float(tag_inframe)*1000)/float(total_codons))+'\t'\
# +"%.2f" % ((float(tga_inframe)*1000)/float(total_codons))+'\t'+"%.2f" % ((float(taa_inframe)*1000)/float(total_codons))+'\n\n'\
# + str(tag_seq_count) + '\t' + str(tga_seq_count) + '\t' + str(taa_seq_count) + color.END
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
if os.path.isdir('/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq') != True:
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/')
else:
pass
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
os.system('mkdir ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*_ORF.*fasta ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/StopCodonFastas/')
os.system('mv ' + args.input_file.split('.fas')[0]+'_t*Prots.tsv ' + '/'.join(args.input_file.split('/')[:-1]) + '/StopCodonFreq/SpreadSheets/')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
home_folder = '/'.join(args.input_file.split('/')[:-1])
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\
.replace('.fasta','_StopCodonStats.tsv')+color.END+color.BOLD+' in the '+home_folder\
+' Folder\n\n' + color.END)
print (color.BOLD+'Next Script is: '+color.GREEN+'5_GCodeTranslate.py\n\n'+ color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
diamond_path = check_diamond_path()
args = check_args()
prep_translations(args)
diamond_ProtDB(args, diamond_path)
hunt_for_stops(args)
clean_up(args)
next_script(args)
main()
#----------------------------------------- NOTES -----------------------------------------#
#
# This script is designed to HELP you make an informed decision about the genetic code being
# used by your particular organism. Be aware that it will be limited by the quality of the
# data given to it!
#
# You will need:
#
# Diamond, BioPython, AND the output from '3_CountOGSDiamond.py'
#
# If you are not using the Author's database, update your database name(s) in lines: 345-360
#
# katzlab$ python StopFrequency.py YourFastaFile.fasta
#
#
#------------------------------- Interpretation of Results -------------------------------#
#
# FORMATTED BELOW WITH TEXTWRANGLER...
#
# Example output using CILIATE (TGA) genetic Code (NOTE THE In-Frame Densities):
#
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
# TGA 341 14 0 22 113156 1.2 0 0.92
# TAG 424 0 0 34 140085 0 0 0.78
# TAA 205 14 0 0 16714 0.84 0 0
# Summary 970 28 0 56 269955 2.04 0 1.7
#
# VALUES in summary line (OR SUM of Density) that are > 1.5 likely indicate that the STOP
# codon has been reassigned... in the case above, TAG and TAA look like they have been
# reassigned.
#
#
# Example output using UNIVERSAL genetic Code (NOTE THE In-Frame Densities):
#
# Stop Codon Number_of_Seqs_Analyzed In-frame TAG In-frame TGA In-frame TAA Total Codons In-frame TAG density In-frame TGA density In-frame TAA density
# TGA 341 1 0 2 113156 0.2 0 0.05
# TAG 424 0 2 4 140085 0 0 0.08
# TAA 205 1 0 0 16714 0.04 0 0
# Summary 970 2 2 6 269955 0.15 0 0.06
#
# VALUES in summary line (OR SUM of Density) that are > 0.5 likely indicate that the STOP
# codon still acts as STOP... in the case above, TAG, TGA and TAA look like they still behave
# as a stop codon.
#
# THIS IS A ROUGH GUIDE FOR INTERPRETING THE RESULTS!!!! BE VERY VERY WARY! NUMBER OF TOTAL
# SEQUENCES AND TOTAL CODONS OBSERVED ARE IMPORTANT (TOO FEW AND ANY INTERPRETATION IS DEVOID
# OF ANY MEANING).

View File

@ -0,0 +1,770 @@
#!/usr/bin/env python3.5
##__Updated__: 20_09_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 5_GCodeTranslate.py --help
##########################################################################################
## This script is intended to aid in identifying the genetic code of the data given ##
## ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with 1_ContigFiltStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
## 6. You either know (or have inferred) the genetic code of the organism ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 6_FilterPatials.py (in FinalizeTranscripts Folder) ##
## ##
##########################################################################################
import argparse, os, re, sys
from argparse import RawTextHelpFormatter,SUPPRESS
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Data.CodonTable import CodonTable
#-------------------------- Set-up Codon Tables (Genetic Codes) --------------------------#
blepharisma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
condylostoma_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'W', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
c_uncinata_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAG': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA'])
euplotes_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGA': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAA','TAG'])
myrionecta_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Y', 'TAG': 'Y',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
no_stop_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'X', 'TAG': 'X',
'TGT': 'C', 'TGC': 'C', 'TGA': 'X', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = [''])
peritrich_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'E', 'TAG': 'E',
'TGT': 'C', 'TGC': 'C', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TGA'])
tag_table = CodonTable(forward_table={
'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L',
'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S',
'TAT': 'Y', 'TAC': 'Y', 'TAA': 'Q',
'TGT': 'C', 'TGC': 'C', 'TGA': 'Q', 'TGG': 'W',
'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L',
'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P',
'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q',
'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R',
'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M',
'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T',
'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K',
'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R',
'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V',
'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A',
'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E',
'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'},
start_codons = [ 'ATG'],
stop_codons = ['TAG'])
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
PURPLE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###------------------------- Checks the Command Line Arguments -------------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script will '+color.RED+'Translate '+color.END+color.BOLD+'a '\
'given Fasta file of CDS\nsequences using a given'+color.PURPLE+' Genetic Code.'+color.END+\
color.BOLD+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--genetic_code','-g', action='store', default='universal',
help=color.BOLD+color.GREEN+' Genetic code to use for translation\n (default = '\
'"universal")\n'+color.END)
optional_arg_group.add_argument('--no_RP','-no_RP', action='store_true',
help=color.BOLD+color.GREEN+' Allows files to "skip" the removal\n of Partial Transcripts\n'\
+color.END)
optional_arg_group.add_argument('--list_codes','-codes', action='store_true',
help=color.BOLD+color.GREEN+' Lists supported genetic codes\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
sys.exit()
### Adding in names to 'arg' class for more easy use throughout the script
args.ntd_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_NTD.ORF.fasta'
args.aa_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_AA.ORF.fasta'
args.tsv_out = args.input_file.split('.fas')[0]+'_'+args.genetic_code.title()+'_allOGCleanresults.tsv'
args.home_folder = '/'.join(args.input_file.split('/')[:-1])
args.Diamond_Folder = args.home_folder+'/DiamondOG'
args.StopFreq = args.home_folder+'/StopCodonFreq'
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) + '/'
args.tsv_file = args.input_file.split('.fas')[0]+ '_allOGCleanresults.tsv'
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 5g_GCodeTranslate.py'\
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.Renamed.fasta'\
' --genetic_code Universal'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
supported_gcodes_names = ['bleph','blepharisma','chilo','chilodonella','condy',\
'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal',\
'taa','tag','tga','mesodinium']
supported_gcodes_list = ['Blepharisma\t(TGA = W)','Chilodonella\t(TAG/TGA = Q)','Ciliate\t\t(TAR = Q)',\
'Condylostoma\t(TAR = Q, TGA = W)','Euplotes\t(TGA = C)','Peritrich\t(TAR = E)','None\t\t(TGA/TAG/TAA = X)',\
'Universal\t(TGA/TAG/TAA = STOP)','TAA\t\t(TAG/TGA = Q)', 'TAG\t\t(TRA = Q)', 'TGA\t\t(TAR = Q)']
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.genetic_code != None and args.genetic_code.lower() not in supported_gcodes_names:
print (color.BOLD+color.RED+'\nProvided genetic code is currently unsupported.\n\n'\
'If you have a new genetic code, please contact the author (with some evidence).\n\n'\
'Otherwise, use one of the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
print (author)
valid_arg += 1
else:
if args.list_codes == True:
print (color.BOLD+color.RED+'\nThese are the currently supported genetic codes.\n'+color.END)
print (color.BOLD+color.ORANGE+'\n'.join(supported_gcodes_list)+'\n\n'+color.END)
valid_arg += 1
if args.author == True:
print (author)
valid_arg += 1
if args.input_file != None:
if os.path.isfile(args.input_file) != False:
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
elif args.input_file.endswith('WTA_EPU.Renamed.fasta') != True:
print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\
' with '+color.GREEN+'3_CountOGsDiamond.py '+color.END+color.BOLD+'are valid\n\n'\
'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\
'"WTA_EPU.Renamed.fasta"\n\n'+color.END)
valid_arg += 1
else:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_arg += 1
return valid_arg
###########################################################################################
###--------------------------- Does the Inital Folder Prep -----------------------------###
###########################################################################################
def prep_folders(args):
OG_folder = '/'.join(args.input_file.split('/')[:-1]) + '/DiamondOG/'
if os.path.isdir(OG_folder) != True:
os.system('mkdir '+OG_folder)
if os.path.isdir(args.all_output_folder + 'TranslatedTranscriptomes') != True:
os.system('mkdir ' + args.all_output_folder + 'TranslatedTranscriptomes')
##########################################################################################
###---------------- Scans 5-Prime End of Transcript for In-Frame "ATG" ----------------###
##########################################################################################
def check_new_start_new(some_seq, low_lim, upper_lim, old_start, codon_table):
## Looks for in-frame STOP codons in the UTR of the transcript
prime5 = str(Seq(some_seq[low_lim:upper_lim]).translate(table=codon_table)).replace('*','x')
in_frame_stops = [stops.start() for stops in re.finditer('x',prime5)]
## Looks for in-frame START codons in the UTR of the transcript
in_frame_starts = [starts.start() for starts in re.finditer('M',prime5)]
## Checks that there are NO in-frame STOP codons between the possible "new" START codon
## and the aligned portion of the transcript -- THIS is double checked!
if len(in_frame_starts) != 0:
if len(in_frame_stops) != 0:
if in_frame_stops[-1] < in_frame_starts[-1]:
new_start = low_lim+in_frame_starts[-1]*3
else:
new_start = old_start
else:
new_start = low_lim+in_frame_starts[-1]*3
else:
new_start = old_start
## Skips the double-checking if there are no GOOD potential START codons
if new_start == old_start:
updated_start = old_start
else:
## Double checks that there are NO IN-FRAME stop codons between the NEW-SUGGESTED Start
## position and the OLD-SUPPORTED stop position!
between_new_old_start = str(Seq(some_seq[new_start:old_start]).translate(table=1)).replace('*','x')
in_frame_stops_check = [stops.start() for stops in re.finditer('x',between_new_old_start)]
in_frame_starts_check = [starts.start() for starts in re.finditer('M',between_new_old_start)]
if len(in_frame_starts_check) != 0:
if len(in_frame_stops_check) != 0:
if in_frame_stops_check[-1] < in_frame_starts_check[-1]:
updated_start = new_start+in_frame_starts_check[-1]*3
else:
updated_start = old_start
else:
updated_start = new_start
else:
updated_start = new_start
return updated_start
##########################################################################################
###--------------- Extracts the ORF from the Fasta File and SpreadSheet ---------------###
##########################################################################################
def extract_ORF(prot_dict, codon_table, args):
print (color.BOLD+'\n\nExtracting '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
' the transcriptomic data-set\n\n'+color.END)
for k, v in prot_dict.items():
## Attempting to find the most-likely START (ATG) position in the transcript (tricky)
## Skips this if the initial Methionine (ATG) is likely present
## (e.g. the alignment position of the protein = '1')
prot_start = int(v[3].split('..')[0])
old_start = v[1]
if prot_start != 1:
min_dist, max_dist = round_down_three(prot_start)
min_start = old_start-min_dist
max_start = old_start-max_dist
if min_start < 0:
min_start = old_start
if max_start < 0:
max_start = min_start%3
# print k+'\tOld_start\t'+str(old_start)+'\tMin_Dist/Start\t'+str(min_dist)+'/'+str(min_start)+'\tMax_Dist/Start\t'+str(max_dist)+'/'+str(max_start)+'\n'
updated_start = check_new_start_new(v[-1], max_start, min_start, old_start, codon_table)
else:
updated_start = old_start
temp = prot_dict[k][-1][updated_start:]
## Uses the given genetic code to identify the stop position of the ORF
temp_prot = str(Seq(temp).translate(table=codon_table))
if '*' in temp_prot:
stop_pos = (temp_prot.index('*')+1)*3
prot_dict[k].append(temp[:stop_pos])
else:
stop_pos = prot_dict[k][2] - prot_dict[k][1]
prot_dict[k].append(temp[:stop_pos])
## Awkward_list is populated with unexpectedly SHORT ORFs!
## Reasons for being short include:
# An error Xyrus introduced
# Not as great genetic code decision (in-frame stop)
# Crummy sequence/assembly quality (false in-frame stop codons)
awkward_list = []
look_good = []
for k, v in prot_dict.items():
expected_min = len(v[-2][v[1]:v[2]])-1
if len(v[-1]) < expected_min:
awkward_list.append(k)
else:
look_good.append(k)
if len(awkward_list) != 0:
with open('UnexpexctedShortStuffBlameXyrus.txt','w+') as x:
for entry in awkward_list:
x.write(entry+'\n')
else:
pass
print (color.BOLD+'\n\nTranslating '+color.PURPLE+'ORFs'+color.END+color.BOLD+' from'\
' using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
prot_dict[k].append(str(Seq(v[-1]).translate(table=codon_table)).rstrip('*'))
return prot_dict
##########################################################################################
###------------ Grabs the Coding Coordinates from the OG-BLAST SpreadSheet ------------###
##########################################################################################
def prep_translations(args):
print (color.BOLD+'\n\nGrabbing useful info from the '+color.ORANGE+args.input_file\
.split('/')[-1]+color.END+color.BOLD+' Fasta File\nand from the '+color.ORANGE+args.tsv_file\
.split('/')[-1]+color.END+color.BOLD+' OG-Assignment Spreadsheet'+color.END)
inTSV = ['\t'.join(i.rstrip('\n').split('\t')[:-1]) for i in open(args.tsv_file).readlines() if i != '\n']
inFasta = [i for i in SeqIO.parse(args.input_file,'fasta')]
# ORF identification step here, uses the 'allOGCleanresults.tsv file to identify the ORF
prot_dict = {}
# Special scenario! Only for when the genetic code is not particularly useful ...
if args.genetic_code.lower() == 'none' or args.genetic_code.lower() == 'condylostoma' or args.genetic_code.lower() == 'condy':
for i in inTSV:
prot_dict.setdefault(i.split('\t')[0],[])
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('F')
## Collects initial Start and Stop positions from the BLAST alignment
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('RC')
## Collects initial Start and Stop positions from the BLAST alignment
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
## Makes sure that the dictionary has the transcript in the correct orientation
for i in inFasta:
if i.description in prot_dict.keys():
if 'RC' == prot_dict[i.description][0]:
prot_dict[i.description].append(str(i.seq.reverse_complement()))
else:
prot_dict[i.description].append(str(i.seq))
else:
for i in inTSV:
prot_dict.setdefault(i.split('\t')[0],[])
if int(i.split('\t')[6]) < int(i.split('\t')[7]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('F')
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[6])-1)
prot_dict[i.split('\t')[0]].append(int(i.split('\t')[7])+3)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
if int(i.split('\t')[7]) < int(i.split('\t')[6]):
## Saves the Transcript Orientation (Coding vs. Template Strand)
prot_dict[i.split('\t')[0]].append('RC')
## Collects initial Start and Stop positions from the BLAST alignment (but in the "correct" orientation)
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[6]))
prot_dict[i.split('\t')[0]].append(int(i.split('_Len')[1].split('_')[0])-int(i.split('\t')[7])+1)
## Implied Amino Acid alignment positions (e.g. does the alignment start at the 1st Methionine?)
prot_dict[i.split('\t')[0]].append('..'.join(i.split('\t')[-4:-2]))
## Makes sure that the dictionary has the transcript in the correct orientation
for i in inFasta:
if i.description in prot_dict.keys():
if 'RC' == prot_dict[i.description][0]:
prot_dict[i.description].append(str(i.seq.reverse_complement()))
else:
prot_dict[i.description].append(str(i.seq))
return prot_dict
##########################################################################################
###------------------------ Rounds Down Values to Nearest "3" -------------------------###
##########################################################################################
def round_down_three(num):
min_val = int(num*3*.5)-int(num*3*.5)%3
max_val = int(num*6)-int(num*6)%3
return min_val, max_val
##########################################################################################
###--------------------- Makes Translation Steps (Later) Easier -----------------------###
##########################################################################################
def standardize_gcode(given_code):
if given_code == 'ciliate' or given_code == 'tga':
codon_table = 6
elif given_code == 'chilodonella' or given_code == 'chilo' or given_code == 'taa':
codon_table = c_uncinata_table
elif given_code == 'blepharisma' or given_code == 'bleph':
codon_table = blepharisma_table
elif given_code == 'euplotes' or given_code == 'eup':
codon_table = euplotes_table
elif given_code == 'myrionecta' or given_code == 'mesodinium':
codon_table = myrionecta_table
elif given_code == 'peritrich' or given_code == 'vorticella':
codon_table = peritrich_table
elif given_code == 'none':
codon_table = no_stop_table
elif given_code == 'condylostoma' or given_code == 'condy':
codon_table = condylostoma_table
elif given_code == 'tag':
codon_table = tag_table
elif given_code == 'universal':
codon_table = 1
else:
print (color.BOLD+color.RED+'\n\nNo valid genetic code provided!\n\n'+color.END+\
color.BOLD+'Using the "Universal" genetic code (by default)\n\nPlease check that the'\
' code you wish to use is supported:'+color.CYAN+'\n\npython 5_GCodeTranslate.py'\
' -list_codes\n\n'+color.END)
codon_table = 1
return codon_table
###########################################################################################
###------------------ Updates Spreadsheet with Updated Contig Names --------------------###
###########################################################################################
def update_spreadsheet(args, updated_spreadsheet_dict):
if os.path.isdir(args.home_folder + '/DiamondOG/') != True:
os.system(args.home_folder + '/DiamondOG/')
else:
pass
inTSV = [line.rstrip('\n') for line in open(args.tsv_file).readlines() if line != '\n' and line.split('\t')[0] in updated_spreadsheet_dict.keys()]
updatedTSV = [updated_spreadsheet_dict[line.split('\t')[0]]+'\t'+'\t'.join(line.split('\t')[1:]) for line in inTSV]
with open(args.tsv_out,'w+') as w:
w.write('\n'.join(updatedTSV))
###########################################################################################
###-------------------- Updates Log With OG Assignment Information ---------------------###
###########################################################################################
def update_log(filename, codon_table):
if os.path.isdir('../PostAssembly_Logs/') != True:
os.system('mkdir ../PostAssembly_Logs/')
else:
pass
ntd_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.fasta','fasta')]
aa_ORF = [i for i in SeqIO.parse(filename.split('.fas')[0]+'_'+gcode.title()+'_ORF.aa.fasta','fasta')]
min_ntd_ORF = str(min([len(i.seq) for i in ntd_ORF]))
max_ntd_ORF = str(max([len(i.seq) for i in ntd_ORF]))
avg_ntd_ORF = '%.2f' % (sum([len(i.seq) for i in ntd_ORF])/float(len(ntd_ORF)))
min_aa_ORF = str(min([len(i.seq) for i in aa_ORF]))
max_aa_ORF = str(max([len(i.seq) for i in aa_ORF]))
avg_aa_ORF = '%.2f' % (sum([len(i.seq) for i in aa_ORF])/float(len(aa_ORF)))
for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'):
if Logname.startswith(filename.split('/')[2].split('_WTA')[0]) and Logname.endswith('Log.txt'):
with open('../PostAssembly_Logs/'+Logname,'a') as LogFile:
LogFile.write('Nucleotide ORFs\t'+str(len(ntd_ORF))+'\tn/a\tn/a\n')
LogFile.write('Nucleotide ORF Lengths\t'+avg_ntd_ORF+'\t'+min_ntd_ORF+'\t'+max_ntd_ORF+'\n')
LogFile.write('Protein ORFs\t'+str(len(aa_ORF))+'\tn/a\tn/a\n')
LogFile.write('Protein ORF Lengths\t'+avg_aa_ORF+'\t'+min_aa_ORF+'\t'+max_aa_ORF+'\n')
##########################################################################################
###----------------------- Write File with Provided Genetic Code ----------------------###
##########################################################################################
def write_data_out(prot_dict, codon_table, args):
update_spreadsheet_dict = {}
#The code below only works if rnaspades was used; constrained by addition of script 6b
for k, v in prot_dict.items():
#if 'Cov' in k:
new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-3:])
#update_spreadsheet_dict[k] = new_name
update_spreadsheet_dict[k] = k
#else:
#new_name = k.split('_Len')[0]+'_Len'+str(len(v[-2]))+'_'+'_'.join(k.split('_')[-2:])
#update_spreadsheet_dict[k] = new_name
#update_spreadsheet_dict[k] = k
with open(args.ntd_out,'w+') as w:
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'ORF'+color.END+color.BOLD\
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-2])+'\n')
with open(args.aa_out, 'w+') as w:
print (color.BOLD+'\n\nWriting FASTA file with '+color.PURPLE+'Translated ORF'+color.END+color.BOLD\
+' sequences using the '+color.DARKCYAN+args.genetic_code.title()+' genetic code'+color.END)
for k, v in prot_dict.items():
w.write('>'+update_spreadsheet_dict[k]+'\n'+str(v[-1])+'\n')
return update_spreadsheet_dict
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
if args.input_file.split('.fas')[0].split('/')[-1] + '_StopCodonStats.tsv' in os.listdir(args.home_folder):
os.system('mv ' + args.input_file.split('.fas')[0] + '_StopCodonStats.tsv ' + args.StopFreq)
os.system('mv '+args.tsv_file+' '+args.Diamond_Folder)
os.system('mv '+args.input_file+' '+args.Diamond_Folder)
if args.no_RP == True:
if os.path.isdir(args.all_output_folder + 'ToRename/') != True:
os.system('mkdir ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder + 'ToRename/')
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder + 'ToRename/')
else:
os.system('cp ' + args.tsv_out + ' ' + args.all_output_folder)
os.system('cp ' + args.ntd_out + ' ' + args.all_output_folder)
os.system('cp ' + args.aa_out + ' ' + args.all_output_folder)
os.system('mv ' + args.home_folder + ' ' + args.all_output_folder + 'TranslatedTranscriptomes')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\n\nLook for '+color.DARKCYAN+args.ntd_out.split('/')[-1]+color.END+\
color.BOLD+',\n'+color.DARKCYAN+args.aa_out.split('/')[-1]+color.END+color.BOLD+', and\n'\
+color.DARKCYAN+args.tsv_out.split('/')[-1]+color.END+color.BOLD+',\nwhich are in the '+\
color.ORANGE+args.home_folder.split('/')[-1]+' Folder'+color.END)
if args.no_RP == True:
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'7_FinalRename.py'+color.END+color.BOLD+\
' in the '+color.PURPLE+'RemovePartials Folder'+color.END+color.BOLD+'\nwith a copy of'\
' the outputs of this script!'+color.END)
print(color.BOLD+'\n\nRemember that you have chosen '+color.RED+'NOT '+color.END+color.BOLD+\
'to remove partials\nand are skipping to the renaming step!\n\n'+color.END)
else:
print(color.BOLD+'\n\nNext Script is: '+color.GREEN+'6_FilterPartials.py'+color.END+color.BOLD+\
' in the '+color.PURPLE+'FinalizeTranscripts Folder'+color.END+color.BOLD+'\nwith a copy of'\
' the outputs of this script!\n\n'+color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
prep_folders(args)
codon_table = standardize_gcode(args.genetic_code.lower())
prot_dict_Prepped = prep_translations(args)
prot_dict_Final = extract_ORF(prot_dict_Prepped, codon_table, args)
new_spreadsheet_names = write_data_out(prot_dict_Final, codon_table, args)
update_spreadsheet(args, new_spreadsheet_names)
# update_log(fasta_file, gcode)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,652 @@
#!/usr/bin/env python3.5
##__Updated__: 2020-11-29
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch
##__Usage__: python 6_FilterPartials.py --help
##################################################################################################
## This script is intended to remove incomplete transcripts that have a more complete mate ##
## ##
## Prior to running this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
## 6. You either know (or have inferred) the genetic code of the organism ##
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## 7_FinalRename.py ##
## ##
##################################################################################################
from Bio import SeqIO
from Bio.Seq import Seq
from statistics import mean
from distutils import spawn
import argparse, os, sys, time, re
from argparse import RawTextHelpFormatter,SUPPRESS
#------------------------------ Colors For Print Statements ------------------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------###
###########################################################################################
## IF Diamond is IN YOUR PATH then no updating is needed...
def check_diamond_path():
diamond_path = ''
if diamond_path == '':
diamond_path = spawn.find_executable("diamond")
#diamond_path = '/path/to/diamond'
else:
pass
if diamond_path == None:
print (color.BOLD + '\n\nPlease open this script and check that you have included'\
+' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END)
print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\
+'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\
+color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END)
sys.exit()
else:
pass
return diamond_path
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\
+color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\
+color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--file_prefix','-fp', action='store',
help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\
'to be processed\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98,
help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\
' contigs\n (default = 0.98)\n'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder')
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
args.id_print = str(int(float(args.identity)*100))
args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/'
args.file_prefix = args.file_prefix.split('/')[-1]
args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')]
args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')]
args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')]
quit_eval = return_more_info(args)
if quit_eval > 0:
print ('\n')
sys.exit()
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\
' --file_prefix Op_me_Xxma'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_arg = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_arg += 1
if args.file_listNTD == []:
print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
'is present in\nthe files of interest')
valid_arg += 1
if args.file_listAA == []:
print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
'is present in\nthe files of interest')
valid_arg += 1
if args.file_listTSV == []:
print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\
' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\
'is present in\nthe files of interest')
valid_arg += 1
if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV):
pass
else:
print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\
' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\
+color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\
' each transcriptome\n\nThen try once again.'+color.END)
valid_arg += 1
return valid_arg
##########################################################################################
###------------------------- Creates Folders For Storing Data -------------------------###
##########################################################################################
def prep_folders(args):
if os.path.isdir(args.all_output_folder + 'ToRename') != True:
os.system('mkdir ' + args.all_output_folder + 'ToRename')
if os.path.isdir(args.all_output_folder + args.file_prefix) != True:
os.system('mkdir ' + args.all_output_folder + args.file_prefix)
if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True:
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original')
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets')
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/')
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets')
if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True:
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed')
os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets')
##########################################################################################
###-------------------- Merges Fasta Files When Replicates Present --------------------###
##########################################################################################
def merge_fasta_replicates(args, type):
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
count = 0
fasta_to_merge = []
if type == 'NTD':
fasta_list = args.file_listNTD
else:
fasta_list = args.file_listAA
for file in fasta_list:
fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != '']
count += 1
with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w:
w.write(''.join(fasta_to_merge))
time.sleep(.75)
##########################################################################################
###--------------------- Merges TSV Files When Replicates Present ---------------------###
##########################################################################################
def merge_tsv_replicates(args):
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
count = 0
tsv_to_merge = []
for file in args.file_listTSV:
tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != '']
count += 1
with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w:
w.write('\n'.join(tsv_to_merge))
time.sleep(.75)
##########################################################################################
###------------------ Calls on the other Merge Functions by Data Type -----------------###
##########################################################################################
def merge_relevant_data(args):
print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END)
merge_fasta_replicates(args, 'NTD')
merge_fasta_replicates(args, 'AA')
merge_tsv_replicates(args)
##########################################################################################
###------------------- Uses Diamond to perform Self-vs-Self "BLAST" -------------------###
##########################################################################################
def self_blast(args, diamond_path):
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
diamond_makedb = diamond_path + ' makedb --in ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated'
diamond_self = diamond_path + ' blastp -q ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated --strand plus --no-self-hits --id '+str(args.identity)+\
' --query-cover 0.7 --evalue 1e-15 --threads 60 --outfmt 6 -o ' + cat_folder + 'SpreadSheets/' + args.file_prefix + '.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
print (color.BOLD+'\n\nBinning ALL '+color.ORANGE+'Nucleotide ORFs'+color.END+color.BOLD\
+' for '+color.GREEN+args.file_prefix+color.END+color.BOLD+' at '+args.id_print\
+'% identity.\n\n'+color.END)
os.system(diamond_makedb)
os.system(diamond_self)
return cat_folder+'SpreadSheets/'+args.file_prefix+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv'
##########################################################################################
###------------------- Uses USearch to perform Self-vs-Self "BLAST" -------------------###
##########################################################################################
def check_Self_vs_Self(tsv_file):
evaluation = ''
tsv_in = [i for i in open(tsv_file).read().split('\n') if i != '']
if len(tsv_in) == 0:
evaluation = 'empty'
with open(tsv_file,'w+') as w:
w.write('No Self-vs-Self hits were found')
else:
evaluation = 'continue'
return evaluation
##########################################################################################
###-------------------- Removes Nearly Identical ORFs from Data Set -------------------###
##########################################################################################
def filter_NTD_data(args):
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
##########################################
## Set-up Useful Lists and Dictionaries ##
##########################################
nuc_Above98_hit = {}
seqs_to_toss = []
prepped_NTD = []
prepped_AA = []
nuc_tsv_100 = 0
replicates = ''
if len(args.file_listNTD) > 1:
replicates = 'yes'
else:
replicates = 'nope'
OGLenDB = {}
for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
if rec.id[-10:] not in OGLenDB:
OGLenDB.update({ rec.id[-10:] : [] })
OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
for og in OGLenDB:
OGLenDB[og] = mean(OGLenDB[og])
print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\
' with >'+args.id_print+'% Nucleotide Identity over >70% of\ntheir length when '\
'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\
+color.CYAN+args.file_prefix+'\n\n'+color.END)
#####################################################################
## Self-v-self BLAST Output Parsing - first checks for Seq-length! ##
#####################################################################
nuc_tsv_raw = [i.rstrip('\n') for i in open(cat_folder+'SpreadSheets/'+args.file_prefix\
+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv').readlines() if i != '\n']
too_long = 0
for line in nuc_tsv_raw:
og_number = re.split('OG.{1}_', line)[-1][:6]
og_prefix = line.split(og_number)[0][-4:]
og = og_prefix + og_number
if og in OGLenDB.keys():
if int(line.split('Len')[-1].split('_')[0]) > 4.5*OGLenDB[og] or int(line.split('Len')[-1].split('_')[0]) < 1.5*OGLenDB[og]:
seqs_to_toss.append(line.split('\t')[1])
too_long += 1
nuc_tsv = [line for line in nuc_tsv_raw if line.split('\t')[1] not in seqs_to_toss]
if len(nuc_tsv) > 0:
if 'Cov' in nuc_tsv[0].split('\t')[0].split('_')[-3]:
nuc_tsv.sort(key=lambda x: (-int(x.split('\t')[1].split('Len')[-1].split('_')[0]),-int(x.split('\t')[1].split('Cov')[-1].split('_')[0])))
else:
nuc_tsv.sort(key=lambda x: -int(x.split('\t')[1].split('Len')[-1].split('_')[0]))
for line in nuc_tsv:
if line.split('\t')[1] not in seqs_to_toss:
nuc_Above98_hit.setdefault(line.split('\t')[1],[]).append(line.split('\t')[0])
seqs_to_toss.append(line.split('\t')[0])
if line.split('\t')[2] == '100.0':
nuc_tsv_100 += 1
seqs_to_toss = list(set(seqs_to_toss))
inFasta_NTD_rawLen = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') if i.description]
inFasta_NTD = [i for i in inFasta_NTD_rawLen if i.description not in seqs_to_toss]
inFasta_AA = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta','fasta') if i.description not in seqs_to_toss]
if replicates != '':
for i in inFasta_NTD:
if i.description not in nuc_Above98_hit.keys():
prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq))
else:
Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq))
for i in inFasta_AA:
if i.description not in nuc_Above98_hit.keys():
prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq).replace('*','X'))
else:
Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]])))
prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq).replace('*','X'))
else:
for i in inFasta_NTD:
if i.description not in nuc_Above98_hit.keys():
prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
else:
prepped_NTD.append('>'+i.description+'\n'+str(i.seq))
for i in inFasta_AA:
if i.description not in nuc_Above98_hit.keys():
prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
else:
prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X'))
with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w+') as w:
for k, v in nuc_Above98_hit.items():
w.write(k+'\t'+'\t'.join(v)+'\n')
###################################################################################
## Check for abnormally short sequences for the taxon for every Gene Family (OG) ##
###################################################################################
print (color.BOLD+'Removing Abnormally Short (70% length) OR Long (200% length)'\
+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
self_OGLenDB={} ##
seqs_to_toss = [] ##
too_long = too_short = 0 ##
for i in prepped_NTD:
og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
og = og_prefix + og_number
self_OGLenDB.setdefault(og,[]).append(len(i.split('\n')[-1]))
good_NTD_names = []
for i in prepped_NTD:
og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6]
og_prefix = i.split('\n')[0].split(og_number)[0][-4:]
og = og_prefix + og_number
if (0.7*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))) <= len(i.split('\n')[-1]) <= (2*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))):
good_NTD_names.append(i.split('\n')[0])
good_NTD_seqs = [i for i in prepped_NTD if i.split('\n')[0] in good_NTD_names]
good_AA_seqs = [i for i in prepped_AA if i.split('\n')[0] in good_NTD_names]
too_short = len(prepped_NTD) - len(good_NTD_names)
####################################################################
## Finalized Outputs are Summarized and Written Out to New Fastas ##
####################################################################
print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\
str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\
color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\
+color.PURPLE+' ORFs.\n\n'+color.END)
print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\
+' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\
color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\
color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\
+color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\
+color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\
color.RED+'TOO LONG or SHORT.\n\n'+color.END)
print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\
+color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END)
with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
for i in good_NTD_seqs:
w.write(i+'\n')
with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
for i in good_AA_seqs:
x.write(i+'\n')
return good_NTD_names
##########################################################################################
###------------------- Updates SpreadSheet with Update Sequence Names -----------------###
##########################################################################################
def update_tsv(args, NTD_list_names):
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/'
proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
inTSV = {'_'.join(i.split('\t')[0].split('_')[1:]):'\t'.join(i.split('\t')[1:]) for i in open(cat_folder+\
args.file_prefix+'_Concatenated.allOGCleanresults.tsv').readlines() if i != '\n'}
Updated_inTSV = [i.strip('>')+'\t'+inTSV[i.split('_Trans')[0].strip('>')] for i in NTD_list_names]
with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as w:
for line in Updated_inTSV:
w.write(line+'\n')
def no_partials_present(args, OGLenDB):
print (color.BOLD+color.RED+'\n\nWarning:'+color.END+color.BOLD+' No partial sequences'\
' were found with > '+str(args.id_print)+'% nucleotide identity.\n\nThe data will still be '\
'checked for ORFs that are unexpectedly '+color.ORANGE+'Short'+color.END+color.BOLD+' or'\
+color.ORANGE+' Long.\n\n'+color.END)
cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/'
proc_folder = args.all_output_folder + args.file_prefix + '/Processed/'
NTD_file = cat_folder+args.file_prefix+'.NTD.Concatenated.fasta'
AA_file = cat_folder+args.file_prefix+'.AA.Concatenated.fasta'
TSV_file = cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv'
OGLenDB = {}
for rec in SeqIO.parse(args.hook_fasta, 'fasta'):
if rec.id[-10:] not in OGLenDB:
OGLenDB.update({ rec.id[-10:] : [] })
OGLenDB[rec.id[-10:]].append(len(str(rec.seq)))
for og in OGLenDB:
OGLenDB[og] = mean(OGLenDB[og])
self_OGLenDB = {}
seqs_to_toss = []
too_long, too_short = 0, 0
## Small changes in this section for Auden (ought to work now)
## Lists -> Dictionaries and some data curation steps
inFasta = {i.description:str(i.seq) for i in SeqIO.parse(NTD_file,'fasta')}
for k,v in inFasta.items():
og_number = re.split('OG.{1}_', k)[-1][:6]
og_prefix = k.split(og_number)[0][-4:]
og = og_prefix + og_number
if len(v) >= 4.5*OGLenDB[og]:
seqs_to_toss.append(k)
too_long+= 1
prepped_NTD = [i for i in inFasta if i not in seqs_to_toss]
print (color.BOLD+'Removing Abnormally Short (70% length) OR Long (200% length)'\
+color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\
'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END)
## toss those sequences from the sequence dictonary (less headache)
for crap_seq in seqs_to_toss:
del inFasta[crap_seq]
for k, v in inFasta.items():
og_number = re.split('OG.{1}_', k)[-1][:6]
og_prefix = k.split(og_number)[0][-4:]
og = og_prefix + og_number
self_OGLenDB.setdefault(og,[]).append(len(v))
self_OGLenDB_Final = {k:sum(v)/len(v) for k, v in self_OGLenDB.items()}
good_NTD_data = { }
for k, v in inFasta.items():
og_number = re.split('OG.{1}_', k)[-1][:6]
og_prefix = k.split(og_number)[0][-4:]
og = og_prefix + og_number
if 0.7*self_OGLenDB_Final[og] <= len(v) <= 2*self_OGLenDB_Final[og]:
good_NTD_data.update({ k : v })
good_AA_data = {i.description:str(i.seq) for i in SeqIO.parse(AA_file,'fasta') if i.description in good_NTD_data.keys()}
good_TSV_data = [i for i in open(cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv')\
.read().split('\n') if i != '' and i.split('\t')[0] in good_NTD_data.keys()]
renamed_TSV_data = [i.split('\t')[0]+'_Trans1\t'+'\t'.join(i.split('\t')[1:]) for i in good_TSV_data]
with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w:
for k,v in good_NTD_data.items():
w.write('>'+k+'_Trans1\n'+v+'\n')
with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x:
for k, v in good_AA_data.items():
x.write('>'+k+'_Trans1\n'+v+'\n')
with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as y:
y.write('\n'.join(renamed_TSV_data))
##########################################################################################
###--------------------- Cleans up the Folder and Moves Final Files -------------------###
##########################################################################################
def clean_up(args):
for i in args.file_listNTD:
os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/')
os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script():
print(color.BOLD+'\nNext Script is: '+color.GREEN+'6b_update_cov_post_removepartials.py\n\n'+color.END)
##########################################################################################
###------------------- Checks Command Line Arguments and Calls Steps ------------------###
##########################################################################################
def main():
diamond_path = check_diamond_path()
args = check_args()
prep_folders(args)
merge_relevant_data(args)
self_BLAST_out = self_blast(args, diamond_path)
evaluation = check_Self_vs_Self(self_BLAST_out)
if evaluation != 'empty':
NTD_names = filter_NTD_data(args)
update_tsv(args, NTD_names)
else:
no_partials_present(args)
clean_up(args)
next_script()
main()

View File

@ -0,0 +1,88 @@
#!/usr/bin/python
from __future__ import print_function
__author__ = "Jean-David Grattepanche"
__version__ = "2, August 28, 2017"
__email__ = "jeandavid.grattepanche@gmail.com"
import sys
import os
import re
import time
import string
import os.path
from Bio import SeqIO
from sys import argv
def Addcoverage(code):
seqfolder = code
all_output_folder = '/'.join(code.split('/')[:-1])
code = code.split('/')[-1]
covupd = {}
for seqcoll in open(seqfolder + '/' + code + '_SeqPairsAbove98.txt','r'):
CL = 0
for transc in seqcoll.split('\t'):
if CL == 0:
reftrans = ('_').join(transc.split('_')[1:])
coverage = int(transc.split('Cov')[1].split('_')[0])
Length = int(transc.split('Len')[1].split('_')[0])
CL += coverage * Length
covupd[reftrans] = CL
if os.path.isdir(seqfolder + '/Updated_Coverage/') != True:
os.system('mkdir ' + seqfolder + '/Updated_Coverage/')
if os.path.isdir(seqfolder + '/Updated_Coverage/SpreadSheets/') != True:
os.system('mkdir ' + seqfolder + '/Updated_Coverage/SpreadSheets/')
for spreadsh in os.listdir(seqfolder + '/Processed/SpreadSheets/'):
if spreadsh.endswith('.tsv'):
outtsvtokeep = open(seqfolder + '/Updated_Coverage/SpreadSheets/' + spreadsh.split('Final')[0] + 'UC.Final' + spreadsh.split('Final')[1],'w+')
for row in open(seqfolder + '/Processed/SpreadSheets/'+ spreadsh, 'r'):
if row.split('_Trans')[0] in covupd:
og_number = re.split('OG.{1}_', row)[-1][:6]
og_prefix = row.split(og_number)[0][-4:]
og = og_prefix + og_number
newcov2 = round(covupd[row.split('_Trans')[0]] / int(row.split('_Len')[1].split('_')[0]))
outtsvtokeep.write(row.split('Cov')[0]+'Cov'+str(newcov2)+'_' + og_prefix +row.split(og_prefix)[1].split('_Trans')[0] +'\t' +('\t').join(row.split('\t')[1:]))
else:
if 'Trans' in row:
outtsvtokeep.write(row.split('_Trans')[0]+ '\t' +('\t').join(row.split('\t')[1:]))
else:
outtsvtokeep.write(row)
outtsvtokeep.close()
for seqfile in os.listdir(seqfolder + '/Processed'):
if seqfile.endswith('.fasta'):
outseqtokeep = open(seqfolder + '/Updated_Coverage/' + seqfile.split('Final')[0] + 'UC.Final' + seqfile.split('Final')[1],'w+')
for Seq in SeqIO.parse(seqfolder + '/Processed/' + seqfile ,'fasta'):
if Seq.description.split('_Trans')[0] not in covupd:
outseqtokeep.write('>'+Seq.description.split('_Trans')[0]+ '\n'+str(Seq.seq) +'\n')
else:
og_number = re.split('OG.{1}_', Seq.description)[-1][:6]
og_prefix = Seq.description.split(og_number)[0][-4:]
og = og_prefix + og_number
newcov = round(covupd[Seq.description.split('_Trans')[0]] / int(Seq.description.split('_Len')[1].split('_')[0]))
outseqtokeep.write('>'+Seq.description.split('Cov')[0]+'Cov'+str(newcov)+'_' + Seq.description.split(og)[0][-2:] + og + '\n'+str(Seq.seq) +'\n')
outseqtokeep.close()
if os.path.isdir(all_output_folder + '/ToRename') != True:
os.system('mkdir ' + all_output_folder + '/ToRename')
os.system('cp ' + seqfolder + '/Updated_Coverage/*fasta ' + all_output_folder + '/ToRename/')
os.system('cp ' + seqfolder + '/Updated_Coverage/SpreadSheets/*tsv ' + all_output_folder + '/ToRename/')
def main():
script, code = argv
Addcoverage(code)
main()

View File

@ -0,0 +1,398 @@
#!/usr/bin/env python3.5
##__Updated__: 31_08_2017
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
##__Usage__: python 6_FilterPartials.py --help
##################################################################################################
## This script is intended to rename the outputs of the FilterPartials script ##
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
## ##
## Prior to r`ning this script, ensure the following: ##
## ##
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
## 3. Removed SSU/LSU sequences from your Fasta File ##
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
## 6. You either know (or have inferred) the genetic code of the organism ##
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
## 8. Partial sequences have been removed from the transcriptomic data sets ##
## ##
## COMMAND Example Below ##
## Extra Notes at Bottom of Script ##
## ##
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
## ##
## Next Script(s) to Run: ##
## NONE! You're FINISHED! :D ##
## ##
##################################################################################################
import argparse, os, sys
from argparse import RawTextHelpFormatter,SUPPRESS
#----------------------- Solely to Make Print Statements Colorful -----------------------#
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
ORANGE = '\033[38;5;214m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
#------------------------------- Main Functions of Script --------------------------------#
###########################################################################################
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
###########################################################################################
def check_args():
parser = argparse.ArgumentParser(description=
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
required_arg_group.add_argument('--input_file','-in', action='store',
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
required_arg_group.add_argument('--name','-n', action='store',
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
optional_arg_group.add_argument('-author', action='store_true',
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
if len(sys.argv[1:]) == 0:
print (parser.description)
print ('\n')
sys.exit()
args = parser.parse_args()
quit_eval = return_more_info(args)
if quit_eval > 0:
print ('\n')
sys.exit()
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2])
if '.allOGCleanresults' in args.input_TSV:
args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('.allOGCleanresults.','.AA.ORF.')\
.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
else:
args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('_allOGCleanresults.','_AA.ORF.')\
.replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit'
args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0]
if 'fasta' in args.file_prefix:
args.file_prefix = args.name
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/'
return args
###########################################################################################
###------------------------------- Script Usage Message --------------------------------###
###########################################################################################
def usage_msg():
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\
' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END)
##########################################################################################
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
##########################################################################################
def return_more_info(args):
valid_args = 0
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
' maurerax@gmail.com\n\n'+color.END)
if args.author == True:
print (author)
valid_args += 1
if args.input_file.endswith('AA.ORF.fasta'):
args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta')
args.input_AA = args.input_file
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv')
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
elif args.input_file.endswith('NTD.ORF.fasta'):
args.input_NTD = args.input_file
args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta')
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv')
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
print(args.input_TSV)
if os.path.isfile(args.input_NTD) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
if os.path.isfile(args.input_AA) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
if os.path.isfile(args.input_TSV) != True:
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\
' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
valid_args += 1
return valid_args
###########################################################################################
###-------------------- Double Checks Format for 10-Character Code ---------------------###
###########################################################################################
def check_code(args):
check_name = args.name.split('_')
if len(args.name) != 10:
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
elif args.name.count('_') != 2:
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
.split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\
"code: "+color.CYAN+args.name+color.END+'\n')
else:
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
'Am_ar_Ehis\n\n'+color.END)
sys.exit()
##########################################################################################
###------------------------- Creates Folders For Storing Data -------------------------###
##########################################################################################
def prep_folders(args):
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
if os.path.isdir(args.r2g_ntd) != True:
os.system('mkdir ' + args.r2g_ntd)
if os.path.isdir(args.r2g_aa) != True:
os.system('mkdir ' + args.r2g_aa)
if os.path.isdir(args.r2g_tsv) != True:
os.system('mkdir ' + args.r2g_tsv)
if os.path.isdir(args.r2g_xml) != True:
os.system('mkdir ' + args.r2g_xml)
if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True:
os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed')
###########################################################################################
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
###########################################################################################
def rename_paralogs(args):
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_')
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_')
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
if '\n\n' in open(args.input_TSV).read():
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
.replace('\n\n','\n'+args.name+'_XX_')
else:
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
.replace('\n','\n'+args.name+'_XX_')
with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w:
w.write(renamed_Final_Prots)
with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x:
x.write(renamed_Final_Nucs)
with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y:
y.write(renamed_Final_tsv)
###########################################################################################
###--------------------------------- Header/Tail Lines ---------------------------------###
###########################################################################################
def header_tail():
header = '<?xml version="1.0"?>\n<!DOCTYPE BlastOutput PUBLIC "-//NCBI//NCBI BlastOutput/EN" "http://www.ncbi.nlm.nih.gov/dtd/NCBI_BlastOutput.dtd">\n'\
'<BlastOutput>\n <BlastOutput_program>blastp</BlastOutput_program>\n <BlastOutput_version>BLASTP 2.2.29+</BlastOutput_version>\n'\
' <BlastOutput_reference>Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&amp;auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), &quot;Gapped BLAST and PSI-BLAST: a new generation of protein database search programs&quot;, Nucleic Acids Res. 25:3389-3402.</BlastOutput_reference>\n'\
' <BlastOutput_db>../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta</BlastOutput_db>\n <BlastOutput_query-ID>Query_1</BlastOutput_query-ID>\n'
tail = '</BlastOutput_iterations>\n</BlastOutput>'
return header, tail
###########################################################################################
###------------------------------- TSV to XML Conversion -------------------------------###
###########################################################################################
def convert_TSV_data(args):
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
TSVforConvert = home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1]
inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n']
iterations = []
for n in range(len(inTSV)):
if n == 0:
iterations.append(' <BlastOutput_query-def>'+inTSV[n].split('\t')[0]+'</BlastOutput_query-def>\n <BlastOutput_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</BlastOutput_query-len>\n'\
' <BlastOutput_param>\n <Parameters>\n <Parameters_matrix>BLOSUM62</Parameters_matrix>\n <Parameters_expect>1e-10</Parameters_expect>\n'\
' <Parameters_gap-open>11</Parameters_gap-open>\n <Parameters_gap-extend>1</Parameters_gap-extend>\n <Parameters_filter>F</Parameters_filter>\n'\
' </Parameters>\n </BlastOutput_param>\n<BlastOutput_iterations>\n<Iteration>\n <Iteration_iter-num>1</Iteration_iter-num>\n <Iteration_query-ID>Query_1</Iteration_query-ID>\n'\
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
else:
iterations.append('<Iteration>\n <Iteration_iter-num>'+str(n+1)+'</Iteration_iter-num>\n <Iteration_query-ID>Query_'+str(n+1)+'</Iteration_query-ID>\n'\
' <Iteration_query-def>'+inTSV[n].split('\t')[0]+'</Iteration_query-def>\n <Iteration_query-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Iteration_query-len>\n'\
'<Iteration_hits>\n<Hit>\n <Hit_num>1</Hit_num>\n <Hit_id>Fake_Entry</Hit_id>\n <Hit_def>'+inTSV[n].split('\t')[1]+'</Hit_def>\n <Hit_accession>Fake_Accession</Hit_accession>\n'\
' <Hit_len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'</Hit_len>\n <Hit_hsps>\n <Hsp>\n <Hsp_num>1</Hsp_num>\n <Hsp_bit-score>1234</Hsp_bit-score>\n'\
' <Hsp_score>'+inTSV[n].split('\t')[-1]+'</Hsp_score>\n <Hsp_evalue>'+inTSV[n].split('\t')[-2]+'</Hsp_evalue>\n <Hsp_query-from>'+inTSV[n].split('\t')[-4]+'</Hsp_query-from>\n'\
' <Hsp_query-to>'+inTSV[n].split('\t')[-3]+'</Hsp_query-to>\n <Hsp_hit-from>'+inTSV[n].split('\t')[-4]+'</Hsp_hit-from>\n <Hsp_hit-to>'+inTSV[n].split('\t')[-3]+'</Hsp_hit-to>\n'\
' <Hsp_query-frame>0</Hsp_query-frame>\n <Hsp_hit-frame>0</Hsp_hit-frame>\n <Hsp_identity>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_identity>\n'\
' <Hsp_positive>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_positive>\n <Hsp_gaps>0</Hsp_gaps>\n <Hsp_align-len>'+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'</Hsp_align-len>\n'\
' <Hsp_qseq></Hsp_qseq>\n <Hsp_hseq></Hsp_hseq>\n <Hsp_midline></Hsp_midline>\n </Hsp>\n </Hit_hsps>\n</Hit>\n'\
'\n</Iteration_hits>\n <Iteration_stat>\n <Statistics>\n <Statistics_db-num>379660</Statistics_db-num>\n <Statistics_db-len>197499634</Statistics_db-len>\n'\
' <Statistics_hsp-len>123</Statistics_hsp-len>\n <Statistics_eff-space>184705217500</Statistics_eff-space>\n <Statistics_kappa>0.041</Statistics_kappa>\n'\
' <Statistics_lambda>0.267</Statistics_lambda>\n <Statistics_entropy>0.14</Statistics_entropy>\n </Statistics>\n </Iteration_stat>\n</Iteration>\n')
return iterations
###########################################################################################
###--------------------------- Writes Out the Fake XML File ----------------------------###
###########################################################################################
def write_Fake_XML(args):
home_folder = args.all_output_folder + '/' + args.file_prefix + '/'
print (color.BOLD+'\n\nConverting '+color.ORANGE+args.input_file.split('/')[-1]+color.END\
+color.BOLD+' to XML format\n'+color.END)
header, tail = header_tail()
iterations = convert_TSV_data(args)
with open(home_folder+args.out_XML,'w+') as w:
w.write(header)
w.write(''.join(iterations))
w.write(tail)
##########################################################################################
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
##########################################################################################
def clean_up(args):
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
os.system('cp ' + args.all_output_folder + '/' + args.file_prefix+'/'+args.out_XML+' '+args.r2g_xml)
os.system('cp '+home_folder+'*tsv '+args.r2g_tsv)
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa)
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd)
os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix)
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*')
if os.path.isdir(args.all_output_folder + '/Finished/') != True:
os.system('mkdir ' + args.all_output_folder + '/Finished')
os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished')
###########################################################################################
###-------------------------------- Next Script Message --------------------------------###
###########################################################################################
def next_script(args):
print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.out_XML\
.split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\
args.out_XML.split('_XX_')[-1].split('_Filtered')[0]+color.END+color.BOLD+' and '\
+color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\
'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END)
##########################################################################################
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
##########################################################################################
def main():
args = check_args()
check_code(args)
prep_folders(args)
rename_paralogs(args)
write_Fake_XML(args)
clean_up(args)
next_script(args)
main()

View File

@ -0,0 +1,269 @@
import os, sys
import argparse
from Bio import SeqIO
import CUB
from statistics import mean
from math import ceil, floor
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
def get_args():
parser = argparse.ArgumentParser(
prog = 'PTL6p1 Script 8: Stat Summary',
description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
)
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')
return parser.parse_args()
def hook_lens(args):
print('\nGetting average OG lengths in the Hook DB...')
len_by_og = { }
for file in os.listdir(args.databases + '/db_OG'):
if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
if rec.id[-10:] not in len_by_og:
len_by_og.update({ rec.id[-10:] : [] })
len_by_og[rec.id[-10:]].append(len(str(rec.seq)))
for og in len_by_og:
len_by_og[og] = mean(len_by_og[og])
return len_by_og
def aa_comp_lengths(args, gcodes):
print('\nGetting amino acid composition data from ReadyToGo files...')
r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
if file.endswith('.fasta') and file[:10] in gcodes:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
fymink = 0; garp = 0; other = 0; total = 0
for char in str(rec.seq):
if char in 'FYMINK':
fymink += 1
elif char in 'GARP':
garp += 1
else:
other += 1
total += 1
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total } })
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
print('\nGetting transcript sequence data from original assembled transcript files...')
transcripts = { }; transcript_id_corr = { }
for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes')]):
if os.path.isdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
for file in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
if file.endswith('Original.fasta') and file[:10] in gcodes:
for rec in SeqIO.parse(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta/' + file, 'fasta'):
transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})
return aa_comp, transcripts, r2g_lengths, transcript_id_corr
def get_nuc_comp(args, gcodes):
print('\nGetting nucleotide composition data from ReadyToGo files...')
nuc_comp = { }
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
if file.endswith('.fasta') and file[:10] in gcodes:
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]])[0]
for k in cub_out:
nuc_comp.update({ k : cub_out[k] })
return nuc_comp
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
og_mean_lens = hook_lens(args)
if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
os.mkdir(args.input + '/PerSequenceStatSummaries')
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
for taxon in taxa:
with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA\n')
for rec in nuc_comp:
if rec[:10] == taxon:
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
try:
o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
except KeyError:
o.write(',NA,NA')
o.write(',' + str(r2g_lengths[rec]) + ',' + str(og_mean_lens[rec[-10:]]))
v = nuc_comp[rec]
gcs = [str(v.gcOverall), str(v.gc1), str(v.gc2), str(v.gc3), str(v.gc4F)]
ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))
o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + '\n')
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
o.write('Taxon,TranscriptsInput,Median_GCTranscripts,IQR_GCTranscripts,Median_LenTranscripts,IRQ_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,IQR_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
for taxon in taxa:
o.write(taxon)
transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
o.write(',' + str(len(transcripts)))
transcript_gcs = []
for transcript in transcripts:
transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))
transcript_gcs = sorted(transcript_gcs)
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.75)] - transcript_gcs[floor(len(transcripts)*0.25)]))
transcript_lens = sorted([len(transcript) for transcript in transcripts])
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
o.write(',' + str(len(r2g_ntds)))
r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
o.write(',' + str(len(r2g_ogs)))
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.75)] - r2g_gc3s[floor(len(r2g_gc3s)*0.25)]))
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)]))
tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
o.write(',' + gcodes[taxon] + '\n')
def r2g_jf(args, nuc_comp, gcodes):
#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF'):
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF')
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF'):
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF')
for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
if file.endswith('.fasta') and file[:10] in gcodes:
taxon = file[:10]
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
def plot_jf(args, nuc_comp):
if not os.path.isdir(args.input + '/GC3xENc_Plots'):
os.mkdir(args.input + '/GC3xENc_Plots')
taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))
gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]
for taxon in taxa:
comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]
plt.figure()
plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
plt.xlabel("GC content (3rd pos, 4-fold sites)")
plt.ylabel("Observed Wright ENc (6 Fold)")
plt.savefig(args.input + '/GC3xENc_Plots/' + taxon + '.png')
if __name__ == "__main__":
args = get_args()
valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']
gcodes = { }
if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
for line in open(args.input + '/Intermediate/gcode_output.tsv'):
if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
elif line.split('\t')[4].strip().lower() != '':
print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 8 (summary statistics)\n')
else:
print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 8 (summary statistics).\n')
exit()
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
nuc_comp = get_nuc_comp(args, gcodes)
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
if args.r2g_jf:
r2g_jf(args, nuc_comp, gcodes)
plot_jf(args, nuc_comp)

View File

@ -0,0 +1,523 @@
#!/usr/bin/env python3
# coding=utf-8
'''Aim of this script is to generate lots of codon usage statistics to aid in
identifying useful characteristics for de novo ORF calling'''
# Author: Xyrus Maurer-Alcalá
# Contact: maurerax@gmail.com or xyrus.maurer-alcala@izb.unibe.ch
# Last Modified: 2020-09-17
# usage: python CUB.py
# Dependencies:
# Python3, numpy, BioPython
import os
import re
import sys
#import matplotlib.pyplot as plt
import numpy as np
#import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqUtils import GC
class CalcCUB:
"""
Returns the Effective Number of Codons used (observed and expected)
following the equations originally from Wright 1990.
"""
def expWrightENc(gc3):
# Calculates the expected ENc from a sequence's GC3 under Wright 1990
if gc3 > 1:
# If GC3 looks as though it is > 1 (e.g. 100%), converts to a float ≤ 1.
# Calculations expect a value between 0 and 1
gc3 = gc3/100
exp_enc = 2+gc3+(29/((gc3**2)+(1-gc3)**2))
return round(exp_enc, 4)
def nullENcGC3():
# Calculates the expected ENc from the null distribution of GC3
# values (0, 100% GC)
null = [CalcCUB.expWrightENc(n) for n in np.arange(0,.51,0.01)]
null += null[:-1][::-1]
return [str(i)+'\t'+str(j) for i, j in zip([n for n in range(0, 101)],null)]
def calcWrightENc(cdnTable):
# Follows Wright's (1990) calculations for determining ENc scores.
def faCalcWright(aa_counts):
# Returns the codon homozygosity (fa) for a given "type" of AA (e.g.
# 2-fold degeneracy).
counts = [i[2] for i in aa_counts]
# n_aa --> number of this particular AA
n_aa = sum(counts)
# fa --> codon homozygosity
try:
fa = (((n_aa*sum([(i/float(n_aa))**2 for i in counts]))-1)/(n_aa-1))
except:
fa = 0
return fa
def ENcWright_by_Degen(fa_data):
# Same as used in Wright 1990, averages the homozygosity across all codons
# of a given class (e.g. 2-fold degeneracy)
# Codons without any degeneracy (e.g. ATG == M) have 100% homozygosity
# and provide a "base" for the ENc score
enc = 2
for k, v in fa_data.items():
non_zero_vals, non_zero_sum = len([i for i in v if i != 0]), sum([i for i in v if i != 0])
try:
f_aa = non_zero_sum/non_zero_vals
except:
f_aa = 1
enc += k/f_aa
return enc
# Determines the number of degenerate groups to use (i.e. whether 6-Fold
# degeneracy is present).
degen_cdns = {}
for k, v in cdnTable.items():
if v[1] not in degen_cdns.keys():
degen_cdns[v[1]] = [v[0]]
else:
if v[0] not in degen_cdns[v[1]]:
degen_cdns[v[1]] += [v[0]]
# Calculates codon homozygosity (fa) for each amino acid. Groups the
# resulting values based on the amino acids degeneracy (e.g. 'two-fold').
fa_cdns = {len(v):[] for k, v in degen_cdns.items() if 'one' not in k}
for k, v in degen_cdns.items():
# Skip codons lacking degeneracy
if 'one' in k:
continue
for aa in v:
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
fa_cdns[len(v)] += [faCalcWright(aa_counts)]
enc_val = min(61, round(ENcWright_by_Degen(fa_cdns),4))
return enc_val
def SunEq5(cdnTable):
def calcFcf(aa_counts):
counts = [i[2] for i in aa_counts]
pseudocounts = [i+1 for i in counts]
na = sum(pseudocounts)
fcf = sum([(i/float(na))**2 for i in pseudocounts]), sum(pseudocounts)
return fcf
ENcWeightedPsuedo = 0
degen_cdns = {}
for k, v in cdnTable.items():
if v[1] == 'none':
continue
if v[1] not in degen_cdns.keys():
degen_cdns[v[1]] = [v[0]]
else:
if v[0] not in degen_cdns[v[1]]:
degen_cdns[v[1]] += [v[0]]
for k, v in degen_cdns.items():
fcf_nc = []
for aa in v:
aa_counts = [cdnTable[k] for k in cdnTable.keys() if cdnTable[k][0] == aa]
fcf_nc.append(calcFcf(aa_counts))
weightedENc = (len(fcf_nc) /
(sum([i[0]*i[1] for i in fcf_nc]) /
sum([i[1] for i in fcf_nc])))
ENcWeightedPsuedo += weightedENc
return round(ENcWeightedPsuedo,4)
def calcRCSU(cdnTbl):
rscu = {k:[v[0]] for k, v in cdnTbl.items() if v[0].isalpha()}
for k, v in rscu.items():
try:
aa_info = [(key, val[-1]) for key, val in cdnTbl.items() if val[0] == v[0]]
aa_cnts = [x[1] for x in aa_info]
cdn_rscu = (cdnTbl[k][-1]*len(aa_cnts))/sum(aa_cnts)
rscu[k] += [str(round(cdn_rscu,4))]
except:
rscu[k] += ['0.0']
return rscu
class GenUtil(object):
"""
"Overflow" of functions for now. Just a precaution to make the code a
little cleaner/easier to manage.
This class inclues means to normalize/check the user-provided genetic code,
which if not valid will default to the "universal" genetic code.
Similarly, This class will return the appropriate
codon count table and provides a function to update its values.
"""
def convertGenCode(gCode):
# Will interpret the user provided genetic code (gcode) and checks that
# it is currently available for use with the NCBI/biopython
# supported translation tables. Default is universal.
# Dictionary of the possible/functional genetic codes that are supported.
# --- Chilodonella and condylostoma are to come!
transTable = {'universal':1, 'blepharisma':4,
'ciliate':6, 'euplotes':10, 'mesodinium':29, 'myrionecta':29, 'peritrich':30,
'1':1, '4':4, '6':6, '10':10, '29':29, '30':30, 'chilo':'chilo'}
if str(gCode).lower() not in transTable:
print("\nWarning: Provided genetic code is not supported (yet).\n")
print("Currently running using the UNIVERSAL genetic code.\n\n")
print("Alternative genetic codes are as follows (Note: numbers "\
"correspond to NCBI genetic code tables):\n")
print('\n'.join(list(transTable.keys()))+'\n')
return 'Universal',1
else:
return gCode,transTable[str(gCode).lower()]
def getCDNtable(gCode):
# Returns the appropriate codon table to be used for the ENc calculations.
# Universal codon table, with 6-fold degenerate codons split
# into four-fold and two-fold groups.
universal_no6fold = {
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
'GCG': ['A', 'four', 0], 'CGT': ['R', 'four', 0], 'CGC': ['R', 'four', 0],
'CGG': ['R', 'four', 0], 'CGA': ['R', 'four', 0], 'AGA': ['R_', 'two', 0],
'AGG': ['R_', 'two', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
'TTA': ['L_', 'two', 0], 'TTG': ['L_', 'two', 0], 'CTT': ['L', 'four', 0],
'CTC': ['L', 'four', 0], 'CTA': ['L', 'four', 0], 'CTG': ['L', 'four', 0],
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'four', 0],
'TCC': ['S', 'four', 0], 'TCA': ['S', 'four', 0], 'TCG': ['S', 'four', 0],
'AGT': ['S_', 'two', 0], 'AGC': ['S_', 'two', 0], 'ACT': ['T', 'four', 0],
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
# Universal codon table, with 6-fold degenerate codons kept
# whole, no splitting! Traditional Universal codon table.
universal_6fold = {
'GCT': ['A', 'four', 0], 'GCC': ['A', 'four', 0], 'GCA': ['A', 'four', 0],
'GCG': ['A', 'four', 0], 'CGT': ['R', 'six', 0], 'CGC': ['R', 'six', 0],
'CGG': ['R', 'six', 0], 'CGA': ['R', 'six', 0], 'AGA': ['R', 'six', 0],
'AGG': ['R', 'six', 0], 'AAT': ['N', 'two', 0], 'AAC': ['N', 'two', 0],
'GAT': ['D', 'two', 0], 'GAC': ['D', 'two', 0], 'TGT': ['C', 'two', 0],
'TGC': ['C', 'two', 0], 'CAA': ['Q', 'two', 0], 'CAG': ['Q', 'two', 0],
'GAA': ['E', 'two', 0], 'GAG': ['E', 'two', 0], 'GGT': ['G', 'four', 0],
'GGC': ['G', 'four', 0], 'GGA': ['G', 'four', 0], 'GGG': ['G', 'four', 0],
'CAT': ['H', 'two', 0], 'CAC': ['H', 'two', 0], 'ATT': ['I', 'three', 0],
'ATC': ['I', 'three', 0], 'ATA': ['I', 'three', 0], 'ATG': ['M', 'one', 0],
'TTA': ['L', 'six', 0], 'TTG': ['L', 'six', 0], 'CTT': ['L', 'six', 0],
'CTC': ['L', 'six', 0], 'CTA': ['L', 'six', 0], 'CTG': ['L', 'six', 0],
'AAA': ['K', 'two', 0], 'AAG': ['K', 'two', 0], 'TTT': ['F', 'two', 0],
'TTC': ['F', 'two', 0], 'CCT': ['P', 'four', 0], 'CCC': ['P', 'four', 0],
'CCA': ['P', 'four', 0], 'CCG': ['P', 'four', 0], 'TCT': ['S', 'six', 0],
'TCC': ['S', 'six', 0], 'TCA': ['S', 'six', 0], 'TCG': ['S', 'six', 0],
'AGT': ['S', 'six', 0], 'AGC': ['S', 'six', 0], 'ACT': ['T', 'four', 0],
'ACC': ['T', 'four', 0], 'ACA': ['T', 'four', 0], 'ACG': ['T', 'four', 0],
'TGG': ['W', 'one', 0], 'TAT': ['Y', 'two', 0], 'TAC': ['Y', 'two', 0],
'GTT': ['V', 'four', 0], 'GTC': ['V', 'four', 0], 'GTA': ['V', 'four', 0],
'GTG': ['V', 'four', 0], 'TAA': ['*', 'none', 0], 'TGA': ['*', 'none', 0],
'TAG': ['*', 'none', 0], 'XXX': ['_missing', 'none', 0]}
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting!
blepharisma_6fold = {**universal_6fold,
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
# Blepharisma (table 4) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
blepharisma_no6fold = {**universal_no6fold,
'TGA': ['W', 'two', 0], 'TGG': ['W', 'two', 0],
'TAA': ['*', 'two', 0], 'TAG': ['*', 'two', 0]}
# Chilodonella genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting!
chilo_6fold = {**universal_6fold,
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
'TAA': ['*', 'one', 0], 'TAG': ['Q', 'four', 0],
'TGA': ['Q', 'four', 0]}
# Chilodonella genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAG =/= TAG)
chilo_no6fold = {**universal_no6fold,
'TAA': ['*', 'one', 0], 'TAG': ['Q_', 'one', 0],
'TGA': ['Q_', 'one', 0]}
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting! Traditional ciliate codon table.
ciliate_6fold = {**universal_6fold,
'CAA': ['Q', 'four', 0], 'CAG': ['Q', 'four', 0],
'TAA': ['Q', 'four', 0], 'TAG': ['Q', 'four', 0],
'TGA': ['*', 'one', 0]}
# Ciliate (table 6) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAA =/= TAA)
ciliate_no6fold = {**universal_no6fold,
'TAA': ['Q_', 'two', 0], 'TAG': ['Q_', 'two', 0],
'TGA': ['*', 'one', 0]}
# Euplotes codon table, with 6-fold degenerate codons kept
# whole, no splitting! Traditional Universal codon table.
euplotes_6fold = {**universal_6fold,
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
'TAG': ['*', 'two',0]}
# Euplotes genetic code codon table, with 6-fold degenerate codons
# split into four-fold and two-fold groups.
euplotes_no6fold = {**universal_no6fold,
'TGA': ['C', 'three', 0], 'TGT': ['C', 'three', 0],
'TGC': ['C', 'three', 0], 'TAA': ['*', 'two', 0],
'TAG': ['*', 'two',0]}
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
# degenerate codons kept whole, no splitting! Traditional ciliate codon table.
mesodinium_6fold = {**universal_6fold,
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
'TGA': ['*', 'one', 0]}
# Mesodinium/Myrionecta (table 29) genetic code codon table, with 6-fold
# degenerate codons split into four-fold and two-fold groups.
mesodinium_no6fold = {**universal_no6fold,
'TAA': ['Y', 'four', 0], 'TAT': ['Y', 'four', 0],
'TAG': ['Y', 'four', 0], 'TAC': ['Y', 'four', 0],
'TGA': ['*', 'one', 0]}
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
# codons kept whole, no splitting! Traditional ciliate codon table.
peritrich_6fold = {**universal_6fold,
'GAA': ['E', 'four', 0], 'GAG': ['E', 'four', 0],
'TAA': ['E', 'four', 0], 'TAG': ['E', 'four', 0],
'TGA': ['*', 'one', 0]}
# Peritrich (table 30) genetic code codon table, with 6-fold degenerate
# codons split into four-fold and two-fold groups.
# Note that this also splits four-fold degenerate codons that OUGHT to
# be in "different" functional categories (e.g. CAA =/= TAA)
peritrich_no6fold = {**universal_no6fold,
'TAA': ['E_', 'two', 0], 'TAG': ['E_', 'two', 0],
'TGA': ['*', 'one', 0]}
cdnTableDict = {1:[universal_no6fold,universal_6fold],
4:[blepharisma_no6fold, blepharisma_6fold],
6:[ciliate_no6fold,ciliate_6fold],
10:[euplotes_no6fold,euplotes_6fold],
29:[mesodinium_no6fold,mesodinium_6fold],
30:[peritrich_no6fold,peritrich_6fold],
'chilodonella':[chilo_no6fold,chilo_6fold],
'chilo':[chilo_no6fold,chilo_6fold]}
return cdnTableDict[gCode]
def mapCdns(seq, cdnTable):
# Updates the codon counts for a given sequence to the respective codon
# count table (e.g. with or without 6-fold degeneracy).
codons = [seq[n:n+3] for n in range(0, len(seq)-len(seq)%3, 3)]
amb_cdn = 0
for c in codons:
try:
cdnTable[c][-1] += 1
except:
amb_cdn += 1
if cdnTable['TCC'][1] == 'six':
return cdnTable, amb_cdn
else:
return cdnTable
class GCeval():
"""
Returns %GC values from DNA sequences of various types.
"""
def gcTotal(seq):
# This function returns global GC content
return round(GC(seq), 4)
def gc1(seq):
# This function return the GC content of the first position of a codon
return round(GC(''.join([seq[n] for n in range(0, len(seq), 3)])), 4)
def gc2(seq):
# This function return the GC content of the second position of a codon
return round(GC(''.join([seq[n] for n in
range(1, len(seq)-len(seq[1:]) % 3, 3)])), 4)
def gc3(seq):
# This function return the GC content of the third position of a codon
return round(GC(''.join([seq[n] for n in
range(2, len(seq)-len(seq[2:]) % 3, 3)])), 4)
def gc3_4F(cdnTbl):
# # This function return the GC content of the third position of four-fold
# # degenerate codons
FrFold = round(GC(''.join([k[-1]*v[-1] for k, v in cdnTbl.items() if
'one' not in v[1]])), 4)
return FrFold
class SeqInfo(object):
"""
Provides a means to harbor the data for each individual contig/gene in a
given fasta file.
This includes GC content (various types), Effective Number of codons
(ENc; again various calculations), Relative Synonymous Codon Usage (RSCU).
"""
def __init__(self,seq,gcode='universal'):
self.ntd = str(seq)
self.gcode, self.transTable = GenUtil.convertGenCode(gcode)
# Dictionary of the GC-related functions/calculations
self.gcFuncs = {'gcOverall':GCeval.gcTotal,'gc1':GCeval.gc1,'gc2':GCeval.gc2,'gc3':GCeval.gc3}
def countCodons(self):
# Stores the different codon tables and updates their codon counts
cdnTbls = GenUtil.getCDNtable(self.transTable)
self.cdnCounts_6F,self.amb_cdn = GenUtil.mapCdns(self.ntd, cdnTbls[1])
self.cdnCounts_No6F = GenUtil.mapCdns(self.ntd, cdnTbls[0])
def ENcStats(self):
# Stores the various Effective Number of Codons calculations in the class
self.expENc = CalcCUB.expWrightENc(self.gc3)
self.obsENc_6F = CalcCUB.calcWrightENc(self.cdnCounts_6F)
self.obsENc_No6F = CalcCUB.calcWrightENc(self.cdnCounts_No6F)
self.SunENc_6F = CalcCUB.SunEq5(self.cdnCounts_6F)
self.SunENc_No6F = CalcCUB.SunEq5(self.cdnCounts_No6F)
def GCstats(self):
# Stores the various GC-stats in the class
for k, v in self.gcFuncs.items():
setattr(self,k,v(self.ntd))
self.gc4F = GCeval.gc3_4F(self.cdnCounts_No6F)
def RSCUstats(self):
self.rscu_No6Fold = CalcCUB.RSCU(self.cdnCounts_No6F)
self.rscu_6Fold = CalcCUB.RSCU(self.cdnCounts_6F)
def prepFolders(outName):
if os.path.isdir(outName) == False:
os.mkdir(outName)
if os.path.isdir(outName+'/Plots') == False:
os.mkdir(outName+'/Plots')
if os.path.isdir(outName+'/SpreadSheets') == False:
os.mkdir(outName+'/SpreadSheets')
def CalcRefFasta(fasta, gCode):
seqDB = {i.description:SeqInfo(i.seq, gCode) for i in SeqIO.parse(fasta,'fasta')}
GenCDNtable = {}
for k, v in seqDB.items():
v.countCodons()
v.GCstats()
v.ENcStats()
for k, v in v.cdnCounts_6F.items():
if k.isalpha() and k not in GenCDNtable .keys():
GenCDNtable[k] = [v[0],v[-1]]
else:
GenCDNtable[k][-1] += v[-1]
RSCU = CalcCUB.calcRCSU(GenCDNtable)
return seqDB, RSCU
def WriteWrightOut(seqData, outName, comp):
if comp == False:
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Raw.tsv','w+') as w:
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
for k, v in seqData.items():
name = [k]
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
str(v.SunENc_6F),str(v.SunENc_No6F)]
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
else:
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.CompTrans.ENc.Raw.tsv','w+') as w:
w.write('SequenceID\tAmbiguousCodons\tGC-Overall\tGC1\tGC2\tGC3\t'
'GC3-Degen\tExpWrightENc\tObsWrightENc_6Fold\tObsWrightENc_No6Fold\t'
'ObsWeightedENc_6Fold\tObsWeightedENc_No6Fold\n')
for k, v in seqData.items():
name = [k]
gcs = [str(v.gcOverall),str(v.gc1),str(v.gc2),str(v.gc3),str(v.gc4F)]
ENc = [str(v.expENc),str(v.obsENc_6F),str(v.obsENc_No6F),
str(v.SunENc_6F),str(v.SunENc_No6F)]
w.write('\t'.join(name+[str(v.amb_cdn)]+gcs+ENc)+'\n')
def getCompFasta(fasta, gCode):
print(fasta)
stopCDNs = {'1':['TAA','TAG','TGA'], '4':['TAA','TAG'], '6':['TGA'], '10':['TAA','TAG'],
'29':['TGA'], '30':['TGA'], 'universal':['TAA','TAG','TGA'], 'blepharisma':['TAA','TAG'],
'ciliate':['TGA'],'euplotes':['TAA','TAG'], 'mesodinium':['TGA'], 'peritrich':['TGA'],
'chilo':['TAA']}
if gCode.lower() not in stopCDNs.keys():
stops = stopCDNs['1']
else:
stops = stopCDNs[gCode]
with open(fasta.replace('.fasta','.Comp.fasta'),'w+') as w:
for i in SeqIO.parse(fasta,'fasta'):
#if str(i.seq).upper().startswith('ATG') and str(i.seq).upper()[-3:] in stops:
#if str(i.seq).upper()[-3:] in stops:
if len(i.seq) % 3 == 0:
w.write('>'+i.description+'\n'+str(i.seq)+'\n')
return fasta.replace('.fasta','.Comp.fasta')
def WriteNullENcOut(outName):
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.ENc.Null.tsv','w+') as w:
w.write('GC3\tENc\n')
w.write('\n'.join(CalcCUB.nullENcGC3()))
def WriteRSCUtbl(RSCUtbl, outName):
with open(outName+'/SpreadSheets/'+outName.split('/')[-1]+'.RSCU.tsv','w+') as w:
w.write('Codon\tAmino Acid\tRSCU\n')
for k,v in RSCUtbl.items():
w.write(k+'\t'+'\t'.join(v)+'\n')
if __name__ == "__main__":
if len(sys.argv) < 2:
print('\nUsage:\n')
print('python CUB.py MyNtds.fasta MyTaxon genetic_code\n')
print('\nGenetic Codes:\n')
gcd = ['1', '4', '6', '10', '29', '30', 'universal', 'blepharisma',
'ciliate','euplotes', 'mesodinium', 'peritrich','chilo']
print('\n'.join(gcd)+'\n')
sys.exit()
fasta = sys.argv[1]
try:
outName = sys.argv[2]
except:
print('Missing an output name. Include one, then run again!')
sys.exit()
try:
gCode = sys.argv[3]
except:
gCode = 'universal'
compFasta = getCompFasta(fasta, gCode)
prepFolders(outName)
fastaDataRaw, RSCUtbl = CalcRefFasta(fasta, gCode)
fastaDataComp, RSCUtbl = CalcRefFasta(compFasta, gCode)
WriteWrightOut(fastaDataRaw, outName, comp=False)
WriteWrightOut(fastaDataComp, outName, comp=True)
WriteNullENcOut(outName)
WriteRSCUtbl(RSCUtbl, outName)
os.system('cp '+fasta+' '+outName+'/')
os.system('mv '+compFasta+' '+outName+'/')

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,143 @@
Contig_204_Len5938_Cov7976_P_OG6_102770
Contig_3323_Len1201_Cov3_U_OG6_148486
Contig_313_Len4904_Cov3_E_OG6_101730
Contig_3269_Len1219_Cov2_U_OG6_102957
Contig_9180_Len427_Cov1_P_OG6_100774
Contig_4698_Len874_Cov1_U_OG6_111564
Contig_6408_Len634_Cov2_U_OG6_102446
Contig_667_Len3323_Cov11_U_OG6_116443
Contig_2271_Len1620_Cov1_E_OG6_101536
Contig_4504_Len909_Cov11_P_OG6_101629
Contig_333_Len4757_Cov12241_P_OG6_102770
Contig_811_Len2973_Cov4_U_OG6_116443
Contig_4707_Len871_Cov7779_P_OG6_102770
Contig_5488_Len751_Cov2_U_OG6_103872
Contig_317_Len4879_Cov6_E_OG6_107629
Contig_5624_Len732_Cov1_U_OG6_100648
Contig_1895_Len1829_Cov9_U_OG6_107629
Contig_6636_Len610_Cov1_P_OG6_100598
Contig_4338_Len944_Cov2_U_OG6_118109
Contig_3061_Len1290_Cov2_P_OG6_105259
Contig_786_Len3043_Cov3_E_OG6_101725
Contig_4267_Len958_Cov2_U_OG6_107629
Contig_157_Len6644_Cov11_E_OG6_102358
Contig_24_Len10475_Cov4_E_OG6_106943
Contig_145_Len6822_Cov9_U_OG6_145244
Contig_536_Len3701_Cov3_E_OG6_103917
Contig_421_Len4239_Cov31_P_OG6_102198
Contig_2102_Len1703_Cov4_E_OG6_102675
Contig_40_Len9764_Cov77_E_OG6_105576
Contig_4490_Len912_Cov1_P_OG6_106264
Contig_6491_Len626_Cov2_P_OG6_100328
Contig_7231_Len557_Cov1_U_OG6_107629
Contig_756_Len3094_Cov12165_P_OG6_102770
Contig_3009_Len1307_Cov13_U_OG6_116443
Contig_4049_Len1006_Cov3_U_OG6_103879
Contig_426_Len4213_Cov26_U_OG6_108411
Contig_5227_Len788_Cov1_P_OG6_100289
Contig_701_Len3225_Cov2_P_OG6_106492
Contig_4905_Len840_Cov2_P_OG6_115036
Contig_28_Len10287_Cov699_E_OG6_100777
Contig_117_Len7490_Cov12_E_OG6_102035
Contig_748_Len3109_Cov7889_P_OG6_102770
Contig_3013_Len1306_Cov6_U_OG6_116443
Contig_5988_Len682_Cov2_U_OG6_107629
Contig_2349_Len1580_Cov1_P_OG6_124813
Contig_349_Len4667_Cov4_U_OG6_116443
Contig_841_Len2909_Cov21_E_OG6_107629
Contig_8720_Len444_Cov1_P_OG6_115892
Contig_2350_Len1578_Cov8_U_OG6_107629
Contig_7003_Len576_Cov3_P_OG6_115725
Contig_8605_Len452_Cov1_U_OG6_107629
Contig_3057_Len1292_Cov6_U_OG6_102073
Contig_168_Len6512_Cov4_E_OG6_102446
Contig_3018_Len1303_Cov9938_P_OG6_102770
Contig_3148_Len1260_Cov6057_P_OG6_102770
Contig_2866_Len1358_Cov8209_P_OG6_102770
Contig_1129_Len2472_Cov5_E_OG6_100425
Contig_4087_Len998_Cov2_E_OG6_102109
Contig_9116_Len429_Cov2_E_OG6_100420
Contig_2878_Len1356_Cov2_E_OG6_102565
Contig_1436_Len2164_Cov2_U_OG6_121413
Contig_125_Len7242_Cov67_U_OG6_105015
Contig_4250_Len962_Cov4_U_OG6_107629
Contig_9689_Len411_Cov1_P_OG6_110848
Contig_488_Len3892_Cov12068_P_OG6_102770
Contig_5963_Len686_Cov1_P_OG6_101423
Contig_5913_Len692_Cov3_U_OG6_116443
Contig_1768_Len1907_Cov2_P_OG6_106950
Contig_1477_Len2126_Cov2_P_OG6_104145
Contig_7577_Len529_Cov1_E_OG6_116443
Contig_9982_Len402_Cov2_P_OG6_104843
Contig_373_Len4515_Cov26_U_OG6_108411
Contig_4656_Len883_Cov2_E_OG6_102774
Contig_3145_Len1262_Cov2_P_OG6_110223
Contig_5222_Len788_Cov2_E_OG6_116443
Contig_6139_Len665_Cov1_P_OG6_129320
Contig_987_Len2641_Cov4_E_OG6_103026
Contig_213_Len5837_Cov11847_P_OG6_102770
Contig_5568_Len739_Cov1_E_OG6_102109
Contig_3238_Len1232_Cov5_U_OG6_116443
Contig_1549_Len2061_Cov3_E_OG6_102774
Contig_3274_Len1217_Cov2_E_OG6_107629
Contig_6478_Len627_Cov4_U_OG6_107219
Contig_1710_Len1945_Cov2_P_OG6_101690
Contig_172_Len6427_Cov11653_P_OG6_102770
Contig_2759_Len1398_Cov2_U_OG6_102446
Contig_4836_Len848_Cov10167_E_OG6_101051
Contig_665_Len3326_Cov6_E_OG6_103961
Contig_7896_Len504_Cov1_U_OG6_116443
Contig_6369_Len640_Cov2_U_OG6_107629
Contig_218_Len5773_Cov12260_P_OG6_102770
Contig_250_Len5432_Cov11_U_OG6_102109
Contig_1503_Len2102_Cov2_P_OG6_104631
Contig_1781_Len1898_Cov4_E_OG6_103110
Contig_7718_Len517_Cov4_U_OG6_107629
Contig_2524_Len1486_Cov10283_U_OG6_102770
Contig_207_Len5911_Cov4_P_OG6_104171
Contig_1989_Len1770_Cov9572_P_OG6_102770
Contig_6893_Len587_Cov1_P_OG6_103083
Contig_1403_Len2189_Cov6_E_OG6_100617
Contig_4699_Len873_Cov2_U_OG6_102109
Contig_7520_Len533_Cov1_P_OG6_103438
Contig_115_Len7578_Cov15_U_OG6_107629
Contig_1732_Len1931_Cov2_E_OG6_101536
Contig_6482_Len627_Cov1_P_OG6_101673
Contig_849_Len2894_Cov3_U_OG6_116443
Contig_3913_Len1037_Cov5_U_OG6_107629
Contig_9808_Len408_Cov1_P_OG6_113435
Contig_439_Len4172_Cov3_E_OG6_107629
Contig_749_Len3109_Cov7_U_OG6_101143
Contig_376_Len4497_Cov28_E_OG6_116443
Contig_1579_Len2041_Cov3_E_OG6_103026
Contig_787_Len3042_Cov8_U_OG6_101143
Contig_4099_Len996_Cov1_U_OG6_105091
Contig_8084_Len489_Cov1_P_OG6_101427
Contig_2431_Len1534_Cov37_E_OG6_101143
Contig_110_Len7741_Cov24_U_OG6_101172
Contig_4412_Len930_Cov1_P_OG6_101345
Contig_7241_Len556_Cov1_E_OG6_107629
Contig_574_Len3608_Cov16_P_OG6_107278
Contig_739_Len3127_Cov4_E_OG6_102109
Contig_858_Len2877_Cov9661_P_OG6_102770
Contig_3857_Len1055_Cov1_U_OG6_100774
Contig_7308_Len551_Cov1_P_OG6_100463
Contig_61_Len9077_Cov26_E_OG6_101172
Contig_5256_Len784_Cov1_U_OG6_107629
Contig_6823_Len592_Cov1_E_OG6_100210
Contig_2634_Len1443_Cov2_U_OG6_102109
Contig_2511_Len1491_Cov2_P_OG6_118575
Contig_3656_Len1108_Cov2_P_OG6_106027
Contig_7423_Len542_Cov1_U_OG6_132867
Contig_889_Len2819_Cov18_U_OG6_116443
Contig_5665_Len726_Cov1_E_OG6_100769
Contig_3821_Len1064_Cov2_P_OG6_100469
Contig_6635_Len610_Cov1_P_OG6_100578
Contig_573_Len3614_Cov53_U_OG6_102121
Contig_7211_Len558_Cov3_U_OG6_101143
Contig_1293_Len2294_Cov87_E_OG6_106711
Contig_131_Len7117_Cov11286_P_OG6_102770
Contig_5544_Len742_Cov1_P_OG6_111848
Contig_806_Len2987_Cov21_E_OG6_103192
Contig_219_Len5773_Cov12245_P_OG6_102770
Contig_136_Len7040_Cov8_U_OG6_100080

View File

@ -0,0 +1,7 @@
import re
og = 'OG6_123456'
ogv = og.split(re.split('OG.{1}_', og)[1])[0][-4:]
print(ogv, re.split('OG.{1}_', og)[1])

View File

@ -0,0 +1,261 @@
#Dependencies
import os, sys, re
import shutil
import argparse
def get_args():
parser = argparse.ArgumentParser(
prog = 'PhyloToL v6.0 Part 1 for Transcriptomes',
description = "Updated January 19th, 2023 by Auden Cote-L'Heureux. Link to GitHub: https://github.com/AudenCote/PhyloToL_v6.0"
)
parser.add_argument('-s', '--script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6, 7 }, help = 'Script to run if you are only running one script')
parser.add_argument('-n', '--conspecific_names', type = str, help = 'A .txt or .tsv file with two tab-separated columns; the first should have 10 digit codes, the second species or other identifying names. This is used to determine which sequences to remove (only between "species") in cross-plate contamination assessment')
parser.add_argument('-1', '--first_script', default = -1, type = int, choices = { 1, 2, 3, 4, 5, 6 }, help = 'First script to run')
parser.add_argument('-2', '--last_script', default = -1, type = int, choices = { 2, 3, 4, 5, 6, 7 }, help = 'First script to run')
parser.add_argument('-a', '--assembled_transcripts', type = str, help = 'Path to a folder of assembled transcripts, assembled by rnaSPAdes. Each assembled transcript file name should start with a unique 10 digit code, and end in "_assembledTranscripts.fasta", E.g. Op_me_hsap_assembledTranscripts.fasta')
parser.add_argument('-o', '--output', default = '../', type = str, help = 'An "Output" folder will be created at this directory to contain all output files. By default this folder will be created at the parent directory of the Scripts folder')
parser.add_argument('-x', '--xplate_contam', action = 'store_true', help = 'Run cross-plate contamination removal (includes all files)')
parser.add_argument('-g', '--genetic_code', type = str, help = 'If all of your taxa use the same genetic code, you may enter it here (to be used in script 5). Alternatively, if you need to use a variety of genetic codes but know which codes to use, you may fill give here the path to a .txt or .tsv with two tab-separated columns, the first with the ten-digit codes and the second column with the corresponding genetics codes. Otherwise, stop at script 4 and fill in "gcode_output.tsv" before running script 5')
parser.add_argument('-min', '--minlen', type = int, default = 200, help = 'Minimum transcript length')
parser.add_argument('-max', '--maxlen', type = int, default = 12000, help = 'Maximum transcript length')
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
return parser.parse_args()
#running the first script on all the bare files
def script_one(args, ten_digit_codes):
for file in os.listdir(args.assembled_transcripts):
if file[10:] == '_assembledTranscripts.fasta' and file[:10] in ten_digit_codes:
os.system('python 1a_ContigFiltStats.py --input_file ' + args.assembled_transcripts + '/' + file + ' --output_file ' + args.output + '/Output/' + file[:10] + ' --minLen ' + str(args.minlen) + ' --maxLen ' + str(args.maxlen) + ' --spades') #SPADES ARGUMENT??
if args.xplate_contam:
os.system('python 1b_XSpeciesContaminationAgnes.py ' + args.output + '/Output/XlaneBleeding ' + str(args.minlen) + ' ' + args.conspecific_names)
def script_two(args):
for folder in os.listdir(args.output + '/Output/'):
if os.path.isfile(args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta'):
os.system('python 2a_remove_rRNA.py --input_file ' + args.output + '/Output/' + folder + '/SizeFiltered/' + folder + '.' + str(args.minlen) + 'bp.fasta --databases ' + args.databases)
fasta_withBact = args.output + '/Output/' + folder + '/' + folder + '_NorRNAseqs.fasta'
os.system('python 2b_remove_Bact.py --input_file ' + fasta_withBact + ' --databases ' + args.databases)
#NEED TO SORT OUT FILE NAMES ETC. BELOW HERE
#running the third script
def script_three(args):
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta'):
os.system('python 3_CountOGsDiamond.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.fasta --evalue 1e-15 --databases ' + args.databases)
#running the fourth script
def script_four(args):
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
os.system('python 4_InFrameStopFreq.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --databases ' + args.databases)
#putting all of the gcode summaries produced by fourth script into a spreadsheet
gcode_info = []
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv'):
with open(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed_StopCodonStats.tsv') as f:
for line in f:
line_sep = line.split('\t')
if line_sep[0] == 'Summary':
gcode_info.append([folder, line_sep[6], line_sep[7], line_sep[8][:-1]])
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
stop = False
gcode_file = { }
if args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
if os.path.isfile(args.genetic_code):
for line in open(args.genetic_code):
try:
if line.split('\t')[1].strip().lower() in valid_codes:
gcode_file.update({ line.split('\t')[0] : line.split('\t')[1].strip() })
else:
print('Genetic code ERROR -- ' + line.split('\t')[1].strip() + ' is not a valid genetic code. Please fill out the "gcode_output.tsv" file and continue with script 5.')
except IndexError:
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it is improperly formatted. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
stop = True
else:
print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but it could not be found. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
stop = True
with open(args.output + '/Output/gcode_output.tsv', 'w') as g:
g.writelines('10 Digit Code\tIn-frame TAG Density\tIn-frame TGA Density\tIn-frame TAA Density\tGenetic Code\n')
for row in gcode_info:
if args.genetic_code == None:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t\n')
elif args.genetic_code.lower() in valid_codes:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3] + '\t' + args.genetic_code + '\n')
elif args.genetic_code.endswith('.txt') or args.genetic_code.endswith('.tsv'):
try:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3]+ '\t' + gcode_file[row[0]] + '\n')
except KeyError:
g.writelines(row[0] + '\t' + row[1] + '\t' + row[2] + '\t' + row[3]+ '\t' + 'Universal' + '\n')
print('\nDefaulting to Universal genetic code for taxon ' + row[0] + '\n')
#print('\nGenetic code ERROR -- it looks like you tried to enter a .txt/.tsv file, but a taxon is missing. Stopping after script 4; you may fill out the file gcode_output.tsv and continue with script 5.\n')
#stop = True
else:
stop = True
if stop or args.genetic_code == None:
print('\nStopping after script 4 because genetic code information is incomplete. Please fill out the file "gcode_output.tsv" and continue with script 5.\n')
exit()
def script_five(args):
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
lines = [line.strip().split('\t') for line in open(args.output + '/Output/gcode_output.tsv', 'r')]
with open(args.output + '/Output/gcode_output.tsv', 'r') as g:
for folder in os.listdir(args.output + '/Output'):
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta'):
for line in lines:
if line[0] == folder and line[-1].lower() in valid_codes:
os.system('python 5_GCodeTranslate.py --input_file ' + args.output + '/Output/' + folder + '/' + folder + '_WTA_EPU.Renamed.fasta --genetic_code ' + line[-1])
elif line[-1].lower() not in valid_codes and line[-1] != 'Genetic Code':
print('\n' + line[-1] + ' is not a valid genetic code. Skipping taxon ' + folder + '.\n')
def script_six(args):
prefixes = []
for file in os.listdir(args.output + '/Output'):
if file.endswith('_AA.ORF.fasta'):
prefixes.append(file[:10])
unique_prefixes = list(dict.fromkeys(prefixes))
hook_fasta = ''
for file in os.listdir(args.databases + '/db_OG'):
if file.split('.')[-1] in ('fasta', 'fas', 'fa', 'faa'):
hook_fasta = args.databases + '/db_OG/' + file
if hook_fasta == '':
print('\nNo .fasta file could be found containing Hook sequences. This should be supplied along with the .dmnd-formatted database file in the Databases/db_OG folder. Quitting before script 6.\n')
for prefix in unique_prefixes:
os.system('python 6_FilterPartials.py --file_prefix ' + args.output + '/Output/' + prefix + ' --hook_fasta ' + hook_fasta)
for prefix in unique_prefixes:
os.system('python 6b_update_cov_post_removepartials.py ' + args.output + '/Output/' + prefix)
def script_seven(args):
for file in os.listdir(args.output + '/Output/ToRename'):
if '.AA.ORF.fasta' in file:
os.system('python 7_FinalizeName.py --input_file ' + args.output + '/Output/ToRename/' + file + ' --name ' + file[:10])
os.mkdir(args.output + '/Output/Intermediate')
for file in os.listdir(args.output + '/Output'):
if file != 'ReadyToGo' and file != 'Intermediate':
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
os.system('python 8_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
if __name__ == "__main__":
args = get_args()
if (args.first_script == 1 or args.script == 1) and (args.assembled_transcripts == None or not os.path.isdir(args.assembled_transcripts)):
print('\nERROR: If starting at the first script, a valid path to a folder of assembled transcript files (which must end in .fasta, .fa, or .fna) should be input using the --assembled_transcripts argument')
quit()
if args.genetic_code == None and args.script == -1:
if args.first_script < 5 and args.last_script >= 5:
print('\nERROR: You cannot run script 5 without giving a genetic code! If all of the taxa in the run use the same genetic code, then use the --genetic_code argument (e.g. -g Universal). Otherwise, stop after script 4, fill out the spreadsheet called "gcode_translate.tsv," and then run scripts 5-7. If this does not make sense, please ask for help.')
quit()
ten_digit_codes = []
if args.first_script == 1 or args.script == 1:
for file in os.listdir(args.assembled_transcripts):
if file[10:] == '_assembledTranscripts.fasta':
ten_digit_codes.append(file[:10])
else:
if not os.path.isdir(args.output + '/Output'):
print('\nERROR: A folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
quit()
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
print('\nERROR: Duplicate 10-digit codes are not allowed.\n')
quit()
for code in ten_digit_codes:
for c, char in enumerate(code):
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
print('\nERROR: ' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
quit()
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
print('\nERROR: An "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
quit()
elif os.path.isdir(args.output + '/Output/Intermediate'):
print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
quit()
elif not os.path.isdir(args.output + '/Output'):
os.mkdir(args.output + '/Output')
scripts = [0, script_one, script_two, script_three, script_four, script_five, script_six, script_seven]
if args.script == -1:
if args.first_script < args.last_script:
for i in range(1 + args.last_script - args.first_script):
print('\nRunning script ' + str(i + args.first_script) + '...\n')
if i + args.first_script == 1:
if len(ten_digit_codes) == 0:
print('\nNo properly-named assembled transcripts files found.\n')
quit()
else:
scripts[i + args.first_script](args, ten_digit_codes)
else:
scripts[i + args.first_script](args)
else:
print('\nERROR: Invalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
quit()
else:
if args.script == 1:
if len(ten_digit_codes) == 0:
print('\nNo properly-named assembled transcripts files found.\n')
quit()
else:
scripts[args.script](args, ten_digit_codes)
else:
scripts[args.script](args)

View File

@ -0,0 +1,29 @@
#!/bin/bash
#
#SBATCH --job-name=PTL1_GBF
#SBATCH --output=PTL1.%j.out # Stdout (%j expands to jobId)
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --ntasks-per-node=64 ##change to number of srun when running multiple instances
#SBATCH --mem=160G
#SBATCH --mail-type=ALL
#SBATCH --mail-user=YOUREMAIL@smith.edu
module purge #Cleans up any loaded modules
module use /gridapps/modules/all #make sure module locations is loaded
module load slurm
module load tqdm
module load Biopython/1.75-foss-2019b-Python-3.7.4
module load BLAST+/2.9.0-gompi-2019b
module load DIAMOND/0.9.30-GCC-8.3.0
module load VSEARCH/2.21.1-GCC-10.3.0
parent='/beegfs/fast/katzlab/becky/PTL1/Transcriptomes/Forams/'
srun -D ${parent}Scripts python3 ${parent}Scripts/wrapper.py -1 1 -2 7 -x --assembled_transcripts ${parent}AssembledTranscripts -o ${parent} -n ${parent}Conspecific.txt --genetic_code Universal &
#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate4/Assembled_Transcripts -o ${parent}Plate4 -n ${parent}Plate4/Conspecific.txt --genetic_code ${parent}Plate4/Gcodes.txt &
#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate7/Assembled_Transcripts -o ${parent}Plate7 -n ${parent}Plate7/Conspecific.txt --genetic_code ${parent}Plate7/Gcodes.txt &
#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 1 -2 7 -x --assembled_transcripts ${parent}Plate11/Assembled_Transcripts -o ${parent}Plate11 -n ${parent}Plate11/Conspecific.txt --genetic_code ${parent}Plate11/Gcodes.txt &
#srun -D ${parent}HQ/Scripts python3 ${parent}HQ/Scripts/wrapper.py -1 2 -2 7 -x --assembled_transcripts ${parent}Plate18/Assembled_Transcripts -o ${parent}Plate18 -n ${parent}Plate18/Conspecific.txt --genetic_code ${parent}Plate18/Gcodes.txt &
wait