mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 03:30:25 +08:00
Small changes, see description
- Default partial filter cov threshold is 20% - Changed script names - Made requirement of conspecific names file explicit when using XPC
This commit is contained in:
parent
b2e381e01d
commit
1ee6f5ceb6
@ -328,8 +328,8 @@ def filter_NTD_data(args, OGLenDB):
|
||||
pairs_per_og = { }
|
||||
for line in open(proc_folder + '/SpreadSheets/All_NTD_SelfBLAST_Results.tsv'):
|
||||
|
||||
#IMPORTANT: This line is where the query coverage threshold is determined, and it might be helpful to adjust this when trying to optimally filter chimeras. By default it is set to 33%
|
||||
if line.split('\t')[0] != line.split('\t')[1] and line.split('\t')[0][-10:] == line.split('\t')[1][-10:] and int(line.split('\t')[-1].strip()) > 33:
|
||||
#IMPORTANT: This line is where the query coverage threshold is determined, and it might be helpful to adjust this when trying to optimally filter chimeras. By default it is set to 20%
|
||||
if line.split('\t')[0] != line.split('\t')[1] and line.split('\t')[0][-10:] == line.split('\t')[1][-10:] and int(line.split('\t')[-1].strip()) > 20:
|
||||
if line.split('\t')[0][-10:] not in pairs_per_og:
|
||||
pairs_per_og.update({ line.split('\t')[0][-10:] : [] })
|
||||
|
||||
|
||||
300
PTL1/Transcriptomes/Scripts/7a_FinalizeName.py
Normal file
300
PTL1/Transcriptomes/Scripts/7a_FinalizeName.py
Normal file
@ -0,0 +1,300 @@
|
||||
#!/usr/bin/env python3.5
|
||||
|
||||
##__Updated__: 9_29_2023 by Auden Cote-L'Heureux
|
||||
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
||||
##__Usage__: python 6_FilterPartials.py --help
|
||||
|
||||
##################################################################################################
|
||||
## This script is intended to rename the outputs of the FilterPartials script ##
|
||||
## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ##
|
||||
## ##
|
||||
## Prior to r`ning this script, ensure the following: ##
|
||||
## ##
|
||||
## 1. You have assembled your transcriptome and COPIED the 'assembly' file ##
|
||||
## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ##
|
||||
## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ##
|
||||
## 3. Removed SSU/LSU sequences from your Fasta File ##
|
||||
## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ##
|
||||
## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ##
|
||||
## 6. You either know (or have inferred) the genetic code of the organism ##
|
||||
## 7. You have translated the sequences and checked for the data in the RemovePartials folder ##
|
||||
## 8. Partial sequences have been removed from the transcriptomic data sets ##
|
||||
## ##
|
||||
## COMMAND Example Below ##
|
||||
## Extra Notes at Bottom of Script ##
|
||||
## ##
|
||||
## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ##
|
||||
## ##
|
||||
## Next Script(s) to Run: ##
|
||||
## NONE! You're FINISHED! :D ##
|
||||
## ##
|
||||
##################################################################################################
|
||||
|
||||
import argparse, os, sys
|
||||
from argparse import RawTextHelpFormatter,SUPPRESS
|
||||
|
||||
#----------------------- Solely to Make Print Statements Colorful -----------------------#
|
||||
|
||||
class color:
|
||||
PURPLE = '\033[95m'
|
||||
CYAN = '\033[96m'
|
||||
DARKCYAN = '\033[36m'
|
||||
ORANGE = '\033[38;5;214m'
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BOLD = '\033[1m'
|
||||
UNDERLINE = '\033[4m'
|
||||
END = '\033[0m'
|
||||
|
||||
|
||||
#------------------------------- Main Functions of Script --------------------------------#
|
||||
|
||||
###########################################################################################
|
||||
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description=
|
||||
color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\
|
||||
+color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\
|
||||
+color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\
|
||||
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
||||
|
||||
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
||||
|
||||
required_arg_group.add_argument('--input_file','-in', action='store',
|
||||
help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END)
|
||||
required_arg_group.add_argument('--name','-n', action='store',
|
||||
help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END)
|
||||
|
||||
|
||||
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
||||
|
||||
optional_arg_group.add_argument('-author', action='store_true',
|
||||
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
||||
|
||||
if len(sys.argv[1:]) == 0:
|
||||
print (parser.description)
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
quit_eval = return_more_info(args)
|
||||
if quit_eval > 0:
|
||||
print ('\n')
|
||||
sys.exit()
|
||||
|
||||
args.all_output_folder = '/'.join(args.input_file.split('/')[:-2])
|
||||
|
||||
args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0]
|
||||
if 'fasta' in args.file_prefix:
|
||||
args.file_prefix = args.name
|
||||
|
||||
args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/'
|
||||
args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/'
|
||||
args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/'
|
||||
|
||||
|
||||
return args
|
||||
|
||||
|
||||
###########################################################################################
|
||||
###------------------------------- Script Usage Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def usage_msg():
|
||||
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\
|
||||
' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
||||
##########################################################################################
|
||||
|
||||
def return_more_info(args):
|
||||
|
||||
valid_args = 0
|
||||
|
||||
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
||||
' maurerax@gmail.com\n\n'+color.END)
|
||||
|
||||
if args.author == True:
|
||||
print (author)
|
||||
valid_args += 1
|
||||
|
||||
print(args.input_file)
|
||||
|
||||
if args.input_file.endswith('AA.ORF.fasta'):
|
||||
args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta')
|
||||
args.input_AA = args.input_file
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
|
||||
elif args.input_file.endswith('NTD.ORF.fasta'):
|
||||
args.input_NTD = args.input_file
|
||||
args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta')
|
||||
# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv')
|
||||
args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv')
|
||||
print(args.input_TSV)
|
||||
|
||||
if os.path.isfile(args.input_NTD) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_AA) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\
|
||||
'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
if os.path.isfile(args.input_TSV) != True:
|
||||
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\
|
||||
' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
||||
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
||||
valid_args += 1
|
||||
|
||||
return valid_args
|
||||
|
||||
###########################################################################################
|
||||
###-------------------- Double Checks Format for 10-Character Code ---------------------###
|
||||
###########################################################################################
|
||||
|
||||
def check_code(args):
|
||||
|
||||
check_name = args.name.split('_')
|
||||
|
||||
if len(args.name) != 10:
|
||||
print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
elif args.name.count('_') != 2:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
|
||||
sys.exit()
|
||||
|
||||
if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4:
|
||||
print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\
|
||||
.split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\
|
||||
"code: "+color.CYAN+args.name+color.END+'\n')
|
||||
else:
|
||||
print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n')
|
||||
print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\
|
||||
'Am_ar_Ehis\n\n'+color.END)
|
||||
sys.exit()
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###------------------------- Creates Folders For Storing Data -------------------------###
|
||||
##########################################################################################
|
||||
|
||||
def prep_folders(args):
|
||||
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/ReadyToGo')
|
||||
|
||||
|
||||
if os.path.isdir(args.r2g_ntd) != True:
|
||||
os.system('mkdir ' + args.r2g_ntd)
|
||||
if os.path.isdir(args.r2g_aa) != True:
|
||||
os.system('mkdir ' + args.r2g_aa)
|
||||
if os.path.isdir(args.r2g_tsv) != True:
|
||||
os.system('mkdir ' + args.r2g_tsv)
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed')
|
||||
|
||||
###########################################################################################
|
||||
###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------###
|
||||
###########################################################################################
|
||||
|
||||
def rename_paralogs(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END)
|
||||
renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_')
|
||||
|
||||
|
||||
print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END)
|
||||
if '\n\n' in open(args.input_TSV).read():
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n\n','\n'+args.name+'_XX_')
|
||||
else:
|
||||
renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\
|
||||
.replace('\n','\n'+args.name+'_XX_')
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w:
|
||||
w.write(renamed_Final_Prots)
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x:
|
||||
x.write(renamed_Final_Nucs)
|
||||
|
||||
|
||||
with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y:
|
||||
y.write(renamed_Final_tsv)
|
||||
|
||||
|
||||
##########################################################################################
|
||||
###-------------------- Cleans up the Folder and Moves Final Files --------------------###
|
||||
##########################################################################################
|
||||
def clean_up(args):
|
||||
|
||||
home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/'
|
||||
|
||||
os.system('cp '+home_folder+'*tsv '+args.r2g_tsv)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd)
|
||||
|
||||
os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix)
|
||||
|
||||
os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*')
|
||||
|
||||
if os.path.isdir(args.all_output_folder + '/Finished/') != True:
|
||||
os.system('mkdir ' + args.all_output_folder + '/Finished')
|
||||
|
||||
os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished')
|
||||
|
||||
###########################################################################################
|
||||
###-------------------------------- Next Script Message --------------------------------###
|
||||
###########################################################################################
|
||||
|
||||
def next_script(args):
|
||||
|
||||
print (color.BOLD+'\nThere is no next script!\n\n'+color.END)
|
||||
|
||||
##########################################################################################
|
||||
###--------------- Checks Command Line Arguments and Calls on Functions ---------------###
|
||||
##########################################################################################
|
||||
|
||||
def main():
|
||||
|
||||
args = check_args()
|
||||
|
||||
check_code(args)
|
||||
|
||||
prep_folders(args)
|
||||
|
||||
rename_paralogs(args)
|
||||
|
||||
clean_up(args)
|
||||
|
||||
next_script(args)
|
||||
|
||||
main()
|
||||
276
PTL1/Transcriptomes/Scripts/7b_SummaryStats.py
Normal file
276
PTL1/Transcriptomes/Scripts/7b_SummaryStats.py
Normal file
@ -0,0 +1,276 @@
|
||||
import os, sys
|
||||
import argparse
|
||||
from Bio import SeqIO
|
||||
import CUB
|
||||
from statistics import mean
|
||||
from math import ceil, floor
|
||||
from tqdm import tqdm
|
||||
#import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from datetime import date
|
||||
|
||||
|
||||
today = str(date.today())
|
||||
|
||||
def get_args():
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
prog = 'PTL6p1 Script 8: Stat Summary',
|
||||
description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
|
||||
)
|
||||
|
||||
parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
|
||||
parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
|
||||
parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def hook_lens(args):
|
||||
|
||||
print('\nGetting average OG lengths in the Hook DB...')
|
||||
|
||||
len_by_og = { }
|
||||
for file in os.listdir(args.databases + '/db_OG'):
|
||||
if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
|
||||
for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
|
||||
if rec.id[-10:] not in len_by_og:
|
||||
len_by_og.update({ rec.id[-10:] : [] })
|
||||
|
||||
len_by_og[rec.id[-10:]].append(len(str(rec.seq)))
|
||||
|
||||
for og in len_by_og:
|
||||
len_by_og[og] = mean(len_by_og[og])
|
||||
|
||||
return len_by_og
|
||||
|
||||
|
||||
def aa_comp_lengths(args, gcodes):
|
||||
|
||||
print('\nGetting amino acid composition data from ReadyToGo files...')
|
||||
|
||||
r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
|
||||
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
|
||||
r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })
|
||||
|
||||
fymink = 0; garp = 0; other = 0; total = 0; x = 0
|
||||
for char in str(rec.seq):
|
||||
if char in 'FYMINKfymink':
|
||||
fymink += 1
|
||||
elif char in 'GARPgarp':
|
||||
garp += 1
|
||||
elif char in 'Xx':
|
||||
x += 1
|
||||
else:
|
||||
other += 1
|
||||
|
||||
total += 1
|
||||
|
||||
aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total, 'X' : x/total } })
|
||||
|
||||
recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })
|
||||
|
||||
print('\nGetting transcript sequence data from original assembled transcript files...')
|
||||
|
||||
transcripts = { }; transcript_id_corr = { }
|
||||
for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes')]):
|
||||
if os.path.isdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
|
||||
for file in os.listdir(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta'):
|
||||
if file.endswith('Original.fasta') and file[:10] in gcodes:
|
||||
for rec in SeqIO.parse(args.input + '/Intermediate/TranslatedTranscriptomes/' + tax + '/OriginalFasta/' + file, 'fasta'):
|
||||
transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
|
||||
if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
|
||||
transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})
|
||||
|
||||
return aa_comp, transcripts, r2g_lengths, transcript_id_corr
|
||||
|
||||
|
||||
def get_nuc_comp(args, gcodes):
|
||||
|
||||
print('\nGetting nucleotide composition data from ReadyToGo files...')
|
||||
|
||||
nuc_comp = { }
|
||||
for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]].lower())[0]
|
||||
for k in cub_out:
|
||||
nuc_comp.update({ k : cub_out[k] })
|
||||
|
||||
return nuc_comp
|
||||
|
||||
|
||||
def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr):
|
||||
|
||||
og_mean_lens = hook_lens(args)
|
||||
|
||||
if not os.path.isdir(args.input + '/PerSequenceStatSummaries_' + today):
|
||||
os.mkdir(args.input + '/PerSequenceStatSummaries_' + today)
|
||||
|
||||
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
||||
|
||||
for taxon in taxa:
|
||||
with open(args.input + '/PerSequenceStatSummaries_' + today + '/' + taxon + '.csv', 'w') as o:
|
||||
o.write('Sequence,Taxon,OG,OrigName,OrigLength,ORFLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA,N_Xs\n')
|
||||
for rec in nuc_comp:
|
||||
if rec[:10] == taxon:
|
||||
o.write(rec + ',' + rec[:10] + ',' + rec[-10:])
|
||||
|
||||
try:
|
||||
o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
|
||||
except KeyError:
|
||||
o.write(',NA,NA')
|
||||
|
||||
o.write(',' + str(r2g_lengths[rec]) + ',' + str(round(og_mean_lens[rec[-10:]], 2)))
|
||||
|
||||
v = nuc_comp[rec]
|
||||
gcs = [str(round(v.gcOverall, 2)), str(round(v.gc1, 2)), str(round(v.gc2, 2)), str(round(v.gc3, 2)), str(round(v.gc4F, 2))]
|
||||
ENc = [str(round(v.expENc, 2)), str(round(v.obsENc_6F, 2)), str(round(v.obsENc_No6F, 2)), str(round(v.SunENc_6F, 2)),str(round(v.SunENc_No6F, 2))]
|
||||
o.write(',' + ','.join([str(round(v.amb_cdn, 2))] + gcs + ENc))
|
||||
|
||||
o.write(',' + str(round(aa_comp[rec]['FYMINK'], 2)) + ',' + str(round(aa_comp[rec]['GARP'], 2)) + ',' + str(round(aa_comp[rec]['Other'], 2)) + ',' + str(round(aa_comp[rec]['X'], 2)) + '\n')
|
||||
|
||||
|
||||
def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes):
|
||||
|
||||
taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))
|
||||
|
||||
with open(args.input + '/PerTaxonSummary_' + today + '.csv', 'w') as o:
|
||||
o.write('Taxon,OrigSeqs,Orig_MedianGC,Orig_GCWidth_5-95Perc,Orig_MedianLen,Orig_IQRLen,R2GSeqs,R2G_OGs,R2G_MedianGC3,R2G_5Perc_GC3,R2G_95Perc_GC3,R2G_GC3Width_5-95Perc,R2G_MedianENc,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,GeneticCode\n')
|
||||
|
||||
for taxon in taxa:
|
||||
o.write(taxon)
|
||||
|
||||
transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
|
||||
o.write(',' + str(len(transcripts)))
|
||||
|
||||
transcript_gcs = []
|
||||
for transcript in transcripts:
|
||||
transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))
|
||||
|
||||
transcript_gcs = sorted(transcript_gcs)
|
||||
o.write(',' + str(round(transcript_gcs[floor(len(transcripts)*0.5)], 2)))
|
||||
o.write(',' + str(round(transcript_gcs[floor(len(transcripts)*0.95)] - transcript_gcs[floor(len(transcripts)*0.05)], 2)))
|
||||
|
||||
transcript_lens = sorted([len(transcript) for transcript in transcripts])
|
||||
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
|
||||
o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))
|
||||
|
||||
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
|
||||
o.write(',' + str(len(r2g_ntds)))
|
||||
r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
|
||||
o.write(',' + str(len(r2g_ogs)))
|
||||
|
||||
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
||||
o.write(',' + str(round(r2g_gc3s[floor(len(r2g_ntds)*0.5)], 2)))
|
||||
o.write(',' + str(round(r2g_gc3s[floor(len(r2g_gc3s)*0.05)], 2)))
|
||||
o.write(',' + str(round(r2g_gc3s[floor(len(r2g_gc3s)*0.95)], 2)))
|
||||
o.write(',' + str(round(r2g_gc3s[floor(len(r2g_gc3s)*0.95)] - r2g_gc3s[floor(len(r2g_gc3s)*0.05)], 2)))
|
||||
|
||||
r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
|
||||
o.write(',' + str(round(r2g_encs[floor(len(r2g_encs)*0.5)], 2)))
|
||||
o.write(',' + str(round(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)], 2)))
|
||||
|
||||
tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
|
||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
|
||||
o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))
|
||||
|
||||
o.write(',' + gcodes[taxon] + '\n')
|
||||
|
||||
|
||||
def r2g_jf(args, nuc_comp, gcodes):
|
||||
|
||||
#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?
|
||||
|
||||
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF_' + today):
|
||||
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF_' + today)
|
||||
|
||||
if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF_' + today):
|
||||
os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF_' + today)
|
||||
|
||||
for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
|
||||
if file.endswith('.fasta') and file[:10] in gcodes:
|
||||
taxon = file[:10]
|
||||
|
||||
r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
|
||||
r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
|
||||
|
||||
with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF_' + today + '/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
|
||||
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
|
||||
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
||||
|
||||
with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF_' + today + '/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
|
||||
for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
|
||||
if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
|
||||
o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
||||
|
||||
|
||||
def plot_jf(args, nuc_comp):
|
||||
|
||||
if not os.path.isdir(args.input + '/GC3xENc_Plots_' + today):
|
||||
os.mkdir(args.input + '/GC3xENc_Plots_' + today)
|
||||
|
||||
taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))
|
||||
|
||||
gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
|
||||
enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]
|
||||
|
||||
for taxon in taxa:
|
||||
comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]
|
||||
|
||||
plt.figure()
|
||||
plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
|
||||
plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
|
||||
plt.xlabel("GC content (3rd pos, 4-fold sites)")
|
||||
plt.ylabel("Observed Wright ENc (6 Fold)")
|
||||
plt.savefig(args.input + '/GC3xENc_Plots_' + today + '/' + taxon + '.png')
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
valid_codes = ['bleph','blepharisma','chilo','chilodonella','condy', 'condylostoma','none','eup','euplotes','peritrich','vorticella','ciliate','universal','taa','tag','tga','mesodinium']
|
||||
|
||||
gcodes = { }
|
||||
if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
|
||||
for line in open(args.input + '/Intermediate/gcode_output.tsv'):
|
||||
if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
|
||||
gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
|
||||
elif line.split('\t')[4].strip().lower() != '':
|
||||
print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 8 (summary statistics)\n')
|
||||
else:
|
||||
print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 8 (summary statistics).\n')
|
||||
exit()
|
||||
|
||||
aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
|
||||
nuc_comp = get_nuc_comp(args, gcodes)
|
||||
|
||||
per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes)
|
||||
per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr)
|
||||
|
||||
if args.r2g_jf:
|
||||
r2g_jf(args, nuc_comp, gcodes)
|
||||
|
||||
#plot_jf(args, nuc_comp)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -33,7 +33,11 @@ def script_one(args, ten_digit_codes):
|
||||
os.system('python 1a_TranscriptLengthFilter.py --input_file ' + args.assembled_transcripts + '/' + file + ' --output_file ' + args.output + '/Output/' + file[:10] + ' --minLen ' + str(args.minlen) + ' --maxLen ' + str(args.maxlen) + ' --spades') #SPADES ARGUMENT??
|
||||
|
||||
if args.xplate_contam:
|
||||
os.system('python 1b_CrossPlateContamination.py ' + args.output + '/Output/XlaneBleeding ' + str(args.minlen) + ' ' + args.conspecific_names)
|
||||
if not os.path.isfile(args.conspecific_names):
|
||||
print('\nERROR: If you are running cross-plate contamination, a file designating species assignments is required for the --conspecific_names argument\n')
|
||||
exit()
|
||||
else:
|
||||
os.system('python 1b_CrossPlateContamination.py ' + args.output + '/Output/XlaneBleeding ' + str(args.minlen) + ' ' + args.conspecific_names)
|
||||
|
||||
|
||||
def script_two(args):
|
||||
@ -153,7 +157,7 @@ def script_seven(args):
|
||||
|
||||
for file in os.listdir(args.output + '/Output/ToRename'):
|
||||
if '.AA.ORF.fasta' in file:
|
||||
os.system('python 7_FinalizeName.py --input_file ' + args.output + '/Output/ToRename/' + file + ' --name ' + file[:10])
|
||||
os.system('python 7a_FinalizeName.py --input_file ' + args.output + '/Output/ToRename/' + file + ' --name ' + file[:10])
|
||||
|
||||
os.mkdir(args.output + '/Output/Intermediate')
|
||||
|
||||
@ -161,7 +165,7 @@ def script_seven(args):
|
||||
if file != 'ReadyToGo' and file != 'Intermediate':
|
||||
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
|
||||
|
||||
os.system('python 8_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
|
||||
os.system('python 7b_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user