From 7d54bb74161061e8fa3e3df095a7353a05478042 Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Mon, 18 Sep 2023 10:06:36 -0400 Subject: [PATCH] New script 6 with revised length filtration --- .../Scripts/6_FilterPartials.py | 698 ++++++++++++++++++ 1 file changed, 698 insertions(+) create mode 100644 PTL1/Transcriptomes/Scripts/6_FilterPartials.py diff --git a/PTL1/Transcriptomes/Scripts/6_FilterPartials.py b/PTL1/Transcriptomes/Scripts/6_FilterPartials.py new file mode 100644 index 0000000..c818c00 --- /dev/null +++ b/PTL1/Transcriptomes/Scripts/6_FilterPartials.py @@ -0,0 +1,698 @@ +#!/usr/bin/env python3.5 + +##__Updated__: 2023-09-18 +##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch +##__Usage__: python 6_FilterPartials.py --help + + +################################################################################################## +## This script is intended to remove incomplete transcripts that have a more complete mate ## +## ## +## Prior to running this script, ensure the following: ## +## ## +## 1. You have assembled your transcriptome and COPIED the 'assembly' file ## +## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ## +## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ## +## 3. Removed SSU/LSU sequences from your Fasta File ## +## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ## +## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ## +## 6. You either know (or have inferred) the genetic code of the organism ## +## 7. You have translated the sequences and checked for the data in the RemovePartials folder ## +## ## +## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## +## ## +## Next Script(s) to Run: ## +## 7_FinalRename.py ## +## ## +################################################################################################## + +from Bio import SeqIO +from Bio.Seq import Seq +from statistics import mean + +from distutils import spawn +import argparse, os, sys, time, re +from argparse import RawTextHelpFormatter,SUPPRESS + + + +#------------------------------ Colors For Print Statements ------------------------------# +class color: + PURPLE = '\033[95m' + CYAN = '\033[96m' + DARKCYAN = '\033[36m' + ORANGE = '\033[38;5;214m' + BLUE = '\033[94m' + GREEN = '\033[92m' + YELLOW = '\033[93m' + RED = '\033[91m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + END = '\033[0m' + + +#------------------------------- Main Functions of Script --------------------------------# + +########################################################################################### +###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------### +########################################################################################### + ## IF Diamond is IN YOUR PATH then no updating is needed... + +def check_diamond_path(): + + diamond_path = '' + + if diamond_path == '': + diamond_path = spawn.find_executable("diamond") + #diamond_path = '/path/to/diamond' + else: + pass + + if diamond_path == None: + print (color.BOLD + '\n\nPlease open this script and check that you have included'\ + +' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END) + print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\ + +'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\ + +color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END) + + sys.exit() + else: + pass + + return diamond_path + +########################################################################################### +###--------------------- Parses and Checks Command-Line Arguments ----------------------### +########################################################################################### + +def check_args(): + + parser = argparse.ArgumentParser(description= + color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\ + +color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\ + +color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\ + +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) + + required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) + + required_arg_group.add_argument('--file_prefix','-fp', action='store', + help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\ + 'to be processed\n'+color.END) + + + optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) + + optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98, + help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\ + ' contigs\n (default = 0.98)\n'+color.END) + optional_arg_group.add_argument('-author', action='store_true', + help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) + optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder') + + if len(sys.argv[1:]) == 0: + print (parser.description) + print ('\n') + sys.exit() + + args = parser.parse_args() + + args.id_print = str(int(float(args.identity)*100)) + + args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/' + args.file_prefix = args.file_prefix.split('/')[-1] + + args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')] + + args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')] + + args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')] + + quit_eval = return_more_info(args) + if quit_eval > 0: + print ('\n') + sys.exit() + + return args + + +########################################################################################### +###------------------------------- Script Usage Message --------------------------------### +########################################################################################### + +def usage_msg(): + return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\ + ' --file_prefix Op_me_Xxma'+color.END) + + +########################################################################################## +###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### +########################################################################################## + +def return_more_info(args): + + valid_arg = 0 + + author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ + ' maurerax@gmail.com\n\n'+color.END) + + if args.author == True: + print (author) + valid_arg += 1 + + if args.file_listNTD == []: + print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\ + ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ + 'is present in\nthe files of interest') + valid_arg += 1 + + if args.file_listAA == []: + print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\ + ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ + 'is present in\nthe files of interest') + valid_arg += 1 + + if args.file_listTSV == []: + print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\ + ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ + 'is present in\nthe files of interest') + valid_arg += 1 + + if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV): + pass + else: + print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\ + ' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\ + +color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\ + ' each transcriptome\n\nThen try once again.'+color.END) + valid_arg += 1 + + return valid_arg + + +########################################################################################## +###------------------------- Creates Folders For Storing Data -------------------------### +########################################################################################## + +def prep_folders(args): + + if os.path.isdir(args.all_output_folder + 'ToRename') != True: + os.system('mkdir ' + args.all_output_folder + 'ToRename') + + if os.path.isdir(args.all_output_folder + args.file_prefix) != True: + os.system('mkdir ' + args.all_output_folder + args.file_prefix) + + if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True: + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original') + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets') + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/') + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets') + + if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True: + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed') + os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets') + + +########################################################################################## +###-------------------- Merges Fasta Files When Replicates Present --------------------### +########################################################################################## + +def merge_fasta_replicates(args, type): + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' + + count = 0 + fasta_to_merge = [] + + if type == 'NTD': + fasta_list = args.file_listNTD + else: + fasta_list = args.file_listAA + + for file in fasta_list: + fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != ''] + count += 1 + + with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w: + w.write(''.join(fasta_to_merge)) + + time.sleep(.75) + + +########################################################################################## +###--------------------- Merges TSV Files When Replicates Present ---------------------### +########################################################################################## + +def merge_tsv_replicates(args): + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/' + + count = 0 + tsv_to_merge = [] + + for file in args.file_listTSV: + tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != ''] + count += 1 + + with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w: + w.write('\n'.join(tsv_to_merge)) + + time.sleep(.75) + + +########################################################################################## +###------------------ Calls on the other Merge Functions by Data Type -----------------### +########################################################################################## + +def merge_relevant_data(args): + + print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END) + + merge_fasta_replicates(args, 'NTD') + merge_fasta_replicates(args, 'AA') + merge_tsv_replicates(args) + + +########################################################################################## +###------------------- Uses Diamond to perform Self-vs-Self "BLAST" -------------------### +########################################################################################## + +def self_blast(args, diamond_path): + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' + + diamond_makedb = diamond_path + ' makedb --in ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated' + + diamond_self = diamond_path + ' blastp -q ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated --strand plus --no-self-hits --id '+str(args.identity)+\ + ' --query-cover 0.7 --evalue 1e-15 --threads 60 --outfmt 6 -o ' + cat_folder + 'SpreadSheets/' + args.file_prefix + '.Concatenated.Self.'+str(args.id_print)+'ID.tsv' + + print (color.BOLD+'\n\nBinning ALL '+color.ORANGE+'Nucleotide ORFs'+color.END+color.BOLD\ + +' for '+color.GREEN+args.file_prefix+color.END+color.BOLD+' at '+args.id_print\ + +'% identity.\n\n'+color.END) + + os.system(diamond_makedb) + os.system(diamond_self) + + return cat_folder+'SpreadSheets/'+args.file_prefix+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv' + + +########################################################################################## +###------------------- Uses USearch to perform Self-vs-Self "BLAST" -------------------### +########################################################################################## + +def check_Self_vs_Self(tsv_file): + + evaluation = '' + + tsv_in = [i for i in open(tsv_file).read().split('\n') if i != ''] + + if len(tsv_in) == 0: + evaluation = 'empty' + with open(tsv_file,'w+') as w: + w.write('No Self-vs-Self hits were found') + else: + evaluation = 'continue' + + return evaluation + + + +########################################################################################## +###-------------------- Removes Nearly Identical ORFs from Data Set -------------------### +########################################################################################## + +def filter_NTD_data(args, OGLenDB): + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' + proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' + + ########################################## + ## Set-up Useful Lists and Dictionaries ## + ########################################## + + nuc_Above98_hit = {} + seqs_to_toss = [] + prepped_NTD = [] + prepped_AA = [] + + nuc_tsv_100 = 0 + + orf_lens = { rec.id : len(str(rec.seq)) for file in args.file_listNTD for rec in SeqIO.parse(file, 'fasta') } + + replicates = '' + + if len(args.file_listNTD) > 1: + replicates = 'yes' + else: + replicates = 'nope' + + print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\ + ' with >'+args.id_print+'% Nucleotide Identity over >70% of\ntheir length when '\ + 'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\ + +color.CYAN+args.file_prefix+'\n\n'+color.END) + + ##################################################################### + ## Self-v-self BLAST Output Parsing - first checks for Seq-length! ## + ##################################################################### + + nuc_tsv_raw = [i.rstrip('\n') for i in open(cat_folder+'SpreadSheets/'+args.file_prefix\ + +'.Concatenated.Self.'+str(args.id_print)+'ID.tsv').readlines() if i != '\n'] + + too_long = 0 + for line in nuc_tsv_raw: + og_number = re.split('OG.{1}_', line)[-1][:6] + og_prefix = line.split(og_number)[0][-4:] + og = og_prefix + og_number + + if og in OGLenDB.keys(): + if orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] > 4.5*OGLenDB[og] or orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] < 1.5*OGLenDB[og]: + seqs_to_toss.append(line.split('\t')[1]) + too_long += 1 + + nuc_tsv = [line for line in nuc_tsv_raw if line.split('\t')[1] not in seqs_to_toss] + + if len(nuc_tsv) > 0: + if 'Cov' in nuc_tsv[0].split('\t')[0].split('_')[-3]: + nuc_tsv.sort(key=lambda x: (-int(x.split('\t')[1].split('Len')[-1].split('_')[0]),-int(x.split('\t')[1].split('Cov')[-1].split('_')[0]))) + else: + nuc_tsv.sort(key=lambda x: -int(x.split('\t')[1].split('Len')[-1].split('_')[0])) + + for line in nuc_tsv: + if line.split('\t')[1] not in seqs_to_toss: + nuc_Above98_hit.setdefault(line.split('\t')[1],[]).append(line.split('\t')[0]) + seqs_to_toss.append(line.split('\t')[0]) + if line.split('\t')[2] == '100.0': + nuc_tsv_100 += 1 + + seqs_to_toss = list(set(seqs_to_toss)) + inFasta_NTD_rawLen = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') if i.description] + inFasta_NTD = [i for i in inFasta_NTD_rawLen if i.description not in seqs_to_toss] + inFasta_AA = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta','fasta') if i.description not in seqs_to_toss] + + if replicates != '': + for i in inFasta_NTD: + if i.description not in nuc_Above98_hit.keys(): + prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq)) + else: + Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]]))) + prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq)) + for i in inFasta_AA: + if i.description not in nuc_Above98_hit.keys(): + prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq).replace('*','X')) + else: + Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]]))) + prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq).replace('*','X')) + else: + for i in inFasta_NTD: + if i.description not in nuc_Above98_hit.keys(): + prepped_NTD.append('>'+i.description+'\n'+str(i.seq)) + else: + prepped_NTD.append('>'+i.description+'\n'+str(i.seq)) + for i in inFasta_AA: + if i.description not in nuc_Above98_hit.keys(): + prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X')) + else: + prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X')) + + with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w+') as w: + for k, v in nuc_Above98_hit.items(): + w.write(k+'\t'+'\t'.join(v)+'\n') + + ############################################################################################ + ## Check for abnormally short and long sequences for the taxon for every Gene Family (OG) ## + ## ## + ## IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which ## + ## filters ORFs by length relative to the length of other ORFs from the taxon, may be ## + ## worth revising/removing in the next version of PhyloToL Part 1. ## + ## ## + ############################################################################################ + + print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\ + +color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\ + 'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END) + + self_OGLenDB={} ## + seqs_to_toss = [] ## + too_long = too_short = 0 ## + + for i in prepped_NTD: + og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6] + og_prefix = i.split('\n')[0].split(og_number)[0][-4:] + og = og_prefix + og_number + + self_OGLenDB.setdefault(og,[]).append(len(i.split('\n')[-1])) + + good_NTD_names = [] + for i in prepped_NTD: + og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6] + og_prefix = i.split('\n')[0].split(og_number)[0][-4:] + og = og_prefix + og_number + + if (0.3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))) <= len(i.split('\n')[-1]) <= (3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))): + good_NTD_names.append(i.split('\n')[0]) + + good_NTD_seqs = [i for i in prepped_NTD if i.split('\n')[0] in good_NTD_names] + good_AA_seqs = [i for i in prepped_AA if i.split('\n')[0] in good_NTD_names] + + too_short = len(prepped_NTD) - len(good_NTD_names) + + #################################################################### + ## Finalized Outputs are Summarized and Written Out to New Fastas ## + #################################################################### + + print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\ + +color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\ + str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\ + color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\ + +color.PURPLE+' ORFs.\n\n'+color.END) + + print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\ + +' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\ + color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\ + color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\ + +color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\ + +color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\ + color.RED+'TOO LONG or SHORT.\n\n'+color.END) + + print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\ + +color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END) + + with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w: + for i in good_NTD_seqs: + w.write(i+'\n') + with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x: + for i in good_AA_seqs: + x.write(i+'\n') + + return good_NTD_names + + +########################################################################################## +###------------------- Updates SpreadSheet with Update Sequence Names -----------------### +########################################################################################## + +def update_tsv(args, NTD_list_names): + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/' + proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' + + inTSV = {'_'.join(i.split('\t')[0].split('_')[1:]):'\t'.join(i.split('\t')[1:]) for i in open(cat_folder+\ + args.file_prefix+'_Concatenated.allOGCleanresults.tsv').readlines() if i != '\n'} + + Updated_inTSV = [i.strip('>')+'\t'+inTSV[i.split('_Trans')[0].strip('>')] for i in NTD_list_names] + + with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as w: + for line in Updated_inTSV: + w.write(line+'\n') + + +############################################################################################ +## ## +## IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which ## +## filters ORFs by length that do NOT hit other ORFs from the taxon by self-BLAST, may be ## +## worth revising/removing in the next version of PhyloToL Part 1. ## +## ## +############################################################################################ + + +def no_partials_present(args, OGLenDB): + + print (color.BOLD+color.RED+'\n\nWarning:'+color.END+color.BOLD+' No partial sequences'\ + ' were found with > '+str(args.id_print)+'% nucleotide identity.\n\nThe data will still be '\ + 'checked for ORFs that are unexpectedly '+color.ORANGE+'Short'+color.END+color.BOLD+' or'\ + +color.ORANGE+' Long.\n\n'+color.END) + + cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' + proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' + + NTD_file = cat_folder+args.file_prefix+'.NTD.Concatenated.fasta' + AA_file = cat_folder+args.file_prefix+'.AA.Concatenated.fasta' + TSV_file = cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv' + + self_OGLenDB = {} + seqs_to_toss = [] + too_long, too_short = 0, 0 + + ## Small changes in this section for Auden (ought to work now) + ## Lists -> Dictionaries and some data curation steps + + inFasta = {i.description:str(i.seq) for i in SeqIO.parse(NTD_file,'fasta')} + + for k,v in inFasta.items(): + og_number = re.split('OG.{1}_', k)[-1][:6] + og_prefix = k.split(og_number)[0][-4:] + og = og_prefix + og_number + + if len(v) >= 4.5*OGLenDB[og]: + seqs_to_toss.append(k) + too_long+= 1 + + prepped_NTD = [i for i in inFasta if i not in seqs_to_toss] + + print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\ + +color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\ + 'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END) + + ## toss those sequences from the sequence dictonary (less headache) + for crap_seq in seqs_to_toss: + del inFasta[crap_seq] + + for k, v in inFasta.items(): + og_number = re.split('OG.{1}_', k)[-1][:6] + og_prefix = k.split(og_number)[0][-4:] + og = og_prefix + og_number + + self_OGLenDB.setdefault(og,[]).append(len(v)) + + self_OGLenDB_Final = {k:sum(v)/len(v) for k, v in self_OGLenDB.items()} + + good_NTD_data = { } + for k, v in inFasta.items(): + og_number = re.split('OG.{1}_', k)[-1][:6] + og_prefix = k.split(og_number)[0][-4:] + og = og_prefix + og_number + + if 0.3*self_OGLenDB_Final[og] <= len(v) <= 3*self_OGLenDB_Final[og]: + good_NTD_data.update({ k : v }) + + good_AA_data = {i.description:str(i.seq) for i in SeqIO.parse(AA_file,'fasta') if i.description in good_NTD_data.keys()} + + good_TSV_data = [i for i in open(cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv')\ + .read().split('\n') if i != '' and i.split('\t')[0] in good_NTD_data.keys()] + + renamed_TSV_data = [i.split('\t')[0]+'\t'+'\t'.join(i.split('\t')[1:]) for i in good_TSV_data] + + with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w: + for k,v in good_NTD_data.items(): + w.write('>'+k+'\n'+v+'\n') + + with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x: + for k, v in good_AA_data.items(): + x.write('>'+k+'\n'+v+'\n') + + with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as y: + y.write('\n'.join(renamed_TSV_data)) + + +############################################################################################ +## ## +## IMPORTANT: The following function was added in September 2023 by ACL and LAK to ## +## replace the pre-Guidance filter that removes sequences <50% and >150% the average ## +## length of the OG in the Hook. It may be worth revisiting in the next version of ## +## PhyloToL part 1. ## +## ## +############################################################################################ + + +def filter_all_by_hook(args, OGLenDB): + + proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' + + remaining_NTD_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta', 'fasta') } + remaining_AA_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta', 'fasta') } + + short_from_translation = [] + if os.path.isfile('UnexpexctedShortStuffBlameXyrus.txt'): + for line in open('UnexpexctedShortStuffBlameXyrus.txt'): + short_from_translation.append(line.strip()) + + good_NTD_seqs = []; good_AA_seqs = [] + for rec in remaining_NTD_seqs: + og_number = re.split('OG.{1}_', rec)[-1][:6] + og_prefix = rec.split(og_number)[0][-4:] + og = og_prefix + og_number + + if len(remaining_AA_seqs[rec]) <= 1.5*OGLenDB[og] and len(remaining_AA_seqs[rec]) >= 0.33*OGLenDB[og] and rec.split('_Trans')[0] not in short_from_translation: + good_NTD_seqs.append((rec, remaining_NTD_seqs[rec])) + good_AA_seqs.append((rec, remaining_AA_seqs[rec])) + + with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w') as w: + for rec in good_NTD_seqs: + w.write('>' + rec[0] + '\n' + rec[1] + '\n\n') + + with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w') as x: + for rec in good_AA_seqs: + x.write('>' + rec[0] + '\n' + rec[1] + '\n\n') + + +########################################################################################## +###--------------------- Cleans up the Folder and Moves Final Files -------------------### +########################################################################################## + +def clean_up(args): + + for i in args.file_listNTD: + os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/') + os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/') + os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/') + + +########################################################################################### +###-------------------------------- Next Script Message --------------------------------### +########################################################################################### + +def next_script(): + + print(color.BOLD+'\nNext Script is: '+color.GREEN+'6b_update_cov_post_removepartials.py\n\n'+color.END) + + +########################################################################################## +###------------------- Checks Command Line Arguments and Calls Steps ------------------### +########################################################################################## + +def main(): + + diamond_path = check_diamond_path() + + args = check_args() + + prep_folders(args) + + merge_relevant_data(args) + + self_BLAST_out = self_blast(args, diamond_path) + + evaluation = check_Self_vs_Self(self_BLAST_out) + + OGLenDB = {} + for rec in SeqIO.parse(args.hook_fasta, 'fasta'): + if rec.id[-10:] not in OGLenDB: + OGLenDB.update({ rec.id[-10:] : [] }) + + OGLenDB[rec.id[-10:]].append(len(str(rec.seq))) + + for og in OGLenDB: + OGLenDB[og] = mean(OGLenDB[og]) + + if evaluation != 'empty': + NTD_names = filter_NTD_data(args, OGLenDB) + update_tsv(args, NTD_names) + else: + no_partials_present(args, OGLenDB) + + filter_all_by_hook(args, OGLenDB) + + clean_up(args) + + next_script() + +main()