From eb3db88688cebc1b7cb36f05966351639aefdc8f Mon Sep 17 00:00:00 2001 From: Auden Cote-L'Heureux <52716489+AudenCote@users.noreply.github.com> Date: Fri, 29 Sep 2023 13:20:05 -0400 Subject: [PATCH] Delete PTL1/Transcriptomes/Scripts/6_FilterPartials.py --- .../Scripts/6_FilterPartials.py | 698 ------------------ 1 file changed, 698 deletions(-) delete mode 100644 PTL1/Transcriptomes/Scripts/6_FilterPartials.py diff --git a/PTL1/Transcriptomes/Scripts/6_FilterPartials.py b/PTL1/Transcriptomes/Scripts/6_FilterPartials.py deleted file mode 100644 index c818c00..0000000 --- a/PTL1/Transcriptomes/Scripts/6_FilterPartials.py +++ /dev/null @@ -1,698 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 2023-09-18 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com; xyrus.maurer-alcala@izb.unibe.ch -##__Usage__: python 6_FilterPartials.py --help - - -################################################################################################## -## This script is intended to remove incomplete transcripts that have a more complete mate ## -## ## -## Prior to running this script, ensure the following: ## -## ## -## 1. You have assembled your transcriptome and COPIED the 'assembly' file ## -## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ## -## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ## -## 3. Removed SSU/LSU sequences from your Fasta File ## -## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ## -## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ## -## 6. You either know (or have inferred) the genetic code of the organism ## -## 7. You have translated the sequences and checked for the data in the RemovePartials folder ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -## Next Script(s) to Run: ## -## 7_FinalRename.py ## -## ## -################################################################################################## - -from Bio import SeqIO -from Bio.Seq import Seq -from statistics import mean - -from distutils import spawn -import argparse, os, sys, time, re -from argparse import RawTextHelpFormatter,SUPPRESS - - - -#------------------------------ Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------### -########################################################################################### - ## IF Diamond is IN YOUR PATH then no updating is needed... - -def check_diamond_path(): - - diamond_path = '' - - if diamond_path == '': - diamond_path = spawn.find_executable("diamond") - #diamond_path = '/path/to/diamond' - else: - pass - - if diamond_path == None: - print (color.BOLD + '\n\nPlease open this script and check that you have included'\ - +' the PATH to the'+color.BLUE+' "diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END) - print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\ - +'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\ - +color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END) - - sys.exit() - else: - pass - - return diamond_path - -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script is intended to '+color.RED+'Identify and Collapse '+color.END\ - +color.BOLD+'partial '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'present within a '\ - +color.RED+'Given'+color.END+color.BOLD+' transcriptome (or replicate) transcriptome(s)'\ - +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--file_prefix','-fp', action='store', - help=color.BOLD+color.GREEN+' File prefix that is unique (or common)\n to the files '\ - 'to be processed\n'+color.END) - - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('--identity','-id', type=float, action='store', default=0.98, - help=color.BOLD+color.GREEN+' Identity threshold for identifying \n "partials" to larger'\ - ' contigs\n (default = 0.98)\n'+color.END) - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - optional_arg_group.add_argument('--hook_fasta','-f', help='Path to the fasta file of the Hook DB in the Databases/db_OG folder') - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - args.id_print = str(int(float(args.identity)*100)) - - args.all_output_folder = '/'.join(args.file_prefix.split('/')[:-1]) + '/' - args.file_prefix = args.file_prefix.split('/')[-1] - - args.file_listNTD = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('NTD.ORF.fasta')] - - args.file_listAA = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('AA.ORF.fasta')] - - args.file_listTSV = [args.all_output_folder + i for i in os.listdir(args.all_output_folder) if args.file_prefix in i and i.endswith('results.tsv')] - - quit_eval = return_more_info(args) - if quit_eval > 0: - print ('\n') - sys.exit() - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 6_RemovePartials.py'\ - ' --file_prefix Op_me_Xxma'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_arg = 0 - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - print (author) - valid_arg += 1 - - if args.file_listNTD == []: - print (color.BOLD+'\n\nNo '+color.ORANGE+'Nucleotide Fasta Files'+color.END+color.BOLD+\ - ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ - 'is present in\nthe files of interest') - valid_arg += 1 - - if args.file_listAA == []: - print (color.BOLD+'\n\nNo '+color.ORANGE+'Protein Fasta Files'+color.END+color.BOLD+\ - ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ - 'is present in\nthe files of interest') - valid_arg += 1 - - if args.file_listTSV == []: - print (color.BOLD+'\n\nNo '+color.ORANGE+'OG-Assignment Spreadsheets'+color.END+color.BOLD+\ - ' found!\n\nCheck that your'+color.GREEN+' File Prefix'+color.END+color.BOLD+\ - 'is present in\nthe files of interest') - valid_arg += 1 - - if len(args.file_listNTD) == len(args.file_listAA) == len(args.file_listTSV): - pass - else: - print (color.BOLD+color.RED+'\n\nError:'+color.END+color.BOLD+' Unequal numbers of'\ - ' input files found.\n\nDouble-check that there are:'+color.CYAN+'SINGLE'+color.END\ - +color.BOLD+' Nucleotide and Protein fasta files and OG-assignment Spreadsheet for'\ - ' each transcriptome\n\nThen try once again.'+color.END) - valid_arg += 1 - - return valid_arg - - -########################################################################################## -###------------------------- Creates Folders For Storing Data -------------------------### -########################################################################################## - -def prep_folders(args): - - if os.path.isdir(args.all_output_folder + 'ToRename') != True: - os.system('mkdir ' + args.all_output_folder + 'ToRename') - - if os.path.isdir(args.all_output_folder + args.file_prefix) != True: - os.system('mkdir ' + args.all_output_folder + args.file_prefix) - - if os.path.isdir(args.all_output_folder + args.file_prefix + '/Original') != True: - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original') - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets') - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/') - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets') - - if os.path.isdir(args.all_output_folder + args.file_prefix + '/Processed') != True: - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed') - os.system('mkdir ' + args.all_output_folder + args.file_prefix + '/Processed/SpreadSheets') - - -########################################################################################## -###-------------------- Merges Fasta Files When Replicates Present --------------------### -########################################################################################## - -def merge_fasta_replicates(args, type): - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' - - count = 0 - fasta_to_merge = [] - - if type == 'NTD': - fasta_list = args.file_listNTD - else: - fasta_list = args.file_listAA - - for file in fasta_list: - fasta_to_merge += ['>'+str(count)+'_'+i for i in open(file).read().split('>') if i != ''] - count += 1 - - with open(cat_folder+args.file_prefix+'.'+type+'.Concatenated.fasta','w+') as w: - w.write(''.join(fasta_to_merge)) - - time.sleep(.75) - - -########################################################################################## -###--------------------- Merges TSV Files When Replicates Present ---------------------### -########################################################################################## - -def merge_tsv_replicates(args): - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/' - - count = 0 - tsv_to_merge = [] - - for file in args.file_listTSV: - tsv_to_merge += [str(count)+'_'+i for i in open(file).read().split('\n') if i != ''] - count += 1 - - with open(cat_folder+args.file_prefix+'_Concatenated.allOGCleanresults.tsv','w+') as w: - w.write('\n'.join(tsv_to_merge)) - - time.sleep(.75) - - -########################################################################################## -###------------------ Calls on the other Merge Functions by Data Type -----------------### -########################################################################################## - -def merge_relevant_data(args): - - print (color.BOLD+'\n\nMerging Transcriptome data together.'+color.END) - - merge_fasta_replicates(args, 'NTD') - merge_fasta_replicates(args, 'AA') - merge_tsv_replicates(args) - - -########################################################################################## -###------------------- Uses Diamond to perform Self-vs-Self "BLAST" -------------------### -########################################################################################## - -def self_blast(args, diamond_path): - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' - - diamond_makedb = diamond_path + ' makedb --in ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated' - - diamond_self = diamond_path + ' blastp -q ' + cat_folder + args.file_prefix + '.AA.Concatenated.fasta -d ' + cat_folder + args.file_prefix + '.AA.Concatenated --strand plus --no-self-hits --id '+str(args.identity)+\ - ' --query-cover 0.7 --evalue 1e-15 --threads 60 --outfmt 6 -o ' + cat_folder + 'SpreadSheets/' + args.file_prefix + '.Concatenated.Self.'+str(args.id_print)+'ID.tsv' - - print (color.BOLD+'\n\nBinning ALL '+color.ORANGE+'Nucleotide ORFs'+color.END+color.BOLD\ - +' for '+color.GREEN+args.file_prefix+color.END+color.BOLD+' at '+args.id_print\ - +'% identity.\n\n'+color.END) - - os.system(diamond_makedb) - os.system(diamond_self) - - return cat_folder+'SpreadSheets/'+args.file_prefix+'.Concatenated.Self.'+str(args.id_print)+'ID.tsv' - - -########################################################################################## -###------------------- Uses USearch to perform Self-vs-Self "BLAST" -------------------### -########################################################################################## - -def check_Self_vs_Self(tsv_file): - - evaluation = '' - - tsv_in = [i for i in open(tsv_file).read().split('\n') if i != ''] - - if len(tsv_in) == 0: - evaluation = 'empty' - with open(tsv_file,'w+') as w: - w.write('No Self-vs-Self hits were found') - else: - evaluation = 'continue' - - return evaluation - - - -########################################################################################## -###-------------------- Removes Nearly Identical ORFs from Data Set -------------------### -########################################################################################## - -def filter_NTD_data(args, OGLenDB): - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' - proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' - - ########################################## - ## Set-up Useful Lists and Dictionaries ## - ########################################## - - nuc_Above98_hit = {} - seqs_to_toss = [] - prepped_NTD = [] - prepped_AA = [] - - nuc_tsv_100 = 0 - - orf_lens = { rec.id : len(str(rec.seq)) for file in args.file_listNTD for rec in SeqIO.parse(file, 'fasta') } - - replicates = '' - - if len(args.file_listNTD) > 1: - replicates = 'yes' - else: - replicates = 'nope' - - print (color.BOLD+'\n\nRemoving Partial '+color.PURPLE+'ORFs'+color.END+color.BOLD+\ - ' with >'+args.id_print+'% Nucleotide Identity over >70% of\ntheir length when '\ - 'compared to more complete '+color.PURPLE+'ORFs '+color.END+color.BOLD+'from: '\ - +color.CYAN+args.file_prefix+'\n\n'+color.END) - - ##################################################################### - ## Self-v-self BLAST Output Parsing - first checks for Seq-length! ## - ##################################################################### - - nuc_tsv_raw = [i.rstrip('\n') for i in open(cat_folder+'SpreadSheets/'+args.file_prefix\ - +'.Concatenated.Self.'+str(args.id_print)+'ID.tsv').readlines() if i != '\n'] - - too_long = 0 - for line in nuc_tsv_raw: - og_number = re.split('OG.{1}_', line)[-1][:6] - og_prefix = line.split(og_number)[0][-4:] - og = og_prefix + og_number - - if og in OGLenDB.keys(): - if orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] > 4.5*OGLenDB[og] or orf_lens['_'.join(line.split('\t')[1].split('_')[1:])] < 1.5*OGLenDB[og]: - seqs_to_toss.append(line.split('\t')[1]) - too_long += 1 - - nuc_tsv = [line for line in nuc_tsv_raw if line.split('\t')[1] not in seqs_to_toss] - - if len(nuc_tsv) > 0: - if 'Cov' in nuc_tsv[0].split('\t')[0].split('_')[-3]: - nuc_tsv.sort(key=lambda x: (-int(x.split('\t')[1].split('Len')[-1].split('_')[0]),-int(x.split('\t')[1].split('Cov')[-1].split('_')[0]))) - else: - nuc_tsv.sort(key=lambda x: -int(x.split('\t')[1].split('Len')[-1].split('_')[0])) - - for line in nuc_tsv: - if line.split('\t')[1] not in seqs_to_toss: - nuc_Above98_hit.setdefault(line.split('\t')[1],[]).append(line.split('\t')[0]) - seqs_to_toss.append(line.split('\t')[0]) - if line.split('\t')[2] == '100.0': - nuc_tsv_100 += 1 - - seqs_to_toss = list(set(seqs_to_toss)) - inFasta_NTD_rawLen = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.NTD.Concatenated.fasta', 'fasta') if i.description] - inFasta_NTD = [i for i in inFasta_NTD_rawLen if i.description not in seqs_to_toss] - inFasta_AA = [i for i in SeqIO.parse(cat_folder+args.file_prefix+'.AA.Concatenated.fasta','fasta') if i.description not in seqs_to_toss] - - if replicates != '': - for i in inFasta_NTD: - if i.description not in nuc_Above98_hit.keys(): - prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq)) - else: - Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]]))) - prepped_NTD.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq)) - for i in inFasta_AA: - if i.description not in nuc_Above98_hit.keys(): - prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans1\n'+str(i.seq).replace('*','X')) - else: - Rep_Num = str(len(set([i.description.split('_')[0]]+[j.split('_')[0] for j in nuc_Above98_hit[i.description]]))) - prepped_AA.append('>'+'_'.join(i.description.split('_')[1:])+'_Trans'+Rep_Num+'\n'+str(i.seq).replace('*','X')) - else: - for i in inFasta_NTD: - if i.description not in nuc_Above98_hit.keys(): - prepped_NTD.append('>'+i.description+'\n'+str(i.seq)) - else: - prepped_NTD.append('>'+i.description+'\n'+str(i.seq)) - for i in inFasta_AA: - if i.description not in nuc_Above98_hit.keys(): - prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X')) - else: - prepped_AA.append('>'+i.description+'\n'+str(i.seq).replace('*','X')) - - with open(args.all_output_folder + args.file_prefix + '/'+args.file_prefix+'_SeqPairsAbove98.txt','w+') as w: - for k, v in nuc_Above98_hit.items(): - w.write(k+'\t'+'\t'.join(v)+'\n') - - ############################################################################################ - ## Check for abnormally short and long sequences for the taxon for every Gene Family (OG) ## - ## ## - ## IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which ## - ## filters ORFs by length relative to the length of other ORFs from the taxon, may be ## - ## worth revising/removing in the next version of PhyloToL Part 1. ## - ## ## - ############################################################################################ - - print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\ - +color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\ - 'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END) - - self_OGLenDB={} ## - seqs_to_toss = [] ## - too_long = too_short = 0 ## - - for i in prepped_NTD: - og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6] - og_prefix = i.split('\n')[0].split(og_number)[0][-4:] - og = og_prefix + og_number - - self_OGLenDB.setdefault(og,[]).append(len(i.split('\n')[-1])) - - good_NTD_names = [] - for i in prepped_NTD: - og_number = re.split('OG.{1}_', i.split('\n')[0])[-1][:6] - og_prefix = i.split('\n')[0].split(og_number)[0][-4:] - og = og_prefix + og_number - - if (0.3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))) <= len(i.split('\n')[-1]) <= (3*sum(self_OGLenDB[og])/float(len(self_OGLenDB[og]))): - good_NTD_names.append(i.split('\n')[0]) - - good_NTD_seqs = [i for i in prepped_NTD if i.split('\n')[0] in good_NTD_names] - good_AA_seqs = [i for i in prepped_AA if i.split('\n')[0] in good_NTD_names] - - too_short = len(prepped_NTD) - len(good_NTD_names) - - #################################################################### - ## Finalized Outputs are Summarized and Written Out to New Fastas ## - #################################################################### - - print (color.BOLD+'There were '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\ - +color.PURPLE+' ORFs '+color.END+color.BOLD+'originally, with '+color.ORANGE+\ - str(nuc_tsv_100)+color.END+color.BOLD+' Partial '+color.PURPLE+'ORFs'+color.END+\ - color.BOLD+' that\nwere '+color.RED+'100% Identical'+color.END+color.BOLD+' to larger'\ - +color.PURPLE+' ORFs.\n\n'+color.END) - - print(color.BOLD+'Of the '+color.CYAN+str(len(inFasta_NTD_rawLen))+color.END+color.BOLD\ - +' original'+color.PURPLE+' ORFs'+color.END+color.BOLD+', '+color.ORANGE+str(len(set(seqs_to_toss)))+\ - color.END+color.BOLD+' are '+color.PURPLE+'Partial ORFs '+color.END+color.BOLD+'(e.g. '+\ - color.RED+'> '+args.id_print+'%'+color.END+color.BOLD+'\nNUCLEOTIDE identity) to larger'\ - +color.PURPLE+' ORFs'+color.END+color.BOLD+' with '+color.ORANGE+str(too_short+too_long)\ - +color.END+color.BOLD+' additional'+color.PURPLE+' ORFs\n'+color.END+color.BOLD+'that were either '+\ - color.RED+'TOO LONG or SHORT.\n\n'+color.END) - - print (color.BOLD+'Overall, there are '+color.GREEN+str(len(good_NTD_seqs))+' Unique ORFs'\ - +color.END+color.BOLD+' for '+color.CYAN+args.file_prefix+'\n'+color.END) - - with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w: - for i in good_NTD_seqs: - w.write(i+'\n') - with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x: - for i in good_AA_seqs: - x.write(i+'\n') - - return good_NTD_names - - -########################################################################################## -###------------------- Updates SpreadSheet with Update Sequence Names -----------------### -########################################################################################## - -def update_tsv(args, NTD_list_names): - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/SpreadSheets/' - proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' - - inTSV = {'_'.join(i.split('\t')[0].split('_')[1:]):'\t'.join(i.split('\t')[1:]) for i in open(cat_folder+\ - args.file_prefix+'_Concatenated.allOGCleanresults.tsv').readlines() if i != '\n'} - - Updated_inTSV = [i.strip('>')+'\t'+inTSV[i.split('_Trans')[0].strip('>')] for i in NTD_list_names] - - with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as w: - for line in Updated_inTSV: - w.write(line+'\n') - - -############################################################################################ -## ## -## IMPORTANT: As of September 2023, ACL and LAK think that the following filter, which ## -## filters ORFs by length that do NOT hit other ORFs from the taxon by self-BLAST, may be ## -## worth revising/removing in the next version of PhyloToL Part 1. ## -## ## -############################################################################################ - - -def no_partials_present(args, OGLenDB): - - print (color.BOLD+color.RED+'\n\nWarning:'+color.END+color.BOLD+' No partial sequences'\ - ' were found with > '+str(args.id_print)+'% nucleotide identity.\n\nThe data will still be '\ - 'checked for ORFs that are unexpectedly '+color.ORANGE+'Short'+color.END+color.BOLD+' or'\ - +color.ORANGE+' Long.\n\n'+color.END) - - cat_folder = args.all_output_folder + args.file_prefix + '/Original/Concatenated/' - proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' - - NTD_file = cat_folder+args.file_prefix+'.NTD.Concatenated.fasta' - AA_file = cat_folder+args.file_prefix+'.AA.Concatenated.fasta' - TSV_file = cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv' - - self_OGLenDB = {} - seqs_to_toss = [] - too_long, too_short = 0, 0 - - ## Small changes in this section for Auden (ought to work now) - ## Lists -> Dictionaries and some data curation steps - - inFasta = {i.description:str(i.seq) for i in SeqIO.parse(NTD_file,'fasta')} - - for k,v in inFasta.items(): - og_number = re.split('OG.{1}_', k)[-1][:6] - og_prefix = k.split(og_number)[0][-4:] - og = og_prefix + og_number - - if len(v) >= 4.5*OGLenDB[og]: - seqs_to_toss.append(k) - too_long+= 1 - - prepped_NTD = [i for i in inFasta if i not in seqs_to_toss] - - print (color.BOLD+'Removing Abnormally Short (30% length) OR Long (300% length)'\ - +color.PURPLE+' ORFs'+color.END+color.BOLD+'\ncompared to typical '+color.ORANGE+'Gene '\ - 'Family '+color.END+color.BOLD+'member length for: '+color.CYAN+args.file_prefix+'\n\n'+color.END) - - ## toss those sequences from the sequence dictonary (less headache) - for crap_seq in seqs_to_toss: - del inFasta[crap_seq] - - for k, v in inFasta.items(): - og_number = re.split('OG.{1}_', k)[-1][:6] - og_prefix = k.split(og_number)[0][-4:] - og = og_prefix + og_number - - self_OGLenDB.setdefault(og,[]).append(len(v)) - - self_OGLenDB_Final = {k:sum(v)/len(v) for k, v in self_OGLenDB.items()} - - good_NTD_data = { } - for k, v in inFasta.items(): - og_number = re.split('OG.{1}_', k)[-1][:6] - og_prefix = k.split(og_number)[0][-4:] - og = og_prefix + og_number - - if 0.3*self_OGLenDB_Final[og] <= len(v) <= 3*self_OGLenDB_Final[og]: - good_NTD_data.update({ k : v }) - - good_AA_data = {i.description:str(i.seq) for i in SeqIO.parse(AA_file,'fasta') if i.description in good_NTD_data.keys()} - - good_TSV_data = [i for i in open(cat_folder+'/SpreadSheets/'+args.file_prefix+'_Concatenated.allOGCleanresults.tsv')\ - .read().split('\n') if i != '' and i.split('\t')[0] in good_NTD_data.keys()] - - renamed_TSV_data = [i.split('\t')[0]+'\t'+'\t'.join(i.split('\t')[1:]) for i in good_TSV_data] - - with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w+') as w: - for k,v in good_NTD_data.items(): - w.write('>'+k+'\n'+v+'\n') - - with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w+') as x: - for k, v in good_AA_data.items(): - x.write('>'+k+'\n'+v+'\n') - - with open(proc_folder+'/SpreadSheets/'+args.file_prefix+'_Filtered.Final.allOGCleanresults.tsv','w+') as y: - y.write('\n'.join(renamed_TSV_data)) - - -############################################################################################ -## ## -## IMPORTANT: The following function was added in September 2023 by ACL and LAK to ## -## replace the pre-Guidance filter that removes sequences <50% and >150% the average ## -## length of the OG in the Hook. It may be worth revisiting in the next version of ## -## PhyloToL part 1. ## -## ## -############################################################################################ - - -def filter_all_by_hook(args, OGLenDB): - - proc_folder = args.all_output_folder + args.file_prefix + '/Processed/' - - remaining_NTD_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta', 'fasta') } - remaining_AA_seqs = { rec.id : str(rec.seq) for rec in SeqIO.parse(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta', 'fasta') } - - short_from_translation = [] - if os.path.isfile('UnexpexctedShortStuffBlameXyrus.txt'): - for line in open('UnexpexctedShortStuffBlameXyrus.txt'): - short_from_translation.append(line.strip()) - - good_NTD_seqs = []; good_AA_seqs = [] - for rec in remaining_NTD_seqs: - og_number = re.split('OG.{1}_', rec)[-1][:6] - og_prefix = rec.split(og_number)[0][-4:] - og = og_prefix + og_number - - if len(remaining_AA_seqs[rec]) <= 1.5*OGLenDB[og] and len(remaining_AA_seqs[rec]) >= 0.33*OGLenDB[og] and rec.split('_Trans')[0] not in short_from_translation: - good_NTD_seqs.append((rec, remaining_NTD_seqs[rec])) - good_AA_seqs.append((rec, remaining_AA_seqs[rec])) - - with open(proc_folder+args.file_prefix+'_Filtered.Final.NTD.ORF.fasta','w') as w: - for rec in good_NTD_seqs: - w.write('>' + rec[0] + '\n' + rec[1] + '\n\n') - - with open(proc_folder+args.file_prefix+'_Filtered.Final.AA.ORF.fasta','w') as x: - for rec in good_AA_seqs: - x.write('>' + rec[0] + '\n' + rec[1] + '\n\n') - - -########################################################################################## -###--------------------- Cleans up the Folder and Moves Final Files -------------------### -########################################################################################## - -def clean_up(args): - - for i in args.file_listNTD: - os.system('mv ' + i + ' ' + args.all_output_folder + args.file_prefix + '/Original/') - os.system('mv ' + i.replace('NTD.ORF.fasta','AA.ORF.fasta') + ' ' + args.all_output_folder + args.file_prefix + '/Original/') - os.system('mv ' + i.split('named')[0]+'named*allOGCleanresults.tsv ' + args.all_output_folder + args.file_prefix + '/Original/SpreadSheets/') - - -########################################################################################### -###-------------------------------- Next Script Message --------------------------------### -########################################################################################### - -def next_script(): - - print(color.BOLD+'\nNext Script is: '+color.GREEN+'6b_update_cov_post_removepartials.py\n\n'+color.END) - - -########################################################################################## -###------------------- Checks Command Line Arguments and Calls Steps ------------------### -########################################################################################## - -def main(): - - diamond_path = check_diamond_path() - - args = check_args() - - prep_folders(args) - - merge_relevant_data(args) - - self_BLAST_out = self_blast(args, diamond_path) - - evaluation = check_Self_vs_Self(self_BLAST_out) - - OGLenDB = {} - for rec in SeqIO.parse(args.hook_fasta, 'fasta'): - if rec.id[-10:] not in OGLenDB: - OGLenDB.update({ rec.id[-10:] : [] }) - - OGLenDB[rec.id[-10:]].append(len(str(rec.seq))) - - for og in OGLenDB: - OGLenDB[og] = mean(OGLenDB[og]) - - if evaluation != 'empty': - NTD_names = filter_NTD_data(args, OGLenDB) - update_tsv(args, NTD_names) - else: - no_partials_present(args, OGLenDB) - - filter_all_by_hook(args, OGLenDB) - - clean_up(args) - - next_script() - -main()