diff --git a/PTL1/Transcriptomes/Scripts/2b_remove_Bact.py b/PTL1/Transcriptomes/Scripts/2b_remove_Bact.py deleted file mode 100644 index a67ee47..0000000 --- a/PTL1/Transcriptomes/Scripts/2b_remove_Bact.py +++ /dev/null @@ -1,410 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 18_08_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 2b_remove_Bact.py --help - -########################################################################################## -## This script is intended to identify and isolate SSU/LSU sequences ## -## Prior to running this script, ensure the following: ## -## ## -## 1. You have assembled your transcriptome and COPIED the 'assembly' file ## -## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ## -## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ## -## 3. Have the Databases set up correctly (e.g. with BLAST or Diamond) and in their ## -## respective folders! See the manual if you need help ## -## 4. Run removeSSU.py on your Fasta file ## -## ## -## COMMAND Example Below ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -## Next Script(s) to Run: ## -## 3_CountOGsDiamond.py ## -## ## -########################################################################################## - - -import argparse, os, sys -from argparse import RawTextHelpFormatter,SUPPRESS -from distutils import spawn -from Bio import SeqIO - - -#------------------------------ Colors For Print Statements ------------------------------# -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###---------------------------- UPDATE DIAMOND PATH BELOW! -----------------------------### -########################################################################################### - ## IF Diamond is IN YOUR PATH then no updating is needed... - -def check_diamond_path(): - - diamond_path = '' - - if diamond_path == '': - diamond_path = spawn.find_executable("diamond") - #diamond_path = '/path/to/diamond' - else: - pass - - if diamond_path == None: - print (color.BOLD + '\n\nPlease open this script and check that you have included'\ - +' the PATH to the'+color.BLUE+' "Diamond" '+color.END+color.BOLD+'executable.\n\n'+color.END) - print (color.BOLD+color.BLUE+'LOOK FOR:\n\n'+color.RED\ - +'#------------------------------ UPDATE DIAMOND PATH BELOW! -------------------------------#'\ - +color.BLUE+'\n\nThis is somewhere around lines 50 - 80...\n\n'+color.END) - - sys.exit() - else: - pass - - return diamond_path - - -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\nThis script will categorize Contigs as'+color.ORANGE+' STRONGLY '+color.END\ - +color.BOLD+color.RED+'Eukaryotic \nOR Prokaryotic'+color.END+color.BOLD+' using a set of Proteins'\ - ' from diverse\n'+color.ORANGE+'Eukaryotes, Bacteria and Archaea'+color.END\ - +color.BOLD+'.'+color.END+usage_msg(), usage=SUPPRESS,formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+'Fasta file of Nucleotide sequences (with rRNAs removed)'+color.END) - required_arg_group.add_argument('--databases','-d', action='store', - help=color.BOLD+color.GREEN+"Path to databases"+color.END) - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Print author contact information\n'+color.END) - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - sys.exit() - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 2b_remove_Bact.py --input_file'\ - ' ../Op_me_Xxma/Op_me_Xxma_NorRNAseqs.fasta'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_arg = 0 - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - print (author) - valid_arg += 1 - - print(args.input_file) - - if args.input_file != None: - if os.path.isfile(args.input_file) != False: - if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])): - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - elif args.input_file.endswith('NorRNAseqs.fasta') != True: - print (color.BOLD+'\n\nInvalid Fasta File! Only Fasta Files that were processed'\ - ' with '+color.GREEN+'2a_remove_rRNA.py '+color.END+color.BOLD+'are valid\n\n'\ - 'However, to bypass that issue, Fasta Files MUST end with '+color.CYAN+\ - '"NorRNAseqs.fas"\n\n'+color.END) - valid_arg += 1 - else: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\ - '('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_arg += 1 - - if os.path.isdir(args.databases + '/db_BvsE') != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\ - +color.ORANGE+'db_BvsE Folder!\n\n'+color.END+color.BOLD+'Ensure that this folder '\ - 'can be found in the main '+color.ORANGE+'Databases Folder'+color.END+color.BOLD\ - +'\n\nThen try once again.') - valid_arg += 1 - elif os.path.isfile(args.databases + '/db_BvsE/eukout.dmnd') != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\ - 'Diamond formatted '+color.ORANGE+'Eukaryotic Protein database!\n\n'+color.END+color.BOLD+\ - 'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\ - color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\ - color.END+color.BOLD+'\n\nThen try once again.'+color.END) - elif os.path.isfile(args.databases + '/db_BvsE/micout.dmnd') != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' Cannot find the '\ - 'Diamond formatted '+color.ORANGE+'Bacterial/Archaeal Protein database!\n\n'+color.END+color.BOLD+\ - 'Ensure that it can be found in the '+color.ORANGE+'db_BvsE folder'+color.END+\ - color.BOLD+',\nwhich can be found in the main '+color.ORANGE+'Databases Folder'+\ - color.END+color.BOLD+'\n\nThen try once again.'+color.END) - - valid_arg += 1 - - return valid_arg - - -########################################################################################### -###--------------------------- Does the Inital Folder Prep -----------------------------### -########################################################################################### - -def prep_folders(args): - - BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/' - - if os.path.isdir(BvE_folder) != True: - os.system('mkdir '+BvE_folder) - - -########################################################################################### -###---------------- Runs Diamond on Bact and Euk small RefSeq Databases ----------------### -########################################################################################### - -def ublast_BvE(args, diamond_path): - - BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/' - mic_output = args.input_file.split('/')[-1]+'micresults.' - euk_output = args.input_file.split('/')[-1]+'eukresults.' - - print(color.BOLD+'\n\n"BLAST"-ing against PROK database using DIAMOND: ' + color.DARKCYAN + 'micout.dmnd' + color.END + '\n\n') - - Prok_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/micout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'allmicresults.tsv' - - os.system(Prok_diamond_cmd) - - print(color.BOLD+'\n\n"BLAST"-ing against EUK database using DIAMOND: ' + color.DARKCYAN + 'eukout.dmnd' + color.END + '\n\n') - - Euk_diamond_cmd = diamond_path + ' blastx -q ' + args.input_file + ' --max-target-seqs 1 -d ' + args.databases + '/db_BvsE/eukout.dmnd --evalue 1e-5 --threads 60 --outfmt 6 -o ' + BvE_folder + 'alleukresults.tsv' - - os.system(Euk_diamond_cmd) - - - -########################################################################################### -###---------------- Compares Bacterial and Euk Hits for Classification -----------------### -########################################################################################### - -def compare_hits(args): - - BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/' - - EukDict = {} - ProkDict = {} - CompDict = {} - - inFasta = [seq_rec for seq_rec in SeqIO.parse(args.input_file,'fasta')] - - for seq_rec in inFasta: - EukDict[seq_rec.description] = '' - ProkDict[seq_rec.description] = '' - CompDict[seq_rec.description] = [] - - inEukHits = [i for i in open(BvE_folder + 'alleukresults.tsv').readlines()] - inEukHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3]))) - - inProkHits = [i for i in open(BvE_folder + 'allmicresults.tsv').readlines()] - inProkHits.sort(key=lambda x: (float(x.split('\t')[-2]), -int(x.split('\t')[3]))) - - for i in inEukHits: - if EukDict[i.split('\t')[0]] == '': - EukDict[i.split('\t')[0]] = float(i.split('\t')[-2]) - - for i in inProkHits: - if ProkDict[i.split('\t')[0]] == '': - ProkDict[i.split('\t')[0]] = float(i.split('\t')[-2]) - - for k in CompDict.keys(): - if EukDict[k] != '': - CompDict[k].append(EukDict[k]) - else: - CompDict[k].append('no hit') - if ProkDict[k] != '': - CompDict[k].append(ProkDict[k]) - else: - CompDict[k].append('no hit') - - for k, v in CompDict.items(): - - ### Contigs lacking STRONG Eukaryotic OR Prokaryotic Hits - if v[0] == 'no hit' and v[1] == 'no hit': - CompDict[k].append('UNDETERMINED') - - ### Contigs lacking STRONG Eukaryotic with a Prokaryotic Hit - elif v[0] != 'no hit' and v[1] == 'no hit': - CompDict[k].append('EUKARYOTIC') - - ### Contigs with a Eukaryotic but without a Prokaryotic Hit - elif v[0] == 'no hit' and v[1] != 'no hit': - CompDict[k].append('PROKARYOTIC') - - ### Uses Basic math to determine if contigs with are MORE Eukaryotic than Prokaryotic - else: - try: - prok_euk_ratio = float(v[1])/float(v[0]) - euk_prok_ratio = float(v[0])/float(v[1]) - - if prok_euk_ratio >= 100: - CompDict[k].append('EUKARYOTIC') - - elif euk_prok_ratio >= 1000: - CompDict[k].append('PROKARYOTIC') - - else: - CompDict[k].append('UNDETERMINED') - - except: - CompDict[k].append('divide by zero') - - with open(BvE_folder + 'comparisons.txt','w+') as w: - for k, v in CompDict.items(): - w.write(k+':'+':'.join([str(i) for i in v])+'\n') - - BvE_folder = '/'.join(args.input_file.split('/')[:-1]) + '/BvE/' - BvE_output_base = BvE_folder+args.input_file.split('/')[-1].split('.fas')[0] - -### Gathers the sequences and categorizes them - Euk_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'EUKARYOTIC'), key=lambda x: -int(len(x.seq))) - Prok_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'PROKARYOTIC'), key=lambda x: -int(len(x.seq))) - Und_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'UNDETERMINED'), key=lambda x: -int(len(x.seq))) - Zero_Fasta = sorted((seq_rec for seq_rec in inFasta if CompDict[seq_rec.description][-1] == 'divide by zero'), key=lambda x: -int(len(x.seq))) - - -### Writes out all of the categorized sequences - with open(args.input_file.split('NorRNA')[0] + 'WTA_EPU.fasta', 'w') as epu: - with open(BvE_output_base+'.Not_Bact.fasta','w+') as nb: - for euk_seq in Euk_Fasta: - nb.write('>' + euk_seq.description + '\n' + str(euk_seq.seq) + '\n') - epu.write('>' + euk_seq.description + '_E' + '\n' + str(euk_seq.seq) + '\n') - - - with open(BvE_output_base+'.Bact_Hit.fasta','w+') as pr: - for prok_seq in Prok_Fasta: - pr.write('>' + prok_seq.description + '\n' + str(prok_seq.seq) + '\n') - epu.write('>' + prok_seq.description + '_P' + '\n' + str(prok_seq.seq) + '\n') - - with open(BvE_output_base+'.Undetermined.fasta','w+') as und: - for und_seq in Und_Fasta: - und.write('>' + und_seq.description + '\n' + str(und_seq.seq) + '\n') - epu.write('>' + und_seq.description + '_U' + '\n' + str(und_seq.seq) + '\n') - - if len(Zero_Fasta) != 0: - with open(BvE_output_base+'.DivideByZero.fasta','w+') as w: - for zero_seq in Zero_Fasta: - w.write('>' + zero_seq.description + '\n' + str(zero_seq.seq) + '\n') - epu.write('>' + zero_seq.description + '_U' + '\n' + str(zero_seq.seq) + '\n') - else: - pass - - return str(len(Euk_Fasta)), str(len(Prok_Fasta)), str(len(Und_Fasta)) - - -########################################################################################### -###----------------------- Updates Log of Prok vs Euk Comparisons ----------------------### -########################################################################################### - -def update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs): - - if os.path.isdir('../PostAssembly_Logs/') != True: - os.system('mkdir ../PostAssembly_Logs/') - else: - pass - - print (color.BOLD +'\n\nThere are '+color.RED+Prok_Contigs+' Strongly Prokaryotic contigs'+color.END\ - +color.BOLD+',\n'+color.ORANGE+Euk_Contigs+' Strongly Eukaryotic contigs'+color.END\ - +color.BOLD+',\nand '+color.PURPLE+Und_Contigs+' Undetermined Contigs\n'+color.END\ - +color.BOLD+'in '+args.input_file.split('/')[-1]+color.END) - - for Logname in os.listdir(os.curdir+'./PostAssembly_Logs/'): - if Logname.startswith(args.input_file.split('/')[-1].split('_No')[0]) and Logname.endswith('Log.txt'): # ACL - ??? - with open('../PostAssembly_Logs/'+Logname,'a') as Logfilename: - Logfilename.write('Prokaryotic Contigs\t'+Prok_Contigs+'\tn/a\tn/a\n') - Logfilename.write('Eukaryotic Contigs\t'+Euk_Contigs+'\tn/a\tn/a\n') - Logfilename.write('Undetermined Contigs\t'+Und_Contigs+'\tn/a\tn/a\n') - - -########################################################################################### -###-------------------------------- Next Script Message --------------------------------### -########################################################################################### - -def next_script(args): - - print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1]\ - .split('NorRNA')[0]+'WTA_EPU.fasta'+color.END+color.BOLD+' in the '\ - +args.input_file.split('/')[1]+' Folder\n\n' + color.END) - print (color.BOLD + 'Next Script is: ' + color.GREEN + '3_CountOGsDiamond.py\n\n'+ color.END) - - -########################################################################################## -###--------------------- Cleans up the Folder and Moves Final Files -------------------### -########################################################################################## - -def clean_up(args): - - home_folder = '/'.join(args.input_file.split('/')[:-1]) - - os.system('cp '+home_folder+'/*WTA_EPU.fasta '+home_folder+'/BvE/') - os.system('mv '+home_folder+'/*NorRNA*fasta '+home_folder+'/rRNA_Removal/') - - -########################################################################################## -###--------------- Checks Command Line Arguments and Calls on Functions ---------------### -########################################################################################## - -def main(): - - usearch_path = check_diamond_path() - - args = check_args() - - prep_folders(args) - - ublast_BvE(args, usearch_path) - - Euk_Contigs, Prok_Contigs, Und_Contigs = compare_hits(args) - - #update_log(args, Euk_Contigs, Prok_Contigs, Und_Contigs) - - clean_up(args) - - next_script(args) - -main()