diff --git a/PTL1/Transcriptomes/Scripts/7_FinalizeName.py b/PTL1/Transcriptomes/Scripts/7_FinalizeName.py deleted file mode 100644 index 677a836..0000000 --- a/PTL1/Transcriptomes/Scripts/7_FinalizeName.py +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/env python3.5 - -##__Updated__: 31_08_2017 -##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com -##__Usage__: python 6_FilterPartials.py --help - -################################################################################################## -## This script is intended to rename the outputs of the FilterPartials script ## -## to a given 10-character that is used in the Katz lab Phylogenomic Tree building methods ## -## ## -## Prior to r`ning this script, ensure the following: ## -## ## -## 1. You have assembled your transcriptome and COPIED the 'assembly' file ## -## (contigs.fasta, or scaffolds.fasta) to the PostAssembly Folder ## -## 2. Removed small sequences (usually sequences < 300bp) with ContigFilterPlusStats.py ## -## 3. Removed SSU/LSU sequences from your Fasta File ## -## 4. Classified your sequences as Strongly Prokaryotic/Eukaryotic or Undetermined ## -## 5. Classified the Non-Strongly Prokaryotic sequences into OGs ## -## 6. You either know (or have inferred) the genetic code of the organism ## -## 7. You have translated the sequences and checked for the data in the RemovePartials folder ## -## 8. Partial sequences have been removed from the transcriptomic data sets ## -## ## -## COMMAND Example Below ## -## Extra Notes at Bottom of Script ## -## ## -## E-mail Xyrus (author) for help if needed: maurerax@gmail.com ## -## ## -## Next Script(s) to Run: ## -## NONE! You're FINISHED! :D ## -## ## -################################################################################################## - -import argparse, os, sys -from argparse import RawTextHelpFormatter,SUPPRESS - -#----------------------- Solely to Make Print Statements Colorful -----------------------# - -class color: - PURPLE = '\033[95m' - CYAN = '\033[96m' - DARKCYAN = '\033[36m' - ORANGE = '\033[38;5;214m' - BLUE = '\033[94m' - GREEN = '\033[92m' - YELLOW = '\033[93m' - RED = '\033[91m' - BOLD = '\033[1m' - UNDERLINE = '\033[4m' - END = '\033[0m' - - -#------------------------------- Main Functions of Script --------------------------------# - -########################################################################################### -###--------------------- Parses and Checks Command-Line Arguments ----------------------### -########################################################################################### - -def check_args(): - - parser = argparse.ArgumentParser(description= - color.BOLD + '\n\nThis script is intended to '+color.RED+'Rename '+color.END\ - +color.BOLD+'the core set of '+color.PURPLE+'ORFS\n'+color.END+color.BOLD+'with a valid '\ - +color.RED+'10-character code'+color.END+color.BOLD+' for use in the KatzLab\nPhylogenomic Pipeline'\ - +usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter) - - required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END) - - required_arg_group.add_argument('--input_file','-in', action='store', - help=color.BOLD+color.GREEN+' One of the Fasta files that is to be renamed\n'+color.END) - required_arg_group.add_argument('--name','-n', action='store', - help=color.BOLD+color.GREEN+' A valid 10-Character code for updating the data\n'+color.END) - - - optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END) - - optional_arg_group.add_argument('-author', action='store_true', - help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END) - - if len(sys.argv[1:]) == 0: - print (parser.description) - print ('\n') - sys.exit() - - args = parser.parse_args() - - quit_eval = return_more_info(args) - if quit_eval > 0: - print ('\n') - sys.exit() - - args.all_output_folder = '/'.join(args.input_file.split('/')[:-2]) - - if '.allOGCleanresults' in args.input_TSV: - args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('.allOGCleanresults.','.AA.ORF.')\ - .replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit' - else: - args.out_XML = args.name+'_XX_'+args.input_TSV.split('/')[-1].replace('_allOGCleanresults.','_AA.ORF.')\ - .replace('.tsv','.fasta')+'_1e-10keepall_BlastOutall.oneHit' - - args.file_prefix = args.input_file.split('/')[-1].split('_Filtered.Final')[0] - if 'fasta' in args.file_prefix: - args.file_prefix = args.name - - args.r2g_aa = args.all_output_folder + '/ReadyToGo/ReadyToGo_AA/' - args.r2g_ntd = args.all_output_folder + '/ReadyToGo/ReadyToGo_NTD/' - args.r2g_tsv = args.all_output_folder + '/ReadyToGo/ReadyToGo_TSV/' - args.r2g_xml = args.all_output_folder + '/ReadyToGo/ReadyToGo_XML/' - - - return args - - -########################################################################################### -###------------------------------- Script Usage Message --------------------------------### -########################################################################################### - -def usage_msg(): - return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 7_FinalizeName.py'\ - ' --input_file ../ToRename/Op_me_Xxma_Filtered.Final.AA.ORF.fasta --name Op_me_Xxma'+color.END) - - -########################################################################################## -###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------### -########################################################################################## - -def return_more_info(args): - - valid_args = 0 - - author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\ - ' maurerax@gmail.com\n\n'+color.END) - - if args.author == True: - print (author) - valid_args += 1 - - if args.input_file.endswith('AA.ORF.fasta'): - args.input_NTD = args.input_file.replace('AA.ORF.fasta','NTD.ORF.fasta') - args.input_AA = args.input_file -# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('AA.ORF.fasta','allOGCleanresults.tsv') - args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv') - - elif args.input_file.endswith('NTD.ORF.fasta'): - args.input_NTD = args.input_file - args.input_AA = args.input_file.replace('NTD.ORF.fasta','AA.ORF.fasta') -# args.input_TSV = ('/').join(args.input_file.split('/')[:-1])+'/SpreadSheets/'+args.input_file.split('/')[-1].replace('NTD.ORF.fasta','allOGCleanresults.tsv') - args.input_TSV = args.input_file.replace('AA.ORF.fasta','allOGCleanresults.tsv') - print(args.input_TSV) - - if os.path.isfile(args.input_NTD) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Nucleotide '\ - 'Fasta file ('+color.DARKCYAN+args.input_NTD.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - if os.path.isfile(args.input_AA) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Protein '\ - 'Fasta file ('+color.DARKCYAN+args.input_AA.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - if os.path.isfile(args.input_TSV) != True: - print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided TSV '\ - ' file ('+color.DARKCYAN+args.input_TSV.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\ - ' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END) - valid_args += 1 - - return valid_args - -########################################################################################### -###-------------------- Double Checks Format for 10-Character Code ---------------------### -########################################################################################### - -def check_code(args): - - check_name = args.name.split('_') - - if len(args.name) != 10: - print (color.BOLD+'\n\nNew Species Prefix is not 10 characters long\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - sys.exit() - - elif args.name.count('_') != 2: - print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - - sys.exit() - - if len(check_name[0]) == 2 and len(check_name[1]) == 2 and len(check_name[2]) == 4: - print (color.BOLD+"\n\nRenaming "+color.ORANGE+args.input_file.split('/')[-1]\ - .split('_Filtered')[0]+color.END+color.BOLD+"'s files with the following 10-character\n"\ - "code: "+color.CYAN+args.name+color.END+'\n') - else: - print (color.BOLD+'\n\nCheck the format of your Species Prefix!\n\n') - print ('Three examples below:\n'+color.CYAN+'\n\tSr_ci_Cunc\n\n\tOp_me_Hsap\n\n\t'\ - 'Am_ar_Ehis\n\n'+color.END) - sys.exit() - - -########################################################################################## -###------------------------- Creates Folders For Storing Data -------------------------### -########################################################################################## - -def prep_folders(args): - - - if os.path.isdir(args.all_output_folder + '/ReadyToGo/') != True: - os.system('mkdir ' + args.all_output_folder + '/ReadyToGo') - - - if os.path.isdir(args.r2g_ntd) != True: - os.system('mkdir ' + args.r2g_ntd) - if os.path.isdir(args.r2g_aa) != True: - os.system('mkdir ' + args.r2g_aa) - if os.path.isdir(args.r2g_tsv) != True: - os.system('mkdir ' + args.r2g_tsv) - if os.path.isdir(args.r2g_xml) != True: - os.system('mkdir ' + args.r2g_xml) - - if os.path.isdir(args.all_output_folder + '/' + args.file_prefix + '/Renamed') != True: - os.system('mkdir ' + args.all_output_folder + '/' + args.file_prefix + '/Renamed') - -########################################################################################### -###----------- Renames the NTD and AA CDSs with the Given 10-Character Code ------------### -########################################################################################### - -def rename_paralogs(args): - - home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/' - - print (color.BOLD+'\nRenaming Translated (Protein) '+color.PURPLE+'ORFs\n'+color.END) - renamed_Final_Prots = open(args.input_AA).read().replace('>','>'+args.name+'_XX_') - - print (color.BOLD+'\nRenaming Nucleotide '+color.PURPLE+'ORFs\n'+color.END) - renamed_Final_Nucs = open(args.input_NTD).read().replace('>','>'+args.name+'_XX_') - - - print (color.BOLD+'\nUpdating CDS Names in the Spreadsheet'+color.END) - if '\n\n' in open(args.input_TSV).read(): - renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\ - .replace('\n\n','\n'+args.name+'_XX_') - else: - renamed_Final_tsv = args.name+'_XX_'+open(args.input_TSV).read().rstrip('\n')\ - .replace('\n','\n'+args.name+'_XX_') - - with open(home_folder+args.name+'_XX_'+args.input_AA.split('/')[-1],'w+') as w: - w.write(renamed_Final_Prots) - - with open(home_folder+args.name+'_XX_'+args.input_NTD.split('/')[-1],'w+') as x: - x.write(renamed_Final_Nucs) - - - with open(home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1],'w+') as y: - y.write(renamed_Final_tsv) - - -########################################################################################### -###--------------------------------- Header/Tail Lines ---------------------------------### -########################################################################################### - -def header_tail(): - header = '\n\n'\ - '\n blastp\n BLASTP 2.2.29+\n'\ - ' Stephen F. Altschul, Thomas L. Madden, Alejandro A. Sch&auml;ffer, Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs", Nucleic Acids Res. 25:3389-3402.\n'\ - ' ../OGBlastDB/renamed_aa_seqs_OrthoMCL-5_12653.fasta\n Query_1\n' - - tail = '\n' - return header, tail - - -########################################################################################### -###------------------------------- TSV to XML Conversion -------------------------------### -########################################################################################### - -def convert_TSV_data(args): - - home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/' - - TSVforConvert = home_folder+args.name+'_XX_'+args.input_TSV.split('/')[-1] - - inTSV = [line.rstrip('\n') for line in open(TSVforConvert).readlines() if line != '\n'] - - iterations = [] - - for n in range(len(inTSV)): - if n == 0: - iterations.append(' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - ' \n \n BLOSUM62\n 1e-10\n'\ - ' 11\n 1\n F\n'\ - ' \n \n\n\n 1\n Query_1\n'\ - ' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - '\n\n 1\n Fake_Entry\n '+inTSV[n].split('\t')[1]+'\n Fake_Accession\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n \n \n 1\n 1234\n'\ - ' '+inTSV[n].split('\t')[-1]+'\n '+inTSV[n].split('\t')[-2]+'\n '+inTSV[n].split('\t')[-4]+'\n'\ - ' '+inTSV[n].split('\t')[-3]+'\n '+inTSV[n].split('\t')[-4]+'\n '+inTSV[n].split('\t')[-3]+'\n'\ - ' 0\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' \n \n \n \n \n\n'\ - '\n\n \n \n 379660\n 197499634\n'\ - ' 123\n 184705217500\n 0.041\n'\ - ' 0.267\n 0.14\n \n \n\n') - else: - iterations.append('\n '+str(n+1)+'\n Query_'+str(n+1)+'\n'\ - ' '+inTSV[n].split('\t')[0]+'\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n'\ - '\n\n 1\n Fake_Entry\n '+inTSV[n].split('\t')[1]+'\n Fake_Accession\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])+1))+'\n \n \n 1\n 1234\n'\ - ' '+inTSV[n].split('\t')[-1]+'\n '+inTSV[n].split('\t')[-2]+'\n '+inTSV[n].split('\t')[-4]+'\n'\ - ' '+inTSV[n].split('\t')[-3]+'\n '+inTSV[n].split('\t')[-4]+'\n '+inTSV[n].split('\t')[-3]+'\n'\ - ' 0\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n 0\n '+str(abs(int(inTSV[n].split('\t')[-3])-int(inTSV[n].split('\t')[-4])))+'\n'\ - ' \n \n \n \n \n\n'\ - '\n\n \n \n 379660\n 197499634\n'\ - ' 123\n 184705217500\n 0.041\n'\ - ' 0.267\n 0.14\n \n \n\n') - - return iterations - - -########################################################################################### -###--------------------------- Writes Out the Fake XML File ----------------------------### -########################################################################################### - -def write_Fake_XML(args): - - home_folder = args.all_output_folder + '/' + args.file_prefix + '/' - - print (color.BOLD+'\n\nConverting '+color.ORANGE+args.input_file.split('/')[-1]+color.END\ - +color.BOLD+' to XML format\n'+color.END) - - header, tail = header_tail() - - iterations = convert_TSV_data(args) - - with open(home_folder+args.out_XML,'w+') as w: - w.write(header) - w.write(''.join(iterations)) - w.write(tail) - -########################################################################################## -###-------------------- Cleans up the Folder and Moves Final Files --------------------### -########################################################################################## -def clean_up(args): - - home_folder = args.all_output_folder + '/' + args.file_prefix + '/Renamed/' - - os.system('cp ' + args.all_output_folder + '/' + args.file_prefix+'/'+args.out_XML+' '+args.r2g_xml) - - os.system('cp '+home_folder+'*tsv '+args.r2g_tsv) - - os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta '+args.r2g_aa) - os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta '+args.r2g_ntd) - - os.system('cp '+home_folder+'*_XX_*tsv ' + args.all_output_folder + '/' + args.file_prefix) - os.system('cp '+home_folder+'*_XX_*AA.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix) - os.system('cp '+home_folder+'*_XX_*NTD.ORF.fasta ' + args.all_output_folder + '/' + args.file_prefix) - - os.system('rm ' + args.all_output_folder + '/ToRename/*'+args.file_prefix+'*') - - if os.path.isdir(args.all_output_folder + '/Finished/') != True: - os.system('mkdir ' + args.all_output_folder + '/Finished') - - os.system('mv ' + args.all_output_folder + '/' + args.file_prefix + ' ' + args.all_output_folder + '/Finished') - -########################################################################################### -###-------------------------------- Next Script Message --------------------------------### -########################################################################################### - -def next_script(args): - - print (color.BOLD+'\nThere is no next script! The final '+color.ORANGE+args.out_XML\ - .split('_XX')[0]+color.END+color.BOLD+' files can be\nfound in the '+color.RED+\ - args.out_XML.split('_XX_')[-1].split('_Filtered')[0]+color.END+color.BOLD+' and '\ - +color.RED+'ReadyToGo folders'+color.END+color.BOLD+' and are ready\n'\ - 'for the KatzLab Phylogenomic Tree-Building Steps!\n\n'+color.END) - -########################################################################################## -###--------------- Checks Command Line Arguments and Calls on Functions ---------------### -########################################################################################## - -def main(): - - args = check_args() - - check_code(args) - - prep_folders(args) - - rename_paralogs(args) - - write_Fake_XML(args) - - clean_up(args) - - next_script(args) - -main() \ No newline at end of file