mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 07:20:25 +08:00
These are PTL1 scripts from ACL on Feb8. NOTE: These are WITHOUT databases..... these are at (Katzlab only) link here https://drive.google.com/drive/folders/10n1GdTJqEOBg1nGA1hjxNHgaNZgmbwb5
217 lines
8.5 KiB
Python
217 lines
8.5 KiB
Python
#!/usr/bin/env python3.5
|
|
|
|
##__Updated__: 19_09_2017
|
|
##__Author__: Xyrus Maurer-Alcala; maurerax@gmail.com
|
|
##__Usage__: python 1g_RenameCDS.py --help
|
|
|
|
|
|
from Bio import SeqIO
|
|
from Bio.SeqUtils import GC
|
|
import argparse, os, sys, time
|
|
from argparse import RawTextHelpFormatter,SUPPRESS
|
|
|
|
|
|
#----------------------------- Colors For Print Statements ------------------------------#
|
|
class color:
|
|
PURPLE = '\033[95m'
|
|
CYAN = '\033[96m'
|
|
DARKCYAN = '\033[36m'
|
|
ORANGE = '\033[38;5;214m'
|
|
BLUE = '\033[94m'
|
|
GREEN = '\033[92m'
|
|
YELLOW = '\033[93m'
|
|
RED = '\033[91m'
|
|
BOLD = '\033[1m'
|
|
UNDERLINE = '\033[4m'
|
|
END = '\033[0m'
|
|
|
|
#------------------------------- Main Functions of Script --------------------------------#
|
|
###########################################################################################
|
|
###--------------------- Parses and Checks Command-Line Arguments ----------------------###
|
|
###########################################################################################
|
|
|
|
def check_args():
|
|
|
|
parser = argparse.ArgumentParser(description=
|
|
color.BOLD + '\n\nThis script is intended to extract '+color.RED+'Annotated '+\
|
|
color.PURPLE+'ORFS\n'+color.END+color.BOLD+'from a provided Genbank formatted file.'\
|
|
+usage_msg(), usage=SUPPRESS, formatter_class=RawTextHelpFormatter)
|
|
|
|
required_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Required Options'+color.END)
|
|
|
|
required_arg_group.add_argument('--input_file','-in', action='store',
|
|
help=color.BOLD+color.GREEN+' Fasta file with CDSs\n'+color.END)
|
|
|
|
required_arg_group.add_argument('--output_dir','-o', action='store',
|
|
help=color.BOLD+color.GREEN+' Output directory\n'+color.END)
|
|
|
|
optional_arg_group = parser.add_argument_group(color.ORANGE+color.BOLD+'Options'+color.END)
|
|
|
|
optional_arg_group.add_argument('--source','-s', action='store', default='GenBank',
|
|
help=color.BOLD+color.GREEN+' Data Source of CDSs (default = "GenBank")\n'+color.END)
|
|
|
|
optional_arg_group.add_argument('--list_source','-lsrc', action='store_true',
|
|
help=color.BOLD+color.GREEN+' Lists supported data sources\n'+color.END)
|
|
|
|
optional_arg_group.add_argument('-author', action='store_true',
|
|
help=color.BOLD+color.GREEN+' Prints author contact information\n'+color.END)
|
|
|
|
|
|
if len(sys.argv[1:]) == 0:
|
|
print (parser.description)
|
|
print ('\n')
|
|
sys.exit()
|
|
|
|
args = parser.parse_args()
|
|
|
|
more_info = return_more_info(args)
|
|
if more_info != None:
|
|
print (parser.description)
|
|
print (more_info)
|
|
sys.exit()
|
|
|
|
args.folder = args.output_dir + '/' + args.input_file.split('/')[-1][:10]
|
|
|
|
return args
|
|
|
|
|
|
###########################################################################################
|
|
###------------------------------- Script Usage Message --------------------------------###
|
|
###########################################################################################
|
|
|
|
def usage_msg():
|
|
return (color.BOLD+color.RED+'\n\nExample usage:'+color.CYAN+' python 1g_RenameCDS.py'\
|
|
' --input_file ../Stentor_coeruleus.WGS.CDS.Prep/Stentor_coeruleus.WGS.CDS.fasta --source'\
|
|
' GenBank'+color.END)
|
|
|
|
|
|
##########################################################################################
|
|
###-------- Storage for LARGE (Annoying) Print Statements for Flagged Options ---------###
|
|
##########################################################################################
|
|
|
|
def return_more_info(args):
|
|
|
|
acceptable_sources = ['in-house', 'in-lab', 'GenBank', 'gb', 'NCBI']
|
|
|
|
author = (color.BOLD+color.ORANGE+'\n\n\tQuestions/Comments? Email Xyrus (author) at'\
|
|
' maurerax@gmail.com\n\n'+color.END)
|
|
|
|
if args.author == True:
|
|
return author
|
|
|
|
if args.list_source == True:
|
|
print (color.BOLD+color.RED+'\nThese are the currently supported data sources.\n'+color.END)
|
|
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n\n'+color.END)
|
|
sys.exit()
|
|
|
|
if args.source.lower() not in [i.lower() for i in acceptable_sources]:
|
|
print (color.BOLD+color.RED+'\nUnsupported source was provided.\n\nEnsure that '\
|
|
'you are providing a valid data source (see below).\n'+color.END)
|
|
print (color.BOLD+color.ORANGE+'\n'.join(acceptable_sources)+'\n'+color.END)
|
|
sys.exit()
|
|
|
|
if args.input_file != None:
|
|
if args.input_file.split('/')[-1] not in os.listdir('/'.join(args.input_file.split('/')[:-1])):
|
|
print (color.BOLD+color.RED+'\nError:'+color.END+color.BOLD+' The provided Fasta file '\
|
|
'('+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+color.BOLD+')\ndoes not'\
|
|
' exist or is incorrectly formatted.\n\nDouble-check then try again!\n\n'+color.END)
|
|
sys.exit()
|
|
|
|
|
|
###########################################################################################
|
|
###--------------------------- Does the Inital Folder Prep -----------------------------###
|
|
###########################################################################################
|
|
|
|
def prep_folders(args):
|
|
|
|
if os.path.isdir(args.folder) != True:
|
|
os.system('mkdir '+args.folder)
|
|
os.system('cp '+args.input_file+' '+args.folder)
|
|
args.input_file = args.folder+'/'+args.input_file.split('/')[-1]
|
|
|
|
if os.path.isdir(args.folder+'/Original') != True:
|
|
os.system('mkdir '+args.folder+'/Original')
|
|
|
|
os.system('cp '+args.input_file+' '+args.folder+'/Original/')
|
|
|
|
###########################################################################################
|
|
###------------- Renames Protein-Coding CDS Sequences to Standard Format ---------------###
|
|
###########################################################################################
|
|
|
|
def renamed_GenomeCDS(args):
|
|
|
|
print (color.BOLD+'\n\nPrepping to rename '+color.GREEN+args.input_file.split('/')[-1]+\
|
|
color.END+color.BOLD+"'s CDS sequences"+color.END)
|
|
inFasta = sorted((i for i in SeqIO.parse(args.input_file,'fasta')),key=lambda seq_rec: -len(seq_rec.seq))
|
|
|
|
renamed_seqs = []
|
|
seq_code_dict = {}
|
|
|
|
count = 1
|
|
for seq_rec in inFasta:
|
|
seq_code_dict.setdefault(seq_rec.description,[]).append('Contig_'+str(count)+'_Len'+str(len(seq_rec.seq)))
|
|
seq_code_dict[seq_rec.description].append(str(seq_rec.seq).upper())
|
|
renamed_seqs.append('>Contig_' + str(count) + '_Len' + str(len(seq_rec.seq)) + '\n' + str(seq_rec.seq).upper())
|
|
count += 1
|
|
|
|
## keeps only CDSs that are greater than 30 bp (10 AA --> This is a cut-off in the
|
|
## phylogenomic pipeline too!)
|
|
renamed_seqs = [i for i in renamed_seqs if len(i.split('\n')[-1]) > 30]
|
|
|
|
print (color.BOLD+'\n\nFor '+color.DARKCYAN+args.input_file.split('/')[-1]+color.END+\
|
|
color.BOLD+', '+color.RED+str(len(renamed_seqs))+' CDS sequences\n'+color.END+color.BOLD+
|
|
'were renamed while preserving the '+color.ORANGE+args.source+color.END+color.BOLD+' formatting'\
|
|
+color.END+'\n')
|
|
|
|
with open(args.input_file.replace('.fasta','.Prepped.fasta'),'w+') as w:
|
|
w.write('\n'.join(renamed_seqs))
|
|
|
|
with open(args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv'),'w+') as w:
|
|
w.write('Original Name\tNew Name\tSeq Length\t Seq GC\n')
|
|
for k, v in seq_code_dict.items():
|
|
w.write(k+'\t'+v[0]+'\t'+str(len(v[1]))+'\t'+str(GC(v[1]))+'\n')
|
|
|
|
|
|
###########################################################################################
|
|
###--------------------- Cleans up the Folder and Moves Final Files --------------------###
|
|
###########################################################################################
|
|
|
|
def clean_up(args):
|
|
|
|
# os.system('rm '+args.input_file)
|
|
os.system('mv ' + args.input_file.split('/')[-1].replace('.fasta','.SeqCodes.tsv') + ' ' + args.folder + '/Original/')
|
|
os.system('mv ' + args.input_file + ' ' + args.folder + '/Original/')
|
|
|
|
|
|
###########################################################################################
|
|
###-------------------------------- Next Script Message --------------------------------###
|
|
###########################################################################################
|
|
|
|
def next_script(args):
|
|
|
|
print (color.BOLD+'\nLook for '+color.DARKCYAN+args.input_file.split('/')[-1].replace('.fasta','.Renamed.fasta')\
|
|
+'.fasta'+color.END+color.BOLD+'\nin the '+color.ORANGE+args.folder.split('/')[-1]+\
|
|
' Folder\n\n'+color.END+color.BOLD)
|
|
|
|
print ('Next Script(s) are:\n\n'+color.PURPLE+'2g_GCodeEval.py'+color.END+color.BOLD\
|
|
+' (if Genetic Code is '+color.RED+'Unknown'+color.END+color.BOLD+')\n\nOtherwise:\n\n'+\
|
|
color.PURPLE+'3g_GCodeTranslate.py\n\n'+color.END)
|
|
|
|
|
|
##########################################################################################
|
|
###----------------------------- Calls on Above Functions -----------------------------###
|
|
##########################################################################################
|
|
|
|
def main():
|
|
|
|
args = check_args()
|
|
|
|
prep_folders(args)
|
|
|
|
renamed_GenomeCDS(args)
|
|
|
|
clean_up(args)
|
|
|
|
next_script(args)
|
|
|
|
main() |