Adding CheckSetup.py to the Genomes

This commit is contained in:
Auden Cote-L'Heureux 2023-10-30 17:18:56 -04:00 committed by GitHub
parent fbac4f6b09
commit 72660b742b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 83 additions and 9 deletions

View File

@ -0,0 +1,71 @@
import os, sys, re
from Bio import SeqIO
def check_cds(params):
if os.path.isdir(params.cds):
for file in os.listdir(params.cds):
if file[10:] != '_GenBankCDS.fasta' and 'DS_Store' not in file:
print('\nERROR: The file ' + file + ' in the give folder of assembled transcripts is incorrectly formatted. The files must start with a ten digit taxon identifier and then be named like Op_me_Hsap_GenBankCDS.fasta\n')
exit()
else:
print('\nERROR: CDS folder could not be found. Please ensure the given path is correct.\n')
exit()
def check_databases(params):
if os.path.isdir(params.databases):
if os.path.isdir(params.databases + '/db_OG'):
fasta = [file for file in os.listdir(params.databases + '/db_OG') if file.endswith('.fasta')]
dmnd = [file for file in os.listdir(params.databases + '/db_OG') if file.endswith('.dmnd')]
if len(fasta) == 0:
print('\nERROR: No Hook fasta file found in the Databases/db_OG folder\n')
exit()
elif len(fasta) > 1:
print('\nERROR: More than one Hook fasta file found in the Databases/db_OG folder. Please delete all except for the correct file.\n')
exit()
else:
for rec in SeqIO.parse(params.databases + '/db_OG/' + fasta[0], 'fasta'):
try:
og_number = re.split('OG.{1}_', rec.id)[-1][:6]
og_prefix = rec.id.split(og_number)[-2][-4:]
og = og_prefix + og_number
if rec.id[-10:] != og:
print('\nError: The sequence name ' + rec.id + ' in the given Hook database fasta file is incorrectly formatted. Each sequence ID should start with a ten-digit taxon identifier and end with a ten-digit gene family identifier (which must start with OGX_, with "X" being any digit. E.g. Op_me_Hsap_0_OG6_110767)\n')
exit()
except IndexError:
print('\nError: The sequence name ' + rec.id + ' in the given Hook database fasta file is incorrectly formatted. Each sequence ID should start with a ten-digit taxon identifier and end with a ten-digit gene family identifier (which must start with OGX_, with "X" being any digit. E.g. Op_me_Hsap_0_OG6_110767)\n')
exit()
if len(dmnd) == 0:
print('\nERROR: No Hook Diamond database (.dmnd) file found in the Databases/db_OG folder.\n')
exit()
elif len(dmnd) > 1:
print('\nERROR: No Hook Diamond database (.dmnd) file found in the Databases/db_OG folder. Please delete all except for the correct file.\n')
exit()
else:
print('\nERROR: The db_OG folder could not be found in the databases folder.\n')
exit()
else:
print('\nERROR: Databases folder could not be found. Please ensure the given path is correct.\n')
exit()
def run(params):
print('\nChecking the input files and scripts setup...\n')
check_cds(params)
check_databases(params)
print('\nAll checks passed!\n')

View File

@ -1,5 +1,6 @@
import os, sys, re import os, sys, re
import argparse import argparse
import CheckSetup
def get_args(): def get_args():
@ -22,6 +23,8 @@ def get_args():
def script_one(args, ten_digit_codes): def script_one(args, ten_digit_codes):
CheckSetup.run(args)
for file in os.listdir(args.cds): for file in os.listdir(args.cds):
if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes: if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes:
os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output') os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output')
@ -125,7 +128,7 @@ def script_five(args):
if os.path.isdir(args.output + '/Output/' + folder): if os.path.isdir(args.output + '/Output/' + folder):
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:] gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta'): if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta'):
step5_cmd = 'python 5_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder step5_cmd = 'python 5a_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder
os.system(step5_cmd) os.system(step5_cmd)
os.mkdir(args.output + '/Output/Intermediate') os.mkdir(args.output + '/Output/Intermediate')
@ -134,7 +137,7 @@ def script_five(args):
if file != 'ReadyToGo' and file != 'Intermediate': if file != 'ReadyToGo' and file != 'Intermediate':
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate') os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
os.system('python 6_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases) os.system('python 5b_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
if __name__ == "__main__": if __name__ == "__main__":
@ -143,7 +146,7 @@ if __name__ == "__main__":
if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds): if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds):
print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument') print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument')
quit() exit()
ten_digit_codes = [] ten_digit_codes = []
if args.first_script == 1 or args.script == 1: if args.first_script == 1 or args.script == 1:
@ -153,24 +156,24 @@ if __name__ == "__main__":
else: else:
if not os.path.isdir(args.output + '/Output'): if not os.path.isdir(args.output + '/Output'):
print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n') print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
quit() exit()
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))): if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
print('\nDuplicate 10-digit codes are not allowed. Aborting.\n') print('\nDuplicate 10-digit codes are not allowed. Aborting.\n')
quit() exit()
for code in ten_digit_codes: for code in ten_digit_codes:
for c, char in enumerate(code): for c, char in enumerate(code):
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'): if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n') print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
quit() exit()
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1): if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n') print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
quit() exit()
elif os.path.isdir(args.output + '/Output/Intermediate'): elif os.path.isdir(args.output + '/Output/Intermediate'):
print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n') print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
quit() exit()
elif not os.path.isdir(args.output + '/Output'): elif not os.path.isdir(args.output + '/Output'):
os.mkdir(args.output + '/Output') os.mkdir(args.output + '/Output')
@ -186,7 +189,7 @@ if __name__ == "__main__":
scripts[i + args.first_script](args) scripts[i + args.first_script](args)
else: else:
print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n') print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
quit() exit()
else: else:
if args.script == 1: if args.script == 1:
scripts[args.script](args, ten_digit_codes) scripts[args.script](args, ten_digit_codes)