mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 05:20:24 +08:00
Adding CheckSetup.py to the Genomes
This commit is contained in:
parent
fbac4f6b09
commit
72660b742b
71
PTL1/Genomes/Scripts/CheckSetup.py
Normal file
71
PTL1/Genomes/Scripts/CheckSetup.py
Normal file
@ -0,0 +1,71 @@
|
||||
import os, sys, re
|
||||
from Bio import SeqIO
|
||||
|
||||
|
||||
def check_cds(params):
|
||||
|
||||
if os.path.isdir(params.cds):
|
||||
for file in os.listdir(params.cds):
|
||||
if file[10:] != '_GenBankCDS.fasta' and 'DS_Store' not in file:
|
||||
print('\nERROR: The file ' + file + ' in the give folder of assembled transcripts is incorrectly formatted. The files must start with a ten digit taxon identifier and then be named like Op_me_Hsap_GenBankCDS.fasta\n')
|
||||
exit()
|
||||
else:
|
||||
print('\nERROR: CDS folder could not be found. Please ensure the given path is correct.\n')
|
||||
exit()
|
||||
|
||||
|
||||
def check_databases(params):
|
||||
|
||||
if os.path.isdir(params.databases):
|
||||
if os.path.isdir(params.databases + '/db_OG'):
|
||||
fasta = [file for file in os.listdir(params.databases + '/db_OG') if file.endswith('.fasta')]
|
||||
dmnd = [file for file in os.listdir(params.databases + '/db_OG') if file.endswith('.dmnd')]
|
||||
|
||||
if len(fasta) == 0:
|
||||
print('\nERROR: No Hook fasta file found in the Databases/db_OG folder\n')
|
||||
exit()
|
||||
elif len(fasta) > 1:
|
||||
print('\nERROR: More than one Hook fasta file found in the Databases/db_OG folder. Please delete all except for the correct file.\n')
|
||||
exit()
|
||||
else:
|
||||
for rec in SeqIO.parse(params.databases + '/db_OG/' + fasta[0], 'fasta'):
|
||||
try:
|
||||
og_number = re.split('OG.{1}_', rec.id)[-1][:6]
|
||||
og_prefix = rec.id.split(og_number)[-2][-4:]
|
||||
og = og_prefix + og_number
|
||||
|
||||
if rec.id[-10:] != og:
|
||||
print('\nError: The sequence name ' + rec.id + ' in the given Hook database fasta file is incorrectly formatted. Each sequence ID should start with a ten-digit taxon identifier and end with a ten-digit gene family identifier (which must start with OGX_, with "X" being any digit. E.g. Op_me_Hsap_0_OG6_110767)\n')
|
||||
exit()
|
||||
except IndexError:
|
||||
print('\nError: The sequence name ' + rec.id + ' in the given Hook database fasta file is incorrectly formatted. Each sequence ID should start with a ten-digit taxon identifier and end with a ten-digit gene family identifier (which must start with OGX_, with "X" being any digit. E.g. Op_me_Hsap_0_OG6_110767)\n')
|
||||
exit()
|
||||
if len(dmnd) == 0:
|
||||
print('\nERROR: No Hook Diamond database (.dmnd) file found in the Databases/db_OG folder.\n')
|
||||
exit()
|
||||
elif len(dmnd) > 1:
|
||||
print('\nERROR: No Hook Diamond database (.dmnd) file found in the Databases/db_OG folder. Please delete all except for the correct file.\n')
|
||||
exit()
|
||||
else:
|
||||
print('\nERROR: The db_OG folder could not be found in the databases folder.\n')
|
||||
exit()
|
||||
else:
|
||||
print('\nERROR: Databases folder could not be found. Please ensure the given path is correct.\n')
|
||||
exit()
|
||||
|
||||
|
||||
def run(params):
|
||||
|
||||
print('\nChecking the input files and scripts setup...\n')
|
||||
|
||||
check_cds(params)
|
||||
|
||||
check_databases(params)
|
||||
|
||||
print('\nAll checks passed!\n')
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
import os, sys, re
|
||||
import argparse
|
||||
import CheckSetup
|
||||
|
||||
|
||||
def get_args():
|
||||
@ -22,6 +23,8 @@ def get_args():
|
||||
|
||||
def script_one(args, ten_digit_codes):
|
||||
|
||||
CheckSetup.run(args)
|
||||
|
||||
for file in os.listdir(args.cds):
|
||||
if file[10:] == '_GenBankCDS.fasta' and file[:10] in ten_digit_codes:
|
||||
os.system('python 1_RenameCDS.py -in ' + args.cds + '/' + file + ' -s GenBank -o ' + args.output + '/Output')
|
||||
@ -125,7 +128,7 @@ def script_five(args):
|
||||
if os.path.isdir(args.output + '/Output/' + folder):
|
||||
gcode_formatted = gcode_by_folder[folder][0].upper() + gcode_by_folder[folder].lower()[1:]
|
||||
if os.path.isfile(args.output + '/Output/' + folder + '/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta'):
|
||||
step5_cmd = 'python 5_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder
|
||||
step5_cmd = 'python 5a_FinalizeName.py -in ' + args.output + '/Output/' + folder + '/DiamondOG/' + folder + '_GenBankCDS.Renamed.' + gcode_formatted + '.AA.fasta -n ' + folder
|
||||
os.system(step5_cmd)
|
||||
|
||||
os.mkdir(args.output + '/Output/Intermediate')
|
||||
@ -134,7 +137,7 @@ def script_five(args):
|
||||
if file != 'ReadyToGo' and file != 'Intermediate':
|
||||
os.system('mv ' + args.output + '/Output/' + file + ' ' + args.output + '/Output/Intermediate')
|
||||
|
||||
os.system('python 6_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
|
||||
os.system('python 5b_SummaryStats.py -i ' + args.output + '/Output -d ' + args.databases)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
@ -143,7 +146,7 @@ if __name__ == "__main__":
|
||||
|
||||
if (args.first_script == 1 or args.script == 1) and not os.path.isdir(args.cds):
|
||||
print('\nIf starting at the first script, a valid path to a folder of nucleotide CDS files (which must end in .fasta) should be input using the --cds argument')
|
||||
quit()
|
||||
exit()
|
||||
|
||||
ten_digit_codes = []
|
||||
if args.first_script == 1 or args.script == 1:
|
||||
@ -153,24 +156,24 @@ if __name__ == "__main__":
|
||||
else:
|
||||
if not os.path.isdir(args.output + '/Output'):
|
||||
print('\nA folder called "Output" is not found at the given output path. Enter the correct path for --output or start from script 1.\n')
|
||||
quit()
|
||||
exit()
|
||||
|
||||
if(len(ten_digit_codes) > len(list(dict.fromkeys(ten_digit_codes)))):
|
||||
print('\nDuplicate 10-digit codes are not allowed. Aborting.\n')
|
||||
quit()
|
||||
exit()
|
||||
|
||||
for code in ten_digit_codes:
|
||||
for c, char in enumerate(code):
|
||||
if (c != 2 and c != 5 and char not in 'qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890') or ((c == 2 or c == 5) and char != '_'):
|
||||
print('\n' + code + ' is an invalid 10-digit code sample identifier. It must of the format Op_me_hsap (Homo sapiens for example). Please ask for help if this does not make sense.\n')
|
||||
quit()
|
||||
exit()
|
||||
|
||||
if os.path.isdir(args.output + '/Output') and (args.first_script == 1 or args.script == 1):
|
||||
print('\nAn "Output" folder already exists at the given path. Please delete or rename this folder and try again.\n')
|
||||
quit()
|
||||
exit()
|
||||
elif os.path.isdir(args.output + '/Output/Intermediate'):
|
||||
print('\nIt looks like this run is already complete. Try deleting/renaming the Output folder and try again.\n')
|
||||
quit()
|
||||
exit()
|
||||
elif not os.path.isdir(args.output + '/Output'):
|
||||
os.mkdir(args.output + '/Output')
|
||||
|
||||
@ -186,7 +189,7 @@ if __name__ == "__main__":
|
||||
scripts[i + args.first_script](args)
|
||||
else:
|
||||
print('\nInvalid script combination: the first script must be less than the last script. If you want to use only once script, use the --script argument.\n')
|
||||
quit()
|
||||
exit()
|
||||
else:
|
||||
if args.script == 1:
|
||||
scripts[args.script](args, ten_digit_codes)
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user