mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 12:10:25 +08:00
Updated preguidance with no logger dependency
This commit is contained in:
parent
a15ba2402d
commit
b466a30a96
@ -1,5 +1,4 @@
|
|||||||
import os, sys, re
|
import os, sys, re
|
||||||
from logger import Logger
|
|
||||||
from Bio import SeqIO
|
from Bio import SeqIO
|
||||||
|
|
||||||
|
|
||||||
@ -8,53 +7,71 @@ def run(params):
|
|||||||
try:
|
try:
|
||||||
ogs = list(dict.fromkeys([line.strip() for line in open(params.gf_list)]))
|
ogs = list(dict.fromkeys([line.strip() for line in open(params.gf_list)]))
|
||||||
except (FileNotFoundError, TypeError) as e:
|
except (FileNotFoundError, TypeError) as e:
|
||||||
Logger.Error('Unable to read GF list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e))
|
print('\nERROR: Unable to read GF list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
|
||||||
|
exit()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
taxa = list(dict.fromkeys([line.strip() for line in open(params.taxon_list)]))
|
taxa = list(dict.fromkeys([line.strip() for line in open(params.taxon_list)]))
|
||||||
except (FileNotFoundError, TypeError) as e:
|
except (FileNotFoundError, TypeError) as e:
|
||||||
Logger.Error('Unable to read taxon list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e))
|
print('\nERROR: Unable to read taxon list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
|
||||||
|
exit()
|
||||||
|
|
||||||
|
if params.sim_taxa != None:
|
||||||
try:
|
try:
|
||||||
sim_taxa = list(dict.fromkeys([line.strip() for line in open(params.sim_taxa)]))
|
sim_taxa = list(dict.fromkeys([line.strip() for line in open(params.sim_taxa)]))
|
||||||
except (FileNotFoundError, TypeError) as e:
|
except (FileNotFoundError, TypeError) as e:
|
||||||
Logger.Error('Unable to read similarity taxa list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e))
|
print('\nERROR: Unable to read similarity taxa list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
|
||||||
|
exit()
|
||||||
|
else:
|
||||||
|
sim_taxa = 'all'
|
||||||
|
|
||||||
|
if params.blacklist != None:
|
||||||
try:
|
try:
|
||||||
blacklist_seqs = list(dict.fromkeys([line.strip() for line in open(params.blacklist)]))
|
blacklist_seqs = list(dict.fromkeys([line.strip() for line in open(params.blacklist)]))
|
||||||
except (FileNotFoundError, TypeError) as e:
|
except (FileNotFoundError, TypeError) as e:
|
||||||
print('\nUnable to read blacklist file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e))
|
print('\nERROR: Unable to read blacklist file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
|
||||||
|
exit()
|
||||||
|
else:
|
||||||
|
blacklist_seqs = []
|
||||||
|
|
||||||
if not os.path.isdir(params.data):
|
if not os.path.isdir(params.data):
|
||||||
Logger.Error(Logger.Error('Input amino-acid data files not found. Please make sure that the given path (--data) is correct.'))
|
print('\nInput amino-acid data files not found. Please make sure that the given path (--data) is correct.\n')
|
||||||
|
|
||||||
aa_files = [f for f in os.listdir(params.data) if f[:10] in taxa if f.endswith('.faa') or f.endswith('.fa') or f.endswith('.fasta')]
|
aa_files = [f for f in os.listdir(params.data) if f[:10] in taxa if f.endswith('.faa') or f.endswith('.fa') or f.endswith('.fasta')]
|
||||||
|
|
||||||
missing_taxa = [tax for tax in taxa if tax not in [f[:10] for f in aa_files]]
|
missing_taxa = [tax for tax in taxa if tax not in [f[:10] for f in aa_files]]
|
||||||
if(len(missing_taxa) > 0):
|
if(len(missing_taxa) > 0):
|
||||||
Logger.Warning('The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa]))
|
print('\nWARNING: The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa]) + '\n')
|
||||||
|
|
||||||
os.mkdir(params.output + '/Output/Intermediate/SF_Diamond')
|
os.mkdir(params.output + '/Output/Intermediate/SF_Diamond')
|
||||||
|
|
||||||
removed_file = open(params.output + '/Output/Pre-Guidance/SimFilter_removed.txt', 'w')
|
removed_file = open(params.output + '/Output/Pre-Guidance/SimFilter_removed.txt', 'w')
|
||||||
|
|
||||||
for og in ogs:
|
for og in ogs:
|
||||||
Logger.Message('Processing ' + og)
|
print('\nProcessing ' + og + '\n')
|
||||||
with open(params.output + '/Output/Pre-Guidance/' + og + '_preguidance.faa', 'w') as preguidance_file:
|
with open(params.output + '/Output/Pre-Guidance/' + og + '_preguidance.fasta', 'w') as preguidance_file:
|
||||||
for taxon_file in aa_files:
|
for taxon_file in aa_files:
|
||||||
recs = []
|
recs = []
|
||||||
for rec in sorted([rec for rec in SeqIO.parse(params.data + '/' + taxon_file, 'fasta') if rec.id[-10:] == og and rec.id not in blacklist_seqs and params.og_identifier in rec.id], key=lambda x: -len(x.seq)):
|
for rec in sorted([rec for rec in SeqIO.parse(params.data + '/' + taxon_file, 'fasta') if rec.id[-10:] == og and rec.id not in blacklist_seqs and params.og_identifier in rec.id], key=lambda x: -len(x.seq)):
|
||||||
if(rec.id == rec.description):
|
if(rec.id == rec.description):
|
||||||
recs.append(rec)
|
recs.append(rec)
|
||||||
else:
|
else:
|
||||||
Logger.Warning('\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.')
|
print('\n\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.\n')
|
||||||
|
|
||||||
|
if sim_taxa == 'all':
|
||||||
|
use_taxon = True
|
||||||
|
else:
|
||||||
|
if taxon_file[:10] in sim_taxa:
|
||||||
|
use_taxon = True
|
||||||
|
else:
|
||||||
|
use_taxon = False
|
||||||
|
|
||||||
masters = []; removed = 0; flag = 0; cycle = 0
|
masters = []; removed = 0; flag = 0; cycle = 0
|
||||||
if params.similarity_filter and taxon_file[:10] in sim_taxa:
|
if params.similarity_filter and use_taxon:
|
||||||
if len(recs) > 1:
|
if len(recs) > 1:
|
||||||
while flag == 0:
|
while flag == 0:
|
||||||
master_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
|
master_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
|
||||||
query_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.faa'
|
query_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.fasta'
|
||||||
diamond_out_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv'
|
diamond_out_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv'
|
||||||
|
|
||||||
open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n')
|
open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n')
|
||||||
@ -85,7 +102,7 @@ def run(params):
|
|||||||
for item in recs_to_remove:
|
for item in recs_to_remove:
|
||||||
removed_file.write(f"{item}\n")
|
removed_file.write(f"{item}\n")
|
||||||
|
|
||||||
Logger.Message('\t' + str(removed) + ' sequence(s) removed by the similarity filter (' + str(cycle + 1) + ' iterations) from ' + taxon_file[:10])
|
print('\n\t' + str(removed) + ' sequence(s) removed by the similarity filter (' + str(cycle + 1) + ' iterations) from ' + taxon_file[:10] + '\n')
|
||||||
|
|
||||||
for rec in recs + masters:
|
for rec in recs + masters:
|
||||||
preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')
|
||||||
@ -95,43 +112,3 @@ def run(params):
|
|||||||
if(not params.keep_temp):
|
if(not params.keep_temp):
|
||||||
os.system('rm -r ' + params.output + '/Output/Intermediate/SF_Diamond')
|
os.system('rm -r ' + params.output + '/Output/Intermediate/SF_Diamond')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user