EukPhylo/PTL2/Scripts/preguidance.py

import os, sys, re
from Bio import SeqIO


def run(params):

	try:
		ogs = list(dict.fromkeys([line.strip() for line in open(params.gf_list)]))
	except (FileNotFoundError, TypeError) as e:
		print('\nERROR: Unable to read GF list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
		exit()

	try:
		taxa = list(dict.fromkeys([line.strip() for line in open(params.taxon_list)]))
	except (FileNotFoundError, TypeError) as e:
		print('\nERROR: Unable to read taxon list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
		exit()

	if params.sim_taxa != None:
		try:
			sim_taxa = list(dict.fromkeys([line.strip() for line in open(params.sim_taxa)]))
		except (FileNotFoundError, TypeError) as e:
			print('\nERROR: Unable to read similarity taxa list file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
			exit()
	else:
		sim_taxa = 'all'

	if params.blacklist != None:
		try:
			blacklist_seqs = list(dict.fromkeys([line.strip() for line in open(params.blacklist)]))
		except (FileNotFoundError, TypeError) as e:
			print('\nERROR: Unable to read blacklist file. Please make sure that the path is correct and that the file is formatted correctly.\n\n' + str(e) + '\n')
			exit()
	else:
		blacklist_seqs = []

	if not os.path.isdir(params.data):
		print('\nInput amino-acid data files not found. Please make sure that the given path (--data) is correct.\n')

	aa_files = [f for f in os.listdir(params.data) if f[:10] in taxa if f.endswith('.faa') or f.endswith('.fa') or f.endswith('.fasta')]

	missing_taxa = [tax for tax in taxa if tax not in [f[:10] for f in aa_files]]
	if(len(missing_taxa) > 0):
		print('\nWARNING: The following taxa in the taxon list are missing amino-acid files in ' + params.data + ':\n' + '\n'.join(['\t' + t for t in missing_taxa]) + '\n')

	os.mkdir(params.output + '/Output/Intermediate/SF_Diamond')

	removed_file = open(params.output + '/Output/Pre-Guidance/SimFilter_removed.txt', 'w')

	for og in ogs:
		print('\nProcessing ' + og + '\n')
		with open(params.output + '/Output/Pre-Guidance/' + og + '_preguidance.fasta', 'w') as preguidance_file:
			for taxon_file in aa_files:
				recs = []
				for rec in sorted([rec for rec in SeqIO.parse(params.data + '/' + taxon_file, 'fasta') if rec.id[-10:] == og and rec.id not in blacklist_seqs and rec.id[-10:].startswith(params.og_identifier)], key=lambda x: -len(x.seq)):
					if(rec.id == rec.description):
						recs.append(rec)
					else:
						print('\n\tThe sequence ID ' + rec.description + ' is invalid. Please make sure that sequence IDs contain no spaces, tabs, etc. This sequence is being excluded.\n')

				if sim_taxa == 'all':
					use_taxon = True
				else:
					if taxon_file[:10] in sim_taxa:
						use_taxon = True
					else:
						use_taxon = False

				masters = []; removed = 0; flag = 0; cycle = 0
				if params.similarity_filter and use_taxon:
					if len(recs) > 1:
						while flag == 0:
							master_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_master_' + str(cycle)
							query_file_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_queries_' + str(cycle) + '.fasta'
							diamond_out_name = params.output + '/Output/Intermediate/SF_Diamond/' + og + '_' + taxon_file[:10] + '_diamond_results_' + str(cycle) + '.tsv'

							open(master_file_name + '.faa', 'w').write('>' + recs[0].id + '\n' + str(recs[0].seq) + '\n\n')
							masters.append(recs[0])

							with open(query_file_name, 'w') as queries:
								for rec in recs[1:]:
									queries.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')

							os.system('diamond makedb --in ' + master_file_name + '.faa -d ' + master_file_name)
							os.system('diamond blastp -d ' + master_file_name + '.dmnd -q ' + query_file_name + ' --outfmt 6 -o ' + diamond_out_name)

							diamond_out = open(diamond_out_name).readlines()
							recs_to_remove = []
							for line in diamond_out:
								line = line.strip().split('\t')

								if float(line[2])/100 >= params.sim_cutoff:
									recs_to_remove.append(line[0]); removed =+ 1

							if len([rec for rec in recs[1:] if rec.id not in recs_to_remove]) < 2:
								recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove]
								flag = 1
							else:
								recs = [rec for rec in recs[1:] if rec.id not in recs_to_remove]
								cycle += 1

							for item in recs_to_remove:
								removed_file.write(f"{item}\n")

					print('\n\t' + str(removed) + ' sequence(s) removed by the similarity filter (' + str(cycle + 1) + ' iterations) from ' + taxon_file[:10] + '\n')

				for rec in recs + masters:
					preguidance_file.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')

	removed_file.close()

	if(not params.keep_temp):
		os.system('rm -r ' + params.output + '/Output/Intermediate/SF_Diamond')