EukPhylo/PTL1/Genomes/Scripts/6_SummaryStats.py

import os, sys
import argparse
from Bio import SeqIO
import CUB
from statistics import mean
from math import ceil, floor
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


def get_args():

	parser = argparse.ArgumentParser(
		prog = 'PTL6p1 Script 8: Stat Summary',
		description = "Updated March 31th, 2023 by Auden Cote-L'Heureux"
	)

	parser.add_argument('-i', '--input', type = str, required = True, help = 'Input path to the "Output" folder produced by PhyloToL Part 1. This folder should contain both the "ReadyToGO" and "Intermediate" folders.')
	parser.add_argument('-d', '--databases', type = str, default = '../Databases', help = 'Path to databases folder')
	parser.add_argument('-r', '--r2g_jf', action = 'store_true', help = 'Create ReadyToGo files filtered to only include sequences between the 25th and 75th percentile of silent-site GC content. Please be aware that these are not necessarily the correct or non-contaminant sequences; examine the GC3xENc plots carefully before using these data.')

	#Curate genetic code

	return parser.parse_args()


def hook_lens(args):

	print('\nGetting average OG lengths in the Hook DB...')

	len_by_og = { }
	for file in os.listdir(args.databases + '/db_OG'):
		if file.endswith('.fasta') and os.path.isfile(args.databases + '/db_OG/' + file.replace('.fasta', '.dmnd')):
			for rec in tqdm(SeqIO.parse(args.databases + '/db_OG/' + file, 'fasta')):
				if rec.id[-10:] not in len_by_og:
					len_by_og.update({ rec.id[-10:] : [] })

				len_by_og[rec.id[-10:]].append(len(str(rec.seq)))

	for og in len_by_og:
		len_by_og[og] = mean(len_by_og[og])

	return len_by_og


def aa_comp_lengths(args, gcodes):

	print('\nGetting amino acid composition data from ReadyToGo files...')

	r2g_lengths = { }; aa_comp = { }; recid_by_contig_n = { }
	for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_AA')]):
		if file.endswith('.fasta') and file[:10] in gcodes:
			for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file, 'fasta'):
				r2g_lengths.update({ rec.id : len(str(rec.seq)) * 3 })

				fymink = 0; garp = 0; other = 0; total = 0; x = 0
				for char in str(rec.seq):
					if char in 'FYMINKfymink':
						fymink += 1
					elif char in 'GARPgarp':
						garp += 1
					elif char in 'Xx':
						x += 1
					else:
						other += 1

					total += 1

				aa_comp.update({ rec.id : { 'FYMINK' : fymink/total, 'GARP' : garp/total, 'Other' : other/total, 'X' : x} })

				recid_by_contig_n.update({ rec.id.split('Contig_')[-1].split('_')[0] : rec.id })

	print('\nGetting transcript sequence data from original assembled transcript files...')

	transcripts = { }; transcript_id_corr = { }
	for tax in tqdm([f for f in os.listdir(args.input + '/Intermediate/')]):
		if os.path.isdir(args.input + '/Intermediate/' + tax + '/Original'):
			for file in os.listdir(args.input + '/Intermediate/' + tax + '/Original'):
				if file.endswith('_GenBankCDS.fasta'):
					for rec in SeqIO.parse(args.input + '/Intermediate/' + tax + '/Original/' + file, 'fasta'):
						transcripts.update({ rec.id : (file[:10], str(rec.seq)) })
						if rec.id.split('NODE_')[-1].split('_')[0] in recid_by_contig_n:
							transcript_id_corr.update({ recid_by_contig_n[rec.id.split('NODE_')[-1].split('_')[0]] : rec.id})

	return aa_comp, transcripts, r2g_lengths, transcript_id_corr


def get_nuc_comp(args, gcodes):

	print('\nGetting nucleotide composition data from ReadyToGo files...')

	nuc_comp = { }
	for file in tqdm([f for f in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD')]):
		if file.endswith('.fasta') and file[:10] in gcodes:
			cub_out = CUB.CalcRefFasta(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, gcodes[file[:10]].lower())[0]
			for k in cub_out:
				nuc_comp.update({ k : cub_out[k] })

	return nuc_comp


def per_seq(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, transcript_id_corr, og_mean_lens):

	if not os.path.isdir(args.input + '/PerSequenceStatSummaries'):
		os.mkdir(args.input + '/PerSequenceStatSummaries')

	taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))

	for taxon in taxa:
		with open(args.input + '/PerSequenceStatSummaries/' + taxon + '.csv', 'w') as o:
			o.write('Sequence,Taxon,OG,Transcript,TranscriptLength,CDSLength,AvgLengthOGinHook,AmbiguousCodons,GC-Overall,GC1,GC2,GC3,GC3-Degen,ExpWrightENc,ObsWrightENc_6Fold,ObsWrightENc_No6Fold,ObsWeightedENc_6Fold,ObsWeightedENc_No6Fold,FYMINK,GARP,OtherAA,N.Xs\n')
			for rec in nuc_comp:
				if rec[:10] == taxon:
					o.write(rec + ',' + rec[:10] + ',' + rec[-10:])

					try:
						o.write(',' + transcript_id_corr[rec] + ',' + str(len(all_transcripts[transcript_id_corr[rec]][1])))
					except KeyError:
						o.write(',NA,NA')

					o.write(',' + str(r2g_lengths[rec]) + ',' + str(og_mean_lens[rec[-10:]]))

					v = nuc_comp[rec]
					gcs = [str(v.gcOverall), str(v.gc1), str(v.gc2), str(v.gc3), str(v.gc4F)]
					ENc = [str(v.expENc), str(v.obsENc_6F), str(v.obsENc_No6F), str(v.SunENc_6F),str(v.SunENc_No6F)]
					o.write(',' + ','.join([str(v.amb_cdn)] + gcs + ENc))

					o.write(',' + str(aa_comp[rec]['FYMINK']) + ',' + str(aa_comp[rec]['GARP']) + ',' + str(aa_comp[rec]['Other']) + ',' + str(aa_comp[rec]['X']) + '\n')


def per_tax(args, nuc_comp, aa_comp, all_transcripts, r2g_lengths, gcodes, og_mean_lens):

	taxa = list(dict.fromkeys([seq[:10] for seq in nuc_comp]))

	with open(args.input + '/PerTaxonSummary.csv', 'w') as o:
		o.write('Taxon,TranscriptsInput,Median_GCTranscripts,5-95Perc_GCTranscripts,Median_LenTranscripts,IQR_LenTranscripts,SeqsR2G,OGsR2G,Median_GC3R2G,5Perc_GC3R2G,95Perc_GC3R2G,5-95Perc_GC3R2G,Median_ENcR2G,IQR_ENcR2G,Median_LenR2G,IQR_LenR2G,Prop.G1.5_OGAvg,Prop.L0.5_OGAvg,GeneticCode\n')

		for taxon in taxa:
			try:
				o.write(taxon)

				transcripts = [all_transcripts[seq][1].upper() for seq in all_transcripts if all_transcripts[seq][0] == taxon]
				o.write(',' + str(len(transcripts)))

				transcript_gcs = []
				for transcript in transcripts:
					transcript_gcs.append((transcript.count('G') + transcript.count('C'))/len(transcript))

				transcript_gcs = sorted(transcript_gcs)
				o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.5)]))
				o.write(',' + str(transcript_gcs[floor(len(transcripts)*0.95)] - transcript_gcs[floor(len(transcripts)*0.05)]))

				transcript_lens = sorted([len(transcript) for transcript in transcripts])
				o.write(',' + str(transcript_lens[floor(len(transcripts)*0.5)]))
				o.write(',' + str(transcript_lens[floor(len(transcripts)*0.75)] - transcript_lens[floor(len(transcripts)*0.25)]))

				r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
				o.write(',' + str(len(r2g_ntds)))
				r2g_ogs = list(dict.fromkeys([seq[-10:] for seq in nuc_comp if seq[:10] == taxon]))
				o.write(',' + str(len(r2g_ogs)))

				r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])
				o.write(',' + str(r2g_gc3s[floor(len(r2g_ntds)*0.5)]))
				o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.05)]))
				o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.95)]))
				o.write(',' + str(r2g_gc3s[floor(len(r2g_gc3s)*0.95)] - r2g_gc3s[floor(len(r2g_gc3s)*0.05)]))

				r2g_encs = sorted([seq.obsENc_6F for seq in r2g_ntds])
				o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.5)]))
				o.write(',' + str(r2g_encs[floor(len(r2g_encs)*0.75)] - r2g_encs[floor(len(r2g_encs)*0.25)]))

				tax_r2g_lens = sorted([r2g_lengths[seq] for seq in r2g_lengths if seq[:10] == taxon])
				o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.5)]))
				o.write(',' + str(tax_r2g_lens[floor(len(tax_r2g_lens)*0.75)] - tax_r2g_lens[floor(len(tax_r2g_lens)*0.25)]))

				prop_len_g = len([seq for seq in r2g_lengths if seq[:10] == taxon and r2g_lengths[seq] > 4.5 * og_mean_lens[seq[-10:]]])/len(tax_r2g_lens)
				prop_len_l = len([seq for seq in r2g_lengths if seq[:10] == taxon and r2g_lengths[seq] < 1.5 * og_mean_lens[seq[-10:]]])/len(tax_r2g_lens)

				o.write(',' + str(prop_len_g) + ',' + str(prop_len_l))

				o.write(',' + gcodes[taxon] + '\n')
			except:
				pass


def r2g_jf(args, nuc_comp, gcodes):

	#Q: should there be an maximum IQR cutoff at which we do NOT produce a file here?

	if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF'):
		os.mkdir(args.input + '/ReadyToGo/ReadyToGo_NTD_JF')

	if not os.path.isdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF'):
		os.mkdir(args.input + '/ReadyToGo/ReadyToGo_AA_JF')

	for file in os.listdir(args.input + '/ReadyToGo/ReadyToGo_NTD'):
		if file.endswith('.fasta') and file[:10] in gcodes:
			taxon = file[:10]

			r2g_ntds = [nuc_comp[seq] for seq in nuc_comp if seq[:10] == taxon]
			r2g_gc3s = sorted([seq.gc4F for seq in r2g_ntds])

			with open(args.input + '/ReadyToGo/ReadyToGo_NTD_JF/' + file.replace('.fasta', '.JF.fasta'), 'w') as o:
				for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_NTD/' + file, 'fasta'):
					if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
						o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')

			with open(args.input + '/ReadyToGo/ReadyToGo_AA_JF/' + file.replace('.fasta', '.JF.fasta').replace('NTD', 'AA'), 'w') as o:
				for rec in SeqIO.parse(args.input + '/ReadyToGo/ReadyToGo_AA/' + file.replace('NTD', 'AA'), 'fasta'):
					if nuc_comp[rec.id].gc4F > r2g_gc3s[floor(len(r2g_gc3s)*0.25)] and nuc_comp[rec.id].gc4F < r2g_gc3s[floor(len(r2g_gc3s)*0.75)]:
						o.write('>' + rec.id + '\n' + str(rec.seq) + '\n\n')


def plot_jf(args, nuc_comp):

	if not os.path.isdir(args.input + '/GC3xENc_Plots'):
		os.mkdir(args.input + '/GC3xENc_Plots')

	taxa = list(dict.fromkeys([rec[:10] for rec in nuc_comp]))

	gc3_null = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100]
	enc_null = [31, 31.5958, 32.2032, 32.8221, 33.4525, 34.0942, 34.7471, 35.411, 36.0856, 36.7707, 37.4659, 38.1707, 38.8847, 39.6074, 40.3381, 41.0762, 41.8208, 42.5712, 43.3264, 44.0854, 44.8471, 45.6102, 46.3735, 47.1355, 47.8949, 48.65, 49.3991, 50.1406, 50.8725, 51.593, 52.3, 52.9916, 53.6656, 54.32, 54.9525, 55.561, 56.1434, 56.6975, 57.2211, 57.7124, 58.1692, 58.5898, 58.9723, 59.3151, 59.6167, 59.8757, 60.0912, 60.2619, 60.3873, 60.4668, 60.5, 60.4668, 60.3873, 60.2619, 60.0912, 59.8757, 59.6167, 59.3151, 58.9723, 58.5898, 58.1692, 57.7124, 57.2211, 56.6975, 56.1434, 55.561, 54.9525, 54.32, 53.6656, 52.9916, 52.3, 51.593, 50.8725, 50.1406, 49.3991, 48.65, 47.8949, 47.1355, 46.3735, 45.6102, 44.8471, 44.0854, 43.3264, 42.5712, 41.8208, 41.0762, 40.3381, 39.6074, 38.8847, 38.1707, 37.4659, 36.7707, 36.0856, 35.411, 34.7471, 34.0942, 33.4525, 32.8221, 32.2032, 31.5958, 31]

	for taxon in taxa:
		comp_data = [(nuc_comp[rec].gc4F, nuc_comp[rec].obsENc_6F) for rec in nuc_comp if rec[:10] == taxon]

		plt.figure()
		plt.plot(np.array(gc3_null), np.array(enc_null), color = 'black', linewidth=2)
		plt.scatter(np.array([val[0] for val in comp_data]), np.array([val[1] for val in comp_data]), s = 1)
		plt.xlabel("GC content (3rd pos, 4-fold sites)")
		plt.ylabel("Observed Wright ENc (6 Fold)")
		plt.savefig(args.input + '/GC3xENc_Plots/' + taxon + '.png')

if __name__ == "__main__":
	args = get_args()

	valid_codes = ['universal', 'blepharisma', 'chilodonella', 'condylostoma', 'euplotes', 'peritrich', 'vorticella', 'mesodinium', 'tag', 'tga', 'taa', 'none']

	gcodes = { }
	if os.path.isfile(args.input + '/Intermediate/gcode_output.tsv'):
		for line in open(args.input + '/Intermediate/gcode_output.tsv'):
			if len(line.split('\t')) == 5 and line.split('\t')[4].strip().lower() in valid_codes:
				gcodes.update({ line.split('\t')[0] : line.split('\t')[4].strip() })
			elif line.split('\t')[4].strip().lower() != '':
				print('\nInvalid genetic code assignment for taxon ' + line.split('\t')[0] + '. Skipping this taxon in script 6 (summary statistics)\n')
	else:
		print('\nGenetic code assignment file (Output/Intermediate/gcode_output.tsv) not found. Quitting script 6 (summary statistics).\n')
		exit()

	aa_comp, transcripts, r2g_lengths, transcript_id_corr = aa_comp_lengths(args, gcodes)
	nuc_comp = get_nuc_comp(args, gcodes)
	og_mean_lens = hook_lens(args)

	per_tax(args, nuc_comp, aa_comp, transcripts, r2g_lengths, gcodes, og_mean_lens)
	per_seq(args, nuc_comp, aa_comp, transcripts, r2g_lengths, transcript_id_corr, og_mean_lens)

	if args.r2g_jf:
		r2g_jf(args, nuc_comp, gcodes)

	plot_jf(args, nuc_comp)