EukPhylo/Utilities/For_Assemblies/Assess_transcriptomes.py

'''
#Author, date: Elinor Sterner(esterner27@gmail.com), March 2023. Last updated Nov 27th 2023
#Motivation: To describe transcriptomes.
#Intent: Plot length, coverage and GC of assembled transcripts with stats generated.
#Dependencies: Python3
#Inputs: Folder called Renamed_assembled_files of previously renamed files (if this is the case, put -r or --renamed in the command line)
	tsv file of LKH number and new names formatted like this: 10_digit_code\tdescriptor_of_taxon called new_names.tsv
#Outputs: A spreadsheet containing the length, GC, and coverage of each transcript.
#Example: python assess_transcriptomes.py -input <pathway to directory of spades output>
'''
import os
import sys
from pathlib import Path
import sys
from Bio.SeqUtils import GC
from Bio import SeqIO

def script_help():

	print('\nThis script grabs and plots GC, length and coverage of transcriptomes. \n\nInput:\ntsv file of tab separated ten digit code and taxon info (taxonomy, lifestage, etc).\nAND\n\ndirectory of assemblies renamed in this format: ten_digit_code_assembledTranscripts.fasta\n\nOutput is multiple R plots faceted by taxon and a csv file of data. \n\nIt plots GC by length, and distributions of coverage, length and GC.\n\n To run: \n\n\tpython assess_transcriptomes.py -i <pathway to directory of assemblies>\n\nRun -h or --help for this message\n\n')

def get_args():
	#this parses user arguments. Checks if the files are renamed already or not (--renamed or --raw), and gets the directory of those files.

	if('--help' in sys.argv or '-h' in sys.argv):#check for help function in command line
		script_help()
		exit()


	if ('--input'in sys.argv or '-i' in sys.argv):#check for renamed parameter
		renamed = True
		try:
			if('--input' in sys.argv):
				input_dir = sys.argv[sys.argv.index('--input') + 1]
			else:
				input_dir = sys.argv[sys.argv.index('-i') + 1]
		except IndexError:
			print('\nSomething went wrong went parsing the arguments. Did you input a directory of assemblies?\n')


		make_dirs(input_dir)


def make_dirs(input_dir):

	Path(f'plots').mkdir(parents=True, exist_ok=True)#makes output folder for r plots
	assess_transcriptomes(input_dir)#skip renaming funtion if theyre already renamed


def assess_transcriptomes(input_dir):

	#iterate through renamed fasta files
	for file in os.listdir(input_dir):
		if file.endswith('fasta'):
			print(f'gathering data from {file}..\n')
			data = []#initiate list of data per LKH
			records = list(SeqIO.parse(f'{input_dir}/{file}', 'fasta'))#parse the fasta files

			taxon_info = get_taxon_info(file)#send to get_taxon_info function
			#parse output from that function
			ten_digit_code = taxon_info[0]
			taxon = taxon_info[1]

		#	extract all data for each transcript
			for record in records:
				gc = GC(record.seq)
				length = len(record.seq)
				iden = record.id
				cov = record.id.split('_')[5]

				transcript_data = f'{iden}, {length}, {gc}, {cov}, {ten_digit_code}, {taxon}'#list of data per transcriptome
				data.append(transcript_data)#data for each LKH

			all_data.update({file.split('_assembledTranscripts')[0]: data})#make dict of ten digit code: all information

	write_to_file()

def get_taxon_info(file):#parse the info in the new_names.tsv file to get info on the taxon
	with open('new_names.tsv', 'r') as o:
		cell_info = o.readlines()
		for line in cell_info:
			if '_'.join(file.split('_')[0:3]) in line:
				ten_digit_code = line.split('\t')[0]
				taxon = line.split('\t')[1]

	try:
		return ten_digit_code, taxon
	except UnboundLocalError:
		print(f'no taxon information in new_names.tsv for {file}')

	print('done getting taxon info from new_names.tsv')


def write_to_file():

	with open('assembly_assessment.csv', 'w') as o:#create output csv
		print('writing data to assembly_assessment.csv\n')
		o.write('seqID, length, GC, cov, ten_digit_code, taxon_info\n')#write header
		for lkh, data in all_data.items():
			for transcript in data:
				o.write(f'{transcript.strip()}\n')#write each line

	plot_assessment()

def plot_assessment():
	print('Plotting your data\n')
	os.system('Rscript plot_assemblies.R')
	for file in os.listdir():
		if file.endswith('.png') or file.endswith('.pdf'):
			os.system(f'mv {file} plots/{file}')

	print('\n\n\n\nDone! All your plots are saved to the folder called plots\n\n')

if __name__ == '__main__':
	all_data = {}

	try:
		f = open("new_names.tsv")

	except FileNotFoundError:
		print('\n\n Did you include a tsv file of your cells? It should be called new_names.tsv and formatted like this: 10_digit_code\tdescriptor_of_taxon\n\n')
		exit()

	else:
		get_args()