From a86a7d761002080ea342383ad762e33e4b8311c3 Mon Sep 17 00:00:00 2001
From: ElinorSterner <86856150+ElinorSterner@users.noreply.github.com>
Date: Thu, 8 Jun 2023 17:17:12 -0400
Subject: [PATCH] Update assess_transcriptomes.py

Changed arguments and removed renaming
---
 .../For_Assemblies/assess_transcriptomes.py   | 100 +++++++-----------
 1 file changed, 38 insertions(+), 62 deletions(-)
diff --git a/Utilities/For_Assemblies/assess_transcriptomes.py b/Utilities/For_Assemblies/assess_transcriptomes.py
index 6168625..f56bb32 100644
--- a/Utilities/For_Assemblies/assess_transcriptomes.py
+++ b/Utilities/For_Assemblies/assess_transcriptomes.py
@@ -1,15 +1,14 @@
 '''
 Written March 2023 by Elinor (esterner27@gmail.com) to plot length, coverage and GC of assembled transcripts
 
-This script will rename the spades output to new names in the txt file, then iterate through them all and gather GC, length and coverage. With that data, it plots R scripts
+This script will iterate through all assembled files (named as 10 digit code _assembledTranscripts) with and gather GC, length and coverage. With that data, it plots R scripts
 
 Input: 
-	Directory of directories output by rnaSpades OR folder called Renamed_assembled_files of previously renamed files (if this is the case, put -r or --renamed in the command line)
-	txt file of LKH number and new names formatted like this: LKHxxx\tLKHxxx-10_digit_code\tdescriptor_of_taxon
+	Folder called Renamed_assembled_files of previously renamed files (if this is the case, put -r or --renamed in the command line)
+	tsv file of LKH number and new names formatted like this: LKHxxx\tLKHxxx-10_digit_code\tdescriptor_of_taxon called new_names.tsv
 
-To run if your files are already renamed:
-	python assess_transcriptomes.py <pathway to directory of spades output>
-	python assess_transcriptomes.py --renamed
+To run:
+	python assess_transcriptomes.py -input <pathway to directory of spades output>
 
 Output: csv file of length, GC, coverage of each transcript, and multiple R plots, faceted by taxon and a csv file of data. It plots GC by length, and distributions of coverage, length and GC
 
@@ -23,66 +22,36 @@ from Bio import SeqIO
 
 def script_help():
 
-	print('\nThis script grabs and plots GC, length and coverage of transcriptomes. \n\nInput:\ntxt file of tab separated LKH number, ten digit code and taxon info (taxonomy, lifestage, etc).\nAND\nfolder of the folders output by spades named with LKH number \n(LKH999 or WTALKH999)\nOR\ndirectory of renamed assemblies in this format: ten_digit_code_assembledTranscripts.fasta\n\nOutput is multiple R plots, faceted by taxon and a csv file of data. \n\nIt plots GC by length, and distributions of coverage, length and GC.\n\n To run: \n\n\tpython assess_transcriptomes.py <pathway to directory of spades output>\n\n-r or --renamed if your assemblies are already renamed to this format: ten_digit_code_assembledTranscript.fasta/nand this command if they are not yet named: --raw <path to directory of spades output folders>\n\n-h or --help for this message\n\n')
+	print('\nThis script grabs and plots GC, length and coverage of transcriptomes. \n\nInput:\ntsv file of tab separated LKH number, ten digit code and taxon info (taxonomy, lifestage, etc).\nAND\nfolder of the folders output by spades named with LKH number \n(LKH999 or WTALKH999)\nOR\ndirectory of renamed assemblies in this format: ten_digit_code_assembledTranscripts.fasta\n\nOutput is multiple R plots, faceted by taxon and a csv file of data. \n\nIt plots GC by length, and distributions of coverage, length and GC.\n\n To run: \n\n\tpython assess_transcriptomes.py <pathway to directory of spades output>\n\n-r or --renamed if your assemblies are already renamed to this format: ten_digit_code_assembledTranscript.fasta/nand this command if they are not yet named: --raw <path to directory of spades output folders>\n\n-h or --help for this message\n\n')
 
 def get_args():
 	#this parses user arguments. Checks if the files are renamed already or not (--renamed or --raw), and gets the directory of those files.
 	
-	renamed = False
 	if('--help' in sys.argv or '-h' in sys.argv):#check for help function in command line
 		script_help()
 		exit()
 
 
-	if ('--renamed'in sys.argv or '-r' in sys.argv):#check for renamed parameter
+	if ('--input'in sys.argv or '-i' in sys.argv):#check for renamed parameter
 		renamed = True
 		try:
-			if('--renamed' in sys.argv):
-				input_dir = sys.argv[sys.argv.index('--renamed') + 1]
+			if('--input' in sys.argv):
+				input_dir = sys.argv[sys.argv.index('--input') + 1]
 			else:
-				input_dir = sys.argv[sys.argv.index('-r') + 1]
+				input_dir = sys.argv[sys.argv.index('-i') + 1]
 		except IndexError:
 			print('\nSomething went wrong went parsing the arguments. Did you input a directory of assemblies?\n')
 
-	if ('--raw' in sys.argv or '-w' in sys.argv):#check for renamed parameter 
-		renamed = False
-		try:
-			if('--raw' in sys.argv):
-				input_dir = sys.argv[sys.argv.index('--raw') + 1]
-			else:
-				input_dir = sys.argv[sys.argv.index('-r') + 1]
-		except IndexError:
-			print('\nSomething went wrong went parsing the arguments. Did you input a directory of assemblies?\n')
-	make_dirs(renamed, input_dir)
-
 	
-def make_dirs(renamed, input_dir):
+		make_dirs(input_dir)
+
+	
+def make_dirs(input_dir):
 
 	Path(f'plots').mkdir(parents=True, exist_ok=True)#makes output folder for r plots
+	assess_transcriptomes(input_dir)#skip renaming funtion if theyre already renamed
+	
 	
-	if renamed == True:
-		assess_transcriptomes(input_dir)#skip renaming funtion if theyre already renamed
-	elif renamed == False:
-		Path(f'Renamed_assembled_files').mkdir(parents=True, exist_ok=True)#makes output folder for renamed fasta files
-		#print('renaming')
-		#print(input_dir)
-		rename_assembled_transcriptomes(input_dir)#send to renaming function if they're not renamed yet.
-	else:
-		print('\nplease specify if your files are already renamed (--renamed) or if they are not yet renamed (--raw)\n')
-
-
-def rename_assembled_transcriptomes(input_dir):#this function grabs and renames assembled transcripts files to the format required for this script and for phylotolv6 part1 
-
-	names = { line.split('\t')[0].strip() : line.split('\t')[1].strip() for line in open('new_names.txt')}#make dictionary of original name: new name to replace
-	for dir in os.listdir(input_dir):
-		for old_name, new_name in names.items():
-			if(old_name in dir):
-				if(os.path.isfile(f'{input_dir}/{dir}/transcripts.fasta')):#go into spades output directory
-					print(f'{old_name} is being renamed to {new_name}\n')
-					os.system(f'cp {input_dir}/{dir}/transcripts.fasta Renamed_assembled_files/{new_name}_assembledTranscripts.fasta')#rename and copy to renamed_assembled files
-	input_dir = 'Renamed_assembled_files'
-
-	assess_transcriptomes(input_dir)
 
 def assess_transcriptomes(input_dir):
 	
@@ -95,9 +64,8 @@ def assess_transcriptomes(input_dir):
 			
 			taxon_info = get_taxon_info(file)#send to get_taxon_info function
 			#parse output from that function
-			lkh = taxon_info[0]
-			ten_digit_code = taxon_info[1]
-			taxon = taxon_info[2]
+			ten_digit_code = taxon_info[0]
+			taxon = taxon_info[1]
 
 		#	extract all data for each transcript
 			for record in records:
@@ -106,38 +74,37 @@ def assess_transcriptomes(input_dir):
 				iden = record.id
 				cov = record.id.split('_')[5]
 		
-				transcript_data = f'{iden}, {lkh}, {length}, {gc}, {cov}, {ten_digit_code}, {taxon}'#list of data per transcriptome
+				transcript_data = f'{iden}, {length}, {gc}, {cov}, {ten_digit_code}, {taxon}'#list of data per transcriptome
 				data.append(transcript_data)#data for each LKH
 		
 			all_data.update({file.split('_assembledTranscripts')[0]: data})#make dict of ten digit code: all information
 
 	write_to_file()
 
-def get_taxon_info(file):#parse the info in the new_names.txt file to get info on the taxon
-	with open('new_names.txt', 'r') as o:
+def get_taxon_info(file):#parse the info in the new_names.tsv file to get info on the taxon
+	with open('new_names.tsv', 'r') as o:
 		cell_info = o.readlines()
 		for line in cell_info:
 			if '_'.join(file.split('_')[0:3]) in line:
-				lkh = line.split('\t')[0]
-				ten_digit_code = line.split('\t')[1]
-				taxon = line.split('\t')[2]
+				ten_digit_code = line.split('\t')[0]
+				taxon = line.split('\t')[1]
 	
 	try:
-		return lkh, ten_digit_code, taxon
+		return ten_digit_code, taxon
 	except UnboundLocalError:
-		print(f'no taxon information in new_names.txt for {file}')
+		print(f'no taxon information in new_names.tsv for {file}')
 
-	print('done with getting taxon info')
+	print('done getting taxon info from new_names.tsv')
 
 
 def write_to_file():
 
 	with open('assembly_assessment.csv', 'w') as o:#create output csv
 		print('writing data to assembly_assessment.csv\n')
-		o.write('seqID, lkh, length, GC, cov, ten_digit_code, taxon_info\n')#write header
+		o.write('seqID, length, GC, cov, ten_digit_code, taxon_info\n')#write header
 		for lkh, data in all_data.items():
 			for transcript in data:
-				o.write(f'{transcript}')#write each line
+				o.write(f'{transcript.strip()}\n')#write each line
 
 	plot_assessment()
 
@@ -152,4 +119,13 @@ def plot_assessment():
 
 if __name__ == '__main__':
 	all_data = {}
-	get_args()
+	
+	try:
+		f = open("new_names.tsv")
+
+	except FileNotFoundError:
+		print('\n\n Did you include a tsv file of your cells? It should be called new_names.tsv and formatted like this: 10_digit_code\tdescriptor_of_taxon\n\n')	
+		exit()
+
+	else:
+		get_args()