''' #Author, date: Elinor Sterner(esterner27@gmail.com), March 2023. Last updated Nov 27th 2023 #Motivation: To describe transcriptomes. #Intent: Plot length, coverage and GC of assembled transcripts with stats generated. #Dependencies: Python3 #Inputs: Folder called Renamed_assembled_files of previously renamed files (if this is the case, put -r or --renamed in the command line) tsv file of LKH number and new names formatted like this: 10_digit_code\tdescriptor_of_taxon called new_names.tsv #Outputs: A spreadsheet containing the length, GC, and coverage of each transcript. #Example: python Assess_transcriptomes_v2.0.py -input ''' import os import sys from pathlib import Path import sys from Bio.SeqUtils import GC from Bio import SeqIO def script_help(): print('\nThis script grabs and plots GC, length and coverage of transcriptomes. \n\nInput:\ntsv file of tab separated ten digit code and taxon info (taxonomy, lifestage, etc).\nAND\n\ndirectory of assemblies renamed in this format: ten_digit_code_assembledTranscripts.fasta\n\nOutput is multiple R plots faceted by taxon and a csv file of data. \n\nIt plots GC by length, and distributions of coverage, length and GC.\n\n To run: \n\n\tpython assess_transcriptomes.py -i \n\nRun -h or --help for this message\n\n') def get_args(): #this parses user arguments. Checks if the files are renamed already or not (--renamed or --raw), and gets the directory of those files. if('--help' in sys.argv or '-h' in sys.argv):#check for help function in command line script_help() exit() if ('--input'in sys.argv or '-i' in sys.argv):#check for renamed parameter renamed = True try: if('--input' in sys.argv): input_dir = sys.argv[sys.argv.index('--input') + 1] else: input_dir = sys.argv[sys.argv.index('-i') + 1] except IndexError: print('\nSomething went wrong went parsing the arguments. Did you input a directory of assemblies?\n') make_dirs(input_dir) def make_dirs(input_dir): Path(f'plots').mkdir(parents=True, exist_ok=True)#makes output folder for r plots assess_transcriptomes(input_dir)#skip renaming funtion if theyre already renamed def assess_transcriptomes(input_dir): #iterate through renamed fasta files for file in os.listdir(input_dir): if file.endswith('fasta'): print(f'gathering data from {file}..\n') data = []#initiate list of data per LKH records = list(SeqIO.parse(f'{input_dir}/{file}', 'fasta'))#parse the fasta files taxon_info = get_taxon_info(file)#send to get_taxon_info function #parse output from that function ten_digit_code = taxon_info[0] taxon = taxon_info[1] # extract all data for each transcript for record in records: gc = GC(record.seq) length = len(record.seq) iden = record.id cov = record.id.split('_')[5] transcript_data = f'{iden}, {length}, {gc}, {cov}, {ten_digit_code}, {taxon}'#list of data per transcriptome data.append(transcript_data)#data for each LKH all_data.update({file.split('_assembledTranscripts')[0]: data})#make dict of ten digit code: all information write_to_file() def get_taxon_info(file):#parse the info in the new_names.tsv file to get info on the taxon with open('new_names.tsv', 'r') as o: cell_info = o.readlines() for line in cell_info: if '_'.join(file.split('_')[0:3]) in line: ten_digit_code = line.split('\t')[0] taxon = line.split('\t')[1] try: return ten_digit_code, taxon except UnboundLocalError: print(f'no taxon information in new_names.tsv for {file}') print('done getting taxon info from new_names.tsv') def write_to_file(): with open('assembly_assessment.csv', 'w') as o:#create output csv print('writing data to assembly_assessment.csv\n') o.write('seqID, length, GC, cov, ten_digit_code, taxon_info\n')#write header for lkh, data in all_data.items(): for transcript in data: o.write(f'{transcript.strip()}\n')#write each line plot_assessment() def plot_assessment(): print('Plotting your data\n') os.system('Rscript plot_assemblies.R') for file in os.listdir(): if file.endswith('.png') or file.endswith('.pdf'): os.system(f'mv {file} plots/{file}') print('\n\n\n\nDone! All your plots are saved to the folder called plots\n\n') if __name__ == '__main__': all_data = {} try: f = open("new_names.tsv") except FileNotFoundError: print('\n\n Did you include a tsv file of your cells? It should be called new_names.tsv and formatted like this: 10_digit_code\tdescriptor_of_taxon\n\n') exit() else: get_args()