mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-27 03:10:24 +08:00
127 lines
4.5 KiB
Python
127 lines
4.5 KiB
Python
'''
|
|
#Author, date: Elinor Sterner(esterner27@gmail.com), March 2023. Last updated Nov 27th 2023
|
|
#Motivation: To describe transcriptomes.
|
|
#Intent: Plot length, coverage and GC of assembled transcripts with stats generated.
|
|
#Dependencies: Python3
|
|
#Inputs: Folder called Renamed_assembled_files of previously renamed files (if this is the case, put -r or --renamed in the command line)
|
|
tsv file of LKH number and new names formatted like this: 10_digit_code\tdescriptor_of_taxon called new_names.tsv
|
|
#Outputs: A spreadsheet containing the length, GC, and coverage of each transcript.
|
|
#Example: python Assess_transcriptomes_v2.0.py -input <pathway to directory of spades output>
|
|
'''
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import sys
|
|
from Bio.SeqUtils import GC
|
|
from Bio import SeqIO
|
|
|
|
def script_help():
|
|
|
|
print('\nThis script grabs and plots GC, length and coverage of transcriptomes. \n\nInput:\ntsv file of tab separated ten digit code and taxon info (taxonomy, lifestage, etc).\nAND\n\ndirectory of assemblies renamed in this format: ten_digit_code_assembledTranscripts.fasta\n\nOutput is multiple R plots faceted by taxon and a csv file of data. \n\nIt plots GC by length, and distributions of coverage, length and GC.\n\n To run: \n\n\tpython assess_transcriptomes.py -i <pathway to directory of assemblies>\n\nRun -h or --help for this message\n\n')
|
|
|
|
def get_args():
|
|
#this parses user arguments. Checks if the files are renamed already or not (--renamed or --raw), and gets the directory of those files.
|
|
|
|
if('--help' in sys.argv or '-h' in sys.argv):#check for help function in command line
|
|
script_help()
|
|
exit()
|
|
|
|
|
|
if ('--input'in sys.argv or '-i' in sys.argv):#check for renamed parameter
|
|
renamed = True
|
|
try:
|
|
if('--input' in sys.argv):
|
|
input_dir = sys.argv[sys.argv.index('--input') + 1]
|
|
else:
|
|
input_dir = sys.argv[sys.argv.index('-i') + 1]
|
|
except IndexError:
|
|
print('\nSomething went wrong went parsing the arguments. Did you input a directory of assemblies?\n')
|
|
|
|
|
|
make_dirs(input_dir)
|
|
|
|
|
|
def make_dirs(input_dir):
|
|
|
|
Path(f'plots').mkdir(parents=True, exist_ok=True)#makes output folder for r plots
|
|
assess_transcriptomes(input_dir)#skip renaming funtion if theyre already renamed
|
|
|
|
|
|
|
|
def assess_transcriptomes(input_dir):
|
|
|
|
#iterate through renamed fasta files
|
|
for file in os.listdir(input_dir):
|
|
if file.endswith('fasta'):
|
|
print(f'gathering data from {file}..\n')
|
|
data = []#initiate list of data per LKH
|
|
records = list(SeqIO.parse(f'{input_dir}/{file}', 'fasta'))#parse the fasta files
|
|
|
|
taxon_info = get_taxon_info(file)#send to get_taxon_info function
|
|
#parse output from that function
|
|
ten_digit_code = taxon_info[0]
|
|
taxon = taxon_info[1]
|
|
|
|
# extract all data for each transcript
|
|
for record in records:
|
|
gc = GC(record.seq)
|
|
length = len(record.seq)
|
|
iden = record.id
|
|
cov = record.id.split('_')[5]
|
|
|
|
transcript_data = f'{iden}, {length}, {gc}, {cov}, {ten_digit_code}, {taxon}'#list of data per transcriptome
|
|
data.append(transcript_data)#data for each LKH
|
|
|
|
all_data.update({file.split('_assembledTranscripts')[0]: data})#make dict of ten digit code: all information
|
|
|
|
write_to_file()
|
|
|
|
def get_taxon_info(file):#parse the info in the new_names.tsv file to get info on the taxon
|
|
with open('new_names.tsv', 'r') as o:
|
|
cell_info = o.readlines()
|
|
for line in cell_info:
|
|
if '_'.join(file.split('_')[0:3]) in line:
|
|
ten_digit_code = line.split('\t')[0]
|
|
taxon = line.split('\t')[1]
|
|
|
|
try:
|
|
return ten_digit_code, taxon
|
|
except UnboundLocalError:
|
|
print(f'no taxon information in new_names.tsv for {file}')
|
|
|
|
print('done getting taxon info from new_names.tsv')
|
|
|
|
|
|
def write_to_file():
|
|
|
|
with open('assembly_assessment.csv', 'w') as o:#create output csv
|
|
print('writing data to assembly_assessment.csv\n')
|
|
o.write('seqID, length, GC, cov, ten_digit_code, taxon_info\n')#write header
|
|
for lkh, data in all_data.items():
|
|
for transcript in data:
|
|
o.write(f'{transcript.strip()}\n')#write each line
|
|
|
|
plot_assessment()
|
|
|
|
def plot_assessment():
|
|
print('Plotting your data\n')
|
|
os.system('Rscript plot_assemblies.R')
|
|
for file in os.listdir():
|
|
if file.endswith('.png') or file.endswith('.pdf'):
|
|
os.system(f'mv {file} plots/{file}')
|
|
|
|
print('\n\n\n\nDone! All your plots are saved to the folder called plots\n\n')
|
|
|
|
if __name__ == '__main__':
|
|
all_data = {}
|
|
|
|
try:
|
|
f = open("new_names.tsv")
|
|
|
|
except FileNotFoundError:
|
|
print('\n\n Did you include a tsv file of your cells? It should be called new_names.tsv and formatted like this: 10_digit_code\tdescriptor_of_taxon\n\n')
|
|
exit()
|
|
|
|
else:
|
|
get_args()
|