EukPhylo/Utilities/For_Assemblies/plot_transcriptomes_v2.0.R

64 lines
2.6 KiB
R

#Author, date: Elinor Sterner(esterner27@gmail.com), March 2023.
#Motivation: To describe transcriptomes.
#Intent: Plot length, coverage and GC of assembled transcripts with stats generated from assess_transcriptomes.py.
#Dependencies: Python3
#Inputs: assess_transcriptomes.py output (assembly_assessment.csv)
#Outputs: Several plots showing the distribution of length, GC content, and Coverage.
library(ggplot2)
library(tidyverse)
library(scales)
#read in data output by python script
transcriptome_data <- read.csv('assembly_assessment.csv')
#make a column combining ten digit code and taxon information
transcriptome_data_expert_level <- transcriptome_data|>
mutate(taxon_id = paste(ten_digit_code, taxon_info, sep = "-"))
#plot length vs coverage, faceted and labeled by taxon
len_cov <- ggplot(transcriptome_data, aes(x = GC, y = length, color = taxon_info))+
geom_point(size = .5)+
ylim(200, 15000)+
facet_wrap(~ten_digit_code + taxon_info)+
ggtitle('GC% and length of Allogromia assembled transcripts')+
theme(strip.text = element_text(size = 7))
ggsave('len_cov.png', device = 'png', width = 8.5, height = 6)#save plot
#plot distribution of GC
dist_gc <- ggplot(transcriptome_data_expert_level, aes(x = GC, fill = taxon_info))+
geom_histogram()+
facet_wrap(~factor(taxon_id, levels = unique(taxon_id)))+# facet_grid(~factor(my_variable, levels=c('val1', 'val2', 'val3', ...)))
ggtitle('Distribution of GC')+
theme(strip.text = element_text(size = 5))
dist_gc
ggsave('gc_dist.png', device = 'png', width = 8, height = 6)
#plot distribution of length
dist_len <- ggplot(transcriptome_data_expert_level, aes(x = reorder(taxon_id,length), y = length, color = taxon_info, fill= taxon_info))+
geom_violin()+
geom_boxplot(color = 'Black', fill = 'NA', outlier.shape = NA, width = 0.5)+
ggtitle('Length Distribution')+
scale_y_continuous(labels = comma, trans = 'log10')+
ylab('Length (log10)')+
xlab('Ten digit code, taxon info')
dist_len + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
ggsave('length_violin.png', device = 'png',width = 7.5, height = 6)
#plot distribution of coverage
dist_cov <- ggplot(transcriptome_data_expert_level, aes(x = reorder(taxon_id,cov), y = cov, color = taxon_info, fill= taxon_info))+
geom_violin()+
geom_boxplot(color = 'Black', fill = 'NA', outlier.shape = NA, width = 0.5)+
ggtitle('Distribution of coverage')+
scale_y_continuous(labels = comma, trans = 'log10')+
ylab('Coverage (log10)')+
xlab('Ten digit code, taxon info')
dist_cov+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
ggsave('cov_violin.png', device = 'png', width = 7.5, height = 6)