EukPhylo/Utilities/for_fastas/Plotcomps_SppName.R

48 lines
1.8 KiB
R

#load necessary packages
library(tidyverse)
#Change to the path of the directory you're working from
#Use "getwd()" in console below to get path
#You may need to change name of data frame below
#if you used the CUB_v2.1.py script from Github, it should be in the
#CUBOutput folder and then inside the SpreadSheets folder
#you are looking for the CompTrans.ENc.Raw.tsv
gc3 <- data.frame(read_tsv('ENc.Raw.tsv'))|>
mutate(taxon = paste(substr(SequenceID, 1, 5), substr(SequenceID,6,10), sep = '')) #this line reads in your 10-digit codes to a column in the data frame called taxon
spp <- data.frame(read_tsv('Species_name.tsv'))
# Set the maximum text length you want
spp$Species <- sapply(strsplit(spp$Species, " "), function(words) {
if (sum(nchar(words)) > 20) {
words[2] <- "spp"
}
paste(words, collapse = " ")
})
gc3$taxon_a <- gc3$taxon
for (i in seq_len(nrow(spp))) {
gc3$taxon_a <- gsub(spp$ten_digit_code[i], spp$Species[i], gc3$taxon_a)
}
gc3 <- gc3 %>%
group_by(taxon) %>%
mutate(taxon_c = paste0(taxon_a, '\n', taxon,', ',n()))
#This .tsv is generated by the CUB script and will be in the same folder as the .tsv above
#This generates the null expectation line
enc_null <- data.frame(read_tsv('ENc.Null.tsv'))
#change data in first line here to what you want plotted
#you need as.numeric to ensure R is reading the variable correctly
gc3_plot <- ggplot(gc3, aes(as.numeric(GC3.Degen), as.numeric(ObsWrightENc_No6Fold)))+
geom_point(size = 0.1)+
geom_line(data = enc_null, aes(GC3, ENc))+
theme_classic()+
labs(x = 'GC3 Degen', y = 'ObsWrightENc_No6fold')+
theme(legend.position = 'none', strip.text = element_text(size = 7))+
ggtitle("R2G NTD files")+
theme( axis.text.x=element_text(colour="black"))+
theme(axis.text.y=element_text(colour="black"))+
facet_wrap(~taxon_c)
gc3_plot