#load necessary packages
library(tidyverse)

#Change to the path of the directory you're working from
#Use "getwd()" in console below to get path
#You may need to change name of data frame below
#if you used the CUB_v2.1.py script from Github, it should be in the 
#CUBOutput folder and then inside the SpreadSheets folder
#you are looking for the CompTrans.ENc.Raw.tsv
gc3 <- data.frame(read_tsv('ENc.Raw.tsv'))|>
  mutate(taxon = paste(substr(SequenceID, 1, 5), substr(SequenceID,6,10), sep = '')) #this line reads in your 10-digit codes to a column in the data frame called taxon

spp <- data.frame(read_tsv('Species_name.tsv'))
# Set the maximum text length you want

spp$Species <- sapply(strsplit(spp$Species, " "), function(words) {
  if (sum(nchar(words)) > 20) {
    words[2] <- "spp"  
  }
  paste(words, collapse = " ")
})
gc3$taxon_a <- gc3$taxon
for (i in seq_len(nrow(spp))) {
  gc3$taxon_a <- gsub(spp$ten_digit_code[i], spp$Species[i], gc3$taxon_a)
}

gc3 <- gc3 %>%
  group_by(taxon) %>%
  mutate(taxon_c = paste0(taxon_a, '\n', taxon,', ',n()))

#This .tsv is generated by the CUB script and will be in the same folder as the .tsv above
#This generates the null expectation line
enc_null <- data.frame(read_tsv('ENc.Null.tsv'))

#change data in first line here to what you want plotted
#you need as.numeric to ensure R is reading the variable correctly
gc3_plot <- ggplot(gc3, aes(as.numeric(GC3.Degen), as.numeric(ObsWrightENc_No6Fold)))+
  geom_point(size = 0.1)+
  geom_line(data = enc_null, aes(GC3, ENc))+
  theme_classic()+
  labs(x = 'GC3 Degen', y = 'ObsWrightENc_No6fold')+
  theme(legend.position = 'none', strip.text = element_text(size = 7))+
  ggtitle("R2G NTD files")+
  theme( axis.text.x=element_text(colour="black"))+
  theme(axis.text.y=element_text(colour="black"))+
  facet_wrap(~taxon_c)
gc3_plot