#load necessary packages library(tidyverse) #Change to the path of the directory you're working from #Use "getwd()" in console below to get path #You may need to change name of data frame below #if you used the CUB_v2.1.py script from Github, it should be in the #CUBOutput folder and then inside the SpreadSheets folder #you are looking for the CompTrans.ENc.Raw.tsv gc3 <- data.frame(read_tsv('ENc.Raw.tsv'))|> mutate(taxon = paste(substr(SequenceID, 1, 5), substr(SequenceID,6,10), sep = '')) #this line reads in your 10-digit codes to a column in the data frame called taxon spp <- data.frame(read_tsv('Species_name.tsv')) # Set the maximum text length you want spp$Species <- sapply(strsplit(spp$Species, " "), function(words) { if (sum(nchar(words)) > 20) { words[2] <- "spp" } paste(words, collapse = " ") }) gc3$taxon_a <- gc3$taxon for (i in seq_len(nrow(spp))) { gc3$taxon_a <- gsub(spp$ten_digit_code[i], spp$Species[i], gc3$taxon_a) } gc3 <- gc3 %>% group_by(taxon) %>% mutate(taxon_c = paste0(taxon_a, '\n', taxon,', ',n())) #This .tsv is generated by the CUB script and will be in the same folder as the .tsv above #This generates the null expectation line enc_null <- data.frame(read_tsv('ENc.Null.tsv')) #change data in first line here to what you want plotted #you need as.numeric to ensure R is reading the variable correctly gc3_plot <- ggplot(gc3, aes(as.numeric(GC3.Degen), as.numeric(ObsWrightENc_No6Fold)))+ geom_point(size = 0.1)+ geom_line(data = enc_null, aes(GC3, ENc))+ theme_classic()+ labs(x = 'GC3 Degen', y = 'ObsWrightENc_No6fold')+ theme(legend.position = 'none', strip.text = element_text(size = 7))+ ggtitle("R2G NTD files")+ theme( axis.text.x=element_text(colour="black"))+ theme(axis.text.y=element_text(colour="black"))+ facet_wrap(~taxon_c) gc3_plot