1 Application of NovelTree to TSAR Eukaryotes.

Here, we summarize the multitude of results produced by the NovelTree workflow. Because results come from numerous distinct software, we have provided a complementary script containing helper functions to summarize these outputs.


1.1 Preparation

We will begin by creating a variable that specifies where all workflow outputs are stored. Additionally we will source the script containing helper functions, and setting to variable some colors that we’ll use throughout.

suppressPackageStartupMessages(
  suppressWarnings(source('./noveltree-summary-functions.R')))
source('./arcadia-color-gradients.R')

# Specify where the workflow outputs are stored
result_directory <- 'results-tsar-eukaryotes-06062023/'

# And where summarized results should be saved. 
dir.create('./summarized-results', showWarnings = F)

# Some colors used for plotting each of the four supergroups 
# within TSAR Eukaryotes
tsar_cols <- 
  c(Telonemia = '#8A99AD', Rhizaria = '#FFB984', 
    Alveolata = '#C85152', Stramenopiles = '#97CD78')

1.2 BUSCO

Let’s begin by plotting the results of our BUSCO analyses. These were conducted at a broad taxonomic scale using the eukaryota_odb10 database for each species, as well as at shallower taxonomic scales, using either the alveolate, or stramenopile odb10 busco database.

In the section of code below, we read in BUSCO results, summarize them into useful visualizations, and plot these alongside the manually rooted species tree inferred from Asteroid, with branch lengths optimized using SpeciesRax.

busco_summary_fpaths <- list.files(path = paste0(result_directory, 'busco'), pattern = 'batch', full.names = T)
busco_broad_fpaths <- busco_summary_fpaths[grepl("eukaryota", busco_summary_fpaths)]
busco_shallow_fpaths <- busco_summary_fpaths[!grepl("eukaryota", busco_summary_fpaths)]

busco_broad_res <- do.call(rbind, lapply(busco_broad_fpaths, read_busco_summary))
busco_shallow_res <- do.call(rbind, lapply(busco_shallow_fpaths, read_busco_summary))

# Read in the species tree to order plotting by taxonomy
# Since we haven't explicitly done so yet, read in the SpeciesRax tree to variable
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)

# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                          sep = ",", header = T)
taxa_tibble <- tibble(
  tip = samplesheet$species,
  grp = samplesheet$taxonomy)

# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])

# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <- 
  ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
  new_scale_fill() +
  geom_tippoint(aes(fill = grp), pch = 22, 
                      size = 3, color = "black") + 
  scale_fill_manual(values = tsar_cols) + 
  guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) + 
  theme(legend.position = 'top')

busco_broad_res$File <- factor(busco_broad_res$File, levels = rev(spptree$tip.label[ordered_tips]))
busco_shallow_res$File <- factor(busco_shallow_res$File, levels = rev(spptree$tip.label[ordered_tips]))
busco_broad_plt <- plot_busco(busco_broad_res) + 
  theme(legend.position = 'none', 
        axis.title.y = element_blank(),
        title = element_blank())
# Use the dataset as the label, and shift to the right
busco_broad_plt$data$Label <- busco_broad_plt$data$Lineage
busco_broad_plt$layers[[2]]$mapping$x <- 75

busco_shallow_plt <- plot_busco(busco_shallow_res) + 
  theme(axis.text.y = element_blank(), 
        axis.title.y = element_blank(), 
        title = element_blank())
# Add the dataset to the labels
busco_shallow_plt$data$Label <- busco_shallow_plt$data$Lineage
busco_shallow_plt$layers[[2]]$mapping$x <- 75
# Plot together and combine with the tree
busco_plt <- 
  plot_grid(busco_broad_plt, busco_shallow_plt, 
            ncol = 2, rel_widths = c(1, 0.95)) 
busco_plt <- aplot::insert_right(tree_p + theme(legend.position = 'none'), busco_plt, width = 3.5)

# Plot it and save!
ggsave(busco_plt, filename = 'summarized-results/busco_results_w_spp_tree.png', height = 10, width = 20, dpi = 600)
ggsave(busco_plt, filename = 'summarized-results/busco_results_w_spp_tree.pdf', height = 10, width = 20)

# Modify font size of labels for plotting in R-markdown.
busco_broad_plt$layers[[2]]$aes_params$size <- 4
busco_shallow_plt$layers[[2]]$aes_params$size <- 4
busco_plt <- 
  plot_grid(busco_broad_plt, busco_shallow_plt, 
            ncol = 2, rel_widths = c(1, 0.95)) 
busco_plt <- aplot::insert_right(tree_p + theme(legend.position = 'none'), busco_plt, width = 3.5)
busco_plt


1.3 Gene Family Summaries

Now, let’s go ahead and summarize the results from OrthoFinder. To start, this will just include some high-level summaries - number of species per gene family, gene copy number per gene family, etc. This will create and populate a new directory within the NovelTree directory, summarized-results. Within this directory, we will store OrthoFinder-specific results within a subdirectory, intuitively named orthofinder.

# Summarize gene family content, both with respect to number of species 
# included in each gene family, and the per-species mean copy number
summ_genefam_composition(results_dir = result_directory, show_plots = T)

# And get orthogroup summary statistics per species
per_spp_of_stats <-
  get_per_spp_ofinder_stats(tree_fpath = paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'),
                        samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                        results_dir = result_directory, grp_name = 'Supergroup',
                        tip_grp_cols = tsar_cols, 
                        count_cols = arcadia_cividis,
                        prop_cols = arcadia_viridis)

# Lastly, get the per-species orthogroup counts/copy numbers
og_counts <- get_per_spp_og_counts(results_dir = result_directory)

1.4 Species Trees

Let’s go ahead and visualize the species trees inferred from Asteroid and SpeciesRax:

  1. The Asteroid species tree is unrooted:
    • Bipartition support values are obtained through 100 bootstrap replicates.
  2. The SpeciesRax tree is rooted:
    • Inferred using the set of gene family trees (GFTs) that:
      • include at least four species, and
      • a maximum copy number of 5.
    • The tree inference procedure includes:
      • Inferring a reasonable starting tree with MiniNJ, and
      • subsequently optimizing the species tree topology and root position under an explicit model of gene family Duplication, Transfer, and Loss (DTL) through reconciliation and topology optimization of the GFTs used to infer it.
    • Bipartition support values are EPQIC values, a paralogy-aware quartet-based method that ranges between -1 and 1.
    • Interpretation is as follows (as described on the SpeciesRax wiki):
      • 1.0: perfect support
      • > 0.5: excellent support
      • > 0.2: very good support
      • > 0.0: positive support
      • < 0.0: negative support (one alternative quartet is more represented)
asteroid_plt <-
  plot_spp_tree(tree_fpath = paste0(result_directory, 'asteroid/asteroid.bestTree.newick'),
                samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                grp_name = 'Supergroup', outgroup = c("Telonema_sp_P-2", "Telonema_subtile"),
                tip_grp_cols = tsar_cols, cladogram = T)

spprax_plt <-
  plot_spp_tree(tree_fpath = paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'),
                samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                grp_name = 'Supergroup',
                nodelab_name = 'EPQIC',
                tip_grp_cols = tsar_cols)

ggsave(asteroid_plt, height = 7, width = 6, filename = './summarized-results/asteroid_spptree_w_boots.pdf')
ggsave(spprax_plt, height = 7, width = 6, filename = './summarized-results/speciesrax_spptree_w_epqic.pdf')
ggsave(asteroid_plt, height = 7, width = 6, filename = './summarized-results/asteroid_spptree_w_boots.png', dpi = 600)
ggsave(spprax_plt, height = 7, width = 6, filename = './summarized-results/speciesrax_spptree_w_epqic.png', dpi = 600)

# And go ahead and view these:
asteroid_plt

spprax_plt


1.5 GeneRax

Now lets go ahead and summarize, plot, and reformat in more user-friendly ways the large amount of results that are generated from GeneRax.

1.5.1 Preparation

We will begin by first reading in all outputs and reformatting, saving the resultant tables for any additional downstream analyses. The functions used to reformat GeneRax outputs are fairly involved, producing numerous summarized outputs, facilitating downstream analysis and visualization. Briefly, these functions are:

  1. summarize_generax_per_family, which summarizes the results of the per-family model of GeneRax. The object created from this function is a named list that contains:
    • rates_per_og: For each gene family, inferred rates of duplication, transfer, and loss, assuming constant rates across the gene family.
    • events_per_og: Counts of duplications, transfers, and losses per gene family, summed across all species.
    • lgt_count_mat: A matrix of the sum of transfer events between pairs of species, across all gene families.
    • speciations_per_spp: For each gene family, the counts of gene speciation events per species.
    • duplications_per_spp: For each gene family, the counts of duplication events per species.
    • losses_per_spp: For each gene family, the counts of gene loss events per species.
    • transfer_donor_counts: For each gene family, the counts of transfer donor events per species.
    • transfer_recip_counts: For each gene family, the counts of transfer recipient events per species.
  2. generax_res_per_species, which summarizes the results of the per-species model of GeneRax. The object created from this function is a named list that contains the same elements as in the previous results, but additionally includes:
    • spp_rates_per_og: The rates of each event type for each branch in the species tree.

Because there are typically thousands to tends of thousands of gene families, there are a large number of calculations that need to be conducted. Consequently, to expedite this process, these functions are parallelized - I suggest taking advantage of this!

generax_res_per_fam <-
  summarize_generax_per_family(results_dir = result_directory, 
                               per_spp_og_counts = og_counts,
                               nparallel = detectCores(), 
                               out_dir = 'summarized-results/generax-per-family/')
## Extracting event counts per-orthogroup, across all species.
## Extracting event rates per-orthogroup, across all species.
## Extracting event counts per-species, per-orthogroup.
## Now, pulling out each event type individually.
## Summarizing gene transfer recipient events.
## Summarizing gene transfer events into a matrix of donor-recipient species pairs.
## Pulling out the count of transfer-donor events for each species per gene family
## Pulling out the count of transfer-recipient events for each species per gene family
generax_res_per_species <-
  summarize_generax_per_species(results_dir = result_directory, 
                                per_spp_og_counts = og_counts,
                                nparallel = detectCores(), 
                                out_dir = 'summarized-results/generax-per-species/')
## Extracting event counts for each species per gene family.
## Extracting event rates per-species, for each gene family.
## Extracting event counts per-species, per-orthogroup.
## Now, pulling out each event type individually.
## Summarizing gene transfer recipient events.
## Summarizing gene transfer events into a matrix of donor-recipient species pairs.
## Pulling out the count of transfer-donor events for each species per gene family
## Pulling out the count of transfer-recipient events for each species per gene family

1.5.2 Results: Per-Family

In the following section, we summarize results obtained from the per-family model of gene family evolution implemented in GeneRax. Here, it is assumed that rates of duplication/transfer/loss are constant across all species/branches in the species tree.

1.5.2.1 D/T/L rates: Per-family

Now let’s visualize rates of duplication, transfer, and loss per gene family.

# reformat the per-og event rates to a melted data frame for plotting
rates_per_og <-
  melt(generax_res_per_fam$rates_per_og, id.vars = "gene_family",
       variable.name = "Event", value.name = "Rate")
rates_per_og$Rate <-
  as.numeric(rates_per_og$Rate)

# Plot the rates per orthogroup
rates_per_og_plt <-
  ggplot(data = rates_per_og,
         aes(fill = Event, x = Event, y = Rate)) +
  geom_sina(aes(color = Event), alpha = 0.5, pch = 20, size = 1) +
  geom_violin(alpha = 0.25) +
  stat_boxplot(geom = 'errorbar', width = 0.05, linewidth = 1) +
  geom_boxplot(width = 0.05, linewidth = 0.5, outlier.color = NA, notch = T) +
  scale_fill_brewer(type = 'qual', palette = 2) +
  scale_color_brewer(type = 'qual', palette = 2) +
  theme_classic(base_size = 14) +
  theme(legend.position = 'none') +
  xlab('Event Rate')

# Save out to a pdf
ggsave(rates_per_og_plt, filename = "summarized-results/generax-per-family/per_og_generax_event_rates.pdf",
       height = 8, width = 8)

# And let's take a look:
rates_per_og_plt

Let’s also visualize in three separate panes showing how the three rates co-vary.

rates_per_og <- generax_res_per_fam$rates_per_og
rates_per_og[,-1] <- apply(rates_per_og[,-1], 2, as.numeric)
ylimit <- c(0, round(max(rates_per_og[,-1]) + 0.005, 2))
dup_loss_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = transfer, x = duplication, y = loss)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 3, name = 'Transfer Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Duplication Rate') +
  ylab('Loss Rate') +
  ylim(ylimit)

dup_transf_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = loss, x = duplication, y = transfer)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 7, name = 'Loss Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Duplication Rate') +
  ylab('Transfer Rate') +
  ylim(ylimit)

transf_loss_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = duplication, x = transfer, y = loss)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 2, name = 'Duplication Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Transfer Rate') +
  ylab('Loss Rate') +
  ylim(ylimit)

# Combine them, visualize, and save
og_event_rate_2d_plts <-
  plot_grid(dup_loss_rate_plt, dup_transf_rate_plt,
            transf_loss_rate_plt, ncol = 3)

ggsave(og_event_rate_2d_plts, filename = 'summarized-results/generax-per-family/dup_transf_loss_2d_plots.pdf',
       height = 6, width = 14)

og_event_rate_2d_plts

And finally, let’s plot these rates interactively in three dimensions!

rates_per_og <- generax_res_per_fam$rates_per_og
rates_per_og[,-1] <- apply(rates_per_og[,-1], 2, as.numeric)
rates_per_og$verticality <- 
  rowSums(apply(generax_res_per_fam$events_per_og[,c(2,3,4,7)], 2, as.numeric)) / 
  rowSums(apply(generax_res_per_fam$events_per_og[,c(2:5,7)], 2, as.numeric))

# Get the total copy number and count of species for each gene family, either across all 
# species, or per taxonomic supergroup:
og_tax_count <- 
  data.frame("gene_family" = generax_res_per_fam$events_per_og$gene_family,
             "total_number_species" = generax_res_per_fam$events_per_og$number_species,
             "alveolata_number_species" = NA, "rhizaria_number_species" = NA, 
             "stramenopile_number_species" = NA, "telonemia_number_species" = NA)
og_tax_copynum <- 
  data.frame("gene_family" = generax_res_per_fam$events_per_og$gene_family,
             "total_copy_number" = generax_res_per_fam$events_per_og$number_gene_copies,
             "alveolata_copy_number" = NA, "rhizaria_copy_number" = NA,
             "stramenopile_copy_number" = NA, "telonemia_copy_number" = NA)

grps <- unique(samplesheet$taxonomy)
for(i in 1:length(grps[order(grps)])){
  # Identify the group we're dealing with
  grp <- grps[i]
  # Pull out count data for the corresponding species
  spps <- samplesheet$species[which(samplesheet$taxonomy == grp)]
  count_dat <- og_counts[which(og_counts$Orthogroup %in% rates_per_og$gene_family),which(colnames(og_counts) %in% spps)]
  # Identify the count of species within with taxonomic group that occur in each gene family
  num_spp <- apply(count_dat, 1, function(x) length(which(x > 0)))
  copynum <- rowSums(count_dat)
  og_tax_count[,2+i] <- num_spp
  og_tax_copynum[,2+i] <- copynum
}
og_tax_summary <- merge(og_tax_count, og_tax_copynum)

event_rates_3d_plt <-
  plot_ly(x = rates_per_og$duplication, y = rates_per_og$loss,
          z = rates_per_og$transfer, hoverinfo = 'text',
          marker = list(size = 4, opacity = 0.95, showscale = T, color = rates_per_og$transfer,
                        colorscale = list(c(0, 0.33, 0.66, 1.0), unlist(arcadia_viridis$color_dict)),
                        colorbar = list(title = "Verticality")),
          text = ~paste0('Family: ', rates_per_og$gene_family, '<br>', 
                          'Total # Spp.: ', og_tax_summary$total_number_species, '<br>',
                          'Total Copy #: ', og_tax_summary$total_copy_number, '<br>',
                          '# Telonemia: ', og_tax_summary$telonemia_number_species, '<br>',
                          '# Alveolates: ', og_tax_summary$alveolata_number_species, '<br>',
                          '# Stramenopiles: ', og_tax_summary$stramenopile_number_species, '<br>',
                          '# Rhizariates: ', og_tax_summary$rhizaria_number_species, '<br>'),
          mode = 'markers', type = 'scatter3d') %>%
  layout(scene = list(xaxis = list(title = "Duplication Rate"),
                      yaxis = list(title = "Loss Rate"),
                      zaxis = list(title = "Transfer Rate")))

event_rates_3d_plt
# And save this out to an HTML
htmlwidgets::saveWidget(as_widget(event_rates_3d_plt), "summarized-results/generax-per-family/generax_per_og_event_rates_3d.html")

1.5.2.2 Tranfer counts among species

Now, lets summarize the rates/occurrence of pairwise horizontal transfer between species (terminal branches of the phylogeny). Eventually will build/include additional visualization of transfer rates between internal branches; that is a significantly more complicated problem to solve, however.

# Since we haven't explicitly done so yet, read in the SpeciesRax tree to variable
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)

# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                          sep = ",", header = T)
taxa_tibble <- tibble(
  tip = samplesheet$species,
  grp = samplesheet$taxonomy)
    
# Pull these data out and reformat for plotting
transf_count_mat <-
  melt(generax_res_per_fam$lgt_count_mat, varnames = c('Donor', 'Recipient'))

# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])

transf_count_mat$Donor <-
  factor(transf_count_mat$Donor, levels = spptree$tip.label[ordered_tips])
transf_count_mat$Recipient <-
  factor(transf_count_mat$Recipient, levels = rev(spptree$tip.label[ordered_tips]))
transf_count_mat$value <- as.numeric(transf_count_mat$value)

# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <- 
  ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
  new_scale_fill() +
  geom_tippoint(aes(fill = grp), pch = 22, 
                      size = 3, color = "black") + 
  scale_fill_manual(values = tsar_cols) + 
  guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) + 
  theme(legend.position = 'top')

tax_leg <- get_legend(tree_p)

# Now plot with the taxonomic group tip labels:
transf_heatmap <-
  ggplot(data = transf_count_mat,
         aes(x = Donor, y = Recipient, fill = value)) +
  geom_tile() +
  theme_classic(base_size = 14) +
  scale_y_discrete(position = 'right') +
  guides(fill = guide_colorbar(title = 'Transfer Count',
                      title.position = 'top',
                      title.hjust = 0)) +
  theme(legend.position = 'top',
        axis.text.x = element_text(size = 10, angle = 45, hjust=1),
        axis.text.y = element_text(size = 10),
        legend.key.width = unit(2,'cm')) +
  scale_fill_gradientn(colors = arcadia_magma$color_dict,
                       values = arcadia_magma$values, 
                       trans = "log10", 
                       na.value = arcadia_magma$values[1])

transfer_leg <- get_legend(transf_heatmap)
transf_heatmap <- transf_heatmap + theme(legend.position = 'none')
transf_heatmap <- transf_heatmap %>% aplot::insert_left(tree_p + theme(legend.position = 'none'), width = 0.5)

legends <- plot_grid(tax_leg, transfer_leg, nrow = 1)
transf_heatmap_plt <-
  plot_grid(legends, ggplotify::as.grob(transf_heatmap),
            nrow = 2, rel_heights = c(0.1, 0.9))

# Save the figure to a pdf
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-family/species_pairwise_hgt_event_count.pdf',
       height = 9.5, width = 12)
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-family/species_pairwise_hgt_event_count.png',
       height = 9.5, width = 12, dpi = 600)

transf_heatmap_plt

# And let's visualize this heatmap interactively with plotly!
gg <- 
  ggplot(data = transf_count_mat,
         aes(x = Donor, y = Recipient, fill = value+1)) +
  geom_tile(aes(text = value)) +
  theme_classic(base_size = 14) +
  scale_fill_gradientn(colors = arcadia_magma$color_dict,
                       values = arcadia_magma$values, 
                       na.value = arcadia_magma$values[1],
                       trans = 'log10') +
  scale_y_discrete(position = 'right') +
  theme(legend.position = 'top',
        axis.text.x = element_text(size = 6, angle = 45, hjust=1),
        axis.text.y = element_text(size = 6),
        legend.key.width = unit(2,'cm'))

transf_heatmap_plotly <- ggplotly(gg)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$title <- "Transfer\nCount"
transf_heatmap_plotly$x$data[[1]]$text <- sub("value \\+ 1: .*/>", "Number Transfers:", transf_heatmap_plotly$x$data[[1]]$text)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$thickness <- 35
transf_heatmap_plotly
# And save this out to an HTML
htmlwidgets::saveWidget(as_widget(transf_heatmap_plotly), "summarized-results/generax-per-family/species_pairwise_hgt_event_count.html")

1.5.3 Results: Per-Species

In the following section, we summarize results obtained from the per-species model of gene family evolution implemented in GeneRax. Here, each species/branch in the species tree may have its own rate of duplication/transfer/loss.

1.5.3.1 D/T/L rates: Per-family

Now let’s visualize rates of duplication, transfer, and loss per species, measured for each gene family.

# reformat the per-og event rates to a melted data frame for plotting
rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>% 
  summarise(.by = gene_family, duplication = mean(duplication), 
            loss = mean(loss), transfer = mean(transfer)) 
rates_per_og <-
  melt(rates_per_og, id.vars = "gene_family",
       variable.name = "Event", value.name = "Rate")

# Plot the rates per orthogroup
rates_per_og_plt <-
  ggplot(data = rates_per_og,
         aes(fill = Event, x = Event, y = Rate)) +
  geom_sina(aes(color = Event), alpha = 0.5, pch = 20, size = 1) +
  geom_violin(alpha = 0.25) +
  stat_boxplot(geom = 'errorbar', width = 0.05, linewidth = 1) +
  geom_boxplot(width = 0.05, linewidth = 0.5, outlier.color = NA, notch = T) +
  scale_fill_brewer(type = 'qual', palette = 2) +
  scale_color_brewer(type = 'qual', palette = 2) +
  theme_classic(base_size = 14) +
  theme(legend.position = 'none') +
  xlab('Event Rate')

# Save out to a pdf
ggsave(rates_per_og_plt, filename = "summarized-results/generax-per-species/per_og_generax_event_rates.pdf",
       height = 8, width = 8)

# And let's take a look:
rates_per_og_plt

Let’s also visualize in three separate panes showing how the three rates co-vary.

rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>% 
  summarise(.by = gene_family, duplication = mean(duplication), 
            loss = mean(loss), transfer = mean(transfer)) 

ylimit <- c(0, round(max(rates_per_og[,-1]) + 0.005, 2))
dup_loss_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = transfer, x = duplication, y = loss)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 3, name = 'Transfer Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Duplication Rate') +
  ylab('Loss Rate') +
  ylim(ylimit)

dup_transf_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = loss, x = duplication, y = transfer)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 7, name = 'Loss Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Duplication Rate') +
  ylab('Transfer Rate') +
  ylim(ylimit)

transf_loss_rate_plt <-
  ggplot(data = rates_per_og,
         aes(fill = duplication, x = transfer, y = loss)) +
  geom_point(pch = 21, size = 3, alpha = 0.5) +
  scale_fill_distiller(type = 'seq', palette = 2, name = 'Duplication Rate') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Transfer Rate') +
  ylab('Loss Rate') +
  ylim(ylimit)

# Combine them, visualize, and save
og_event_rate_2d_plts <-
  plot_grid(dup_loss_rate_plt, dup_transf_rate_plt,
            transf_loss_rate_plt, ncol = 3)

ggsave(og_event_rate_2d_plts, filename = 'summarized-results/generax-per-species/dup_transf_loss_2d_plots.pdf',
       height = 6, width = 14)

counts <- generax_res_per_fam$events_per_og
rates_per_og$verticality <- rowSums(apply(counts[,c(2,3,4,7)], 2, as.numeric)) / rowSums(apply(counts[,c(2:5,7)], 2, as.numeric))
rates_per_og$td_ratio <- rates_per_og$transfer / rates_per_og$duplication
rates_per_og$dl_ratio <- rates_per_og$duplication / rates_per_og$loss
rates_per_og$tl_ratio <- rates_per_og$transfer / rates_per_og$loss

td_d_plt <- 
  ggplot(data = rates_per_og,
         aes(fill = verticality, x = duplication, y = td_ratio)) +
  geom_point(pch = 21, size = 3) +
  scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Duplication Rate') +scale_y_log10() + scale_x_log10() +
  ylab('Transfer Rate / Duplication Rate')

dl_l_plt <- 
  ggplot(data = rates_per_og,
         aes(fill = verticality, x = loss, y = dl_ratio)) +
  geom_point(pch = 21, size = 3) +
  scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Loss Rate') + scale_y_log10() + scale_x_log10() +
  ylab('Duplication Rate / Loss Rate')

tl_l_plt <- 
  ggplot(data = rates_per_og,
         aes(fill = verticality, x = loss, y = tl_ratio)) +
  geom_point(pch = 21, size = 3) +
  scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
  theme_classic(base_size = 14) +
  theme(legend.position = 'top', legend.justification = 'left',
        legend.key.width = unit(1,"cm")) +
  guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
  xlab('Loss Rate') + scale_y_log10() + scale_x_log10() +
  ylab('Transfer Rate / Loss Rate')

og_event_rate_2d_plts

And let’s plot these rates interactively in three dimensions!

rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>% 
  summarise(.by = gene_family, duplication = mean(duplication), 
            loss = mean(loss), transfer = mean(transfer)) 

counts <- generax_res_per_fam$events_per_og
rates_per_og$verticality <- 
  rowSums(apply(generax_res_per_species$events_per_og[,c(2,3,4,7)], 2, as.numeric)) / 
  rowSums(apply(generax_res_per_species$events_per_og[,c(2:5,7)], 2, as.numeric))

event_rates_3d_plt <-
  plot_ly(x=rates_per_og$duplication, y= rates_per_og$loss,
          z=rates_per_og$transfer, hoverinfo = 'text',
          marker = list(size = 4, opacity = 0.95, showscale = T, color = rates_per_og$transfer,
                        colorscale = list(c(0, 0.33, 0.66, 1.0), unlist(arcadia_viridis$color_dict)),
                        colorbar = list(title = "Verticality")),
          text = ~paste0('Family: ', rates_per_og$gene_family, '<br>', 
                          'Total # Spp.: ', og_tax_summary$total_number_species, '<br>',
                          'Total Copy #: ', og_tax_summary$total_copy_number, '<br>',
                          '# Telonemia: ', og_tax_summary$telonemia_number_species, '<br>',
                          '# Alveolates: ', og_tax_summary$alveolata_number_species, '<br>',
                          '# Stramenopiles: ', og_tax_summary$stramenopile_number_species, '<br>',
                          '# Rhizariates: ', og_tax_summary$rhizaria_number_species, '<br>'),
          mode = 'markers', type = 'scatter3d') %>%
  layout(scene = list(xaxis = list(title = "Duplication Rate"),
                      yaxis = list(title = "Loss Rate"),
                      zaxis = list(title = "Transfer Rate")))

event_rates_3d_plt
# And save this out to an HTML
htmlwidgets::saveWidget(as_widget(event_rates_3d_plt), "summarized-results/generax-per-species/generax_per_og_event_rates_3d.html")

1.5.3.2 D/T/L rates: Per-species

Using the results of GeneRax where rates of D/T/L are inferred for each species and branch in the phylogeny, let’s summarize across all gene families. Specifically, we’ll visualize mean rates of duplication, transfer, loss and the ratio of duplication/transfer along each branch.

rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_spp$dt_ratio <- apply(rates_per_spp[,c(3,5)], 1, function(x) x[1] / x[2])

rates_per_spp <- rates_per_spp %>% 
  summarise(.by = species, duplication = mean(duplication), 
            loss = mean(loss), transfer = mean(transfer), 
            dt_ratio = median(dt_ratio))
rownames(rates_per_spp) <- rates_per_spp$species
colnames(rates_per_spp)[1] <- 'node'

rates_per_spp_melted <- 
  melt(rates_per_spp, id.vars = "node", 
       variable.name = "event", 
       value.name = "value")

# Use the SpeciesRax species tree to correspond these values to each branch 
spprax_tree <- read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'))
spprax_tree$node.label <- NULL

# Update the node names in the tree to correspond with their numeric ID
rates_per_spp <- update_node_names(rates_per_spp, spprax_tree)

rates_per_spp_plts <- 
  plot_rates_per_spp(tree = spprax_tree, species_rates = rates_per_spp, 
                     rate_cols = list(arcadia_dahlias, arcadia_pansies, arcadia_poppies))

# Combine the plots into a four-pane figure:
spp_dtl_rates_plt <- 
  plot_grid(rates_per_spp_plts$duplication, rates_per_spp_plts$transfer,
          rates_per_spp_plts$loss, rates_per_spp_plts$dt_ratio,
          ncol = 2)

# and one that has each along a single row
rates_per_spp_plts_dup <- rates_per_spp_plts
rates_per_spp_plts_dup$duplication$layers[[4]] <- NULL
rates_per_spp_plts_dup$transfer$layers[[4]] <- NULL
rates_per_spp_plts_dup$loss$layers[[4]] <- NULL

spp_dtl_rates_plt_horiz <- 
  plot_grid(rates_per_spp_plts_dup$duplication + theme(plot.margin = margin(5,10,10,5)), 
            rates_per_spp_plts_dup$transfer + theme(plot.margin = margin(5,10,10,5)),
            rates_per_spp_plts_dup$loss + theme(plot.margin = margin(5,10,10,5)), 
            rates_per_spp_plts_dup$dt_ratio, nrow = 1, rel_widths = c(0.65, 0.65, 0.65, 1))
spp_dtl_rates_plt_horiz

# Save this to file, along with the four individual panels
ggsave(spp_dtl_rates_plt, file = 'summarized-results/generax-per-species/dtl_rates_per_species.png', 
       dpi = 600, height = 12, width = 12)
ggsave(spp_dtl_rates_plt, file = 'summarized-results/generax-per-species/dtl_rates_per_species.pdf', 
       height = 12, width = 10)
ggsave(spp_dtl_rates_plt_horiz, file = 'summarized-results/generax-per-species/dtl_rates_per_species_landscape.png', 
       dpi = 600, height = 6, width = 14)
ggsave(spp_dtl_rates_plt_horiz, file = 'summarized-results/generax-per-species/dtl_rates_per_species_landscape.pdf', 
       height = 6, width = 14)
ggsave(rates_per_spp_plts$duplication, file = 'summarized-results/generax-per-species/duplication_rates_per_species.png', 
       dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$duplication, file = 'summarized-results/generax-per-species/duplication_rates_per_species.pdf', 
       height = 6, width = 6)
ggsave(rates_per_spp_plts$transfer, file = 'summarized-results/generax-per-species/transfer_rates_per_species.png', 
       dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$transfer, file = 'summarized-results/generax-per-species/transfer_rates_per_species.pdf', 
       height = 6, width = 6)
ggsave(rates_per_spp_plts$loss, file = 'summarized-results/generax-per-species/loss_rates_per_species.png', 
       dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$loss, file = 'summarized-results/generax-per-species/loss_rates_per_species.pdf', 
       height = 6, width = 5)
ggsave(rates_per_spp_plts$dt_ratio, file = 'summarized-results/generax-per-species/duplication_transfer_ratio_per_species.png', 
       dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$dt_ratio, file = 'summarized-results/generax-per-species/duplication_transfer_ratio_per_species.pdf', 
       height = 6, width = 6)

1.5.3.3 Tranfer counts among species

Now, lets summarize the rates/occurrence of pairwise horizontal transfer between species (terminal branches of the phylogeny). Eventually will build/include additional visualization of transfer rates between internal branches; that is a significantly more complicated problem to solve, however.

# Read in the species tree
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)

# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
                          sep = ",", header = T)
taxa_tibble <- tibble(
  tip = samplesheet$species,
  grp = samplesheet$taxonomy)
    
# Pull these data out and reformat for plotting
transf_count_mat <-
  melt(generax_res_per_species$lgt_count_mat, varnames = c('Donor', 'Recipient'))

# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])

transf_count_mat$Donor <-
  factor(transf_count_mat$Donor, levels = spptree$tip.label[ordered_tips])
transf_count_mat$Recipient <-
  factor(transf_count_mat$Recipient, levels = rev(spptree$tip.label[ordered_tips]))
transf_count_mat$value <- as.numeric(transf_count_mat$value)

# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <- 
  ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
  new_scale_fill() +
  geom_tippoint(aes(fill = grp), pch = 22, 
                      size = 3, color = "black") + 
  scale_fill_manual(values = tsar_cols) + 
  guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) + 
  theme(legend.position = 'top')

tax_leg <- get_legend(tree_p)

# Now plot with the taxonomic group tip labels:
transf_heatmap <-
  ggplot(data = transf_count_mat,
         aes(x = Donor, y = Recipient, fill = value)) +
  geom_tile() +
  theme_classic(base_size = 14) +
  scale_y_discrete(position = 'right') +
  guides(fill = guide_colorbar(title = 'Transfer Count',
                      title.position = 'top',
                      title.hjust = 0)) +
  theme(legend.position = 'top',
        axis.text.x = element_text(size = 10, angle = 45, hjust=1),
        axis.text.y = element_text(size = 10),
        legend.key.width = unit(2,'cm')) +
  scale_fill_gradientn(colors = arcadia_magma$color_dict,
                       values = arcadia_magma$values, 
                       trans = "log10", 
                       na.value = arcadia_magma$values[1])

transfer_leg <- get_legend(transf_heatmap)
transf_heatmap <- transf_heatmap + theme(legend.position = 'none')
transf_heatmap <- transf_heatmap %>% aplot::insert_left(tree_p + theme(legend.position = 'none'), width = 0.5)

legends <- plot_grid(tax_leg, transfer_leg, nrow = 1)
transf_heatmap_plt <-
  plot_grid(legends, ggplotify::as.grob(transf_heatmap),
            nrow = 2, rel_heights = c(0.1, 0.9))

# Save the figure to a pdf
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-species/species_pairwise_hgt_event_count.pdf',
       height = 9.5, width = 12)
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-species/species_pairwise_hgt_event_count.png',
       height = 9.5, width = 12, dpi = 600)

transf_heatmap_plt

# And let's visualize this heatmap interactively with plotly!
transf_count_mat$Donor <- gsub("_", " ", transf_count_mat$Donor)
transf_count_mat$Recipient <- gsub("_", " ", transf_count_mat$Recipient)
transf_count_mat$Donor <-
  factor(transf_count_mat$Donor, levels = gsub("_", " ", spptree$tip.label[ordered_tips]))
transf_count_mat$Recipient <-
  factor(transf_count_mat$Recipient, levels = gsub("_", " ", rev(spptree$tip.label[ordered_tips])))

gg <- 
  ggplot(data = transf_count_mat,
         aes(x = Donor, y = Recipient, fill = value+1)) +
  geom_tile(aes(text = value)) +
  theme_classic(base_size = 12) +
  scale_fill_gradientn(colors = arcadia_magma$color_dict,
                       values = arcadia_magma$values, 
                       na.value = arcadia_magma$values[1],
                       trans = 'log10') +
  scale_y_discrete(position = 'right') +
  theme(legend.position = 'top',
        axis.text.x = element_text(size = 6, angle = 45, hjust=1),
        axis.text.y = element_text(size = 6),
        legend.key.width = unit(2,'cm'))

transf_heatmap_plotly <- ggplotly(gg)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$title <- "Transfer\nCount"
transf_heatmap_plotly$x$data[[1]]$text <- sub("value \\+ 1: .*/>", "Number Transfers:", transf_heatmap_plotly$x$data[[1]]$text)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$thickness <- 35
transf_heatmap_plotly
# And save this out to an HTML
htmlwidgets::saveWidget(as_widget(transf_heatmap_plotly), "summarized-results/generax-per-species/species_pairwise_hgt_event_count.html")