NovelTree to TSAR Eukaryotes.Here, we summarize the multitude of results produced by the
NovelTree workflow. Because results come from numerous
distinct software, we have provided a complementary script containing
helper functions to summarize these outputs.
We will begin by creating a variable that specifies where all workflow outputs are stored. Additionally we will source the script containing helper functions, and setting to variable some colors that we’ll use throughout.
suppressPackageStartupMessages(
suppressWarnings(source('./noveltree-summary-functions.R')))
source('./arcadia-color-gradients.R')
# Specify where the workflow outputs are stored
result_directory <- 'results-tsar-eukaryotes-06062023/'
# And where summarized results should be saved.
dir.create('./summarized-results', showWarnings = F)
# Some colors used for plotting each of the four supergroups
# within TSAR Eukaryotes
tsar_cols <-
c(Telonemia = '#8A99AD', Rhizaria = '#FFB984',
Alveolata = '#C85152', Stramenopiles = '#97CD78')Let’s begin by plotting the results of our BUSCO analyses. These were conducted at a broad taxonomic scale using the eukaryota_odb10 database for each species, as well as at shallower taxonomic scales, using either the alveolate, or stramenopile odb10 busco database.
In the section of code below, we read in BUSCO results, summarize them into useful visualizations, and plot these alongside the manually rooted species tree inferred from Asteroid, with branch lengths optimized using SpeciesRax.
busco_summary_fpaths <- list.files(path = paste0(result_directory, 'busco'), pattern = 'batch', full.names = T)
busco_broad_fpaths <- busco_summary_fpaths[grepl("eukaryota", busco_summary_fpaths)]
busco_shallow_fpaths <- busco_summary_fpaths[!grepl("eukaryota", busco_summary_fpaths)]
busco_broad_res <- do.call(rbind, lapply(busco_broad_fpaths, read_busco_summary))
busco_shallow_res <- do.call(rbind, lapply(busco_shallow_fpaths, read_busco_summary))
# Read in the species tree to order plotting by taxonomy
# Since we haven't explicitly done so yet, read in the SpeciesRax tree to variable
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)
# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
sep = ",", header = T)
taxa_tibble <- tibble(
tip = samplesheet$species,
grp = samplesheet$taxonomy)
# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])
# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <-
ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
new_scale_fill() +
geom_tippoint(aes(fill = grp), pch = 22,
size = 3, color = "black") +
scale_fill_manual(values = tsar_cols) +
guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) +
theme(legend.position = 'top')
busco_broad_res$File <- factor(busco_broad_res$File, levels = rev(spptree$tip.label[ordered_tips]))
busco_shallow_res$File <- factor(busco_shallow_res$File, levels = rev(spptree$tip.label[ordered_tips]))
busco_broad_plt <- plot_busco(busco_broad_res) +
theme(legend.position = 'none',
axis.title.y = element_blank(),
title = element_blank())
# Use the dataset as the label, and shift to the right
busco_broad_plt$data$Label <- busco_broad_plt$data$Lineage
busco_broad_plt$layers[[2]]$mapping$x <- 75
busco_shallow_plt <- plot_busco(busco_shallow_res) +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
title = element_blank())
# Add the dataset to the labels
busco_shallow_plt$data$Label <- busco_shallow_plt$data$Lineage
busco_shallow_plt$layers[[2]]$mapping$x <- 75
# Plot together and combine with the tree
busco_plt <-
plot_grid(busco_broad_plt, busco_shallow_plt,
ncol = 2, rel_widths = c(1, 0.95))
busco_plt <- aplot::insert_right(tree_p + theme(legend.position = 'none'), busco_plt, width = 3.5)
# Plot it and save!
ggsave(busco_plt, filename = 'summarized-results/busco_results_w_spp_tree.png', height = 10, width = 20, dpi = 600)
ggsave(busco_plt, filename = 'summarized-results/busco_results_w_spp_tree.pdf', height = 10, width = 20)
# Modify font size of labels for plotting in R-markdown.
busco_broad_plt$layers[[2]]$aes_params$size <- 4
busco_shallow_plt$layers[[2]]$aes_params$size <- 4
busco_plt <-
plot_grid(busco_broad_plt, busco_shallow_plt,
ncol = 2, rel_widths = c(1, 0.95))
busco_plt <- aplot::insert_right(tree_p + theme(legend.position = 'none'), busco_plt, width = 3.5)
busco_pltNow, let’s go ahead and summarize the results from OrthoFinder. To
start, this will just include some high-level summaries - number of
species per gene family, gene copy number per gene family, etc. This
will create and populate a new directory within the
NovelTree directory, summarized-results.
Within this directory, we will store OrthoFinder-specific results within
a subdirectory, intuitively named orthofinder.
# Summarize gene family content, both with respect to number of species
# included in each gene family, and the per-species mean copy number
summ_genefam_composition(results_dir = result_directory, show_plots = T)# And get orthogroup summary statistics per species
per_spp_of_stats <-
get_per_spp_ofinder_stats(tree_fpath = paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'),
samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
results_dir = result_directory, grp_name = 'Supergroup',
tip_grp_cols = tsar_cols,
count_cols = arcadia_cividis,
prop_cols = arcadia_viridis)# Lastly, get the per-species orthogroup counts/copy numbers
og_counts <- get_per_spp_og_counts(results_dir = result_directory)Let’s go ahead and visualize the species trees inferred from Asteroid and SpeciesRax:
asteroid_plt <-
plot_spp_tree(tree_fpath = paste0(result_directory, 'asteroid/asteroid.bestTree.newick'),
samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
grp_name = 'Supergroup', outgroup = c("Telonema_sp_P-2", "Telonema_subtile"),
tip_grp_cols = tsar_cols, cladogram = T)
spprax_plt <-
plot_spp_tree(tree_fpath = paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'),
samplesheet_fpath = paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
grp_name = 'Supergroup',
nodelab_name = 'EPQIC',
tip_grp_cols = tsar_cols)
ggsave(asteroid_plt, height = 7, width = 6, filename = './summarized-results/asteroid_spptree_w_boots.pdf')
ggsave(spprax_plt, height = 7, width = 6, filename = './summarized-results/speciesrax_spptree_w_epqic.pdf')
ggsave(asteroid_plt, height = 7, width = 6, filename = './summarized-results/asteroid_spptree_w_boots.png', dpi = 600)
ggsave(spprax_plt, height = 7, width = 6, filename = './summarized-results/speciesrax_spptree_w_epqic.png', dpi = 600)
# And go ahead and view these:
asteroid_pltNow lets go ahead and summarize, plot, and reformat in more user-friendly ways the large amount of results that are generated from GeneRax.
We will begin by first reading in all outputs and reformatting, saving the resultant tables for any additional downstream analyses. The functions used to reformat GeneRax outputs are fairly involved, producing numerous summarized outputs, facilitating downstream analysis and visualization. Briefly, these functions are:
summarize_generax_per_family, which summarizes the
results of the per-family model of GeneRax. The object created from this
function is a named list that contains:
rates_per_og: For each gene family, inferred rates of
duplication, transfer, and loss, assuming constant rates across the gene
family.events_per_og: Counts of duplications, transfers, and
losses per gene family, summed across all species.lgt_count_mat: A matrix of the sum of transfer events
between pairs of species, across all gene families.speciations_per_spp: For each gene family, the counts
of gene speciation events per species.duplications_per_spp: For each gene family, the counts
of duplication events per species.losses_per_spp: For each gene family, the counts of
gene loss events per species.transfer_donor_counts: For each gene family, the counts
of transfer donor events per species.transfer_recip_counts: For each gene family, the counts
of transfer recipient events per species.generax_res_per_species, which summarizes the results
of the per-species model of GeneRax. The object created from this
function is a named list that contains the same elements as in the
previous results, but additionally includes:
spp_rates_per_og: The rates of each event type for each
branch in the species tree.Because there are typically thousands to tends of thousands of gene families, there are a large number of calculations that need to be conducted. Consequently, to expedite this process, these functions are parallelized - I suggest taking advantage of this!
generax_res_per_fam <-
summarize_generax_per_family(results_dir = result_directory,
per_spp_og_counts = og_counts,
nparallel = detectCores(),
out_dir = 'summarized-results/generax-per-family/')
## Extracting event counts per-orthogroup, across all species.
## Extracting event rates per-orthogroup, across all species.
## Extracting event counts per-species, per-orthogroup.
## Now, pulling out each event type individually.
## Summarizing gene transfer recipient events.
## Summarizing gene transfer events into a matrix of donor-recipient species pairs.
## Pulling out the count of transfer-donor events for each species per gene family
## Pulling out the count of transfer-recipient events for each species per gene familygenerax_res_per_species <-
summarize_generax_per_species(results_dir = result_directory,
per_spp_og_counts = og_counts,
nparallel = detectCores(),
out_dir = 'summarized-results/generax-per-species/')
## Extracting event counts for each species per gene family.
## Extracting event rates per-species, for each gene family.
## Extracting event counts per-species, per-orthogroup.
## Now, pulling out each event type individually.
## Summarizing gene transfer recipient events.
## Summarizing gene transfer events into a matrix of donor-recipient species pairs.
## Pulling out the count of transfer-donor events for each species per gene family
## Pulling out the count of transfer-recipient events for each species per gene familyIn the following section, we summarize results obtained from the per-family model of gene family evolution implemented in GeneRax. Here, it is assumed that rates of duplication/transfer/loss are constant across all species/branches in the species tree.
Now let’s visualize rates of duplication, transfer, and loss per gene family.
# reformat the per-og event rates to a melted data frame for plotting
rates_per_og <-
melt(generax_res_per_fam$rates_per_og, id.vars = "gene_family",
variable.name = "Event", value.name = "Rate")
rates_per_og$Rate <-
as.numeric(rates_per_og$Rate)
# Plot the rates per orthogroup
rates_per_og_plt <-
ggplot(data = rates_per_og,
aes(fill = Event, x = Event, y = Rate)) +
geom_sina(aes(color = Event), alpha = 0.5, pch = 20, size = 1) +
geom_violin(alpha = 0.25) +
stat_boxplot(geom = 'errorbar', width = 0.05, linewidth = 1) +
geom_boxplot(width = 0.05, linewidth = 0.5, outlier.color = NA, notch = T) +
scale_fill_brewer(type = 'qual', palette = 2) +
scale_color_brewer(type = 'qual', palette = 2) +
theme_classic(base_size = 14) +
theme(legend.position = 'none') +
xlab('Event Rate')
# Save out to a pdf
ggsave(rates_per_og_plt, filename = "summarized-results/generax-per-family/per_og_generax_event_rates.pdf",
height = 8, width = 8)
# And let's take a look:
rates_per_og_pltLet’s also visualize in three separate panes showing how the three rates co-vary.
rates_per_og <- generax_res_per_fam$rates_per_og
rates_per_og[,-1] <- apply(rates_per_og[,-1], 2, as.numeric)
ylimit <- c(0, round(max(rates_per_og[,-1]) + 0.005, 2))
dup_loss_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = transfer, x = duplication, y = loss)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 3, name = 'Transfer Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Duplication Rate') +
ylab('Loss Rate') +
ylim(ylimit)
dup_transf_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = loss, x = duplication, y = transfer)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 7, name = 'Loss Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Duplication Rate') +
ylab('Transfer Rate') +
ylim(ylimit)
transf_loss_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = duplication, x = transfer, y = loss)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 2, name = 'Duplication Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Transfer Rate') +
ylab('Loss Rate') +
ylim(ylimit)
# Combine them, visualize, and save
og_event_rate_2d_plts <-
plot_grid(dup_loss_rate_plt, dup_transf_rate_plt,
transf_loss_rate_plt, ncol = 3)
ggsave(og_event_rate_2d_plts, filename = 'summarized-results/generax-per-family/dup_transf_loss_2d_plots.pdf',
height = 6, width = 14)
og_event_rate_2d_pltsAnd finally, let’s plot these rates interactively in three dimensions!
rates_per_og <- generax_res_per_fam$rates_per_og
rates_per_og[,-1] <- apply(rates_per_og[,-1], 2, as.numeric)
rates_per_og$verticality <-
rowSums(apply(generax_res_per_fam$events_per_og[,c(2,3,4,7)], 2, as.numeric)) /
rowSums(apply(generax_res_per_fam$events_per_og[,c(2:5,7)], 2, as.numeric))
# Get the total copy number and count of species for each gene family, either across all
# species, or per taxonomic supergroup:
og_tax_count <-
data.frame("gene_family" = generax_res_per_fam$events_per_og$gene_family,
"total_number_species" = generax_res_per_fam$events_per_og$number_species,
"alveolata_number_species" = NA, "rhizaria_number_species" = NA,
"stramenopile_number_species" = NA, "telonemia_number_species" = NA)
og_tax_copynum <-
data.frame("gene_family" = generax_res_per_fam$events_per_og$gene_family,
"total_copy_number" = generax_res_per_fam$events_per_og$number_gene_copies,
"alveolata_copy_number" = NA, "rhizaria_copy_number" = NA,
"stramenopile_copy_number" = NA, "telonemia_copy_number" = NA)
grps <- unique(samplesheet$taxonomy)
for(i in 1:length(grps[order(grps)])){
# Identify the group we're dealing with
grp <- grps[i]
# Pull out count data for the corresponding species
spps <- samplesheet$species[which(samplesheet$taxonomy == grp)]
count_dat <- og_counts[which(og_counts$Orthogroup %in% rates_per_og$gene_family),which(colnames(og_counts) %in% spps)]
# Identify the count of species within with taxonomic group that occur in each gene family
num_spp <- apply(count_dat, 1, function(x) length(which(x > 0)))
copynum <- rowSums(count_dat)
og_tax_count[,2+i] <- num_spp
og_tax_copynum[,2+i] <- copynum
}
og_tax_summary <- merge(og_tax_count, og_tax_copynum)
event_rates_3d_plt <-
plot_ly(x = rates_per_og$duplication, y = rates_per_og$loss,
z = rates_per_og$transfer, hoverinfo = 'text',
marker = list(size = 4, opacity = 0.95, showscale = T, color = rates_per_og$transfer,
colorscale = list(c(0, 0.33, 0.66, 1.0), unlist(arcadia_viridis$color_dict)),
colorbar = list(title = "Verticality")),
text = ~paste0('Family: ', rates_per_og$gene_family, '<br>',
'Total # Spp.: ', og_tax_summary$total_number_species, '<br>',
'Total Copy #: ', og_tax_summary$total_copy_number, '<br>',
'# Telonemia: ', og_tax_summary$telonemia_number_species, '<br>',
'# Alveolates: ', og_tax_summary$alveolata_number_species, '<br>',
'# Stramenopiles: ', og_tax_summary$stramenopile_number_species, '<br>',
'# Rhizariates: ', og_tax_summary$rhizaria_number_species, '<br>'),
mode = 'markers', type = 'scatter3d') %>%
layout(scene = list(xaxis = list(title = "Duplication Rate"),
yaxis = list(title = "Loss Rate"),
zaxis = list(title = "Transfer Rate")))
event_rates_3d_pltNow, lets summarize the rates/occurrence of pairwise horizontal transfer between species (terminal branches of the phylogeny). Eventually will build/include additional visualization of transfer rates between internal branches; that is a significantly more complicated problem to solve, however.
# Since we haven't explicitly done so yet, read in the SpeciesRax tree to variable
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)
# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
sep = ",", header = T)
taxa_tibble <- tibble(
tip = samplesheet$species,
grp = samplesheet$taxonomy)
# Pull these data out and reformat for plotting
transf_count_mat <-
melt(generax_res_per_fam$lgt_count_mat, varnames = c('Donor', 'Recipient'))
# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])
transf_count_mat$Donor <-
factor(transf_count_mat$Donor, levels = spptree$tip.label[ordered_tips])
transf_count_mat$Recipient <-
factor(transf_count_mat$Recipient, levels = rev(spptree$tip.label[ordered_tips]))
transf_count_mat$value <- as.numeric(transf_count_mat$value)
# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <-
ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
new_scale_fill() +
geom_tippoint(aes(fill = grp), pch = 22,
size = 3, color = "black") +
scale_fill_manual(values = tsar_cols) +
guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) +
theme(legend.position = 'top')
tax_leg <- get_legend(tree_p)
# Now plot with the taxonomic group tip labels:
transf_heatmap <-
ggplot(data = transf_count_mat,
aes(x = Donor, y = Recipient, fill = value)) +
geom_tile() +
theme_classic(base_size = 14) +
scale_y_discrete(position = 'right') +
guides(fill = guide_colorbar(title = 'Transfer Count',
title.position = 'top',
title.hjust = 0)) +
theme(legend.position = 'top',
axis.text.x = element_text(size = 10, angle = 45, hjust=1),
axis.text.y = element_text(size = 10),
legend.key.width = unit(2,'cm')) +
scale_fill_gradientn(colors = arcadia_magma$color_dict,
values = arcadia_magma$values,
trans = "log10",
na.value = arcadia_magma$values[1])
transfer_leg <- get_legend(transf_heatmap)
transf_heatmap <- transf_heatmap + theme(legend.position = 'none')
transf_heatmap <- transf_heatmap %>% aplot::insert_left(tree_p + theme(legend.position = 'none'), width = 0.5)
legends <- plot_grid(tax_leg, transfer_leg, nrow = 1)
transf_heatmap_plt <-
plot_grid(legends, ggplotify::as.grob(transf_heatmap),
nrow = 2, rel_heights = c(0.1, 0.9))
# Save the figure to a pdf
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-family/species_pairwise_hgt_event_count.pdf',
height = 9.5, width = 12)
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-family/species_pairwise_hgt_event_count.png',
height = 9.5, width = 12, dpi = 600)
transf_heatmap_plt# And let's visualize this heatmap interactively with plotly!
gg <-
ggplot(data = transf_count_mat,
aes(x = Donor, y = Recipient, fill = value+1)) +
geom_tile(aes(text = value)) +
theme_classic(base_size = 14) +
scale_fill_gradientn(colors = arcadia_magma$color_dict,
values = arcadia_magma$values,
na.value = arcadia_magma$values[1],
trans = 'log10') +
scale_y_discrete(position = 'right') +
theme(legend.position = 'top',
axis.text.x = element_text(size = 6, angle = 45, hjust=1),
axis.text.y = element_text(size = 6),
legend.key.width = unit(2,'cm'))
transf_heatmap_plotly <- ggplotly(gg)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$title <- "Transfer\nCount"
transf_heatmap_plotly$x$data[[1]]$text <- sub("value \\+ 1: .*/>", "Number Transfers:", transf_heatmap_plotly$x$data[[1]]$text)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$thickness <- 35
transf_heatmap_plotlyIn the following section, we summarize results obtained from the per-species model of gene family evolution implemented in GeneRax. Here, each species/branch in the species tree may have its own rate of duplication/transfer/loss.
Now let’s visualize rates of duplication, transfer, and loss per species, measured for each gene family.
# reformat the per-og event rates to a melted data frame for plotting
rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>%
summarise(.by = gene_family, duplication = mean(duplication),
loss = mean(loss), transfer = mean(transfer))
rates_per_og <-
melt(rates_per_og, id.vars = "gene_family",
variable.name = "Event", value.name = "Rate")
# Plot the rates per orthogroup
rates_per_og_plt <-
ggplot(data = rates_per_og,
aes(fill = Event, x = Event, y = Rate)) +
geom_sina(aes(color = Event), alpha = 0.5, pch = 20, size = 1) +
geom_violin(alpha = 0.25) +
stat_boxplot(geom = 'errorbar', width = 0.05, linewidth = 1) +
geom_boxplot(width = 0.05, linewidth = 0.5, outlier.color = NA, notch = T) +
scale_fill_brewer(type = 'qual', palette = 2) +
scale_color_brewer(type = 'qual', palette = 2) +
theme_classic(base_size = 14) +
theme(legend.position = 'none') +
xlab('Event Rate')
# Save out to a pdf
ggsave(rates_per_og_plt, filename = "summarized-results/generax-per-species/per_og_generax_event_rates.pdf",
height = 8, width = 8)
# And let's take a look:
rates_per_og_pltLet’s also visualize in three separate panes showing how the three rates co-vary.
rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>%
summarise(.by = gene_family, duplication = mean(duplication),
loss = mean(loss), transfer = mean(transfer))
ylimit <- c(0, round(max(rates_per_og[,-1]) + 0.005, 2))
dup_loss_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = transfer, x = duplication, y = loss)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 3, name = 'Transfer Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Duplication Rate') +
ylab('Loss Rate') +
ylim(ylimit)
dup_transf_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = loss, x = duplication, y = transfer)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 7, name = 'Loss Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Duplication Rate') +
ylab('Transfer Rate') +
ylim(ylimit)
transf_loss_rate_plt <-
ggplot(data = rates_per_og,
aes(fill = duplication, x = transfer, y = loss)) +
geom_point(pch = 21, size = 3, alpha = 0.5) +
scale_fill_distiller(type = 'seq', palette = 2, name = 'Duplication Rate') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Transfer Rate') +
ylab('Loss Rate') +
ylim(ylimit)
# Combine them, visualize, and save
og_event_rate_2d_plts <-
plot_grid(dup_loss_rate_plt, dup_transf_rate_plt,
transf_loss_rate_plt, ncol = 3)
ggsave(og_event_rate_2d_plts, filename = 'summarized-results/generax-per-species/dup_transf_loss_2d_plots.pdf',
height = 6, width = 14)
counts <- generax_res_per_fam$events_per_og
rates_per_og$verticality <- rowSums(apply(counts[,c(2,3,4,7)], 2, as.numeric)) / rowSums(apply(counts[,c(2:5,7)], 2, as.numeric))
rates_per_og$td_ratio <- rates_per_og$transfer / rates_per_og$duplication
rates_per_og$dl_ratio <- rates_per_og$duplication / rates_per_og$loss
rates_per_og$tl_ratio <- rates_per_og$transfer / rates_per_og$loss
td_d_plt <-
ggplot(data = rates_per_og,
aes(fill = verticality, x = duplication, y = td_ratio)) +
geom_point(pch = 21, size = 3) +
scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Duplication Rate') +scale_y_log10() + scale_x_log10() +
ylab('Transfer Rate / Duplication Rate')
dl_l_plt <-
ggplot(data = rates_per_og,
aes(fill = verticality, x = loss, y = dl_ratio)) +
geom_point(pch = 21, size = 3) +
scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Loss Rate') + scale_y_log10() + scale_x_log10() +
ylab('Duplication Rate / Loss Rate')
tl_l_plt <-
ggplot(data = rates_per_og,
aes(fill = verticality, x = loss, y = tl_ratio)) +
geom_point(pch = 21, size = 3) +
scale_fill_distiller(type = 'seq', palette = 2, name = 'Verticality') +
theme_classic(base_size = 14) +
theme(legend.position = 'top', legend.justification = 'left',
legend.key.width = unit(1,"cm")) +
guides(fill = guide_colorbar(title.position="top", title.hjust = 0)) +
xlab('Loss Rate') + scale_y_log10() + scale_x_log10() +
ylab('Transfer Rate / Loss Rate')
og_event_rate_2d_pltsAnd let’s plot these rates interactively in three dimensions!
rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_og <- rates_per_spp %>%
summarise(.by = gene_family, duplication = mean(duplication),
loss = mean(loss), transfer = mean(transfer))
counts <- generax_res_per_fam$events_per_og
rates_per_og$verticality <-
rowSums(apply(generax_res_per_species$events_per_og[,c(2,3,4,7)], 2, as.numeric)) /
rowSums(apply(generax_res_per_species$events_per_og[,c(2:5,7)], 2, as.numeric))
event_rates_3d_plt <-
plot_ly(x=rates_per_og$duplication, y= rates_per_og$loss,
z=rates_per_og$transfer, hoverinfo = 'text',
marker = list(size = 4, opacity = 0.95, showscale = T, color = rates_per_og$transfer,
colorscale = list(c(0, 0.33, 0.66, 1.0), unlist(arcadia_viridis$color_dict)),
colorbar = list(title = "Verticality")),
text = ~paste0('Family: ', rates_per_og$gene_family, '<br>',
'Total # Spp.: ', og_tax_summary$total_number_species, '<br>',
'Total Copy #: ', og_tax_summary$total_copy_number, '<br>',
'# Telonemia: ', og_tax_summary$telonemia_number_species, '<br>',
'# Alveolates: ', og_tax_summary$alveolata_number_species, '<br>',
'# Stramenopiles: ', og_tax_summary$stramenopile_number_species, '<br>',
'# Rhizariates: ', og_tax_summary$rhizaria_number_species, '<br>'),
mode = 'markers', type = 'scatter3d') %>%
layout(scene = list(xaxis = list(title = "Duplication Rate"),
yaxis = list(title = "Loss Rate"),
zaxis = list(title = "Transfer Rate")))
event_rates_3d_pltUsing the results of GeneRax where rates of D/T/L are inferred for each species and branch in the phylogeny, let’s summarize across all gene families. Specifically, we’ll visualize mean rates of duplication, transfer, loss and the ratio of duplication/transfer along each branch.
rates_per_spp <- generax_res_per_species$spp_rates_per_og
rates_per_spp[,-c(1,2)] <- apply(rates_per_spp[,-c(1,2)], 2, as.numeric)
rates_per_spp$dt_ratio <- apply(rates_per_spp[,c(3,5)], 1, function(x) x[1] / x[2])
rates_per_spp <- rates_per_spp %>%
summarise(.by = species, duplication = mean(duplication),
loss = mean(loss), transfer = mean(transfer),
dt_ratio = median(dt_ratio))
rownames(rates_per_spp) <- rates_per_spp$species
colnames(rates_per_spp)[1] <- 'node'
rates_per_spp_melted <-
melt(rates_per_spp, id.vars = "node",
variable.name = "event",
value.name = "value")
# Use the SpeciesRax species tree to correspond these values to each branch
spprax_tree <- read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick'))
spprax_tree$node.label <- NULL
# Update the node names in the tree to correspond with their numeric ID
rates_per_spp <- update_node_names(rates_per_spp, spprax_tree)
rates_per_spp_plts <-
plot_rates_per_spp(tree = spprax_tree, species_rates = rates_per_spp,
rate_cols = list(arcadia_dahlias, arcadia_pansies, arcadia_poppies))
# Combine the plots into a four-pane figure:
spp_dtl_rates_plt <-
plot_grid(rates_per_spp_plts$duplication, rates_per_spp_plts$transfer,
rates_per_spp_plts$loss, rates_per_spp_plts$dt_ratio,
ncol = 2)
# and one that has each along a single row
rates_per_spp_plts_dup <- rates_per_spp_plts
rates_per_spp_plts_dup$duplication$layers[[4]] <- NULL
rates_per_spp_plts_dup$transfer$layers[[4]] <- NULL
rates_per_spp_plts_dup$loss$layers[[4]] <- NULL
spp_dtl_rates_plt_horiz <-
plot_grid(rates_per_spp_plts_dup$duplication + theme(plot.margin = margin(5,10,10,5)),
rates_per_spp_plts_dup$transfer + theme(plot.margin = margin(5,10,10,5)),
rates_per_spp_plts_dup$loss + theme(plot.margin = margin(5,10,10,5)),
rates_per_spp_plts_dup$dt_ratio, nrow = 1, rel_widths = c(0.65, 0.65, 0.65, 1))
spp_dtl_rates_plt_horiz# Save this to file, along with the four individual panels
ggsave(spp_dtl_rates_plt, file = 'summarized-results/generax-per-species/dtl_rates_per_species.png',
dpi = 600, height = 12, width = 12)
ggsave(spp_dtl_rates_plt, file = 'summarized-results/generax-per-species/dtl_rates_per_species.pdf',
height = 12, width = 10)
ggsave(spp_dtl_rates_plt_horiz, file = 'summarized-results/generax-per-species/dtl_rates_per_species_landscape.png',
dpi = 600, height = 6, width = 14)
ggsave(spp_dtl_rates_plt_horiz, file = 'summarized-results/generax-per-species/dtl_rates_per_species_landscape.pdf',
height = 6, width = 14)
ggsave(rates_per_spp_plts$duplication, file = 'summarized-results/generax-per-species/duplication_rates_per_species.png',
dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$duplication, file = 'summarized-results/generax-per-species/duplication_rates_per_species.pdf',
height = 6, width = 6)
ggsave(rates_per_spp_plts$transfer, file = 'summarized-results/generax-per-species/transfer_rates_per_species.png',
dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$transfer, file = 'summarized-results/generax-per-species/transfer_rates_per_species.pdf',
height = 6, width = 6)
ggsave(rates_per_spp_plts$loss, file = 'summarized-results/generax-per-species/loss_rates_per_species.png',
dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$loss, file = 'summarized-results/generax-per-species/loss_rates_per_species.pdf',
height = 6, width = 5)
ggsave(rates_per_spp_plts$dt_ratio, file = 'summarized-results/generax-per-species/duplication_transfer_ratio_per_species.png',
dpi = 600, height = 6, width = 6)
ggsave(rates_per_spp_plts$dt_ratio, file = 'summarized-results/generax-per-species/duplication_transfer_ratio_per_species.pdf',
height = 6, width = 6)Now, lets summarize the rates/occurrence of pairwise horizontal transfer between species (terminal branches of the phylogeny). Eventually will build/include additional visualization of transfer rates between internal branches; that is a significantly more complicated problem to solve, however.
# Read in the species tree
spptree <- ladderize(read.tree(paste0(result_directory, 'speciesrax/species_trees/inferred_species_tree.newick')), right = F)
# Read in samplesheet to plot taxonomic group along with the species tree
samplesheet <- read.table(paste0(result_directory, 'pipeline_info/complete_samplesheet.valid.csv'),
sep = ",", header = T)
taxa_tibble <- tibble(
tip = samplesheet$species,
grp = samplesheet$taxonomy)
# Pull these data out and reformat for plotting
transf_count_mat <-
melt(generax_res_per_species$lgt_count_mat, varnames = c('Donor', 'Recipient'))
# Below, we figure out if branches are terminal or not (i.e. correspond to
# extant species), and then pull transfer events corresponding to these
# branches out, converting into a pairwise matrix, with donors as columns,
# recipients as rows.
is_tip <- spptree$edge[,2] <= length(spptree$tip.label)
ordered_tips <- rev(spptree$edge[is_tip, 2])
transf_count_mat$Donor <-
factor(transf_count_mat$Donor, levels = spptree$tip.label[ordered_tips])
transf_count_mat$Recipient <-
factor(transf_count_mat$Recipient, levels = rev(spptree$tip.label[ordered_tips]))
transf_count_mat$value <- as.numeric(transf_count_mat$value)
# Now, plot the species tree alongside a heatmap of pairwise transfers.
tree_p <-
ggtree(spptree, ladderize = T, size = 1.25) %<+% taxa_tibble +
new_scale_fill() +
geom_tippoint(aes(fill = grp), pch = 22,
size = 3, color = "black") +
scale_fill_manual(values = tsar_cols) +
guides(fill = guide_legend(title = 'Supergroup', nrow = 2)) +
theme(legend.position = 'top')
tax_leg <- get_legend(tree_p)
# Now plot with the taxonomic group tip labels:
transf_heatmap <-
ggplot(data = transf_count_mat,
aes(x = Donor, y = Recipient, fill = value)) +
geom_tile() +
theme_classic(base_size = 14) +
scale_y_discrete(position = 'right') +
guides(fill = guide_colorbar(title = 'Transfer Count',
title.position = 'top',
title.hjust = 0)) +
theme(legend.position = 'top',
axis.text.x = element_text(size = 10, angle = 45, hjust=1),
axis.text.y = element_text(size = 10),
legend.key.width = unit(2,'cm')) +
scale_fill_gradientn(colors = arcadia_magma$color_dict,
values = arcadia_magma$values,
trans = "log10",
na.value = arcadia_magma$values[1])
transfer_leg <- get_legend(transf_heatmap)
transf_heatmap <- transf_heatmap + theme(legend.position = 'none')
transf_heatmap <- transf_heatmap %>% aplot::insert_left(tree_p + theme(legend.position = 'none'), width = 0.5)
legends <- plot_grid(tax_leg, transfer_leg, nrow = 1)
transf_heatmap_plt <-
plot_grid(legends, ggplotify::as.grob(transf_heatmap),
nrow = 2, rel_heights = c(0.1, 0.9))
# Save the figure to a pdf
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-species/species_pairwise_hgt_event_count.pdf',
height = 9.5, width = 12)
ggsave(transf_heatmap_plt, filename = 'summarized-results/generax-per-species/species_pairwise_hgt_event_count.png',
height = 9.5, width = 12, dpi = 600)
transf_heatmap_plt# And let's visualize this heatmap interactively with plotly!
transf_count_mat$Donor <- gsub("_", " ", transf_count_mat$Donor)
transf_count_mat$Recipient <- gsub("_", " ", transf_count_mat$Recipient)
transf_count_mat$Donor <-
factor(transf_count_mat$Donor, levels = gsub("_", " ", spptree$tip.label[ordered_tips]))
transf_count_mat$Recipient <-
factor(transf_count_mat$Recipient, levels = gsub("_", " ", rev(spptree$tip.label[ordered_tips])))
gg <-
ggplot(data = transf_count_mat,
aes(x = Donor, y = Recipient, fill = value+1)) +
geom_tile(aes(text = value)) +
theme_classic(base_size = 12) +
scale_fill_gradientn(colors = arcadia_magma$color_dict,
values = arcadia_magma$values,
na.value = arcadia_magma$values[1],
trans = 'log10') +
scale_y_discrete(position = 'right') +
theme(legend.position = 'top',
axis.text.x = element_text(size = 6, angle = 45, hjust=1),
axis.text.y = element_text(size = 6),
legend.key.width = unit(2,'cm'))
transf_heatmap_plotly <- ggplotly(gg)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$title <- "Transfer\nCount"
transf_heatmap_plotly$x$data[[1]]$text <- sub("value \\+ 1: .*/>", "Number Transfers:", transf_heatmap_plotly$x$data[[1]]$text)
transf_heatmap_plotly$x$data[[2]]$marker$colorbar$thickness <- 35
transf_heatmap_plotly