13 Evaluating the integration results
After having investigated the results of an integration method (Chapter 12), we will show in this chapter how to evaluate the integration results against single-omics analyses results or prior biological knowledge.
As a reminder, here is what the _targets.R
script should look like so far:
_targets.R
script
library(targets)
library(tarchetypes)
library(moiraine)
tar_option_set(
packages = c(
"moiraine",
"MOFA2",
"mixOmics",
"readr",
"tibble",
"tidyr",
"dplyr",
"ggplot2",
"patchwork"
)
)
list(
# Importing data ---------------------------------------------------------------
## Data import using a target factory
import_dataset_csv_factory(
files = c(
system.file("extdata/genomics_dataset.csv", package = "moiraine"),
system.file("extdata/transcriptomics_dataset.csv", package = "moiraine"),
system.file("extdata/metabolomics_dataset.csv", package = "moiraine")
),
col_ids = c("marker", "gene_id", "sample_id"),
features_as_rowss = c(TRUE, TRUE, FALSE),
target_name_suffixes = c("geno", "transcripto", "metabo")
),
## Genomics features metadata file
tar_target(
fmetadata_file_geno,
system.file("extdata/genomics_features_info.csv", package = "moiraine"),
format = "file"
),
## Genomics features metadata import
tar_target(
fmetadata_geno,
import_fmetadata_csv(
fmetadata_file_geno,
col_id = "marker",
col_types = c("chromosome" = "c")
)
),
## Metabolomics features metadata import
import_fmetadata_csv_factory(
files = c(
system.file("extdata/metabolomics_features_info.csv", package = "moiraine")
),
col_ids = c("feature_id"),
target_name_suffixes = c("metabo")
),
## Transcriptomics features metadata import
import_fmetadata_gff_factory(
files = system.file("extdata/bos_taurus_gene_model.gff3", package = "moiraine"),
feature_types = "genes",
add_fieldss = c("Name", "description"),
target_name_suffixes = "transcripto"
),
## Samples metadata import
import_smetadata_csv_factory(
files = system.file("extdata/samples_info.csv", package = "moiraine"),
col_ids = "animal_id",
target_name_suffixes = "all"
),
## Creating omics sets for each dataset
create_omics_set_factory(
datasets = c(data_geno, data_transcripto, data_metabo),
omics_types = c("genomics", "transcriptomics", "metabolomics"),
features_metadatas = c(fmetadata_geno, fmetadata_transcripto, fmetadata_metabo),
samples_metadatas = c(smetadata_all, smetadata_all, smetadata_all)
),
## Creating the MultiDataSet object
tar_target(
mo_set,
create_multiomics_set(
list(set_geno,
set_transcripto,
set_metabo)
)
),
# Inspecting the `MultiDataSet` object -----------------------------------------
## Creating a density plot for each dataset
tar_target(
density_plots,
plot_density_data(
mo_set,
combined = FALSE,
scales = "free"
)
),
## Plotting the relationship between features mean and standard deviation
## for each dataset
tar_target(
mean_sd_plots,
plot_meansd_data(mo_set)
),
## Assessing missing values
tar_target(
n_missing_values,
check_missing_values(mo_set)
),
# Modifying the `MultiDataSet` object ------------------------------------------
## RNAseq differential expression results file
tar_target(
rnaseq_de_res_file,
system.file(
"extdata/transcriptomics_de_results.csv",
package = "moiraine"
),
format = "file"
),
## Reading the RNAseq differential expression results
tar_target(
rnaseq_de_res_df,
read_csv(rnaseq_de_res_file) |>
rename(feature_id = gene_id) |>
mutate(dataset = "rnaseq")
),
## Adding the differential expression results to the MultiDataSet object
tar_target(
mo_set_de,
add_features_metadata(mo_set, rnaseq_de_res_df)
),
# Data pre-processing ----------------------------------------------------------
## Applying transformations to the datasets
transformation_datasets_factory(
mo_set_de,
c("rnaseq" = "vst-deseq2",
"metabolome" = "logx"),
log_bases = 2,
pre_log_functions = zero_to_half_min,
transformed_data_name = "mo_set_transformed"
),
## Density plot for each transformed dataset
tar_target(
density_plots_transformed,
plot_density_data(
mo_set_transformed,
combined = FALSE,
scales = "free"
)
),
## Plotting the mean-SD trend for transformed each dataset
tar_target(
mean_sd_plots_transformed,
plot_meansd_data(mo_set_transformed)
),
## Summary table of the transformations applied
tar_target(
transformation_summary,
get_table_transformations(transformations_runs_list)
),
## Running a PCA on each dataset
pca_complete_data_factory(
mo_set_transformed,
complete_data_name = "mo_set_complete"
),
## PCA screeplots
tar_target(
pca_screeplots,
plot_screeplot_pca(pca_runs_list)
),
## PCA sample plots
tar_target(
pca_sample_plots,
plot_samples_coordinates_pca(
pca_runs_list,
datasets = "snps",
pcs = 1:3,
mo_data = mo_set_de,
colour_upper = "geno_comp_cluster",
shape_upper = "status",
colour_lower = "feedlot"
)
),
# Dataset pre-filtering --------------------------------------------------------
## Unsupervised feature selection based on MAD score
feature_preselection_mad_factory(
mo_set_complete,
to_keep_ns = c("snps" = 1000, "rnaseq" = 1000),
with_ties = TRUE,
filtered_set_target_name = "mo_presel_unsupervised"
),
## Diagnostic plot for MAD-based feature selection
tar_target(
preselection_mad_plot,
plot_feature_preselection_mad(individual_mad_values)
),
## Supervised feature selection based on bruising groups
feature_preselection_splsda_factory(
mo_set_complete,
group = "status",
to_keep_ns = c("snps" = 1000, "rnaseq" = 1000),
filtered_set_target_name = "mo_presel_supervised"
),
## Diagnostic plot for sPLS-DA based feature selection
tar_target(
preselection_splsda_plot,
plot_feature_preselection_splsda(individual_splsda_perf)
),
# Integration with sPLS --------------------------------------------------------
## Creating sPLS input
tar_target(
spls_input,
get_input_spls(
mo_presel_supervised,
mode = "canonical",
datasets = c("rnaseq", "metabolome")
)
),
## Initial PLS run with no feature selection and large number of components
tar_target(
spls_novarsel,
spls_run(
spls_input,
ncomp = 4
)
),
## Cross-validation for number of components
tar_target(
spls_perf_res,
mixOmics::perf(
spls_novarsel,
validation = "Mfold",
folds = 10,
nrepeat = 10,
cpus = 3
)
),
## Plotting cross-validation results (for number of components)
## Can try criterion = 'Q2.total', 'cor.tpred', 'cor.upred', 'RSS.tpred',
## 'RSS.upred' (but avoid 'RSS' and 'PRESS')
tar_target(
spls_perf_plot,
plot(spls_perf_res, criterion = "Q2.total")
),
## Selected value for ncomp
tar_target(
spls_optim_ncomp,
spls_get_optim_ncomp(spls_perf_res, min_ncomp = 2)
),
## Cross-validation for number of features to retain
tar_target(
spls_tune_res,
spls_tune(
spls_input,
ncomp = spls_optim_ncomp,
keepX = seq(10, 100, 10),
keepY = seq(10, 100, 10),
validation = "Mfold",
folds = 10,
nrepeat = 5,
measure = "cor",
cpus = 3
)
),
## Plotting cross-validation results (for number of features)
tar_target(
spls_tune_plot,
spls_plot_tune(spls_tune_res)
),
## Final sPLS run
tar_target(
spls_final_run,
spls_run(
spls_input,
ncomp = spls_optim_ncomp,
keepX = spls_tune_res$choice.keepX,
keepY = spls_tune_res$choice.keepY
)
),
# Integration with sO2PLS ------------------------------------------------------
## Creating sO2PLS input
tar_target(
omicspls_input,
get_input_omicspls(
mo_presel_supervised,
datasets = c("rnaseq", "metabolome")
)
),
## Adjusted cross-validation for number of components
tar_target(
so2pls_cv_adj,
so2pls_crossval_o2m_adjR2(
omicspls_input,
a = 1:5,
ax = seq(0, 10, by = 2),
ay = seq(0, 10, by = 2),
nr_folds = 10,
nr_cores = 6,
seed = 127
)
),
tar_target(
so2pls_cv_adj_res,
so2pls_get_optim_ncomp_adj(so2pls_cv_adj)
),
## Plotting adjusted cross-validation results
tar_target(
so2pls_cv_adj_plot,
so2pls_plot_cv_adj(so2pls_cv_adj)
),
## Standard cross-validation for number of components
tar_target(
so2pls_cv,
so2pls_crossval_o2m(
omicspls_input,
so2pls_cv_adj,
nr_folds = 10,
nr_cores = 6,
seed = 356
)
),
tar_target(
so2pls_cv_res,
so2pls_get_optim_ncomp(so2pls_cv)
),
## Plotting standard cross-validation results
tar_target(
so2pls_cv_plot,
so2pls_plot_cv(so2pls_cv)
),
## Cross-validation for sparsity parameters
tar_target(
so2pls_cv_sparsity,
so2pls_crossval_sparsity(
omicspls_input,
n = so2pls_cv_res["n"],
nx = so2pls_cv_res["nx"],
ny = so2pls_cv_res["ny"],
nr_folds = 10,
keepx_seq = c(seq(5, 30, 5), seq(40, 100, 10)),
keepy_seq = c(seq(5, 40, 5))
)
),
tar_target(
so2pls_cv_sparsity_res,
so2pls_get_optim_keep(so2pls_cv_sparsity)
),
## Plotting the results of the cross-validation for the number of features
## to retain from each dataset for the different joint components
tar_target(
so2pls_cv_sparsity_plot,
so2pls_plot_cv_sparsity(so2pls_cv_sparsity)
),
## Extracting sparsity results in table format
tar_target(
so2pls_cv_sparsity_table,
so2pls_print_cv_sparsity(so2pls_cv_sparsity_res)
),
## Final sO2PLS run
tar_target(
so2pls_final_run,
so2pls_o2m(
omicspls_input,
so2pls_cv_res,
so2pls_cv_sparsity_res
)
),
## Summary plot of percentage of variance explained
tar_target(
so2pls_summary_plot,
so2pls_plot_summary(so2pls_final_run)
),
## Screeplot
tar_target(
so2pls_screeplot,
so2pls_screeplot(so2pls_final_run)
),
## Comparison of samples score for joint components
tar_target(
so2pls_joint_components_comparison_plot,
so2pls_compare_samples_joint_components(
so2pls_final_run,
mo_data = mo_set_de,
colour_by = "status",
shape_by = "feedlot"
)
),
## Coefficient plot for joint components
tar_target(
so2pls_joint_components_coefficients_plot,
so2pls_plot_joint_components_coefficients(so2pls_final_run)
),
## Joint component samples score plot
tar_target(
so2pls_joint_components_samples_score_plot,
so2pls_plot_samples_joint_components(
so2pls_final_run,
mo_data = mo_set_de,
colour_upper = "status",
scale_colour_upper = scale_colour_brewer(palette = "Paired"),
shape_upper = "feedlot"
) +
theme(legend.box = "vertical")
),
## Specific components samples score plot
tar_target(
so2pls_specific_components_samples_score_plot,
so2pls_plot_samples_specific_components(
so2pls_final_run,
mo_data = mo_set_de,
colour_upper = "feedlot",
scale_colour_upper = scale_colour_brewer(palette = "Paired"),
colour_lower = "rnaseq_batch",
shape_upper = "gender"
) |>
map(\(x) x + theme(legend.box = "vertical"))
),
# Integration with MOFA --------------------------------------------------------
## Creating MOFA input
tar_target(
mofa_input,
get_input_mofa(
mo_presel_supervised,
options_list = list(
data_options = list(scale_views = TRUE),
model_options = list(likelihoods = c(
"snps" = "poisson",
"rnaseq" = "gaussian",
"metabolome" = "gaussian")
),
training_options = list(seed = 43)
),
only_common_samples = FALSE
)
),
## Overview plot of the samples in each dataset
tar_target(
mofa_input_plot,
plot_data_overview(mofa_input)
),
## Training MOFA model
tar_target(
mofa_trained,
run_mofa(
mofa_input,
save_data = TRUE,
use_basilisk = TRUE
)
),
## Formatting MOFA output
tar_target(
mofa_output,
get_output(mofa_trained)
),
## Plots of variance explained
tar_target(
mofa_var_explained_plot,
plot_variance_explained(
mofa_trained,
x = "view", ## datasets on the x-axis
y = "factor" ## factors on the y-axis
)
),
tar_target(
mofa_total_var_explained_plot,
plot_variance_explained(
mofa_trained,
x = "view",
y = "factor",
plot_total = TRUE
)[[2]]
),
## Plot of factors correlation with covariates
tar_target(
mofa_factors_covariates_cor_plot,
mofa_plot_cor_covariates(mofa_trained)
),
# Integration with DIABLO ------------------------------------------------------
## Creating the DIABLO input
tar_target(
diablo_input,
get_input_mixomics_supervised(
mo_presel_supervised,
group = "status"
)
),
## Running sPLS on each dataset to construct the design matrix
diablo_pairwise_pls_factory(diablo_input),
## Initial DIABLO run with no feature selection and large number of components
tar_target(
diablo_novarsel,
diablo_run(
diablo_input,
diablo_design_matrix,
ncomp = 7
)
),
## Cross-validation for number of components
tar_target(
diablo_perf_res,
mixOmics::perf(
diablo_novarsel,
validation = "Mfold",
folds = 10,
nrepeat = 10,
cpus = 3
)
),
## Plotting cross-validation results (for number of components)
tar_target(
diablo_perf_plot,
diablo_plot_perf(diablo_perf_res)
),
## Selected value for ncomp
tar_target(
diablo_optim_ncomp,
diablo_get_optim_ncomp(diablo_perf_res)
),
## Cross-validation for number of features to retain
tar_target(
diablo_tune_res,
diablo_tune(
diablo_input,
diablo_design_matrix,
ncomp = diablo_optim_ncomp,
validation = "Mfold",
folds = 10,
nrepeat = 5,
dist = "centroids.dist",
cpus = 3
)
),
## Plotting cross-validation results (for number of features)
tar_target(
diablo_tune_plot,
diablo_plot_tune(diablo_tune_res)
),
## Final DIABLO run
tar_target(
diablo_final_run,
diablo_run(
diablo_input,
diablo_design_matrix,
ncomp = diablo_optim_ncomp,
keepX = diablo_tune_res$choice.keepX
)
),
# Interpreting the integration results -----------------------------------------
## Generating standardised output
tar_target(
spls_output,
get_output(spls_final_run)
),
## Percentage of variance explained
tar_target(
spls_plot_variance_explained,
plot_variance_explained(spls_output)
),
## Samples score matrix plot
tar_target(
spls_samples_scores_plot,
plot_samples_score(
spls_output,
mo_data = mo_set_complete,
colour_upper = "status",
scale_colour_upper = palette_status,
shape_upper = "gender",
colour_lower = "feedlot",
scale_colour_lower = palette_feedlot
) +
theme(legend.box = "vertical")
),
## Distribution of features weight
tar_target(
spls_features_weight_distribution,
plot_features_weight_distr(spls_output)
),
## Plot of top contributing features
tar_target(
spls_top_features_plot,
plot_top_features(
spls_output,
mo_data = mo_set_complete,
label_cols = list(
"rnaseq" = "Name",
"metabolome" = "name"
)
)
),
## Table of selected features
tar_target(
spls_selected_features,
get_selected_features(spls_output)
),
## Generating standardised output
tar_target(
so2pls_output,
get_output(so2pls_final_run)
),
## Percentage of variance explained
tar_target(
so2pls_plot_variance_explained,
plot_variance_explained(so2pls_output, ncol = 1) +
theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 1))
),
## Samples score matrix plot
tar_target(
so2pls_samples_scores_plot,
plot_samples_score(
so2pls_output,
latent_dimensions = "joint component 1",
mo_data = mo_set_complete,
colour_upper = "status",
scale_colour_upper = palette_status,
shape_upper = "gender"
)
),
## Distribution of features weight
tar_target(
so2pls_features_weight_distribution,
plot_features_weight_distr(so2pls_output) +
plot_layout(ncol = 2)
),
## Plot of top contributing features
tar_target(
so2pls_top_features_plot,
plot_top_features(
so2pls_output,
mo_data = mo_set_complete,
label_cols = list(
"rnaseq" = "Name",
"metabolome" = "name"
)
)
),
## Table of selected features
tar_target(
so2pls_selected_features,
get_selected_features(
so2pls_output,
latent_dimensions = "joint component 1"
)
),
## Generating standardised output
tar_target(
mofa_output,
get_output(mofa_trained)
),
## Percentage of variance explained
tar_target(
mofa_plot_variance_explained,
plot_variance_explained(mofa_output, ncol = 1) +
theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 1))
),
## Samples score matrix plot
tar_target(
mofa_samples_scores_plot,
plot_samples_score(
mofa_output,
latent_dimensions = paste("Factor", 1:4),
mo_data = mo_set_complete,
colour_upper = "status",
scale_colour_upper = palette_status,
shape_upper = "gender",
colour_lower = "geno_comp_cluster",
scale_colour_lower = palette_geno_comp
) +
theme(legend.box = "vertical")
),
## Distribution of features weight
tar_target(
mofa_features_weight_distribution,
plot_features_weight_distr(
mofa_output,
latent_dimensions = paste("Factor", 1:4)
) +
plot_layout(ncol = 1)
),
## Plot of top contributing features
tar_target(
mofa_top_features_plot,
plot_top_features(
mofa_output,
mo_data = mo_set_complete,
label_cols = list(
"rnaseq" = "Name",
"metabolome" = "name"
),
latent_dimensions = paste("Factor", 1:2)
)
),
## Table of top contributing features
tar_target(
mofa_top_features,
get_top_features(
mofa_output,
min_importance = 0.8,
mo_data = mo_set_complete
)
),
## Generating standardised output
tar_target(
diablo_output,
get_output(diablo_final_run)
),
## Percentage of variance explained
tar_target(
diablo_plot_variance_explained,
plot_variance_explained(diablo_output, ncol = 2)
),
## Samples score matrix plot
tar_target(
diablo_samples_scores_plot,
plot_samples_score(
diablo_output,
mo_data = mo_set_complete,
colour_upper = "status",
scale_colour_upper = palette_status,
shape_upper = "gender",
colour_lower = "rnaseq_batch",
scale_colour_lower = palette_rnaseq_batch
) +
theme(legend.box = "vertical")
),
## Distribution of features weight
tar_target(
diablo_features_weight_distribution,
plot_features_weight_distr(
diablo_output
) +
plot_layout(ncol = 1)
),
## Plot of top contributing features
tar_target(
diablo_top_features_plot,
plot_top_features(
diablo_output,
mo_data = mo_set_complete,
label_cols = list(
"rnaseq" = "Name",
"metabolome" = "name"
),
latent_dimensions = paste("Component", 1:2)
)
),
## Table of top contributing features
tar_target(
diablo_selected_features,
get_selected_features(diablo_output)
)
)
13.1 Introduction
Evaluating the results of a multi-omics integration analysis is not an easy task, mainly because when analysing biological data there is no ground truth about the underlying biological mechanisms at play. However, it is possible to gain some measure of confidence in our results by comparing them to what we already know about the biological system studied, or to other analyses (typically single-omics analyses) performed on the dataset. In the latter case, this can also show us what insights are gained by analysing the omics datasets together that would have been missed by looking at each omics dataset separately. Note that in this vignette, we refer to the evaluation of an integration method as comparison with prior knowledge or previous studies, while in Chapter 14 we will talk about the comparison of results obtained with different integration tools.
When evaluating integration results, we can compare the features contribution (see Section 12.1) to some prior information that we have about the features: for example, their score in a differential expression analysis, whether they were found differentially expressed or not, or the group to which they belong (e.g. biological pathway or chemical class). On the other hand, we can assess whether the latent dimensions created separate different sample groups (e.g. control vs treatment, or different time points).
For this vignette, we will use the results from the DIABLO (Chapter 11) and MOFA (Chapter 10) analyses as examples. We will use the dimension reduction output objects that we created in Chapter 12, which contains the results of the DIABLO and MOFA runs stored in a standardised format (Section 12.2):
Note that for the DIABLO run, the samples score are computed for the weighted average of each latent component, rather than for the dataset-specific latent components (this is controlled through the use_average_dimensions
argument from the get_output()
function).
We will also use the MultiDataSet
object that contains information about the samples and features:
tar_load(mo_set_complete)
13.2 Evaluating features prioritisation
First, we can evaluate the integration results in terms of the features prioritisation, i.e. the ranking of the features for each latent dimension in terms of their importance score. We can for example assess whether the features that were given the highest scores for some latent components were also highlighted through single-omics analyses, or compare the importance score of groups of features, for example compounds from different chemical classes or genes from different gene families.
Here, we will compare the DIABLO results to the results of traditional single-omics analyses. In our case, a GWAS and eQTL analysis were performed to detect genomic variants associated with the disease status of the samples. The mode of action of significant markers (i.e. QTL, cis-eQTL or trans-eQTL). The GWAS status (high-scoring or not) of each marker was recorded in the genomics features metadata table:
get_features_metadata(mo_set_complete)[["snps"]] |>
pull(qtl_type) |>
unique()
#> [1] "non signif." "cis eQTL" "trans eQTL" "QTL"
For the transcriptomics and metabolomics dataset, a differential expression (DE) analysis was performed on each of them to compare control and BRD animals. The resulting adjusted p-value, log2-fold change or t-value, status (upregulated, downregulated or not differentially expressed) and significance (differentially expressed or not) of each gene or compound are recorded in each dataset’s features metadata:
get_features_metadata(mo_set_complete)[["rnaseq"]] |>
select(feature_id, log_fc:de_status) |>
head()
#> feature_id log_fc log_cpm f
#> ENSBTAG00000000005 ENSBTAG00000000005 0.13603041 5.9051177 4.1626465
#> ENSBTAG00000000008 ENSBTAG00000000008 -0.12965356 -0.7437052 1.0676845
#> ENSBTAG00000000009 ENSBTAG00000000009 1.27141158 -2.5627934 23.9562951
#> ENSBTAG00000000010 ENSBTAG00000000010 0.39567729 6.2563594 60.5303416
#> ENSBTAG00000000011 ENSBTAG00000000011 0.07766873 -2.7608361 0.1418457
#> ENSBTAG00000000012 ENSBTAG00000000012 0.15756169 3.6628775 11.0448775
#> p_value fdr de_signif de_status
#> ENSBTAG00000000005 4.324808e-02 7.170137e-02 Not DE Not DE
#> ENSBTAG00000000008 3.032916e-01 3.938643e-01 Not DE Not DE
#> ENSBTAG00000000009 2.731328e-06 9.665375e-06 Not DE Not DE
#> ENSBTAG00000000010 1.581955e-12 1.521160e-11 Not DE Not DE
#> ENSBTAG00000000011 7.070363e-01 7.773874e-01 Not DE Not DE
#> ENSBTAG00000000012 1.141125e-03 2.623062e-03 Not DE Not DE
get_features_metadata(mo_set_complete)[["metabolome"]] |>
select(feature_id, t_value:de_status) |>
head()
#> feature_id t_value p_value padj de_signif
#> HMDB00001 HMDB00001 -0.5557020 5.797635e-01 6.784466e-01 Not DE
#> HMDB00008 HMDB00008 0.2181562 8.276321e-01 8.925444e-01 Not DE
#> HMDB00042 HMDB00042 -12.5323491 1.753101e-24 4.821028e-23 DE
#> HMDB00043 HMDB00043 -7.9073179 7.827088e-13 3.913544e-12 DE
#> HMDB00060 HMDB00060 -0.4369834 6.628164e-01 7.439776e-01 Not DE
#> HMDB00062 HMDB00062 8.2347595 1.271549e-13 6.993518e-13 DE
#> de_status
#> HMDB00001 Not DE
#> HMDB00008 Not DE
#> HMDB00042 downregulated
#> HMDB00043 downregulated
#> HMDB00060 Not DE
#> HMDB00062 upregulated
13.2.1 Table of counts of selected genes against a grouping
As DIABLO performs feature selection, it is of interest to assess how many (e)QTLs and DE genes and compounds were retained for each latent component. We can use the evaluate_feature_selection_table()
function to generate a table of counts comparing the features selected for each latent component to a features grouping (in our case, the results of the single-omics analyses). The features grouping is obtained from the features metadata stored in the MultiDataSet
object. The function takes as input the integration result object, the MultiDataSet
object and a named list giving for each dataset the name of the column from their features metadata table containing the features grouping:
diablo_evaluation_counts_table <- evaluate_feature_selection_table(
diablo_output,
mo_data = mo_set_complete,
col_names = list(
"snps" = "qtl_type",
"rnaseq" = "de_signif",
"metabolome" = "de_signif"
)
)
diablo_evaluation_counts_table |>
filter(latent_dimension == "Component 1")
#> # A tibble: 8 × 6
#> method latent_dimension dataset feature_label selected not_selected
#> <chr> <fct> <fct> <chr> <int> <int>
#> 1 DIABLO Component 1 snps cis eQTL 1 2
#> 2 DIABLO Component 1 snps non signif. 17 975
#> 3 DIABLO Component 1 snps QTL 1 0
#> 4 DIABLO Component 1 snps trans eQTL 1 3
#> 5 DIABLO Component 1 rnaseq DE 23 52
#> 6 DIABLO Component 1 rnaseq Not DE 7 912
#> 7 DIABLO Component 1 metabolome DE 10 20
#> 8 DIABLO Component 1 metabolome Not DE 0 25
diablo_evaluation_counts_table |>
filter(latent_dimension == "Component 2")
#> # A tibble: 8 × 6
#> method latent_dimension dataset feature_label selected not_selected
#> <chr> <fct> <fct> <chr> <int> <int>
#> 1 DIABLO Component 2 snps cis eQTL 0 3
#> 2 DIABLO Component 2 snps non signif. 25 967
#> 3 DIABLO Component 2 snps QTL 0 1
#> 4 DIABLO Component 2 snps trans eQTL 0 4
#> 5 DIABLO Component 2 rnaseq DE 0 75
#> 6 DIABLO Component 2 rnaseq Not DE 15 904
#> 7 DIABLO Component 2 metabolome DE 6 24
#> 8 DIABLO Component 2 metabolome Not DE 4 21
We can see that for the first latent component, only three out of the eight (e)QTLs were selected 1; two-third of the genes selected were found differentially expressed; and all ten of the selected compounds were found differentially expressed. Conversely, for the second latent component, none of the selected markers were QTLs, and none of the selected genes were differentially expressed. This makes sense, as the DIABLO analysis was run with the goal of separating the control and infected animals; therefore it makes sense that the features selected for the first latent component, which is the one best able to separate the groups, are the features detected as differentially expressed. On the contrary, the second latent component is constructed to be orthogonal to the first one, therefore the features selected will likely not be differentially expressed.
With this function, we can focus on only some datasets, by changing which datasets are present in the list passed to col_names
. Similarly, we can select specific latent dimensions through the latent_dimensions
parameter. For example, let us count the number of DE genes selected with the first latent component:
evaluate_feature_selection_table(
diablo_output,
mo_data = mo_set_complete,
col_names = list("rnaseq" = "de_signif"),
latent_dimensions = "Component 1"
)
#> # A tibble: 2 × 6
#> method latent_dimension dataset feature_label selected not_selected
#> <chr> <fct> <fct> <chr> <int> <int>
#> 1 DIABLO Component 1 rnaseq DE 23 52
#> 2 DIABLO Component 1 rnaseq Not DE 7 912
Note that this function is useful for integration methods that perform feature selection, such as DIABLO or sO2PLS, but will be irrelevant for methods that do not perform feature selection (for example MOFA tends to assign a non-null weight to most features).
13.2.2 Plotting features weight against a covariate
We can go further and compare the weights or importance scores given to the features for each latent component to the outcome of the single-omics results. This can be done with the plot_features_weight_covariate()
function. As its name suggests, this function displays the features weight for each latent dimension against some covariate obtained from the features metadata tables. Again, we will look at the results of the GWAS/QTL and DE analyses and see whether (e)QTLs, DE genes and compounds were assigned higher weights is some of the latent components constructed with DIABLO. The input parameters for this function are similar to those for the evaluate_feature_selection_table()
function: we need to pass the results of the integration method, the MultiDataSet
object, and the named list giving the column in each features metadata table containing information about the features grouping. The latter is passed to the covariate
argument of the function. If some datasets are not present in this list, they won’t be present in the plot.
plot_features_weight_covariate(
diablo_output,
mo_data = mo_set_complete,
covariate = list(
"snps" = "qtl_type",
"rnaseq" = "de_status",
"metabolome" = "de_status"
)
)
#> Warning: Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
By default, the function plots on the y-axis the signed importance score of the features, that is, their importance score to which the sign of their weight was added. This can be controlled via the features_metric
argument. We see that most of the points have an importance score of 0, because most of the features were not selected. We can focus on only those features with a non-null weight with the remove_null_weight
argument:
plot_features_weight_covariate(
diablo_output,
mo_data = mo_set_complete,
covariate = list(
"snps" = "qtl_type",
"rnaseq" = "de_status",
"metabolome" = "de_status"
),
remove_null_weight = TRUE
)
#> Warning: Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
Here, we see that for the first latent component, selected genes that were also found up-regulated were assigned on average a higher importance score that selected genes that were found non DE. Also, for the metabolomics dataset, selected compounds that were found up-regulated were assigned a positive weight while those found down-regulated were assigned a negative weight.
Note that with the plot_features_weight_covariate()
function, we can also plot the features against a continuous covariate, e.g. the log2-fold change obtained with the differential expressions:
plot_features_weight_covariate(
diablo_output,
mo_data = mo_set_complete,
covariate = list(
"rnaseq" = "log_fc",
"metabolome" = "t_value"
)
)
#> Warning: Removed 508 rows containing non-finite values (`stat_smooth()`).
#> Warning: Removed 508 rows containing missing values (`geom_point()`).
We can see a nice trend between compounds’ t-value and their signed importance score in the differential expression analysis for the first latent component.
We can further change the colour or shape of the points according to some information from the features metadata (information must be passed as a named list as for the covariate).
plot_features_weight_covariate(
diablo_output,
mo_data = mo_set_complete,
covariate = list(
"rnaseq" = "log_fc",
"metabolome" = "t_value"
),
colour_by = list(
"rnaseq" = "fdr",
"metabolome" = "padj"
)
) +
scale_colour_viridis_c(aesthetics = c("colour", "fill"))
#> Warning: Removed 508 rows containing non-finite values (`stat_smooth()`).
#> Warning: Removed 508 rows containing missing values (`geom_point()`).
Note that for continuous covariates, as with the function plot_samples_score_covariate()
, a loess curve is fit to summarise the trend between the covariates and the features weight. If colour_by
is used, and the corresponding variables are categorical, a different loess curve is fitted for each category. If instead the colour_by
variables are numeric, a single loess curve will be plotted for each latent dimension and dataset.
13.3 Features set enrichment
In addition to looking at the distribution of features importance score or counting the number of selected features, we can also perform an enrichment to assess whether each of the latent dimensions constructed is enriched for some features set of interest. Features sets are groups of features that belong in a similar category: e.g. all features involved in a same biological pathway, or with the same GO term annotation, or even all up- or down-regulated features. They are built using prior knowledge, for example information extracted from databases, or constructed from the results of previous analyses. Importantly, a feature can belong to more than one set (for example a compound can be involved in several biological pathways).
13.3.1 Constructing feature sets
With the moiraine
package, there are two ways to obtain feature sets to perform enrichment or for plots. The first one is to extract information from the features metadata tables stored in our MultiDataSet
object; which is done with the make_feature_sets_from_fm()
function. Note that if some features grouping is stored in the features metadata, it means that features can only belong to one set. For example, we will use the results of the single-omics analyses to group the features into sets: QTLs, cis-eQTLs, trans-eQTLs and non significant genomics markers, and up-regulated, down-regulated or non DE genes and compounds. Again, the function accepts a named list giving for each dataset the name of the column in the corresponding features metadata table to use to generate the groups, passed to the col_names
argument of the function. It returns a named list where each element is a feature set, and contains the ID of the features in the set:
tar_target(
sets_single_omics,
make_feature_sets_from_fm(
mo_set_complete,
col_names = list(
"snps" = "qtl_type",
"rnaseq" = "de_status",
"metabolome" = "de_status"
)
)
)
tar_read(sets_single_omics) |>
str()
#> List of 10
#> $ non signif. : chr [1:23000] "1_41768691" "10-27008241-A-C-rs42918694" "10-37505419-T-C-rs136559242" "10-49904259-G-A-rs471723345" ...
#> $ cis eQTL : chr [1:31] "ARS-BFGL-BAC-13210" "ARS-BFGL-NGS-113310" "ARS-BFGL-NGS-5022" "ARS-BFGL-NGS-59463" ...
#> $ trans eQTL : chr [1:4] "ARS-BFGL-NGS-80280" "BovineHD0100032240" "BovineHD2300010006" "Hapmap55381-rs29025399"
#> $ QTL : chr "BovineHD1800016801"
#> $ Not DE - rnaseq : chr [1:20224] "ENSBTAG00000000005" "ENSBTAG00000000008" "ENSBTAG00000000009" "ENSBTAG00000000010" ...
#> $ upregulated - rnaseq : chr [1:102] "ENSBTAG00000000377" "ENSBTAG00000000783" "ENSBTAG00000001051" "ENSBTAG00000001785" ...
#> $ downregulated - rnaseq : chr [1:9] "ENSBTAG00000001032" "ENSBTAG00000004824" "ENSBTAG00000011990" "ENSBTAG00000012403" ...
#> $ Not DE - metabolome : chr [1:25] "HMDB00001" "HMDB00008" "HMDB00060" "HMDB00097" ...
#> $ downregulated - metabolome: chr [1:20] "HMDB00042" "HMDB00043" "HMDB00064" "HMDB00094" ...
#> $ upregulated - metabolome : chr [1:10] "HMDB00062" "HMDB00092" "HMDB00159" "HMDB00169" ...
By default, the function keeps the different omics features separate: here for example, there is a “Not DE” set for the transcriptomics dataset, and a “Not DE” set for the metabolomics dataset. When there is no ambiguity as to which omics dataset they represent, the name of the dataset is not included in the set’s name. It is possible to merge feature sets with the same name between the omics datasets, by setting the combine_omics_sets
argument to TRUE
:
tar_target(
sets_single_omics_merged,
make_feature_sets_from_fm(
mo_set_complete,
col_names = list(
"snps" = "qtl_type",
"rnaseq" = "de_status",
"metabolome" = "de_status"
),
combine_omics_sets = TRUE
)
)
tar_read(sets_single_omics_merged) |>
str()
#> List of 7
#> $ non signif. : chr [1:23000] "1_41768691" "10-27008241-A-C-rs42918694" "10-37505419-T-C-rs136559242" "10-49904259-G-A-rs471723345" ...
#> $ cis eQTL : chr [1:31] "ARS-BFGL-BAC-13210" "ARS-BFGL-NGS-113310" "ARS-BFGL-NGS-5022" "ARS-BFGL-NGS-59463" ...
#> $ trans eQTL : chr [1:4] "ARS-BFGL-NGS-80280" "BovineHD0100032240" "BovineHD2300010006" "Hapmap55381-rs29025399"
#> $ QTL : chr "BovineHD1800016801"
#> $ Not DE : chr [1:20249] "ENSBTAG00000000005" "ENSBTAG00000000008" "ENSBTAG00000000009" "ENSBTAG00000000010" ...
#> $ upregulated : chr [1:112] "ENSBTAG00000000377" "ENSBTAG00000000783" "ENSBTAG00000001051" "ENSBTAG00000001785" ...
#> $ downregulated: chr [1:29] "ENSBTAG00000001032" "ENSBTAG00000004824" "ENSBTAG00000011990" "ENSBTAG00000012403" ...
Now the not DE
set contains all genes and compounds ID that have not been found differentially expressed.
Alternatively, feature sets can be constructed from an input data-frame. This is useful when looking at pathways or GO annotations, for which features can belong to several sets at once, as this information cannot be stored in a features metadata table. Here, we will use the GO terms assigned to the genes. Note that in this example, the feature sets will contain features from only one omics dataset, but it doesn’t have to be; instead the sets could contain features from different omics datasets.
We start by reading in the data-frame of GO terms. We will focus on GO terms relating to biological processes for this example:
list(
tar_target(
rnaseq_go_terms_file,
system.file(
"extdata/transcriptomics_go_annotation.csv",
package = "moiraine"
),
format = "file"
),
tar_target(
rnaseq_go_df,
read_csv(rnaseq_go_terms_file) |>
filter(go_domain == "Biological process")
)
)
tar_load(rnaseq_go_df)
dim(rnaseq_go_df)
#> [1] 89226 4
head(rnaseq_go_df)
#> # A tibble: 6 × 4
#> gene_id go_id go_name go_domain
#> <chr> <chr> <chr> <chr>
#> 1 ENSBTAG00000000005 GO:0006468 protein phosphorylation Biologic…
#> 2 ENSBTAG00000000005 GO:0007165 signal transduction Biologic…
#> 3 ENSBTAG00000000005 GO:0031623 receptor internalization Biologic…
#> 4 ENSBTAG00000000008 GO:0006813 potassium ion transport Biologic…
#> 5 ENSBTAG00000000009 GO:0006355 regulation of DNA-templated transcrip… Biologic…
#> 6 ENSBTAG00000000009 GO:0006357 regulation of transcription by RNA po… Biologic…
The data-frame contains one row per gene (gene_id
) / GO term (go_id
) pair. In addition, the name of each GO term and the category (biological process, molecular function or cellular component) are indicated in the go_name
and go_domain
columns, respectively. When reading in the file, we’ve restricted the GO terms to only those that correspond to biological processes.
We can turn this data-frame into a list of feature sets with the make_feature_sets_from_df()
function. The function takes as input the data-frame, as well as the name of the column in the data-frame that contains the features ID (col_id
argument) and the name of the column in the data-frame that contains the feature sets ID or name (col_set
argument):
tar_target(
go_sets,
make_feature_sets_from_df(
rnaseq_go_df,
col_id = "gene_id",
col_set = "go_id"
)
)
tar_load(go_sets)
length(go_sets)
#> [1] 10745
head(go_sets) |>
str()
#> List of 6
#> $ GO:0006468: chr [1:580] "ENSBTAG00000000005" "ENSBTAG00000000013" "ENSBTAG00000000056" "ENSBTAG00000000184" ...
#> $ GO:0007165: chr [1:737] "ENSBTAG00000000005" "ENSBTAG00000000073" "ENSBTAG00000000081" "ENSBTAG00000000105" ...
#> $ GO:0031623: chr [1:31] "ENSBTAG00000000005" "ENSBTAG00000000308" "ENSBTAG00000001050" "ENSBTAG00000001139" ...
#> $ GO:0006813: chr [1:96] "ENSBTAG00000000008" "ENSBTAG00000000082" "ENSBTAG00000000745" "ENSBTAG00000000973" ...
#> $ GO:0006355: chr [1:915] "ENSBTAG00000000009" "ENSBTAG00000000040" "ENSBTAG00000000052" "ENSBTAG00000000074" ...
#> $ GO:0006357: chr [1:627] "ENSBTAG00000000009" "ENSBTAG00000000039" "ENSBTAG00000000052" "ENSBTAG00000000154" ...
13.3.2 Filtering feature sets against the datasets
Before performing any enrichment, we want to make sure that the only features in the sets are the ones that were measured in the datasets. This is part of ensuring that an appropriate background or reference features set is used when performing enrichment (see for example Timmons, Szkop, and Gallagher (2015)). To this end, the reduce_feature_sets_data()
function filters a list of feature sets so that only features present in a given MultiDataSet
object are kept in the sets; it then removes empty feature sets. Note that for this filtering, we are using the MultiDataSet
object that contains the full omics datasets, before pre-filtering:
tar_target(
go_sets_filtered,
reduce_feature_sets_data(go_sets, mo_set_complete)
)
The function yields the following message:
#> All features in sets are in the multi-omics dataset.
tar_load(go_sets_filtered)
13.3.3 Checking the number of features assigned to sets
We can then check the number of features that are assigned to at least one set with the check_feature_sets()
function, to get an idea of the coverage of our annotation. We will restrict the check to the transcriptomics dataset, since the GO annotation only covers genes:
sets_check <- check_feature_sets(
go_sets_filtered,
mo_set_complete,
datasets = "rnaseq"
)
#> rnaseq dataset: 14,364 of 20,335 (70.6%) features assigned to at least one set.
sets_check
#> # A tibble: 1 × 5
#> dataset n_annotated n frac_annotated message
#> <fct> <int> <int> <dbl> <chr>
#> 1 rnaseq 14364 20335 0.706 rnaseq dataset: 14,364 of 20,335 (70…
The function returns a tibble giving the name of the datasets, the number and fraction of features from the dataset that are present in at least one feature set (n_annotated
and frac_annotated
columns), the total number of features in the corresponding dataset (n
column), as well as sentence summarising this information (message
column) that is useful for reporting. In this example, 70.6% of the genes from the transcriptomics dataset are assigned to at least one GO term. This is important to check as the annotation coverage will impact the quality of the enrichment results that we get.
13.3.4 Enrichment of latent dimensions
We are now ready to perform an enrichment of the latent dimensions. There are many ways to perform functional enrichment (e.g. see Zhao and Rhee (2023)): based on over-representation, feature ranking, or even topology when the feature sets are representing biological pathways. Here, we are not trying to provide an exhaustive list of options; however it is relatively straightforward to extract from an integration method output the features importance or list of selected features for a specific latent dimension, and use it for enrichment with the package of your choice.
In moiraine
, we use the gage
package (Luo et al. 2009) to perform the enrichment of each latent dimension against a list of feature sets. It uses a two-sample t-test to assess whether features in a set have higher importance scores than features not in the set. The evaluate_method_enrichment()
function takes as input the results from an integration method in a standardised format (here our diablo_output
object), and performs an enrichment for each latent dimension using the features sets provided as a list. We can focus on specific datasets or latent dimensions by passing their name to the datasets
or latent_dimensions
arguments, respectively. There are a number of parameters that need to be set:
use_abs
: by default, the function uses the absolute value of the importance score to perform the enrichment, which allows it to detect features sets in which features are given strong weights that are both positive and negative. If we instead want to search for features sets in which the weight of the features are coordinated (i.e. either all positive or negative), we can set theuse_abs
argument toFALSE
.min_set_size
: the minimum number of features in a set needed for the set to be considered for enrichment. The default value is 5, i.e. sets with less than 5 features will not be tested for enrichment.add_missing_features
andmo_data
: when performing multi-omics integration, it is highly likely that the omics datasets have been pre-filtered prior to the analysis, which means that some features will not have received a weight in the results of the integration analysis. This can bias the enrichment results, as these are based on only features that have a weight. So if a features set contains 50 features, but 40 of these were discarded during the pre-filtering step, only the remaining 10 features will be considered when performing an enrichment. This might lead to finding a latent dimension enriched for this set, even though the set might not be biologically relevant since 80% of its features did not even pass the pre-filtering step. To account for that, it is possible to add to the integration results all of the features in aMultiDataSet
object that are not present in the results. These features will be assigned a weight of 0 to emphasise that they were not found to play any role for the latent dimensions. This is done by settingadd_missing_features
toTRUE
, and passing tomo_data
aMultiDataSet
object that contains the omics datasets that have not been filtered. This way, the background set used in the enrichment will be all features measured in the omics datasets, and not only the ones that passed the pre-filtering step.sets_info_df
andcol_set
: the feature sets are passed as a named list, therefore the only information that we have about these sets are their names as found in the list. To facilitate the interpretation of the enrichment results, it is possible to pass a data-frame giving information about the sets, through thesets_info_df
argument. This information will be added to the enrichment results table. In that case, thecol_set
argument must also be specified: it takes the name of the column in the data-frame passed tosets_info_df
that contains the sets ID or named as seen in the list. Here we will create this data-frame as follows:
tar_target(
go_sets_info,
rnaseq_go_df |>
dplyr::select(go_id, go_name) |>
dplyr::distinct()
)
tar_read(go_sets_info) |>
head()
#> # A tibble: 6 × 2
#> go_id go_name
#> <chr> <chr>
#> 1 GO:0006468 protein phosphorylation
#> 2 GO:0007165 signal transduction
#> 3 GO:0031623 receptor internalization
#> 4 GO:0006813 potassium ion transport
#> 5 GO:0006355 regulation of DNA-templated transcription
#> 6 GO:0006357 regulation of transcription by RNA polymerase II
We can run the enrichment analysis:
tar_target(
diablo_enrichment_results,
evaluate_method_enrichment(
diablo_output,
go_sets_filtered,
datasets = "rnaseq",
use_abs = TRUE,
min_set_size = 10,
add_missing_features = TRUE,
mo_data = mo_set_complete,
sets_info_df = go_sets_info,
col_set = "go_id"
)
)
The function returns a table giving for each latent component (latent_dimension
) and feature set (set_id
) the enrichment statistics (stat_mean
), p-value (pvalue
), adjusted p-value (adj_pvalue
) as well as the number of features in the set that are present in the integration results (set_size
). The table is arranged by increasing adjusted p-value, such that the significant results are at the top of the table. It is important to note that the multiple testing correction applied to the p-values occurs independently for each latent dimension, but there is no multiple testing correction performed across the latent dimensions. This can easily be done by the user, and should be considered when reporting the results.
tar_read(diablo_enrichment_results) |>
head()
#> # A tibble: 6 × 7
#> latent_dimension set_id stat_mean pvalue adj_pvalue set_size go_name
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 Component 2 GO:0006935 1.39 0.0846 0.601 57 chemotaxis
#> 2 Component 2 GO:0042274 1.37 0.0889 0.601 54 ribosomal sm…
#> 3 Component 2 GO:0050727 1.27 0.104 0.601 56 regulation o…
#> 4 Component 2 GO:0006364 0.977 0.166 0.601 77 rRNA process…
#> 5 Component 2 GO:0030890 0.979 0.168 0.601 30 positive reg…
#> 6 Component 2 GO:0043029 0.983 0.168 0.601 25 T cell homeo…
In that case, unsurprisingly, neither of the latent components were enriched for any GO term. This makes sense since DIABLO performed a strong feature selection, so very few genes had non-null weights. We can repeat the enrichment with the MOFA integration results to show more interesting results. We will run the enrichment only for the first three factors for conciseness:
tar_target(
mofa_enrichment_results,
evaluate_method_enrichment(
mofa_output,
go_sets_filtered,
datasets = "rnaseq",
latent_dimensions = paste("Factor", 1:3),
use_abs = TRUE,
min_set_size = 10,
add_missing_features = TRUE,
mo_data = mo_set_complete,
sets_info_df = go_sets_info,
col_set = "go_id"
)
)
This time, there are a couple of small p-values (even though the adjusted p-values are all quite large):
tar_load(mofa_enrichment_results)
mofa_enrichment_results |> head()
#> # A tibble: 6 × 7
#> latent_dimension set_id stat_mean pvalue adj_pvalue set_size go_name
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 Factor 3 GO:0031424 4.13 0.000275 0.549 20 keratiniza…
#> 2 Factor 3 GO:0045109 3.60 0.000713 0.711 25 intermedia…
#> 3 Factor 3 GO:0030855 2.56 0.00651 0.764 56 epithelial…
#> 4 Factor 3 GO:0031069 2.07 0.0244 0.764 25 hair folli…
#> 5 Factor 3 GO:0030216 1.90 0.0317 0.764 49 keratinocy…
#> 6 Factor 3 GO:0001942 1.82 0.0373 0.764 38 hair folli…
We can see for example that MOFA factor 3 seems enriched in genes associated with the keratinization GO term (GO:0031424
). For factor 1 (the factor separating the healthy and infected animals), the keratinization term also yields the smallest p-value, together with several GO terms related to inflammation (even though after correction for multiple testing these p-values are no longer significant):
mofa_enrichment_results |>
filter(latent_dimension == "Factor 1") |>
head()
#> # A tibble: 6 × 7
#> latent_dimension set_id stat_mean pvalue adj_pvalue set_size go_name
#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 Factor 1 GO:0031424 2.70 0.00516 0.786 20 keratinizat…
#> 2 Factor 1 GO:0045109 2.43 0.00948 0.786 25 intermediat…
#> 3 Factor 1 GO:0010951 2.15 0.0208 0.786 25 negative re…
#> 4 Factor 1 GO:0006954 1.86 0.0318 0.786 165 inflammator…
#> 5 Factor 1 GO:0050727 1.75 0.0421 0.786 56 regulation …
#> 6 Factor 1 GO:0007166 1.62 0.0533 0.786 138 cell surfac…
13.3.5 Plotting features weight for feature sets
We can dig into the results of the enrichment by visualising the distribution of features weight or importance score between features that are in a specific features set vs features that are not in the set. For example, to continue with the MOFA example, let’s look at the importance score of genes associated with the GO term GO:0031424
. We’ll only look at factors 1 and 3 for this example:
plot_features_weight_set(
mofa_output,
go_sets_filtered[["GO:0031424"]],
set_name = "GO:0031424 (keratinization)",
features_metric = "importance",
datasets = "rnaseq",
latent_dimensions = paste("Factor", c(1, 3)),
point_alpha = 0.2
)
In the resulting plot, we can see for each dataset (in this case, we’re only looking at the transcriptomics dataset since we have gene sets) and latent dimensions (rows of the plot) the distribution of features importance for genes in the set vs not in the set. We can see that the GO term investigated contains 20 genes, and in total, 994 genes have been assigned a weight with MOFA (because we have performed some pre-filtering on the transcriptomics dataset prior to running MOFA). Therefore, out of all the genes in the set, 9 of them have no weight (they did not pass the pre-filtering step). We can add these features that did not make it into the MOFA input data in the same way as we did for the enrichment function, namely by setting the add_missing_features
argument to TRUE and passing the non-filtering MultiDataSet
object through mo_data
:
plot_features_weight_set(
mofa_output,
go_sets_filtered[["GO:0031424"]],
set_name = "GO:0031424 (keratinization)",
features_metric = "importance",
datasets = "rnaseq",
latent_dimensions = paste("Factor", c(1, 3)),
point_alpha = 0.2,
add_missing_features = TRUE,
mo_data = mo_set_complete
)
This is the distribution of values that were used for the enrichment test, but it’s not very useful to look at since the vast majority of the genes have a weight of zero.
13.4 Assessing samples clustering
Another aspect that we can evaluate is whether the integration method was able to separate different groups of samples, for example to identify differences between control and treatment samples. This is done by comparing the coordinates of the samples in the space spanned by the latent dimensions created (i.e. the samples score) to a samples label that is obtained from the samples metadata. We can use for that the silhouette score, which is a metric that quantifies how well-defined different clusters are in a given space, i.e. how well-grouped points from a same cluster (here samples group) are, and how separate they are from points from other clusters. First, a silhouette score (or silhouette width) is computed for each individual point (here sample). Then, these values are averaged over all points in a given cluster to yield a cluster silhouette score. The resulting values, both at the point- and cluster- level, can range from -1 to 1. Negative values indicate that the point is grouped with points from another cluster, while positive values indicate that the point is grouped with other points from its cluster. Values close to 0 show that the clustering is ambiguous, while scores close to 1 are indicative of well-defined clusters.
The compute_samples_silhouette()
function relies on the cluster::silhouette()
function from the cluster
package to compute silhouette score. It takes as input the integration results in a standardised format, as well as the MultiDataSet
object (to access the samples metadata) and the name of the column in the samples metadata table that contains the samples grouping information. In addition, it is possible to specify the distance metric that should be used to compute distances between the samples (through the distance_metric
argument). By default, the function uses euclidean distances, but other options are available.
Here, we will check how well DIABLO managed to separate the bruising groups:
tar_target(
diablo_silhouette,
compute_samples_silhouette(
diablo_output,
mo_set_complete,
"status"
)
)
tar_load(diablo_silhouette)
diablo_silhouette
#> $samples_silhouette
#> # A tibble: 135 × 4
#> sample_id group neighbour_group silhouette_width
#> <chr> <fct> <fct> <dbl>
#> 1 G1979 BRD Control 0.334
#> 2 G2500 BRD Control 0.255
#> 3 G3030 BRD Control 0.328
#> 4 G3068 BRD Control 0.370
#> 5 G3121 BRD Control 0.423
#> 6 G3315 Control BRD 0.529
#> 7 G3473 Control BRD 0.355
#> 8 G3474 Control BRD 0.477
#> 9 G3550 BRD Control 0.362
#> 10 G3594 Control BRD 0.567
#> # ℹ 125 more rows
#>
#> $groups_average_silhouette
#> # A tibble: 2 × 2
#> group group_average_width
#> <fct> <dbl[1d]>
#> 1 BRD 0.309
#> 2 Control 0.455
The function returns a list with two elements: a tibble of samples silhouette widths (samples_silhouette
), and a tibble of groups average silhouette (groups_average_silhouette
). In the first table, the group
column indicates for each sample the group to which it is assigned (based on the samples metadata), and the neighbour_group
column indicates the closest other group in the latent dimensions space (which is not very useful in this case since we only have two groups). The samples silhouette width is shown in the silhouette_width
column. This is very useful to identify “outlier” samples that were not grouped as expected:
diablo_silhouette$samples_silhouette |>
filter(silhouette_width < 0)
#> # A tibble: 1 × 4
#> sample_id group neighbour_group silhouette_width
#> <chr> <fct> <fct> <dbl>
#> 1 P4735 BRD Control -0.217
Here, there is one sample from an infected animal that clusters with the control samples in the latent dimensions space.
The average silhouette score for the two disease status groups are both positive, although not close to 1, indicating that DIABLO did a reasonable job of identifying differences between the two groups:
diablo_silhouette$groups_average_silhouette
#> # A tibble: 2 × 2
#> group group_average_width
#> <fct> <dbl[1d]>
#> 1 BRD 0.309
#> 2 Control 0.455
This is expected, since DIABLO is a supervised integration method which tries to maximise the separation between groups of samples.
We can repeat the operation for the MOFA results. By default, the compute_samples_silhouette()
function uses all latent dimensions to compute distances between samples, but it is possible to specify which latent dimensions should be considered through the latent_dimensions
argument. For example, we will see how the first three MOFA factors separate the disease status groups:
tar_target(
mofa_silhouette,
compute_samples_silhouette(
mofa_output,
mo_set_complete,
"status",
latent_dimensions = paste("Factor", 1:3)
)
)
tar_load(mofa_silhouette)
mofa_silhouette$groups_average_silhouette
#> # A tibble: 2 × 2
#> group group_average_width
#> <fct> <dbl[1d]>
#> 1 BRD 0.257
#> 2 Control 0.524
Both groups’ silhouette scores are positive, although the BRD group silhouette is not very high. Also, more samples have a silhouette width below 0.
mofa_silhouette$samples_silhouette |>
filter(silhouette_width < 0)
#> # A tibble: 11 × 4
#> sample_id group neighbour_group silhouette_width
#> <chr> <fct> <fct> <dbl>
#> 1 O3720 BRD Control -0.430
#> 2 O4492 BRD Control -0.246
#> 3 O4919 BRD Control -0.440
#> 4 O5003 BRD Control -0.243
#> 5 P4666 BRD Control -0.0994
#> 6 P4735 BRD Control -0.429
#> 7 R50 BRD Control -0.168
#> 8 R5979 BRD Control -0.00625
#> 9 R9887 BRD Control -0.0659
#> 10 Y3059 BRD Control -0.00504
#> 11 Y9745 BRD Control -0.0358
Note that with MOFA, since the different factors represent different trends within the data, it is possible that using more factors will yield smaller average silhouette groups. For example if we used all constructed factors:
compute_samples_silhouette(
mofa_output,
mo_set_complete,
"status"
)$groups_average_silhouette
#> # A tibble: 2 × 2
#> group group_average_width
#> <fct> <dbl[1d]>
#> 1 BRD 0.0658
#> 2 Control 0.272
In such cases, it makes more sense to evaluate the ability of one or two latent dimensions to separate the samples group.
13.5 Recap – targets list
For convenience, here is the list of targets that we created in this section:
Targets list for evaluating the output of an integration method:
list(
## Evaluating DIABLO selected features against single-omics results
tar_target(
diablo_selected_vs_singleomics_table,
evaluate_feature_selection_table(
diablo_output,
mo_data = mo_set_complete,
col_names = list(
"snps" = "qtl_type",
"rnaseq" = "de_signif",
"metabolome" = "de_signif"
)
)
),
## Plotting DIABLO features weight against single-omics results
tar_target(
diablo_features_weight_vs_singleomics_plot,
plot_features_weight_covariate(
diablo_output,
mo_data = mo_set_complete,
covariate = list(
"snps" = "qtl_type",
"rnaseq" = "de_status",
"metabolome" = "de_status"
),
remove_null_weight = TRUE
)
),
## Genes GO annotation file
tar_target(
rnaseq_go_terms_file,
system.file(
"extdata/transcriptomics_go_annotation.csv",
package = "moiraine"
),
format = "file"
),
## Genes GO annotation data-frame
tar_target(
rnaseq_go_df,
read_csv(rnaseq_go_terms_file) |>
filter(go_domain == "Biological process")
),
## GO term sets
tar_target(
go_sets,
make_feature_sets_from_df(
rnaseq_go_df,
col_id = "gene_id",
col_set = "go_id"
)
),
## Filtering GO term sets against measured features
tar_target(
go_sets_filtered,
reduce_feature_sets_data(go_sets, mo_set_complete)
),
## Checking genes GO term sets against datasets
tar_target(
go_sets_check,
check_feature_sets(
go_sets_filtered,
mo_set_complete,
datasets = "rnaseq"
)
),
## Table of information about GO terms
tar_target(
go_sets_info,
rnaseq_go_df |>
dplyr::select(go_id, go_name) |>
dplyr::distinct()
),
## MOFA latent components enrichment analysis
tar_target(
mofa_enrichment_results,
evaluate_method_enrichment(
mofa_output,
go_sets_filtered,
datasets = "rnaseq",
latent_dimensions = paste("Factor", 1:3),
use_abs = TRUE,
min_set_size = 10,
add_missing_features = TRUE,
mo_data = mo_set_complete,
sets_info_df = go_sets_info,
col_set = "go_id"
)
),
## Plotting features weight for GO term 'GO:0031424'
tar_target(
mofa_enrichment_go0031424_plot,
plot_features_weight_set(
mofa_output,
go_sets_filtered[["GO:0031424"]],
set_name = "GO:0031424 (keratinization)",
features_metric = "importance",
datasets = "rnaseq",
latent_dimensions = paste("Factor", c(1, 3)),
point_alpha = 0.2
)
),
## Assessing DIABLO samples clustering
tar_target(
diablo_silhouette,
compute_samples_silhouette(
diablo_output,
mo_set_complete,
"status"
)
)
)
Note that in the full genomics dataset, 36 markers were detected as QTLs or eQTLs, however only eight of them were retained in the features preselection step ( Section 7.3.2). For a real analysis, all 36 markers would have also been included in the prefiltered dataset even if they did not pass the preselection.↩︎