13 Evaluating the integration results

library(targets)
library(moiraine)

## For data-frames manipulation
library(dplyr)

## For colour palettes
library(ggplot2)

After having investigated the results of an integration method (Chapter 12), we will show in this chapter how to evaluate the integration results against single-omics analyses results or prior biological knowledge.

As a reminder, here is what the _targets.R script should look like so far:

_targets.R script

library(targets)
library(tarchetypes)
library(moiraine)

tar_option_set(
  packages = c(
    "moiraine",
    "MOFA2",
    "mixOmics",
    "readr",
    "tibble",
    "tidyr",
    "dplyr",
    "ggplot2",
    "patchwork"
  )
)

list(
# Importing data ---------------------------------------------------------------

  ## Data import using a target factory
  import_dataset_csv_factory(
    files = c(
      system.file("extdata/genomics_dataset.csv", package = "moiraine"),
      system.file("extdata/transcriptomics_dataset.csv", package = "moiraine"),
      system.file("extdata/metabolomics_dataset.csv", package = "moiraine")
    ),
    col_ids = c("marker", "gene_id", "sample_id"),
    features_as_rowss = c(TRUE, TRUE, FALSE),
    target_name_suffixes = c("geno", "transcripto", "metabo")
  ),
  
  ## Genomics features metadata file
  tar_target(
    fmetadata_file_geno,
    system.file("extdata/genomics_features_info.csv", package = "moiraine"),
    format = "file"
  ),
  
  ## Genomics features metadata import
  tar_target(
    fmetadata_geno,
    import_fmetadata_csv(
      fmetadata_file_geno,
      col_id = "marker",
      col_types = c("chromosome" = "c")
    )
  ),
  
  ## Metabolomics features metadata import
  import_fmetadata_csv_factory(
    files = c(
      system.file("extdata/metabolomics_features_info.csv", package = "moiraine")
    ),
    col_ids = c("feature_id"),
    target_name_suffixes = c("metabo")
  ),
  
  ## Transcriptomics features metadata import
  import_fmetadata_gff_factory(
    files = system.file("extdata/bos_taurus_gene_model.gff3", package = "moiraine"),
    feature_types = "genes",
    add_fieldss = c("Name", "description"),
    target_name_suffixes = "transcripto"
  ),
  
  ## Samples metadata import
  import_smetadata_csv_factory(
    files = system.file("extdata/samples_info.csv", package = "moiraine"),
    col_ids = "animal_id",
    target_name_suffixes = "all"
  ),
  
  ## Creating omics sets for each dataset
  create_omics_set_factory(
    datasets = c(data_geno, data_transcripto, data_metabo),
    omics_types = c("genomics", "transcriptomics", "metabolomics"),
    features_metadatas = c(fmetadata_geno, fmetadata_transcripto, fmetadata_metabo),
    samples_metadatas = c(smetadata_all, smetadata_all, smetadata_all)
  ),
  
  ## Creating the MultiDataSet object
  tar_target(
    mo_set,
    create_multiomics_set(
      list(set_geno,
           set_transcripto,
           set_metabo)
    )
  ),

# Inspecting the `MultiDataSet` object -----------------------------------------

  ## Creating a density plot for each dataset
  tar_target(
    density_plots,
    plot_density_data(
      mo_set,
      combined = FALSE,
      scales = "free"
    )
  ),

  ## Plotting the relationship between features mean and standard deviation
  ## for each dataset
  tar_target(
    mean_sd_plots,
    plot_meansd_data(mo_set)
  ),
  
  ## Assessing missing values
  tar_target(
    n_missing_values,
    check_missing_values(mo_set)
  ),

# Modifying the `MultiDataSet` object ------------------------------------------

  ## RNAseq differential expression results file
  tar_target(
    rnaseq_de_res_file,
    system.file(
      "extdata/transcriptomics_de_results.csv",
      package = "moiraine"
    ),
    format = "file"
  ),

  ## Reading the RNAseq differential expression results
  tar_target(
    rnaseq_de_res_df,
    read_csv(rnaseq_de_res_file) |>
      rename(feature_id = gene_id) |>
      mutate(dataset = "rnaseq")
  ),

  ## Adding the differential expression results to the MultiDataSet object
  tar_target(
    mo_set_de,
    add_features_metadata(mo_set, rnaseq_de_res_df)
  ),

# Data pre-processing ----------------------------------------------------------

  ## Applying transformations to the datasets
  transformation_datasets_factory(
    mo_set_de,
    c("rnaseq" = "vst-deseq2",
      "metabolome" = "logx"),
    log_bases = 2,
    pre_log_functions = zero_to_half_min,
    transformed_data_name = "mo_set_transformed"
  ),
  
  ## Density plot for each transformed dataset
  tar_target(
    density_plots_transformed,
    plot_density_data(
      mo_set_transformed,
      combined = FALSE,
      scales = "free"
    )
  ),
  
  ## Plotting the mean-SD trend for transformed each dataset
  tar_target(
    mean_sd_plots_transformed,
    plot_meansd_data(mo_set_transformed)
  ),
  
  ## Summary table of the transformations applied
  tar_target(
    transformation_summary,
    get_table_transformations(transformations_runs_list)
  ),
  
  ## Running a PCA on each dataset
  pca_complete_data_factory(
    mo_set_transformed,
    complete_data_name = "mo_set_complete"
  ),
  
  ## PCA screeplots
  tar_target(
    pca_screeplots,
    plot_screeplot_pca(pca_runs_list)
  ),
  
  ## PCA sample plots
  tar_target(
    pca_sample_plots,
    plot_samples_coordinates_pca(
      pca_runs_list,
      datasets = "snps",
      pcs = 1:3,
      mo_data = mo_set_de,
      colour_upper = "geno_comp_cluster",
      shape_upper = "status",
      colour_lower = "feedlot"
    )
  ),

# Dataset pre-filtering --------------------------------------------------------

  ## Unsupervised feature selection based on MAD score
  feature_preselection_mad_factory(
    mo_set_complete,
    to_keep_ns = c("snps" = 1000, "rnaseq" = 1000),
    with_ties = TRUE,
    filtered_set_target_name = "mo_presel_unsupervised"
  ),
  
  ## Diagnostic plot for MAD-based feature selection
  tar_target(
    preselection_mad_plot,
    plot_feature_preselection_mad(individual_mad_values)
  ),
  
  ## Supervised feature selection based on bruising groups
  feature_preselection_splsda_factory(
    mo_set_complete,
    group = "status",
    to_keep_ns = c("snps" = 1000, "rnaseq" = 1000),
    filtered_set_target_name = "mo_presel_supervised"
  ),
  
  ## Diagnostic plot for sPLS-DA based feature selection
  tar_target(
    preselection_splsda_plot,
    plot_feature_preselection_splsda(individual_splsda_perf)
  ),

# Integration with sPLS --------------------------------------------------------

  ## Creating sPLS input
  tar_target(
    spls_input,
    get_input_spls(
      mo_presel_supervised,
      mode = "canonical",
      datasets = c("rnaseq", "metabolome")
    )
  ),
  
  ## Initial PLS run with no feature selection and large number of components
  tar_target(
    spls_novarsel,
    spls_run(
      spls_input,
      ncomp = 4
    )
  ),
  
  ## Cross-validation for number of components
  tar_target(
    spls_perf_res,
    mixOmics::perf(
      spls_novarsel,
      validation = "Mfold",
      folds = 10,
      nrepeat = 10,
      cpus = 3
    )
  ),
  
  ## Plotting cross-validation results (for number of components)
  ## Can try criterion = 'Q2.total', 'cor.tpred', 'cor.upred', 'RSS.tpred',
  ## 'RSS.upred' (but avoid 'RSS' and 'PRESS')
  tar_target(
    spls_perf_plot,
    plot(spls_perf_res, criterion = "Q2.total")
  ),
  
  ## Selected value for ncomp
  tar_target(
    spls_optim_ncomp,
    spls_get_optim_ncomp(spls_perf_res, min_ncomp = 2)
  ),
  
  ## Cross-validation for number of features to retain
  tar_target(
    spls_tune_res,
    spls_tune(
      spls_input,
      ncomp = spls_optim_ncomp,
      keepX = seq(10, 100, 10),
      keepY = seq(10, 100, 10),
      validation = "Mfold",
      folds = 10,
      nrepeat = 5,
      measure = "cor",
      cpus = 3
    )
  ),
  
  ## Plotting cross-validation results (for number of features)
  tar_target(
    spls_tune_plot,
    spls_plot_tune(spls_tune_res)
  ),
  
  ## Final sPLS run
  tar_target(
    spls_final_run,
    spls_run(
      spls_input,
      ncomp = spls_optim_ncomp,
      keepX = spls_tune_res$choice.keepX,
      keepY = spls_tune_res$choice.keepY
    )
  ),

# Integration with sO2PLS ------------------------------------------------------

  ## Creating sO2PLS input
  tar_target(
    omicspls_input,
    get_input_omicspls(
      mo_presel_supervised,
      datasets = c("rnaseq", "metabolome")
    )
  ),
  
  ## Adjusted cross-validation for number of components
  tar_target(
    so2pls_cv_adj,
    so2pls_crossval_o2m_adjR2(
      omicspls_input,
      a = 1:5,
      ax = seq(0, 10, by = 2),
      ay = seq(0, 10, by = 2),
      nr_folds = 10,
      nr_cores = 6,
      seed = 127
    )
  ),
  tar_target(
    so2pls_cv_adj_res,
    so2pls_get_optim_ncomp_adj(so2pls_cv_adj)
  ),
  
  ## Plotting adjusted cross-validation results
  tar_target(
    so2pls_cv_adj_plot,
    so2pls_plot_cv_adj(so2pls_cv_adj)
  ),
  
  ## Standard cross-validation for number of components
  tar_target(
    so2pls_cv,
    so2pls_crossval_o2m(
      omicspls_input,
      so2pls_cv_adj,
      nr_folds = 10,
      nr_cores = 6,
      seed = 356
    )
  ),
  tar_target(
    so2pls_cv_res,
    so2pls_get_optim_ncomp(so2pls_cv)
  ),
  
  ## Plotting standard cross-validation results
  tar_target(
    so2pls_cv_plot,
    so2pls_plot_cv(so2pls_cv)
  ),
  
  ## Cross-validation for sparsity parameters
  tar_target(
    so2pls_cv_sparsity,
    so2pls_crossval_sparsity(
      omicspls_input,
      n = so2pls_cv_res["n"],
      nx = so2pls_cv_res["nx"],
      ny = so2pls_cv_res["ny"],
      nr_folds = 10,
      keepx_seq = c(seq(5, 30, 5), seq(40, 100, 10)),
      keepy_seq = c(seq(5, 40, 5))
    )
  ),
  tar_target(
    so2pls_cv_sparsity_res,
    so2pls_get_optim_keep(so2pls_cv_sparsity)
  ),
  
  ## Plotting the results of the cross-validation for the number of features
  ## to retain from each dataset for the different joint components
  tar_target(
    so2pls_cv_sparsity_plot,
    so2pls_plot_cv_sparsity(so2pls_cv_sparsity)
  ),
  
  ## Extracting sparsity results in table format
  tar_target(
    so2pls_cv_sparsity_table,
    so2pls_print_cv_sparsity(so2pls_cv_sparsity_res)
  ),
  
  ## Final sO2PLS run
  tar_target(
    so2pls_final_run,
    so2pls_o2m(
      omicspls_input,
      so2pls_cv_res,
      so2pls_cv_sparsity_res
    )
  ),
  
  ## Summary plot of percentage of variance explained
  tar_target(
    so2pls_summary_plot,
    so2pls_plot_summary(so2pls_final_run)
  ),
  
  ## Screeplot
  tar_target(
    so2pls_screeplot,
    so2pls_screeplot(so2pls_final_run)
  ),
  
  ## Comparison of samples score for joint components
  tar_target(
    so2pls_joint_components_comparison_plot,
    so2pls_compare_samples_joint_components(
      so2pls_final_run,
      mo_data = mo_set_de,
      colour_by = "status",
      shape_by = "feedlot"
    )
  ),
  
  ## Coefficient plot for joint components
  tar_target(
    so2pls_joint_components_coefficients_plot,
    so2pls_plot_joint_components_coefficients(so2pls_final_run)
  ),
  
  ## Joint component samples score plot
  tar_target(
    so2pls_joint_components_samples_score_plot,
    so2pls_plot_samples_joint_components(
      so2pls_final_run,
      mo_data = mo_set_de,
      colour_upper = "status",
      scale_colour_upper = scale_colour_brewer(palette = "Paired"),
      shape_upper = "feedlot"
    ) +
      theme(legend.box = "vertical")
  ),
  
  ## Specific components samples score plot
  tar_target(
    so2pls_specific_components_samples_score_plot,
    so2pls_plot_samples_specific_components(
      so2pls_final_run,
      mo_data = mo_set_de,
      colour_upper = "feedlot",
      scale_colour_upper = scale_colour_brewer(palette = "Paired"),
      colour_lower = "rnaseq_batch",
      shape_upper = "gender"
    ) |> 
      map(\(x) x + theme(legend.box = "vertical"))
  ),

# Integration with MOFA --------------------------------------------------------

  ## Creating MOFA input
  tar_target(
    mofa_input,
    get_input_mofa(
      mo_presel_supervised,
      options_list = list(
        data_options = list(scale_views = TRUE),
        model_options = list(likelihoods = c(
          "snps" = "poisson",
          "rnaseq" = "gaussian",
          "metabolome" = "gaussian")
        ),
        training_options = list(seed = 43)
      ),
      only_common_samples = FALSE
    )
  ),
  
  ## Overview plot of the samples in each dataset
  tar_target(
    mofa_input_plot,
    plot_data_overview(mofa_input)
  ),
  
  ## Training MOFA model
  tar_target(
    mofa_trained,
    run_mofa(
      mofa_input,
      save_data = TRUE,
      use_basilisk = TRUE
    )
  ),
  
  ## Formatting MOFA output
  tar_target(
    mofa_output,
    get_output(mofa_trained)
  ),
  
  ## Plots of variance explained
  tar_target(
    mofa_var_explained_plot,
    plot_variance_explained(
      mofa_trained,
      x = "view",  ## datasets on the x-axis
      y = "factor" ## factors on the y-axis
    )
  ),
  tar_target(
    mofa_total_var_explained_plot,
    plot_variance_explained(
      mofa_trained,
      x = "view",
      y = "factor",
      plot_total = TRUE
    )[[2]]
  ),
  
  ## Plot of factors correlation with covariates
  tar_target(
    mofa_factors_covariates_cor_plot,
    mofa_plot_cor_covariates(mofa_trained)
  ),

# Integration with DIABLO ------------------------------------------------------

  ## Creating the DIABLO input
  tar_target(
    diablo_input,
    get_input_mixomics_supervised(
      mo_presel_supervised,
      group = "status"
    )
  ),
  
  ## Running sPLS on each dataset to construct the design matrix
  diablo_pairwise_pls_factory(diablo_input),
  
  ## Initial DIABLO run with no feature selection and large number of components
  tar_target(
    diablo_novarsel,
    diablo_run(
      diablo_input,
      diablo_design_matrix,
      ncomp = 7
    )
  ),
  
  ## Cross-validation for number of components
  tar_target(
    diablo_perf_res,
    mixOmics::perf(
      diablo_novarsel,
      validation = "Mfold",
      folds = 10,
      nrepeat = 10,
      cpus = 3
    )
  ),
  
  ## Plotting cross-validation results (for number of components)
  tar_target(
    diablo_perf_plot,
    diablo_plot_perf(diablo_perf_res)
  ),
  
  ## Selected value for ncomp
  tar_target(
    diablo_optim_ncomp,
    diablo_get_optim_ncomp(diablo_perf_res)
  ),
  
  ## Cross-validation for number of features to retain
  tar_target(
    diablo_tune_res,
    diablo_tune(
      diablo_input,
      diablo_design_matrix,
      ncomp = diablo_optim_ncomp,
      validation = "Mfold",
      folds = 10,
      nrepeat = 5,
      dist = "centroids.dist",
      cpus = 3
    )
  ),
  
  ## Plotting cross-validation results (for number of features)
  tar_target(
    diablo_tune_plot,
    diablo_plot_tune(diablo_tune_res)
  ),
  
  ## Final DIABLO run
  tar_target(
    diablo_final_run,
    diablo_run(
      diablo_input,
      diablo_design_matrix,
      ncomp = diablo_optim_ncomp,
      keepX = diablo_tune_res$choice.keepX
    )
  ),

# Interpreting the integration results -----------------------------------------

  ## Generating standardised output
  tar_target(
    spls_output,
    get_output(spls_final_run)
  ),
  
  ## Percentage of variance explained
  tar_target(
    spls_plot_variance_explained,
    plot_variance_explained(spls_output)
  ),
  
  ## Samples score matrix plot
  tar_target(
    spls_samples_scores_plot,
    plot_samples_score(
      spls_output,
      mo_data = mo_set_complete,
      colour_upper = "status",
      scale_colour_upper = palette_status,
      shape_upper = "gender",
      colour_lower = "feedlot",
      scale_colour_lower = palette_feedlot
    ) +
      theme(legend.box = "vertical")
  ),
  
  ## Distribution of features weight
  tar_target(
    spls_features_weight_distribution,
    plot_features_weight_distr(spls_output)
  ),
  
  ## Plot of top contributing features
  tar_target(
    spls_top_features_plot,
    plot_top_features(
      spls_output,
      mo_data = mo_set_complete,
      label_cols = list(
        "rnaseq" = "Name",
        "metabolome" = "name"
      )
    )
  ),
  
  ## Table of selected features
  tar_target(
    spls_selected_features,
    get_selected_features(spls_output)
  ),

  ## Generating standardised output
  tar_target(
    so2pls_output,
    get_output(so2pls_final_run)
  ),
  
  ## Percentage of variance explained
  tar_target(
    so2pls_plot_variance_explained,
    plot_variance_explained(so2pls_output, ncol = 1) +
      theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 1))
  ),
  
  ## Samples score matrix plot
  tar_target(
    so2pls_samples_scores_plot,
    plot_samples_score(
      so2pls_output,
      latent_dimensions = "joint component 1",
      mo_data = mo_set_complete,
      colour_upper = "status",
      scale_colour_upper = palette_status,
      shape_upper = "gender"
    )
  ),
  
  ## Distribution of features weight
  tar_target(
    so2pls_features_weight_distribution,
    plot_features_weight_distr(so2pls_output) +
      plot_layout(ncol = 2)
  ),
  
  ## Plot of top contributing features
  tar_target(
    so2pls_top_features_plot,
    plot_top_features(
      so2pls_output,
      mo_data = mo_set_complete,
      label_cols = list(
        "rnaseq" = "Name",
        "metabolome" = "name"
      )
    )
  ),
  
  ## Table of selected features
  tar_target(
    so2pls_selected_features,
    get_selected_features(
      so2pls_output,
      latent_dimensions = "joint component 1"
    )
  ),

  ## Generating standardised output
  tar_target(
    mofa_output,
    get_output(mofa_trained)
  ),
  
  ## Percentage of variance explained
  tar_target(
    mofa_plot_variance_explained,
    plot_variance_explained(mofa_output, ncol = 1) +
      theme(axis.text.x = element_text(size = 9, angle = 30, hjust = 1))
  ),
  
  ## Samples score matrix plot
  tar_target(
    mofa_samples_scores_plot,
    plot_samples_score(
      mofa_output,
      latent_dimensions = paste("Factor", 1:4),
      mo_data = mo_set_complete,
      colour_upper = "status",
      scale_colour_upper = palette_status,
      shape_upper = "gender",
      colour_lower = "geno_comp_cluster",
      scale_colour_lower = palette_geno_comp
    ) +
      theme(legend.box = "vertical")
  ),
  
  ## Distribution of features weight
  tar_target(
    mofa_features_weight_distribution,
    plot_features_weight_distr(
      mofa_output,
      latent_dimensions = paste("Factor", 1:4)
    ) +
      plot_layout(ncol = 1)
  ),
  
  ## Plot of top contributing features
  tar_target(
    mofa_top_features_plot,
    plot_top_features(
      mofa_output,
      mo_data = mo_set_complete,
      label_cols = list(
        "rnaseq" = "Name",
        "metabolome" = "name"
      ),
      latent_dimensions = paste("Factor", 1:2)
    )
  ),
  
  ## Table of top contributing features
  tar_target(
    mofa_top_features,
    get_top_features(
      mofa_output,
      min_importance = 0.8,
      mo_data = mo_set_complete
    )
  ),

  ## Generating standardised output
  tar_target(
    diablo_output,
    get_output(diablo_final_run)
  ),
  
  ## Percentage of variance explained
  tar_target(
    diablo_plot_variance_explained,
    plot_variance_explained(diablo_output, ncol = 2)
  ),
  
  ## Samples score matrix plot
  tar_target(
    diablo_samples_scores_plot,
    plot_samples_score(
      diablo_output,
      mo_data = mo_set_complete,
      colour_upper = "status",
      scale_colour_upper = palette_status,
      shape_upper = "gender",
      colour_lower = "rnaseq_batch",
      scale_colour_lower = palette_rnaseq_batch
    ) +
      theme(legend.box = "vertical")
  ),
  
  ## Distribution of features weight
  tar_target(
    diablo_features_weight_distribution,
    plot_features_weight_distr(
      diablo_output
    ) +
      plot_layout(ncol = 1)
  ),
  
  ## Plot of top contributing features
  tar_target(
    diablo_top_features_plot,
    plot_top_features(
      diablo_output,
      mo_data = mo_set_complete,
      label_cols = list(
        "rnaseq" = "Name",
        "metabolome" = "name"
      ),
      latent_dimensions = paste("Component", 1:2)
    )
  ),
  
  ## Table of top contributing features
  tar_target(
    diablo_selected_features,
    get_selected_features(diablo_output) 
  )
)

13.1 Introduction

Evaluating the results of a multi-omics integration analysis is not an easy task, mainly because when analysing biological data there is no ground truth about the underlying biological mechanisms at play. However, it is possible to gain some measure of confidence in our results by comparing them to what we already know about the biological system studied, or to other analyses (typically single-omics analyses) performed on the dataset. In the latter case, this can also show us what insights are gained by analysing the omics datasets together that would have been missed by looking at each omics dataset separately. Note that in this vignette, we refer to the evaluation of an integration method as comparison with prior knowledge or previous studies, while in Chapter 14 we will talk about the comparison of results obtained with different integration tools.

When evaluating integration results, we can compare the features contribution (see Section 12.1) to some prior information that we have about the features: for example, their score in a differential expression analysis, whether they were found differentially expressed or not, or the group to which they belong (e.g. biological pathway or chemical class). On the other hand, we can assess whether the latent dimensions created separate different sample groups (e.g. control vs treatment, or different time points).

For this vignette, we will use the results from the DIABLO (Chapter 11) and MOFA (Chapter 10) analyses as examples. We will use the dimension reduction output objects that we created in Chapter 12, which contains the results of the DIABLO and MOFA runs stored in a standardised format (Section 12.2):

tar_load(diablo_output)
tar_load(mofa_output)

Note that for the DIABLO run, the samples score are computed for the weighted average of each latent component, rather than for the dataset-specific latent components (this is controlled through the use_average_dimensions argument from the get_output() function).

We will also use the MultiDataSet object that contains information about the samples and features:

tar_load(mo_set_complete)

13.2 Evaluating features prioritisation

First, we can evaluate the integration results in terms of the features prioritisation, i.e. the ranking of the features for each latent dimension in terms of their importance score. We can for example assess whether the features that were given the highest scores for some latent components were also highlighted through single-omics analyses, or compare the importance score of groups of features, for example compounds from different chemical classes or genes from different gene families.

Here, we will compare the DIABLO results to the results of traditional single-omics analyses. In our case, a GWAS and eQTL analysis were performed to detect genomic variants associated with the disease status of the samples. The mode of action of significant markers (i.e. QTL, cis-eQTL or trans-eQTL). The GWAS status (high-scoring or not) of each marker was recorded in the genomics features metadata table:

get_features_metadata(mo_set_complete)[["snps"]] |> 
  pull(qtl_type) |> 
  unique()
#> [1] "non signif." "cis eQTL"    "trans eQTL"  "QTL"

For the transcriptomics and metabolomics dataset, a differential expression (DE) analysis was performed on each of them to compare control and BRD animals. The resulting adjusted p-value, log2-fold change or t-value, status (upregulated, downregulated or not differentially expressed) and significance (differentially expressed or not) of each gene or compound are recorded in each dataset’s features metadata:

get_features_metadata(mo_set_complete)[["rnaseq"]] |> 
  select(feature_id, log_fc:de_status) |> 
  head()
#>                            feature_id      log_fc    log_cpm          f
#> ENSBTAG00000000005 ENSBTAG00000000005  0.13603041  5.9051177  4.1626465
#> ENSBTAG00000000008 ENSBTAG00000000008 -0.12965356 -0.7437052  1.0676845
#> ENSBTAG00000000009 ENSBTAG00000000009  1.27141158 -2.5627934 23.9562951
#> ENSBTAG00000000010 ENSBTAG00000000010  0.39567729  6.2563594 60.5303416
#> ENSBTAG00000000011 ENSBTAG00000000011  0.07766873 -2.7608361  0.1418457
#> ENSBTAG00000000012 ENSBTAG00000000012  0.15756169  3.6628775 11.0448775
#>                         p_value          fdr de_signif de_status
#> ENSBTAG00000000005 4.324808e-02 7.170137e-02    Not DE    Not DE
#> ENSBTAG00000000008 3.032916e-01 3.938643e-01    Not DE    Not DE
#> ENSBTAG00000000009 2.731328e-06 9.665375e-06    Not DE    Not DE
#> ENSBTAG00000000010 1.581955e-12 1.521160e-11    Not DE    Not DE
#> ENSBTAG00000000011 7.070363e-01 7.773874e-01    Not DE    Not DE
#> ENSBTAG00000000012 1.141125e-03 2.623062e-03    Not DE    Not DE

get_features_metadata(mo_set_complete)[["metabolome"]] |>  
  select(feature_id, t_value:de_status) |>
  head()
#>           feature_id     t_value      p_value         padj de_signif
#> HMDB00001  HMDB00001  -0.5557020 5.797635e-01 6.784466e-01    Not DE
#> HMDB00008  HMDB00008   0.2181562 8.276321e-01 8.925444e-01    Not DE
#> HMDB00042  HMDB00042 -12.5323491 1.753101e-24 4.821028e-23        DE
#> HMDB00043  HMDB00043  -7.9073179 7.827088e-13 3.913544e-12        DE
#> HMDB00060  HMDB00060  -0.4369834 6.628164e-01 7.439776e-01    Not DE
#> HMDB00062  HMDB00062   8.2347595 1.271549e-13 6.993518e-13        DE
#>               de_status
#> HMDB00001        Not DE
#> HMDB00008        Not DE
#> HMDB00042 downregulated
#> HMDB00043 downregulated
#> HMDB00060        Not DE
#> HMDB00062   upregulated

13.2.1 Table of counts of selected genes against a grouping

As DIABLO performs feature selection, it is of interest to assess how many (e)QTLs and DE genes and compounds were retained for each latent component. We can use the evaluate_feature_selection_table() function to generate a table of counts comparing the features selected for each latent component to a features grouping (in our case, the results of the single-omics analyses). The features grouping is obtained from the features metadata stored in the MultiDataSet object. The function takes as input the integration result object, the MultiDataSet object and a named list giving for each dataset the name of the column from their features metadata table containing the features grouping:

diablo_evaluation_counts_table <- evaluate_feature_selection_table(
  diablo_output,
  mo_data = mo_set_complete,
  col_names = list(
    "snps" = "qtl_type",
    "rnaseq" = "de_signif",
    "metabolome" = "de_signif"
  )
)

diablo_evaluation_counts_table |> 
  filter(latent_dimension == "Component 1")
#> # A tibble: 8 × 6
#>   method latent_dimension dataset    feature_label selected not_selected
#>   <chr>  <fct>            <fct>      <chr>            <int>        <int>
#> 1 DIABLO Component 1      snps       cis eQTL             1            2
#> 2 DIABLO Component 1      snps       non signif.         17          975
#> 3 DIABLO Component 1      snps       QTL                  1            0
#> 4 DIABLO Component 1      snps       trans eQTL           1            3
#> 5 DIABLO Component 1      rnaseq     DE                  23           52
#> 6 DIABLO Component 1      rnaseq     Not DE               7          912
#> 7 DIABLO Component 1      metabolome DE                  10           20
#> 8 DIABLO Component 1      metabolome Not DE               0           25

diablo_evaluation_counts_table |> 
  filter(latent_dimension == "Component 2")
#> # A tibble: 8 × 6
#>   method latent_dimension dataset    feature_label selected not_selected
#>   <chr>  <fct>            <fct>      <chr>            <int>        <int>
#> 1 DIABLO Component 2      snps       cis eQTL             0            3
#> 2 DIABLO Component 2      snps       non signif.         25          967
#> 3 DIABLO Component 2      snps       QTL                  0            1
#> 4 DIABLO Component 2      snps       trans eQTL           0            4
#> 5 DIABLO Component 2      rnaseq     DE                   0           75
#> 6 DIABLO Component 2      rnaseq     Not DE              15          904
#> 7 DIABLO Component 2      metabolome DE                   6           24
#> 8 DIABLO Component 2      metabolome Not DE               4           21

We can see that for the first latent component, only three out of the eight (e)QTLs were selected ¹; two-third of the genes selected were found differentially expressed; and all ten of the selected compounds were found differentially expressed. Conversely, for the second latent component, none of the selected markers were QTLs, and none of the selected genes were differentially expressed. This makes sense, as the DIABLO analysis was run with the goal of separating the control and infected animals; therefore it makes sense that the features selected for the first latent component, which is the one best able to separate the groups, are the features detected as differentially expressed. On the contrary, the second latent component is constructed to be orthogonal to the first one, therefore the features selected will likely not be differentially expressed.

With this function, we can focus on only some datasets, by changing which datasets are present in the list passed to col_names. Similarly, we can select specific latent dimensions through the latent_dimensions parameter. For example, let us count the number of DE genes selected with the first latent component:

evaluate_feature_selection_table(
  diablo_output,
  mo_data = mo_set_complete,
  col_names = list("rnaseq" = "de_signif"),
  latent_dimensions = "Component 1"
)
#> # A tibble: 2 × 6
#>   method latent_dimension dataset feature_label selected not_selected
#>   <chr>  <fct>            <fct>   <chr>            <int>        <int>
#> 1 DIABLO Component 1      rnaseq  DE                  23           52
#> 2 DIABLO Component 1      rnaseq  Not DE               7          912

Note that this function is useful for integration methods that perform feature selection, such as DIABLO or sO2PLS, but will be irrelevant for methods that do not perform feature selection (for example MOFA tends to assign a non-null weight to most features).

13.2.2 Plotting features weight against a covariate

We can go further and compare the weights or importance scores given to the features for each latent component to the outcome of the single-omics results. This can be done with the plot_features_weight_covariate() function. As its name suggests, this function displays the features weight for each latent dimension against some covariate obtained from the features metadata tables. Again, we will look at the results of the GWAS/QTL and DE analyses and see whether (e)QTLs, DE genes and compounds were assigned higher weights is some of the latent components constructed with DIABLO. The input parameters for this function are similar to those for the evaluate_feature_selection_table() function: we need to pass the results of the integration method, the MultiDataSet object, and the named list giving the column in each features metadata table containing information about the features grouping. The latter is passed to the covariate argument of the function. If some datasets are not present in this list, they won’t be present in the plot.

plot_features_weight_covariate(
  diablo_output, 
  mo_data = mo_set_complete, 
  covariate = list(
    "snps" = "qtl_type", 
    "rnaseq" = "de_status", 
    "metabolome" = "de_status"
  )
)
#> Warning: Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.

By default, the function plots on the y-axis the signed importance score of the features, that is, their importance score to which the sign of their weight was added. This can be controlled via the features_metric argument. We see that most of the points have an importance score of 0, because most of the features were not selected. We can focus on only those features with a non-null weight with the remove_null_weight argument:

plot_features_weight_covariate(
  diablo_output, 
  mo_data = mo_set_complete, 
  covariate = list(
    "snps" = "qtl_type", 
    "rnaseq" = "de_status", 
    "metabolome" = "de_status"
  ),
  remove_null_weight = TRUE
)
#> Warning: Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.
#> Groups with fewer than two data points have been dropped.

Here, we see that for the first latent component, selected genes that were also found up-regulated were assigned on average a higher importance score that selected genes that were found non DE. Also, for the metabolomics dataset, selected compounds that were found up-regulated were assigned a positive weight while those found down-regulated were assigned a negative weight.

Note that with the plot_features_weight_covariate() function, we can also plot the features against a continuous covariate, e.g. the log2-fold change obtained with the differential expressions:

plot_features_weight_covariate(
  diablo_output, 
  mo_data = mo_set_complete, 
  covariate = list(
    "rnaseq" = "log_fc", 
    "metabolome" = "t_value"
  )
)
#> Warning: Removed 508 rows containing non-finite values (`stat_smooth()`).
#> Warning: Removed 508 rows containing missing values (`geom_point()`).

We can see a nice trend between compounds’ t-value and their signed importance score in the differential expression analysis for the first latent component.

We can further change the colour or shape of the points according to some information from the features metadata (information must be passed as a named list as for the covariate).

plot_features_weight_covariate(
  diablo_output, 
  mo_data = mo_set_complete, 
  covariate = list(
    "rnaseq" = "log_fc", 
    "metabolome" = "t_value"
  ),
  colour_by = list(
    "rnaseq" = "fdr",
    "metabolome" = "padj"
  )
) +
  scale_colour_viridis_c(aesthetics = c("colour", "fill"))
#> Warning: Removed 508 rows containing non-finite values (`stat_smooth()`).
#> Warning: Removed 508 rows containing missing values (`geom_point()`).

Note that for continuous covariates, as with the function plot_samples_score_covariate(), a loess curve is fit to summarise the trend between the covariates and the features weight. If colour_by is used, and the corresponding variables are categorical, a different loess curve is fitted for each category. If instead the colour_by variables are numeric, a single loess curve will be plotted for each latent dimension and dataset.

13.3 Features set enrichment

In addition to looking at the distribution of features importance score or counting the number of selected features, we can also perform an enrichment to assess whether each of the latent dimensions constructed is enriched for some features set of interest. Features sets are groups of features that belong in a similar category: e.g. all features involved in a same biological pathway, or with the same GO term annotation, or even all up- or down-regulated features. They are built using prior knowledge, for example information extracted from databases, or constructed from the results of previous analyses. Importantly, a feature can belong to more than one set (for example a compound can be involved in several biological pathways).

13.3.1 Constructing feature sets

With the moiraine package, there are two ways to obtain feature sets to perform enrichment or for plots. The first one is to extract information from the features metadata tables stored in our MultiDataSet object; which is done with the make_feature_sets_from_fm() function. Note that if some features grouping is stored in the features metadata, it means that features can only belong to one set. For example, we will use the results of the single-omics analyses to group the features into sets: QTLs, cis-eQTLs, trans-eQTLs and non significant genomics markers, and up-regulated, down-regulated or non DE genes and compounds. Again, the function accepts a named list giving for each dataset the name of the column in the corresponding features metadata table to use to generate the groups, passed to the col_names argument of the function. It returns a named list where each element is a feature set, and contains the ID of the features in the set:

tar_target(
  sets_single_omics,
  make_feature_sets_from_fm(
    mo_set_complete,
    col_names = list(
      "snps" = "qtl_type", 
      "rnaseq" = "de_status", 
      "metabolome" = "de_status"
    )
  )
)

tar_read(sets_single_omics) |> 
  str()
#> List of 10
#>  $ non signif.               : chr [1:23000] "1_41768691" "10-27008241-A-C-rs42918694" "10-37505419-T-C-rs136559242" "10-49904259-G-A-rs471723345" ...
#>  $ cis eQTL                  : chr [1:31] "ARS-BFGL-BAC-13210" "ARS-BFGL-NGS-113310" "ARS-BFGL-NGS-5022" "ARS-BFGL-NGS-59463" ...
#>  $ trans eQTL                : chr [1:4] "ARS-BFGL-NGS-80280" "BovineHD0100032240" "BovineHD2300010006" "Hapmap55381-rs29025399"
#>  $ QTL                       : chr "BovineHD1800016801"
#>  $ Not DE - rnaseq           : chr [1:20224] "ENSBTAG00000000005" "ENSBTAG00000000008" "ENSBTAG00000000009" "ENSBTAG00000000010" ...
#>  $ upregulated - rnaseq      : chr [1:102] "ENSBTAG00000000377" "ENSBTAG00000000783" "ENSBTAG00000001051" "ENSBTAG00000001785" ...
#>  $ downregulated - rnaseq    : chr [1:9] "ENSBTAG00000001032" "ENSBTAG00000004824" "ENSBTAG00000011990" "ENSBTAG00000012403" ...
#>  $ Not DE - metabolome       : chr [1:25] "HMDB00001" "HMDB00008" "HMDB00060" "HMDB00097" ...
#>  $ downregulated - metabolome: chr [1:20] "HMDB00042" "HMDB00043" "HMDB00064" "HMDB00094" ...
#>  $ upregulated - metabolome  : chr [1:10] "HMDB00062" "HMDB00092" "HMDB00159" "HMDB00169" ...

By default, the function keeps the different omics features separate: here for example, there is a “Not DE” set for the transcriptomics dataset, and a “Not DE” set for the metabolomics dataset. When there is no ambiguity as to which omics dataset they represent, the name of the dataset is not included in the set’s name. It is possible to merge feature sets with the same name between the omics datasets, by setting the combine_omics_sets argument to TRUE:

tar_target(
  sets_single_omics_merged,
  make_feature_sets_from_fm(
    mo_set_complete,
    col_names = list(
      "snps" = "qtl_type", 
      "rnaseq" = "de_status", 
      "metabolome" = "de_status"
    ),
    combine_omics_sets = TRUE
  )
)

tar_read(sets_single_omics_merged) |> 
  str()
#> List of 7
#>  $ non signif.  : chr [1:23000] "1_41768691" "10-27008241-A-C-rs42918694" "10-37505419-T-C-rs136559242" "10-49904259-G-A-rs471723345" ...
#>  $ cis eQTL     : chr [1:31] "ARS-BFGL-BAC-13210" "ARS-BFGL-NGS-113310" "ARS-BFGL-NGS-5022" "ARS-BFGL-NGS-59463" ...
#>  $ trans eQTL   : chr [1:4] "ARS-BFGL-NGS-80280" "BovineHD0100032240" "BovineHD2300010006" "Hapmap55381-rs29025399"
#>  $ QTL          : chr "BovineHD1800016801"
#>  $ Not DE       : chr [1:20249] "ENSBTAG00000000005" "ENSBTAG00000000008" "ENSBTAG00000000009" "ENSBTAG00000000010" ...
#>  $ upregulated  : chr [1:112] "ENSBTAG00000000377" "ENSBTAG00000000783" "ENSBTAG00000001051" "ENSBTAG00000001785" ...
#>  $ downregulated: chr [1:29] "ENSBTAG00000001032" "ENSBTAG00000004824" "ENSBTAG00000011990" "ENSBTAG00000012403" ...

Now the not DE set contains all genes and compounds ID that have not been found differentially expressed.

Alternatively, feature sets can be constructed from an input data-frame. This is useful when looking at pathways or GO annotations, for which features can belong to several sets at once, as this information cannot be stored in a features metadata table. Here, we will use the GO terms assigned to the genes. Note that in this example, the feature sets will contain features from only one omics dataset, but it doesn’t have to be; instead the sets could contain features from different omics datasets.

We start by reading in the data-frame of GO terms. We will focus on GO terms relating to biological processes for this example:

list(
  tar_target(
    rnaseq_go_terms_file,
    system.file(
      "extdata/transcriptomics_go_annotation.csv", 
      package = "moiraine"
    ),
    format = "file"
  ),
  
  tar_target(
    rnaseq_go_df,
    read_csv(rnaseq_go_terms_file) |> 
      filter(go_domain == "Biological process")
  )
)

tar_load(rnaseq_go_df)

dim(rnaseq_go_df)
#> [1] 89226     4
head(rnaseq_go_df)
#> # A tibble: 6 × 4
#>   gene_id            go_id      go_name                                go_domain
#>   <chr>              <chr>      <chr>                                  <chr>    
#> 1 ENSBTAG00000000005 GO:0006468 protein phosphorylation                Biologic…
#> 2 ENSBTAG00000000005 GO:0007165 signal transduction                    Biologic…
#> 3 ENSBTAG00000000005 GO:0031623 receptor internalization               Biologic…
#> 4 ENSBTAG00000000008 GO:0006813 potassium ion transport                Biologic…
#> 5 ENSBTAG00000000009 GO:0006355 regulation of DNA-templated transcrip… Biologic…
#> 6 ENSBTAG00000000009 GO:0006357 regulation of transcription by RNA po… Biologic…

The data-frame contains one row per gene (gene_id) / GO term (go_id) pair. In addition, the name of each GO term and the category (biological process, molecular function or cellular component) are indicated in the go_name and go_domain columns, respectively. When reading in the file, we’ve restricted the GO terms to only those that correspond to biological processes.

We can turn this data-frame into a list of feature sets with the make_feature_sets_from_df() function. The function takes as input the data-frame, as well as the name of the column in the data-frame that contains the features ID (col_id argument) and the name of the column in the data-frame that contains the feature sets ID or name (col_set argument):

tar_target(
  go_sets,
  make_feature_sets_from_df(
    rnaseq_go_df,
    col_id = "gene_id",
    col_set = "go_id"
  )
)

tar_load(go_sets)

length(go_sets)
#> [1] 10745
head(go_sets) |> 
  str()
#> List of 6
#>  $ GO:0006468: chr [1:580] "ENSBTAG00000000005" "ENSBTAG00000000013" "ENSBTAG00000000056" "ENSBTAG00000000184" ...
#>  $ GO:0007165: chr [1:737] "ENSBTAG00000000005" "ENSBTAG00000000073" "ENSBTAG00000000081" "ENSBTAG00000000105" ...
#>  $ GO:0031623: chr [1:31] "ENSBTAG00000000005" "ENSBTAG00000000308" "ENSBTAG00000001050" "ENSBTAG00000001139" ...
#>  $ GO:0006813: chr [1:96] "ENSBTAG00000000008" "ENSBTAG00000000082" "ENSBTAG00000000745" "ENSBTAG00000000973" ...
#>  $ GO:0006355: chr [1:915] "ENSBTAG00000000009" "ENSBTAG00000000040" "ENSBTAG00000000052" "ENSBTAG00000000074" ...
#>  $ GO:0006357: chr [1:627] "ENSBTAG00000000009" "ENSBTAG00000000039" "ENSBTAG00000000052" "ENSBTAG00000000154" ...

13.3.2 Filtering feature sets against the datasets

Before performing any enrichment, we want to make sure that the only features in the sets are the ones that were measured in the datasets. This is part of ensuring that an appropriate background or reference features set is used when performing enrichment (see for example Timmons, Szkop, and Gallagher (2015)). To this end, the reduce_feature_sets_data() function filters a list of feature sets so that only features present in a given MultiDataSet object are kept in the sets; it then removes empty feature sets. Note that for this filtering, we are using the MultiDataSet object that contains the full omics datasets, before pre-filtering:

tar_target(
  go_sets_filtered,
  reduce_feature_sets_data(go_sets, mo_set_complete)
)

The function yields the following message:

#> All features in sets are in the multi-omics dataset.

tar_load(go_sets_filtered)

13.3.3 Checking the number of features assigned to sets

We can then check the number of features that are assigned to at least one set with the check_feature_sets() function, to get an idea of the coverage of our annotation. We will restrict the check to the transcriptomics dataset, since the GO annotation only covers genes:

sets_check <- check_feature_sets(
  go_sets_filtered, 
  mo_set_complete,
  datasets = "rnaseq"
)
#> rnaseq dataset: 14,364 of 20,335 (70.6%) features assigned to at least one set.

sets_check
#> # A tibble: 1 × 5
#>   dataset n_annotated     n frac_annotated message                              
#>   <fct>         <int> <int>          <dbl> <chr>                                
#> 1 rnaseq        14364 20335          0.706 rnaseq dataset: 14,364 of 20,335 (70…

The function returns a tibble giving the name of the datasets, the number and fraction of features from the dataset that are present in at least one feature set (n_annotated and frac_annotated columns), the total number of features in the corresponding dataset (n column), as well as sentence summarising this information (message column) that is useful for reporting. In this example, 70.6% of the genes from the transcriptomics dataset are assigned to at least one GO term. This is important to check as the annotation coverage will impact the quality of the enrichment results that we get.

13.3.4 Enrichment of latent dimensions

We are now ready to perform an enrichment of the latent dimensions. There are many ways to perform functional enrichment (e.g. see Zhao and Rhee (2023)): based on over-representation, feature ranking, or even topology when the feature sets are representing biological pathways. Here, we are not trying to provide an exhaustive list of options; however it is relatively straightforward to extract from an integration method output the features importance or list of selected features for a specific latent dimension, and use it for enrichment with the package of your choice.

In moiraine, we use the gage package (Luo et al. 2009) to perform the enrichment of each latent dimension against a list of feature sets. It uses a two-sample t-test to assess whether features in a set have higher importance scores than features not in the set. The evaluate_method_enrichment() function takes as input the results from an integration method in a standardised format (here our diablo_output object), and performs an enrichment for each latent dimension using the features sets provided as a list. We can focus on specific datasets or latent dimensions by passing their name to the datasets or latent_dimensions arguments, respectively. There are a number of parameters that need to be set:

use_abs: by default, the function uses the absolute value of the importance score to perform the enrichment, which allows it to detect features sets in which features are given strong weights that are both positive and negative. If we instead want to search for features sets in which the weight of the features are coordinated (i.e. either all positive or negative), we can set the use_abs argument to FALSE.
min_set_size: the minimum number of features in a set needed for the set to be considered for enrichment. The default value is 5, i.e. sets with less than 5 features will not be tested for enrichment.
add_missing_features and mo_data: when performing multi-omics integration, it is highly likely that the omics datasets have been pre-filtered prior to the analysis, which means that some features will not have received a weight in the results of the integration analysis. This can bias the enrichment results, as these are based on only features that have a weight. So if a features set contains 50 features, but 40 of these were discarded during the pre-filtering step, only the remaining 10 features will be considered when performing an enrichment. This might lead to finding a latent dimension enriched for this set, even though the set might not be biologically relevant since 80% of its features did not even pass the pre-filtering step. To account for that, it is possible to add to the integration results all of the features in a MultiDataSet object that are not present in the results. These features will be assigned a weight of 0 to emphasise that they were not found to play any role for the latent dimensions. This is done by setting add_missing_features to TRUE, and passing to mo_data a MultiDataSet object that contains the omics datasets that have not been filtered. This way, the background set used in the enrichment will be all features measured in the omics datasets, and not only the ones that passed the pre-filtering step.
sets_info_df and col_set: the feature sets are passed as a named list, therefore the only information that we have about these sets are their names as found in the list. To facilitate the interpretation of the enrichment results, it is possible to pass a data-frame giving information about the sets, through the sets_info_df argument. This information will be added to the enrichment results table. In that case, the col_set argument must also be specified: it takes the name of the column in the data-frame passed to sets_info_df that contains the sets ID or named as seen in the list. Here we will create this data-frame as follows:

tar_target(
  go_sets_info,
  rnaseq_go_df |>
    dplyr::select(go_id, go_name) |>
    dplyr::distinct()
)

tar_read(go_sets_info) |> 
  head()
#> # A tibble: 6 × 2
#>   go_id      go_name                                         
#>   <chr>      <chr>                                           
#> 1 GO:0006468 protein phosphorylation                         
#> 2 GO:0007165 signal transduction                             
#> 3 GO:0031623 receptor internalization                        
#> 4 GO:0006813 potassium ion transport                         
#> 5 GO:0006355 regulation of DNA-templated transcription       
#> 6 GO:0006357 regulation of transcription by RNA polymerase II

We can run the enrichment analysis:

tar_target(
  diablo_enrichment_results,
  evaluate_method_enrichment(
    diablo_output,
    go_sets_filtered,
    datasets = "rnaseq",
    use_abs = TRUE,
    min_set_size = 10,
    add_missing_features = TRUE,
    mo_data = mo_set_complete,
    sets_info_df = go_sets_info,
    col_set = "go_id"
  )
)

The function returns a table giving for each latent component (latent_dimension) and feature set (set_id) the enrichment statistics (stat_mean), p-value (pvalue), adjusted p-value (adj_pvalue) as well as the number of features in the set that are present in the integration results (set_size). The table is arranged by increasing adjusted p-value, such that the significant results are at the top of the table. It is important to note that the multiple testing correction applied to the p-values occurs independently for each latent dimension, but there is no multiple testing correction performed across the latent dimensions. This can easily be done by the user, and should be considered when reporting the results.

tar_read(diablo_enrichment_results) |> 
  head()
#> # A tibble: 6 × 7
#>   latent_dimension set_id     stat_mean pvalue adj_pvalue set_size go_name      
#>   <fct>            <chr>          <dbl>  <dbl>      <dbl>    <dbl> <chr>        
#> 1 Component 2      GO:0006935     1.39  0.0846      0.601       57 chemotaxis   
#> 2 Component 2      GO:0042274     1.37  0.0889      0.601       54 ribosomal sm…
#> 3 Component 2      GO:0050727     1.27  0.104       0.601       56 regulation o…
#> 4 Component 2      GO:0006364     0.977 0.166       0.601       77 rRNA process…
#> 5 Component 2      GO:0030890     0.979 0.168       0.601       30 positive reg…
#> 6 Component 2      GO:0043029     0.983 0.168       0.601       25 T cell homeo…

In that case, unsurprisingly, neither of the latent components were enriched for any GO term. This makes sense since DIABLO performed a strong feature selection, so very few genes had non-null weights. We can repeat the enrichment with the MOFA integration results to show more interesting results. We will run the enrichment only for the first three factors for conciseness:

tar_target(
  mofa_enrichment_results,
  evaluate_method_enrichment(
    mofa_output,
    go_sets_filtered,
    datasets = "rnaseq",
    latent_dimensions = paste("Factor", 1:3),
    use_abs = TRUE,
    min_set_size = 10,
    add_missing_features = TRUE,
    mo_data = mo_set_complete,
    sets_info_df = go_sets_info,
    col_set = "go_id"
  )
)

This time, there are a couple of small p-values (even though the adjusted p-values are all quite large):

tar_load(mofa_enrichment_results)

mofa_enrichment_results |> head()
#> # A tibble: 6 × 7
#>   latent_dimension set_id     stat_mean   pvalue adj_pvalue set_size go_name    
#>   <fct>            <chr>          <dbl>    <dbl>      <dbl>    <dbl> <chr>      
#> 1 Factor 3         GO:0031424      4.13 0.000275      0.549       20 keratiniza…
#> 2 Factor 3         GO:0045109      3.60 0.000713      0.711       25 intermedia…
#> 3 Factor 3         GO:0030855      2.56 0.00651       0.764       56 epithelial…
#> 4 Factor 3         GO:0031069      2.07 0.0244        0.764       25 hair folli…
#> 5 Factor 3         GO:0030216      1.90 0.0317        0.764       49 keratinocy…
#> 6 Factor 3         GO:0001942      1.82 0.0373        0.764       38 hair folli…

We can see for example that MOFA factor 3 seems enriched in genes associated with the keratinization GO term (GO:0031424). For factor 1 (the factor separating the healthy and infected animals), the keratinization term also yields the smallest p-value, together with several GO terms related to inflammation (even though after correction for multiple testing these p-values are no longer significant):

mofa_enrichment_results |>
  filter(latent_dimension == "Factor 1") |> 
  head()
#> # A tibble: 6 × 7
#>   latent_dimension set_id     stat_mean  pvalue adj_pvalue set_size go_name     
#>   <fct>            <chr>          <dbl>   <dbl>      <dbl>    <dbl> <chr>       
#> 1 Factor 1         GO:0031424      2.70 0.00516      0.786       20 keratinizat…
#> 2 Factor 1         GO:0045109      2.43 0.00948      0.786       25 intermediat…
#> 3 Factor 1         GO:0010951      2.15 0.0208       0.786       25 negative re…
#> 4 Factor 1         GO:0006954      1.86 0.0318       0.786      165 inflammator…
#> 5 Factor 1         GO:0050727      1.75 0.0421       0.786       56 regulation …
#> 6 Factor 1         GO:0007166      1.62 0.0533       0.786      138 cell surfac…

13.3.5 Plotting features weight for feature sets

We can dig into the results of the enrichment by visualising the distribution of features weight or importance score between features that are in a specific features set vs features that are not in the set. For example, to continue with the MOFA example, let’s look at the importance score of genes associated with the GO term GO:0031424. We’ll only look at factors 1 and 3 for this example:

plot_features_weight_set(
  mofa_output,
  go_sets_filtered[["GO:0031424"]],
  set_name = "GO:0031424 (keratinization)",
  features_metric = "importance",
  datasets = "rnaseq",
  latent_dimensions = paste("Factor", c(1, 3)),
  point_alpha = 0.2
)

In the resulting plot, we can see for each dataset (in this case, we’re only looking at the transcriptomics dataset since we have gene sets) and latent dimensions (rows of the plot) the distribution of features importance for genes in the set vs not in the set. We can see that the GO term investigated contains 20 genes, and in total, 994 genes have been assigned a weight with MOFA (because we have performed some pre-filtering on the transcriptomics dataset prior to running MOFA). Therefore, out of all the genes in the set, 9 of them have no weight (they did not pass the pre-filtering step). We can add these features that did not make it into the MOFA input data in the same way as we did for the enrichment function, namely by setting the add_missing_features argument to TRUE and passing the non-filtering MultiDataSet object through mo_data:

plot_features_weight_set(
  mofa_output,
  go_sets_filtered[["GO:0031424"]],
  set_name = "GO:0031424 (keratinization)",
  features_metric = "importance",
  datasets = "rnaseq",
  latent_dimensions = paste("Factor", c(1, 3)),
  point_alpha = 0.2,
  add_missing_features = TRUE,
  mo_data = mo_set_complete
)

This is the distribution of values that were used for the enrichment test, but it’s not very useful to look at since the vast majority of the genes have a weight of zero.

13.4 Assessing samples clustering

Another aspect that we can evaluate is whether the integration method was able to separate different groups of samples, for example to identify differences between control and treatment samples. This is done by comparing the coordinates of the samples in the space spanned by the latent dimensions created (i.e. the samples score) to a samples label that is obtained from the samples metadata. We can use for that the silhouette score, which is a metric that quantifies how well-defined different clusters are in a given space, i.e. how well-grouped points from a same cluster (here samples group) are, and how separate they are from points from other clusters. First, a silhouette score (or silhouette width) is computed for each individual point (here sample). Then, these values are averaged over all points in a given cluster to yield a cluster silhouette score. The resulting values, both at the point- and cluster- level, can range from -1 to 1. Negative values indicate that the point is grouped with points from another cluster, while positive values indicate that the point is grouped with other points from its cluster. Values close to 0 show that the clustering is ambiguous, while scores close to 1 are indicative of well-defined clusters.

The compute_samples_silhouette() function relies on the cluster::silhouette() function from the cluster package to compute silhouette score. It takes as input the integration results in a standardised format, as well as the MultiDataSet object (to access the samples metadata) and the name of the column in the samples metadata table that contains the samples grouping information. In addition, it is possible to specify the distance metric that should be used to compute distances between the samples (through the distance_metric argument). By default, the function uses euclidean distances, but other options are available.

Here, we will check how well DIABLO managed to separate the bruising groups:

tar_target(
  diablo_silhouette,
  compute_samples_silhouette(
    diablo_output,
    mo_set_complete,
    "status"
  )
)

tar_load(diablo_silhouette)

diablo_silhouette
#> $samples_silhouette
#> # A tibble: 135 × 4
#>    sample_id group   neighbour_group silhouette_width
#>    <chr>     <fct>   <fct>                      <dbl>
#>  1 G1979     BRD     Control                    0.334
#>  2 G2500     BRD     Control                    0.255
#>  3 G3030     BRD     Control                    0.328
#>  4 G3068     BRD     Control                    0.370
#>  5 G3121     BRD     Control                    0.423
#>  6 G3315     Control BRD                        0.529
#>  7 G3473     Control BRD                        0.355
#>  8 G3474     Control BRD                        0.477
#>  9 G3550     BRD     Control                    0.362
#> 10 G3594     Control BRD                        0.567
#> # ℹ 125 more rows
#> 
#> $groups_average_silhouette
#> # A tibble: 2 × 2
#>   group   group_average_width
#>   <fct>             <dbl[1d]>
#> 1 BRD                   0.309
#> 2 Control               0.455

The function returns a list with two elements: a tibble of samples silhouette widths (samples_silhouette), and a tibble of groups average silhouette (groups_average_silhouette). In the first table, the group column indicates for each sample the group to which it is assigned (based on the samples metadata), and the neighbour_group column indicates the closest other group in the latent dimensions space (which is not very useful in this case since we only have two groups). The samples silhouette width is shown in the silhouette_width column. This is very useful to identify “outlier” samples that were not grouped as expected:

diablo_silhouette$samples_silhouette |> 
  filter(silhouette_width < 0)
#> # A tibble: 1 × 4
#>   sample_id group neighbour_group silhouette_width
#>   <chr>     <fct> <fct>                      <dbl>
#> 1 P4735     BRD   Control                   -0.217

Here, there is one sample from an infected animal that clusters with the control samples in the latent dimensions space.

The average silhouette score for the two disease status groups are both positive, although not close to 1, indicating that DIABLO did a reasonable job of identifying differences between the two groups:

diablo_silhouette$groups_average_silhouette
#> # A tibble: 2 × 2
#>   group   group_average_width
#>   <fct>             <dbl[1d]>
#> 1 BRD                   0.309
#> 2 Control               0.455

This is expected, since DIABLO is a supervised integration method which tries to maximise the separation between groups of samples.

We can repeat the operation for the MOFA results. By default, the compute_samples_silhouette() function uses all latent dimensions to compute distances between samples, but it is possible to specify which latent dimensions should be considered through the latent_dimensions argument. For example, we will see how the first three MOFA factors separate the disease status groups:

tar_target(
  mofa_silhouette,
  compute_samples_silhouette(
    mofa_output,
    mo_set_complete,
    "status",
    latent_dimensions = paste("Factor", 1:3)
  )
)

tar_load(mofa_silhouette)

mofa_silhouette$groups_average_silhouette
#> # A tibble: 2 × 2
#>   group   group_average_width
#>   <fct>             <dbl[1d]>
#> 1 BRD                   0.257
#> 2 Control               0.524

Both groups’ silhouette scores are positive, although the BRD group silhouette is not very high. Also, more samples have a silhouette width below 0.

mofa_silhouette$samples_silhouette |> 
  filter(silhouette_width < 0)
#> # A tibble: 11 × 4
#>    sample_id group neighbour_group silhouette_width
#>    <chr>     <fct> <fct>                      <dbl>
#>  1 O3720     BRD   Control                 -0.430  
#>  2 O4492     BRD   Control                 -0.246  
#>  3 O4919     BRD   Control                 -0.440  
#>  4 O5003     BRD   Control                 -0.243  
#>  5 P4666     BRD   Control                 -0.0994 
#>  6 P4735     BRD   Control                 -0.429  
#>  7 R50       BRD   Control                 -0.168  
#>  8 R5979     BRD   Control                 -0.00625
#>  9 R9887     BRD   Control                 -0.0659 
#> 10 Y3059     BRD   Control                 -0.00504
#> 11 Y9745     BRD   Control                 -0.0358

Note that with MOFA, since the different factors represent different trends within the data, it is possible that using more factors will yield smaller average silhouette groups. For example if we used all constructed factors:

compute_samples_silhouette(
    mofa_output,
    mo_set_complete,
    "status"
)$groups_average_silhouette
#> # A tibble: 2 × 2
#>   group   group_average_width
#>   <fct>             <dbl[1d]>
#> 1 BRD                  0.0658
#> 2 Control              0.272

In such cases, it makes more sense to evaluate the ability of one or two latent dimensions to separate the samples group.

13.5 Recap – targets list

For convenience, here is the list of targets that we created in this section:

Targets list for evaluating the output of an integration method:

list(
  ## Evaluating DIABLO selected features against single-omics results
  tar_target(
    diablo_selected_vs_singleomics_table,
    evaluate_feature_selection_table(
      diablo_output,
      mo_data = mo_set_complete,
      col_names = list(
        "snps" = "qtl_type",
        "rnaseq" = "de_signif",
        "metabolome" = "de_signif"
      )
    )
  ),
  
  ## Plotting DIABLO features weight against single-omics results
  tar_target(
    diablo_features_weight_vs_singleomics_plot,
    plot_features_weight_covariate(
      diablo_output, 
      mo_data = mo_set_complete, 
      covariate = list(
        "snps" = "qtl_type", 
        "rnaseq" = "de_status", 
        "metabolome" = "de_status"
      ),
      remove_null_weight = TRUE
    )
  ),
  
  ## Genes GO annotation file 
  tar_target(
    rnaseq_go_terms_file,
    system.file(
      "extdata/transcriptomics_go_annotation.csv", 
      package = "moiraine"
    ),
    format = "file"
  ),
  
  ## Genes GO annotation data-frame 
  tar_target(
    rnaseq_go_df,
    read_csv(rnaseq_go_terms_file) |> 
      filter(go_domain == "Biological process")
  ),
  
  ## GO term sets
  tar_target(
    go_sets,
    make_feature_sets_from_df(
      rnaseq_go_df,
      col_id = "gene_id",
      col_set = "go_id"
    )
  ),
  
  ## Filtering GO term sets against measured features
  tar_target(
    go_sets_filtered,
    reduce_feature_sets_data(go_sets, mo_set_complete)
  ),
  
  ## Checking genes GO term sets against datasets
  tar_target(
    go_sets_check,
    check_feature_sets(
      go_sets_filtered, 
      mo_set_complete,
      datasets = "rnaseq"
    )
  ),
  
  ## Table of information about GO terms
  tar_target(
    go_sets_info,
    rnaseq_go_df |>
      dplyr::select(go_id, go_name) |>
      dplyr::distinct()
  ),
  
  ## MOFA latent components enrichment analysis
  tar_target(
    mofa_enrichment_results,
    evaluate_method_enrichment(
      mofa_output,
      go_sets_filtered,
      datasets = "rnaseq",
      latent_dimensions = paste("Factor", 1:3),
      use_abs = TRUE,
      min_set_size = 10,
      add_missing_features = TRUE,
      mo_data = mo_set_complete,
      sets_info_df = go_sets_info,
      col_set = "go_id"
    )
  ),
  
  ## Plotting features weight for GO term 'GO:0031424'
  tar_target(
    mofa_enrichment_go0031424_plot,
    plot_features_weight_set(
      mofa_output,
      go_sets_filtered[["GO:0031424"]],
      set_name = "GO:0031424 (keratinization)",
      features_metric = "importance",
      datasets = "rnaseq",
      latent_dimensions = paste("Factor", c(1, 3)),
      point_alpha = 0.2
    )
  ),
  
  ## Assessing DIABLO samples clustering
  tar_target(
    diablo_silhouette,
    compute_samples_silhouette(
      diablo_output,
      mo_set_complete,
      "status"
    )
  )
)

Note that in the full genomics dataset, 36 markers were detected as QTLs or eQTLs, however only eight of them were retained in the features preselection step ( Section 7.3.2). For a real analysis, all 36 markers would have also been included in the prefiltered dataset even if they did not pass the preselection.↩︎