6 Data pre-processing

library(targets)
library(moiraine)
library(MultiDataSet)

## For PCA results
library(pcaMethods)

## For working with lists
library(purrr)
library(rlang)
## For custom colour palettes
library(ggplot2)
library(RColorBrewer)

## For displaying documentation
library(DESeq2)
library(S4Vectors)
library(stats4)

Once each omics dataset has been imported into R with associated metadata and combined into a MultiDataSet object, there is a number of pre-processing steps that should be considered. In this chapter, we will show how to apply different transformations to the datasets, as well as how to run a PCA on each dataset and impute missing values if needed.

As a reminder, here is what the _targets.R script should look like so far:

_targets.R script

library(targets)
library(tarchetypes)
library(moiraine)

tar_option_set(
  packages = c(
    "moiraine",
    "MOFA2",
    "mixOmics",
    "readr",
    "tibble",
    "tidyr",
    "dplyr",
    "ggplot2",
    "patchwork"
  )
)

list(
# Importing data ---------------------------------------------------------------

  ## Data import using a target factory
  import_dataset_csv_factory(
    files = c(
      system.file("extdata/genomics_dataset.csv", package = "moiraine"),
      system.file("extdata/transcriptomics_dataset.csv", package = "moiraine"),
      system.file("extdata/metabolomics_dataset.csv", package = "moiraine")
    ),
    col_ids = c("marker", "gene_id", "sample_id"),
    features_as_rowss = c(TRUE, TRUE, FALSE),
    target_name_suffixes = c("geno", "transcripto", "metabo")
  ),
  
  ## Genomics features metadata file
  tar_target(
    fmetadata_file_geno,
    system.file("extdata/genomics_features_info.csv", package = "moiraine"),
    format = "file"
  ),
  
  ## Genomics features metadata import
  tar_target(
    fmetadata_geno,
    import_fmetadata_csv(
      fmetadata_file_geno,
      col_id = "marker",
      col_types = c("chromosome" = "c")
    )
  ),
  
  ## Metabolomics features metadata import
  import_fmetadata_csv_factory(
    files = c(
      system.file("extdata/metabolomics_features_info.csv", package = "moiraine")
    ),
    col_ids = c("feature_id"),
    target_name_suffixes = c("metabo")
  ),
  
  ## Transcriptomics features metadata import
  import_fmetadata_gff_factory(
    files = system.file("extdata/bos_taurus_gene_model.gff3", package = "moiraine"),
    feature_types = "genes",
    add_fieldss = c("Name", "description"),
    target_name_suffixes = "transcripto"
  ),
  
  ## Samples metadata import
  import_smetadata_csv_factory(
    files = system.file("extdata/samples_info.csv", package = "moiraine"),
    col_ids = "animal_id",
    target_name_suffixes = "all"
  ),
  
  ## Creating omics sets for each dataset
  create_omics_set_factory(
    datasets = c(data_geno, data_transcripto, data_metabo),
    omics_types = c("genomics", "transcriptomics", "metabolomics"),
    features_metadatas = c(fmetadata_geno, fmetadata_transcripto, fmetadata_metabo),
    samples_metadatas = c(smetadata_all, smetadata_all, smetadata_all)
  ),
  
  ## Creating the MultiDataSet object
  tar_target(
    mo_set,
    create_multiomics_set(
      list(set_geno,
           set_transcripto,
           set_metabo)
    )
  ),

# Inspecting the `MultiDataSet` object -----------------------------------------

  ## Creating a density plot for each dataset
  tar_target(
    density_plots,
    plot_density_data(
      mo_set,
      combined = FALSE,
      scales = "free"
    )
  ),

  ## Plotting the relationship between features mean and standard deviation
  ## for each dataset
  tar_target(
    mean_sd_plots,
    plot_meansd_data(mo_set)
  ),
  
  ## Assessing missing values
  tar_target(
    n_missing_values,
    check_missing_values(mo_set)
  ),

# Modifying the `MultiDataSet` object ------------------------------------------

  ## RNAseq differential expression results file
  tar_target(
    rnaseq_de_res_file,
    system.file(
      "extdata/transcriptomics_de_results.csv",
      package = "moiraine"
    ),
    format = "file"
  ),

  ## Reading the RNAseq differential expression results
  tar_target(
    rnaseq_de_res_df,
    read_csv(rnaseq_de_res_file) |>
      rename(feature_id = gene_id) |>
      mutate(dataset = "rnaseq")
  ),

  ## Adding the differential expression results to the MultiDataSet object
  tar_target(
    mo_set_de,
    add_features_metadata(mo_set, rnaseq_de_res_df)
  )
)

6.1 Datasets transformations

After inspection of the density plots for the different datasets (see Section 4.3), it might be necessary to normalise or transform some or all datasets. This is necessary to mitigate the mean-variance trend that occurs in RNAseq data, for example, or simply to bring the different features to a comparable scale. Transformation here refers to applying a function to each feature (i.e. each row) within a dataset that will transform the measurement values for the feature.

moiraine implements several options to transform an omics dataset:

Variance Stabilising Normalisation (VSN) through the vsn package – recommended for metabolomics datasets or other continuous datasets with a strong mean-variance trend;
Variance Stabilising Transformation (VST) through the DESeq2 package – recommended for RNAseq data or any raw read count-type data;
Automatic selection of the best normalisation method for each feature through the bestNormalize package – recommended for phenotype data, and when the number of features is small (note that the selection of the normalisation method is done independently for each feature, so the same transformation might not be applied to all features);
A selection of common normalisation methods through the bestNormalize package, including center/scale, log, exponential, square-root, arcsinh, Box Cox, Yeo-Johnson and ordered quantile transformations (see details in the bestNormalize vignette) – recommended when applying the same transformation to all features, e.g. log2 transformation or centering.

6.1.1 Transforming a single dataset

The transformation of one dataset is done through the transform_dataset() function, which takes as input a MultiDataSet object, the name of the dataset to transform, and the name of the transformation to be applied, which should be one of vsn, vst-deseq2, logx, best-normalize-auto or best-normalize-manual. For the latter, the name of the normalisation method from the BestNormalize package to use must also be supplied through the method argument.

The return_multidataset argument determines whether a MultiDataSet object with the corresponding dataset transformed should be returned. If it is set to FALSE, the function will instead return a list with the transformed dataset as a matrix as well as other useful information returned by the transformation function applied. It is possible to only return the transformed matrix, by setting return_matrix_only to TRUE. This can be useful to quickly assess the effects of the transformation outside of the analysis pipeline.

For example, we can apply the Variance Stabilising Transformation to the transcriptomics dataset:

tar_load(mo_set_de)

rnaseq_vst <- transform_dataset(
  mo_set_de,
  "rnaseq",
  "vst-deseq2",
  return_multidataset = FALSE
)
#> Applying Variance Stabilising Transformation (DESeq2) to rnaseq dataset.
#> converting counts to integer mode

The function returns a list, with the transformed dataset as matrix in the transformed_data element. Information generated during the transformation by the DESeq2 package is stored in the info_transformation element. The name of the transformation applied is stored in the transformation element:

names(rnaseq_vst)
#> [1] "transformed_data"    "info_transformation" "transformation"

rnaseq_vst$transformed_data[1:5, 1:5]
#>                        R9497     R5969     R5327     R5979     R9504
#> ENSBTAG00000000005 10.357314 11.289047 12.070536 10.201227 10.251942
#> ENSBTAG00000000008  4.836495  4.937948  4.375764  4.986154  5.710581
#> ENSBTAG00000000009  3.486141  4.054636  4.983434  3.486141  3.486141
#> ENSBTAG00000000010 12.216690 11.937084 12.079359 11.383360 11.897780
#> ENSBTAG00000000011  3.486141  4.054636  4.005139  3.979732  3.486141

rnaseq_vst$info_transformation
#> class: DESeqTransform 
#> dim: 20335 143 
#> metadata(1): version
#> assays(1): ''
#> rownames(20335): ENSBTAG00000000005 ENSBTAG00000000008 ...
#>   ENSBTAG00000055312 ENSBTAG00000055314
#> rowData names(6): baseMean baseVar ... dispGeneIter dispFit
#> colnames(143): R9497 R5969 ... Y9747 Y9816
#> colData names(1): sizeFactor

rnaseq_vst$transformation
#> [1] "vst-deseq2"

If we instead want to apply a log2 transformation to the dataset, we will use the logx transformation option. In that case, we have to specify the log base to use (here 2) through the log_base argument, as well as the function that should be applied to the matrix prior to the log-transformation through the pre_log_function argument. This function is here mostly to take care of zero values that could cause an issue during the log transformation. By default, the zero_to_half_min() function is used, which replaces zero values in the dataset with half of the minimum non-null value found in the dataset. However, it is possible to pass any custom function instead, for example function(x) x would not make any change to the dataset before the log-transformation:

rnaseq_log2 <- transform_dataset(
  mo_set_de,
  "rnaseq",
  "logx",
  base_log = 2,
  pre_log_function = function(x) x,
  return_multidataset = TRUE
)
#> Applying Log Transformation to rnaseq dataset.
#> Warning in transform_logx(mat, return_matrix_only = return_matrix_only, : The
#> matrix contains zero values; log-transformation will yield `-Inf`.

In that case, we asked the function to return a MultiDataSet object, in which the rnaseq dataset has been transformed:

rnaseq_log2
#> Object of class 'MultiDataSet'
#>  . assayData: 3 elements
#>     . snps: 23036 features, 139 samples 
#>     . rnaseq: 20335 features, 143 samples 
#>     . metabolome: 55 features, 139 samples 
#>  . featureData:
#>     . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#>     . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#>     . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#>  . rowRanges:
#>     . snps: YES
#>     . rnaseq: YES
#>     . metabolome: NO
#>  . phenoData:
#>     . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#>     . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#>     . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)

get_dataset_matrix(rnaseq_log2, "rnaseq")[1:5, 1:5]
#>                        R9497     R5969     R5327     R5979     R9504
#> ENSBTAG00000000005  9.517669 10.458407 11.511258  9.768184  9.531381
#> ENSBTAG00000000008  2.584963  2.807355  1.584963  3.321928  4.321928
#> ENSBTAG00000000009      -Inf  0.000000  3.169925      -Inf      -Inf
#> ENSBTAG00000000010 11.394999 11.111136 11.520128 10.965784 11.195372
#> ENSBTAG00000000011      -Inf  0.000000  0.000000  0.000000      -Inf

The two other transformation options are best-normalize-auto and best-normalize-manual. Both rely on the bestNormalize package to apply a transformation independently to each feature in the dataset. With best-normalize-auto, the bestNormalize::bestNormalize() function automatically selects for each feature the transformation that results in a distribution of values closest to Gaussian as possible. Note that as the function is applied independently to each feature, a different distribution may be chosen for different features. With the best-normalize-manual option, the same transformation is applied to each feature. The transformation to use must be specified through the method argument, and should be the name of one of the transformation implemented in bestNormalize: "arcsinh_x", "boxcox", "center_scale", "exp_x", "log_x", "orderNorm", "sqrt_x" or "yeojohnson". Note that with this option, even if the same transformation is applied to each feature, the parameters for the transformation, if not specified, will be independently estimated for each feature. For example, with the log_x method, the function automatically adds a small offset (a) to the feature’s measurements if there are any zero values. The value used for this offset might be different for each feature. We can instead set a value for a to be used for all features, as follows:

transform_dataset(
  mo_set_de,
  "rnaseq",
  "best-normalize-manual",
  method = "log_x",
  a = 0.1,
  return_multidataset = TRUE
)

Note

Before using the best-normalize-manual option, it is strongly recommended to have a look at the documentation of the corresponding function from the bestNormalize package, as they might have behaviours that are not expected. For example, by default the bestNormalize::log_x() function uses a log base 10, and centers and scales the transformed values, which may not be what we want.

6.1.2 Transformation target factory

The target factory function transformation_datasets_factory() provides a wrapper to apply (potentially different) transformations to several datasets at once. The function takes as input the MultiDataSet object as well as a named character vector, in which each element corresponds to a transformation that should be applied to a specific dataset. If a dataset is not present in the transformation vector, it will not be transformed (but it will still be present in the resulting MultiDataSet object).

Here, we would like to apply Variance Stabilising Transformation to the transcriptomics dataset, and a log2 transformation to the metabolomics dataset. Note that the VST and VSN transformations are very close to the log2 transformation, especially for features with high means.

transformation_datasets_factory(
  mo_set_de,
  c("rnaseq" = "vst-deseq2",
    "metabolome" = "logx"),
  log_bases = 2,
  pre_log_functions = zero_to_half_min,
  transformed_data_name = "mo_set_transformed"
)

The transformation_datasets_factory() function works as follows:

It creates a grouped tibble listing the transformation to apply to each dataset, stored in the transformations_spec target;

tar_read(transformations_spec)
#> # A tibble: 2 × 6
#>   dsn        transf     meth   log_b     prelog_f tar_group
#>   <chr>      <chr>      <list> <list>    <list>       <int>
#> 1 rnaseq     vst-deseq2 <NULL> <NULL>    <NULL>           2
#> 2 metabolome logx       <NULL> <dbl [1]> <fn>             1

It performs the required transformation on each dataset via dynamic branching. This is done through a call to the transform_dataset() function. The transformed datasets are stored in a list, in the transformations_runs_list target. Note that by default the function will store all details of the transformations, which can be useful for later inspection, but can be memory-intensive. It is possible to only store the transformed datasets instead, by setting the return_matrix_only argument to TRUE in the transformation_datasets_factory() call.

tar_load(transformations_runs_list)

names(transformations_runs_list)
#> [1] "transformations_runs_list_e68c2d99" "transformations_runs_list_46986372"

map_chr(transformations_runs_list, attr, "dataset_name")
#> transformations_runs_list_e68c2d99 transformations_runs_list_46986372 
#>                       "metabolome"                           "rnaseq"

transformations_runs_list[["transformations_runs_list_a1c8db41"]] |> names()
#> NULL

It creates a new MultiDataSet object, with the transformed version of the datasets. By default, this new MultiDataSet object is stored in a target called transformed_set, but a different name can be specified via the transformed_data_name argument (here we called it mo_set_transformed).

tar_load(mo_set_transformed)

mo_set_transformed
#> Object of class 'MultiDataSet'
#>  . assayData: 3 elements
#>     . snps: 23036 features, 139 samples 
#>     . rnaseq: 20335 features, 143 samples 
#>     . metabolome: 55 features, 139 samples 
#>  . featureData:
#>     . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#>     . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#>     . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#>  . rowRanges:
#>     . snps: YES
#>     . rnaseq: YES
#>     . metabolome: NO
#>  . phenoData:
#>     . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#>     . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#>     . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)

get_dataset_matrix(mo_set_de, "metabolome")[1:5, 1:3]
#>             R21 Y3660 Y3243
#> HMDB00001   9.1   9.7   9.3
#> HMDB00008  58.2  22.8   9.1
#> HMDB00042 403.0 392.0 606.0
#> HMDB00043 172.6 163.1 165.2
#> HMDB00060   0.7   1.5   1.4

get_dataset_matrix(mo_set_transformed, "metabolome")[1:5, 1:3]
#>                  R21     Y3660     Y3243
#> HMDB00001  3.1858665 3.2779847 3.2172307
#> HMDB00008  5.8629472 4.5109619 3.1858665
#> HMDB00042  8.6546360 8.6147098 9.2431740
#> HMDB00043  7.4312887 7.3496130 7.3680699
#> HMDB00060 -0.5145732 0.5849625 0.4854268

Converting targets factory to R script

transformations_runs_list <- c(
  "rnaseq" = "vst-deseq2", 
  "metabolome" = "logx"
) |>  imap(\(x, y) {
    transform_dataset(
      mo_set_de,
      dataset = y,
      transformation = x,
      log_base = 2,
      pre_log_function = zero_to_half_min
    )
  }
)

mo_set_transformed <- get_transformed_data(mo_set_de, transformations_runs_list)

We can assess the effect of the transformations by generating density and mean-sd plots for the transformed datasets:

plot_density_data(
  mo_set_transformed,
  combined = FALSE,
  scales = "free"
)

Note how the relationship between features mean and standard deviation has been reduced in both transformed datasets:

plot_meansd_data(mo_set_transformed)

Finally, it can be useful to summarise which transformations have been applied to the datasets, for example when creating a report. The function get_table_transformation() is here for that. It takes as an input the transformations_runs_list target generated by transformation_datasets_factory(), and returns a tibble indicating the transformation applied to each dataset:

get_table_transformations(transformations_runs_list)
#> # A tibble: 2 × 2
#>   Dataset    Transformation                              
#>   <chr>      <chr>                                       
#> 1 metabolome Log-2 transformation                        
#> 2 rnaseq     Variance Stabilising Transformation (DESeq2)

6.2 Running a PCA on each dataset

It is always best practice to run some exploratory analysis on a dataset prior to running analyses. This is largely outside the scope of this package, and we assume that any input dataset has been properly assessed before turning to the integration pipeline. However, running a Principal Component Analysis (PCA) on each of the omics datasets within the integration pipeline serves two purposes:

as a last check to ensure that there are no obvious batch effects or problematic samples that should be addressed,
as a missing data imputation method.

The moiraine package relies on the Bioconductor pcaMethods package to perform the PCA. In particular, the pcaMethods package implements a NIPALS (non-linear iterative partial least squares) method for PCA, which allows for missing values in the input dataset, and imputes missing values based on the results of the PCA.

6.2.1 Running the PCAs

The pca_complete_data_factory() function uses dynamic branching to perform a PCA on each omics dataset within a MultiDataSet object. It takes as input the MultiDataSet object (in our case, mo_set_transformed), and, optionally, the names of the datasets on which a PCA should be run. This is useful if one dataset is very large and has no missing values, and we want to avoid running a PCA on it. It then proceeds as follows:

It creates a target called dataset_names_pca, which stores a vector of dataset names on which a PCA should be applied;
For each value in dataset_names_pca, it extracts the omics dataset as a matrix with features as rows and samples as columns, using the get_dataset_matrix() function. This is done via dynamic branching, and the results are stored as a list in the pca_mats_list target. Note that the names of this list are not meaningful; to check which element of the list corresponds to which dataset, you can run map_chr(pca_mats_list, attr, "dataset_name");
For each matrix in pca_mats_list, it applies the run_pca_matrix() function to the corresponding dataset. This is done via dynamic branching; it results in a list where each element is the PCA result (i.e. a pcaMethods::pcaRes object) for a given dataset. This list is stored in the pca_runs_list target. Note that the names of this list are not meaningful; to check which element of the list corresponds to which dataset, you can run map_chr(pca_runs_list, attr, "dataset_name");
It extracts from the result of each PCA the complete dataset, i.e. with missing values imputed, and uses this information to construct a new MultiDataSet object, in which the datasets are complete (i.e. no missing value). This is done by calling the get_complete_data() function. If no PCA was run on a dataset, the dataset will still be present in the new MultiDataSet object, but its missing values will not be imputed. The resulting complete MultiDataSet object is stored by default in a target called complete_set; this name can be changed via the complete_data_name argument.

Let’s apply this to our multi-omics dataset:

pca_complete_data_factory(
  mo_set_transformed,
  complete_data_name = "mo_set_complete"
)

We can have a look at the different targets constructed. By default, a PCA was run on all datasets:

tar_read(dataset_names_pca)
#> [1] "snps"       "rnaseq"     "metabolome"

tar_load(pca_mats_list)

map_chr(pca_mats_list, attr, "dataset_name")
#> pca_mats_list_302d7473 pca_mats_list_84d36937 pca_mats_list_64d37e6c 
#>                 "snps"               "rnaseq"           "metabolome"
map(pca_mats_list, ~.x[1:5, 1:5])
#> $pca_mats_list_302d7473
#>                             R21 Y3660 Y3243 R5764 P4669
#> 1_41768691                    1     0     2     2     1
#> 10-27008241-A-C-rs42918694    2     2     2     1     2
#> 10-37505419-T-C-rs136559242   0     1     0     2     0
#> 10-49904259-G-A-rs471723345   1     2     2     2     2
#> 1-109550832-G-A-rs209732846   2     2     1     2     2
#> 
#> $pca_mats_list_84d36937
#>                        R9497     R5969     R5327     R5979     R9504
#> ENSBTAG00000000005 10.357314 11.289047 12.070536 10.201227 10.251942
#> ENSBTAG00000000008  4.836495  4.937948  4.375764  4.986154  5.710581
#> ENSBTAG00000000009  3.486141  4.054636  4.983434  3.486141  3.486141
#> ENSBTAG00000000010 12.216690 11.937084 12.079359 11.383360 11.897780
#> ENSBTAG00000000011  3.486141  4.054636  4.005139  3.979732  3.486141
#> 
#> $pca_mats_list_64d37e6c
#>                  R21     Y3660     Y3243    R5764     P4669
#> HMDB00001  3.1858665 3.2779847 3.2172307       NA 3.6322682
#> HMDB00008  5.8629472 4.5109619 3.1858665 2.655352 4.1618877
#> HMDB00042  8.6546360 8.6147098 9.2431740 8.655352 7.9008668
#> HMDB00043  7.4312887 7.3496130 7.3680699 7.707359 6.4025858
#> HMDB00060 -0.5145732 0.5849625 0.4854268 2.906891 0.6780719

tar_load(pca_runs_list)

names(pca_runs_list)
#> [1] "pca_runs_list_74d71ae8" "pca_runs_list_71fd94b4" "pca_runs_list_559272f0"

map_chr(pca_runs_list, attr, "dataset_name")
#> pca_runs_list_74d71ae8 pca_runs_list_71fd94b4 pca_runs_list_559272f0 
#>                 "snps"               "rnaseq"           "metabolome"

The result of the PCA run on the genomics dataset looks like this:

tar_read(pca_runs_list_74d71ae8)
#> nipals calculated PCA
#> Importance of component(s):
#>                   PC1     PC2     PC3     PC4     PC5     PC6      PC7      PC8
#> R2            0.05578 0.02881 0.01305 0.01196 0.01168 0.01005 0.009848 0.009565
#> Cumulative R2 0.05578 0.08459 0.09765 0.10960 0.12128 0.13133 0.141175 0.150739
#>                    PC9     PC10
#> R2            0.009286 0.008915
#> Cumulative R2 0.160026 0.168941
#> 23036    Variables
#> 139  Samples
#> 9615     NAs ( 0.3 %)
#> 10   Calculated component(s)
#> Data was mean centered before running PCA 
#> Data was NOT scaled before running PCA 
#> Scores structure:
#> [1] 139  10
#> Loadings structure:
#> [1] 23036    10

You can notice that there is some information about the number of principal components computed, and whether the dataset was centred and scaled before applying the PCA. This is handled by the default arguments of run_pca_matrix(), but can be specified by passing the corresponding arguments to pca_complete_data_factory(). For example, to scale the datasets before performing a PCA, we could use:

pca_complete_data_factory(
  mo_set_transformed,
  complete_data_name = "mo_set_complete",
  scale = TRUE
)

For convenience, the run_pca() function can be used to run a PCA on one of the omics datasets directly from a MultiDataSet object. It is a wrapper around the run_pca_matrix() function, and takes as input a MultiDataSet object as well as the name of the omics dataset on which a PCA should be run, e.g.:

run_pca(mo_set_de, "rnaseq")

Converting targets factory to R script

pca_mats_list <- names(mo_set_transformed) |> 
  set_names() |> 
  map(\(x) get_dataset_matrix(mo_set_transformed, x, keep_dataset_name = TRUE))

pca_runs_list <- pca_mats_list |> 
  map(run_pca_matrix)

mo_set_complete <- get_complete_data(mo_set_transformed, pca_runs_list)

6.2.2 Visualising the PCA results

It is possible to get an overview of the results of each PCA. First, the function plot_screeplot_pca() displays the percentage of variance explained by the principal components computed for each dataset. It takes as input the pca_runs_list target constructed in the previous step. Note that by default, 10 components are computed for each dataset.

plot_screeplot_pca(pca_runs_list)

In addition, the plot_samples_coordinates_pca allows us to display the samples in the reduced principal components space (the common PCA sample plot). The function returns a list of plots (one plot per dataset). By default, it shows all principal components computed for each dataset, but for clarity we will only look at the first three:

plot_samples_coordinates_pca(
  pca_runs_list,
  pcs = 1:3
)
#> $snps

#> 
#> $rnaseq

#> 
#> $metabolome

Note that it is possible to look at a different set of principal components for each dataset. For that, the index of the principal components should be passed to the pcs argument as a named list (where the name of each element corresponds to a dataset name), e.g.:

plot_samples_coordinates_pca(
  pca_runs_list,
  pcs = list(
    "snps" = 1:4,
    "rnaseq" = 1:2,
    "metabolome" = 1:3
  )
)

By default, the points in the sample plots are not coloured. It is however possible to colour the samples according to the information contained in the sample metadata tables available through the MultiDataset object. We can set different colours and shapes for the upper and lower plots in the scatterplot matrix, see the plot_samples_score() function for more information. For example, we can assess whether the first three principal components show any clustering of the samples according to their cluster computed from genomics similarity, disease status or feedlot (we’ll only show the results for the SNPs dataset here):

plot_samples_coordinates_pca(
  pca_runs_list,
  datasets = "snps",
  pcs = 1:3,
  mo_data = mo_set_de,
  colour_upper = "geno_comp_cluster",
  shape_upper = "status",
  colour_lower = "feedlot"
) +
  theme(legend.box = "vertical")

6.2.3 Missing values imputation

We can check that the complete multi-omics set constructed has no more missing values:

tar_load(mo_set_complete)

mo_set_complete
#> Object of class 'MultiDataSet'
#>  . assayData: 3 elements
#>     . snps: 23036 features, 139 samples 
#>     . rnaseq: 20335 features, 143 samples 
#>     . metabolome: 55 features, 139 samples 
#>  . featureData:
#>     . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#>     . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#>     . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#>  . rowRanges:
#>     . snps: YES
#>     . rnaseq: YES
#>     . metabolome: NO
#>  . phenoData:
#>     . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#>     . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#>     . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)

check_missing_values(mo_set_complete)
#> No missing values in snps dataset.
#> No missing values in rnaseq dataset.
#> No missing values in metabolome dataset.

6.3 Recap – targets list

For convenience, here is the list of targets that we created in this section:

Targets list for datasets preprocessing

list(
  ## Applying transformations to the datasets
  transformation_datasets_factory(
    mo_set_de,
    c("rnaseq" = "vst-deseq2",
      "metabolome" = "logx"),
    log_bases = 2,
    pre_log_functions = zero_to_half_min,
    transformed_data_name = "mo_set_transformed"
  ),
  
  ## Density plot for each transformed dataset
  tar_target(
    density_plots_transformed,
    plot_density_data(
      mo_set_transformed,
      combined = FALSE,
      scales = "free"
    )
  ),
  
  ## Plotting the mean-SD trend for transformed each dataset
  tar_target(
    mean_sd_plots_transformed,
    plot_meansd_data(mo_set_transformed)
  ),
  
  ## Summary table of the transformations applied
  tar_target(
    transformation_summary,
    get_table_transformations(transformations_runs_list)
  ),
  
  ## Running a PCA on each dataset
  pca_complete_data_factory(
    mo_set_transformed,
    complete_data_name = "mo_set_complete"
  ),
  
  ## PCA screeplots
  tar_target(
    pca_screeplots,
    plot_screeplot_pca(pca_runs_list)
  ),
  
  ## PCA sample plots
  tar_target(
    pca_sample_plots,
    plot_samples_coordinates_pca(
      pca_runs_list,
      datasets = "snps",
      pcs = 1:3,
      mo_data = mo_set_de,
      colour_upper = "geno_comp_cluster",
      shape_upper = "status",
      colour_lower = "feedlot"
    )
  )
)