library(targets)
library(moiraine)
library(MultiDataSet)
## For PCA results
library(pcaMethods)
## For working with lists
library(purrr)
library(rlang)
## For custom colour palettes
library(ggplot2)
library(RColorBrewer)
## For displaying documentation
library(DESeq2)
library(S4Vectors)
library(stats4)
6 Data pre-processing
Once each omics dataset has been imported into R with associated metadata and combined into a MultiDataSet
object, there is a number of pre-processing steps that should be considered. In this chapter, we will show how to apply different transformations to the datasets, as well as how to run a PCA on each dataset and impute missing values if needed.
As a reminder, here is what the _targets.R
script should look like so far:
_targets.R
script
library(targets)
library(tarchetypes)
library(moiraine)
tar_option_set(
packages = c(
"moiraine",
"MOFA2",
"mixOmics",
"readr",
"tibble",
"tidyr",
"dplyr",
"ggplot2",
"patchwork"
)
)
list(
# Importing data ---------------------------------------------------------------
## Data import using a target factory
import_dataset_csv_factory(
files = c(
system.file("extdata/genomics_dataset.csv", package = "moiraine"),
system.file("extdata/transcriptomics_dataset.csv", package = "moiraine"),
system.file("extdata/metabolomics_dataset.csv", package = "moiraine")
),
col_ids = c("marker", "gene_id", "sample_id"),
features_as_rowss = c(TRUE, TRUE, FALSE),
target_name_suffixes = c("geno", "transcripto", "metabo")
),
## Genomics features metadata file
tar_target(
fmetadata_file_geno,
system.file("extdata/genomics_features_info.csv", package = "moiraine"),
format = "file"
),
## Genomics features metadata import
tar_target(
fmetadata_geno,
import_fmetadata_csv(
fmetadata_file_geno,
col_id = "marker",
col_types = c("chromosome" = "c")
)
),
## Metabolomics features metadata import
import_fmetadata_csv_factory(
files = c(
system.file("extdata/metabolomics_features_info.csv", package = "moiraine")
),
col_ids = c("feature_id"),
target_name_suffixes = c("metabo")
),
## Transcriptomics features metadata import
import_fmetadata_gff_factory(
files = system.file("extdata/bos_taurus_gene_model.gff3", package = "moiraine"),
feature_types = "genes",
add_fieldss = c("Name", "description"),
target_name_suffixes = "transcripto"
),
## Samples metadata import
import_smetadata_csv_factory(
files = system.file("extdata/samples_info.csv", package = "moiraine"),
col_ids = "animal_id",
target_name_suffixes = "all"
),
## Creating omics sets for each dataset
create_omics_set_factory(
datasets = c(data_geno, data_transcripto, data_metabo),
omics_types = c("genomics", "transcriptomics", "metabolomics"),
features_metadatas = c(fmetadata_geno, fmetadata_transcripto, fmetadata_metabo),
samples_metadatas = c(smetadata_all, smetadata_all, smetadata_all)
),
## Creating the MultiDataSet object
tar_target(
mo_set,
create_multiomics_set(
list(set_geno,
set_transcripto,
set_metabo)
)
),
# Inspecting the `MultiDataSet` object -----------------------------------------
## Creating a density plot for each dataset
tar_target(
density_plots,
plot_density_data(
mo_set,
combined = FALSE,
scales = "free"
)
),
## Plotting the relationship between features mean and standard deviation
## for each dataset
tar_target(
mean_sd_plots,
plot_meansd_data(mo_set)
),
## Assessing missing values
tar_target(
n_missing_values,
check_missing_values(mo_set)
),
# Modifying the `MultiDataSet` object ------------------------------------------
## RNAseq differential expression results file
tar_target(
rnaseq_de_res_file,
system.file(
"extdata/transcriptomics_de_results.csv",
package = "moiraine"
),
format = "file"
),
## Reading the RNAseq differential expression results
tar_target(
rnaseq_de_res_df,
read_csv(rnaseq_de_res_file) |>
rename(feature_id = gene_id) |>
mutate(dataset = "rnaseq")
),
## Adding the differential expression results to the MultiDataSet object
tar_target(
mo_set_de,
add_features_metadata(mo_set, rnaseq_de_res_df)
)
)
6.1 Datasets transformations
After inspection of the density plots for the different datasets (see Section 4.3), it might be necessary to normalise or transform some or all datasets. This is necessary to mitigate the mean-variance trend that occurs in RNAseq data, for example, or simply to bring the different features to a comparable scale. Transformation here refers to applying a function to each feature (i.e. each row) within a dataset that will transform the measurement values for the feature.
moiraine
implements several options to transform an omics dataset:
Variance Stabilising Normalisation (VSN) through the
vsn
package – recommended for metabolomics datasets or other continuous datasets with a strong mean-variance trend;Variance Stabilising Transformation (VST) through the
DESeq2
package – recommended for RNAseq data or any raw read count-type data;Automatic selection of the best normalisation method for each feature through the
bestNormalize
package – recommended for phenotype data, and when the number of features is small (note that the selection of the normalisation method is done independently for each feature, so the same transformation might not be applied to all features);A selection of common normalisation methods through the
bestNormalize
package, including center/scale, log, exponential, square-root, arcsinh, Box Cox, Yeo-Johnson and ordered quantile transformations (see details in the bestNormalize vignette) – recommended when applying the same transformation to all features, e.g. log2 transformation or centering.
6.1.1 Transforming a single dataset
The transformation of one dataset is done through the transform_dataset()
function, which takes as input a MultiDataSet
object, the name of the dataset to transform, and the name of the transformation to be applied, which should be one of vsn
, vst-deseq2
, logx
, best-normalize-auto
or best-normalize-manual
. For the latter, the name of the normalisation method from the BestNormalize
package to use must also be supplied through the method
argument.
The return_multidataset
argument determines whether a MultiDataSet
object with the corresponding dataset transformed should be returned. If it is set to FALSE
, the function will instead return a list with the transformed dataset as a matrix as well as other useful information returned by the transformation function applied. It is possible to only return the transformed matrix, by setting return_matrix_only
to TRUE
. This can be useful to quickly assess the effects of the transformation outside of the analysis pipeline.
For example, we can apply the Variance Stabilising Transformation to the transcriptomics dataset:
tar_load(mo_set_de)
rnaseq_vst <- transform_dataset(
mo_set_de,
"rnaseq",
"vst-deseq2",
return_multidataset = FALSE
)
#> Applying Variance Stabilising Transformation (DESeq2) to rnaseq dataset.
#> converting counts to integer mode
The function returns a list, with the transformed dataset as matrix in the transformed_data
element. Information generated during the transformation by the DESeq2
package is stored in the info_transformation
element. The name of the transformation applied is stored in the transformation
element:
names(rnaseq_vst)
#> [1] "transformed_data" "info_transformation" "transformation"
rnaseq_vst$transformed_data[1:5, 1:5]
#> R9497 R5969 R5327 R5979 R9504
#> ENSBTAG00000000005 10.357314 11.289047 12.070536 10.201227 10.251942
#> ENSBTAG00000000008 4.836495 4.937948 4.375764 4.986154 5.710581
#> ENSBTAG00000000009 3.486141 4.054636 4.983434 3.486141 3.486141
#> ENSBTAG00000000010 12.216690 11.937084 12.079359 11.383360 11.897780
#> ENSBTAG00000000011 3.486141 4.054636 4.005139 3.979732 3.486141
rnaseq_vst$info_transformation
#> class: DESeqTransform
#> dim: 20335 143
#> metadata(1): version
#> assays(1): ''
#> rownames(20335): ENSBTAG00000000005 ENSBTAG00000000008 ...
#> ENSBTAG00000055312 ENSBTAG00000055314
#> rowData names(6): baseMean baseVar ... dispGeneIter dispFit
#> colnames(143): R9497 R5969 ... Y9747 Y9816
#> colData names(1): sizeFactor
rnaseq_vst$transformation
#> [1] "vst-deseq2"
If we instead want to apply a log2 transformation to the dataset, we will use the logx
transformation option. In that case, we have to specify the log base to use (here 2) through the log_base
argument, as well as the function that should be applied to the matrix prior to the log-transformation through the pre_log_function
argument. This function is here mostly to take care of zero values that could cause an issue during the log transformation. By default, the zero_to_half_min()
function is used, which replaces zero values in the dataset with half of the minimum non-null value found in the dataset. However, it is possible to pass any custom function instead, for example function(x) x
would not make any change to the dataset before the log-transformation:
rnaseq_log2 <- transform_dataset(
mo_set_de,
"rnaseq",
"logx",
base_log = 2,
pre_log_function = function(x) x,
return_multidataset = TRUE
)
#> Applying Log Transformation to rnaseq dataset.
#> Warning in transform_logx(mat, return_matrix_only = return_matrix_only, : The
#> matrix contains zero values; log-transformation will yield `-Inf`.
In that case, we asked the function to return a MultiDataSet
object, in which the rnaseq
dataset has been transformed:
rnaseq_log2
#> Object of class 'MultiDataSet'
#> . assayData: 3 elements
#> . snps: 23036 features, 139 samples
#> . rnaseq: 20335 features, 143 samples
#> . metabolome: 55 features, 139 samples
#> . featureData:
#> . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#> . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#> . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#> . rowRanges:
#> . snps: YES
#> . rnaseq: YES
#> . metabolome: NO
#> . phenoData:
#> . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#> . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#> . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)
get_dataset_matrix(rnaseq_log2, "rnaseq")[1:5, 1:5]
#> R9497 R5969 R5327 R5979 R9504
#> ENSBTAG00000000005 9.517669 10.458407 11.511258 9.768184 9.531381
#> ENSBTAG00000000008 2.584963 2.807355 1.584963 3.321928 4.321928
#> ENSBTAG00000000009 -Inf 0.000000 3.169925 -Inf -Inf
#> ENSBTAG00000000010 11.394999 11.111136 11.520128 10.965784 11.195372
#> ENSBTAG00000000011 -Inf 0.000000 0.000000 0.000000 -Inf
The two other transformation options are best-normalize-auto
and best-normalize-manual
. Both rely on the bestNormalize
package to apply a transformation independently to each feature in the dataset. With best-normalize-auto
, the bestNormalize::bestNormalize()
function automatically selects for each feature the transformation that results in a distribution of values closest to Gaussian as possible. Note that as the function is applied independently to each feature, a different distribution may be chosen for different features. With the best-normalize-manual
option, the same transformation is applied to each feature. The transformation to use must be specified through the method
argument, and should be the name of one of the transformation implemented in bestNormalize
: "arcsinh_x"
, "boxcox"
, "center_scale"
, "exp_x"
, "log_x"
, "orderNorm"
, "sqrt_x"
or "yeojohnson"
. Note that with this option, even if the same transformation is applied to each feature, the parameters for the transformation, if not specified, will be independently estimated for each feature. For example, with the log_x
method, the function automatically adds a small offset (a
) to the feature’s measurements if there are any zero values. The value used for this offset might be different for each feature. We can instead set a value for a
to be used for all features, as follows:
transform_dataset(
mo_set_de,
"rnaseq",
"best-normalize-manual",
method = "log_x",
a = 0.1,
return_multidataset = TRUE
)
Before using the best-normalize-manual
option, it is strongly recommended to have a look at the documentation of the corresponding function from the bestNormalize
package, as they might have behaviours that are not expected. For example, by default the bestNormalize::log_x()
function uses a log base 10, and centers and scales the transformed values, which may not be what we want.
6.1.2 Transformation target factory
The target factory function transformation_datasets_factory()
provides a wrapper to apply (potentially different) transformations to several datasets at once. The function takes as input the MultiDataSet
object as well as a named character vector, in which each element corresponds to a transformation that should be applied to a specific dataset. If a dataset is not present in the transformation vector, it will not be transformed (but it will still be present in the resulting MultiDataSet
object).
Here, we would like to apply Variance Stabilising Transformation to the transcriptomics dataset, and a log2 transformation to the metabolomics dataset. Note that the VST and VSN transformations are very close to the log2 transformation, especially for features with high means.
transformation_datasets_factory(
mo_set_de,
c("rnaseq" = "vst-deseq2",
"metabolome" = "logx"),
log_bases = 2,
pre_log_functions = zero_to_half_min,
transformed_data_name = "mo_set_transformed"
)
The transformation_datasets_factory()
function works as follows:
- It creates a grouped tibble listing the transformation to apply to each dataset, stored in the
transformations_spec
target;
tar_read(transformations_spec)
#> # A tibble: 2 × 6
#> dsn transf meth log_b prelog_f tar_group
#> <chr> <chr> <list> <list> <list> <int>
#> 1 rnaseq vst-deseq2 <NULL> <NULL> <NULL> 2
#> 2 metabolome logx <NULL> <dbl [1]> <fn> 1
- It performs the required transformation on each dataset via dynamic branching. This is done through a call to the
transform_dataset()
function. The transformed datasets are stored in a list, in thetransformations_runs_list
target. Note that by default the function will store all details of the transformations, which can be useful for later inspection, but can be memory-intensive. It is possible to only store the transformed datasets instead, by setting thereturn_matrix_only
argument toTRUE
in thetransformation_datasets_factory()
call.
tar_load(transformations_runs_list)
names(transformations_runs_list)
#> [1] "transformations_runs_list_e68c2d99" "transformations_runs_list_46986372"
map_chr(transformations_runs_list, attr, "dataset_name")
#> transformations_runs_list_e68c2d99 transformations_runs_list_46986372
#> "metabolome" "rnaseq"
transformations_runs_list[["transformations_runs_list_a1c8db41"]] |> names()
#> NULL
- It creates a new
MultiDataSet
object, with the transformed version of the datasets. By default, this newMultiDataSet
object is stored in a target calledtransformed_set
, but a different name can be specified via thetransformed_data_name
argument (here we called itmo_set_transformed
).
tar_load(mo_set_transformed)
mo_set_transformed
#> Object of class 'MultiDataSet'
#> . assayData: 3 elements
#> . snps: 23036 features, 139 samples
#> . rnaseq: 20335 features, 143 samples
#> . metabolome: 55 features, 139 samples
#> . featureData:
#> . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#> . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#> . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#> . rowRanges:
#> . snps: YES
#> . rnaseq: YES
#> . metabolome: NO
#> . phenoData:
#> . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#> . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#> . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)
get_dataset_matrix(mo_set_de, "metabolome")[1:5, 1:3]
#> R21 Y3660 Y3243
#> HMDB00001 9.1 9.7 9.3
#> HMDB00008 58.2 22.8 9.1
#> HMDB00042 403.0 392.0 606.0
#> HMDB00043 172.6 163.1 165.2
#> HMDB00060 0.7 1.5 1.4
get_dataset_matrix(mo_set_transformed, "metabolome")[1:5, 1:3]
#> R21 Y3660 Y3243
#> HMDB00001 3.1858665 3.2779847 3.2172307
#> HMDB00008 5.8629472 4.5109619 3.1858665
#> HMDB00042 8.6546360 8.6147098 9.2431740
#> HMDB00043 7.4312887 7.3496130 7.3680699
#> HMDB00060 -0.5145732 0.5849625 0.4854268
Converting targets factory to R script
transformations_runs_list <- c(
"rnaseq" = "vst-deseq2",
"metabolome" = "logx"
) |> imap(\(x, y) {
transform_dataset(
mo_set_de,
dataset = y,
transformation = x,
log_base = 2,
pre_log_function = zero_to_half_min
)
}
)
mo_set_transformed <- get_transformed_data(mo_set_de, transformations_runs_list)
We can assess the effect of the transformations by generating density and mean-sd plots for the transformed datasets:
plot_density_data(
mo_set_transformed,
combined = FALSE,
scales = "free"
)
Note how the relationship between features mean and standard deviation has been reduced in both transformed datasets:
plot_meansd_data(mo_set_transformed)
Finally, it can be useful to summarise which transformations have been applied to the datasets, for example when creating a report. The function get_table_transformation()
is here for that. It takes as an input the transformations_runs_list
target generated by transformation_datasets_factory()
, and returns a tibble indicating the transformation applied to each dataset:
get_table_transformations(transformations_runs_list)
#> # A tibble: 2 × 2
#> Dataset Transformation
#> <chr> <chr>
#> 1 metabolome Log-2 transformation
#> 2 rnaseq Variance Stabilising Transformation (DESeq2)
6.2 Running a PCA on each dataset
It is always best practice to run some exploratory analysis on a dataset prior to running analyses. This is largely outside the scope of this package, and we assume that any input dataset has been properly assessed before turning to the integration pipeline. However, running a Principal Component Analysis (PCA) on each of the omics datasets within the integration pipeline serves two purposes:
as a last check to ensure that there are no obvious batch effects or problematic samples that should be addressed,
as a missing data imputation method.
The moiraine
package relies on the Bioconductor pcaMethods
package to perform the PCA. In particular, the pcaMethods
package implements a NIPALS (non-linear iterative partial least squares) method for PCA, which allows for missing values in the input dataset, and imputes missing values based on the results of the PCA.
6.2.1 Running the PCAs
The pca_complete_data_factory()
function uses dynamic branching to perform a PCA on each omics dataset within a MultiDataSet
object. It takes as input the MultiDataSet
object (in our case, mo_set_transformed
), and, optionally, the names of the datasets on which a PCA should be run. This is useful if one dataset is very large and has no missing values, and we want to avoid running a PCA on it. It then proceeds as follows:
It creates a target called
dataset_names_pca
, which stores a vector of dataset names on which a PCA should be applied;For each value in
dataset_names_pca
, it extracts the omics dataset as a matrix with features as rows and samples as columns, using theget_dataset_matrix()
function. This is done via dynamic branching, and the results are stored as a list in thepca_mats_list
target. Note that the names of this list are not meaningful; to check which element of the list corresponds to which dataset, you can runmap_chr(pca_mats_list, attr, "dataset_name")
;For each matrix in
pca_mats_list
, it applies therun_pca_matrix()
function to the corresponding dataset. This is done via dynamic branching; it results in a list where each element is the PCA result (i.e. apcaMethods::pcaRes
object) for a given dataset. This list is stored in thepca_runs_list
target. Note that the names of this list are not meaningful; to check which element of the list corresponds to which dataset, you can runmap_chr(pca_runs_list, attr, "dataset_name")
;It extracts from the result of each PCA the complete dataset, i.e. with missing values imputed, and uses this information to construct a new
MultiDataSet
object, in which the datasets are complete (i.e. no missing value). This is done by calling theget_complete_data()
function. If no PCA was run on a dataset, the dataset will still be present in the newMultiDataSet
object, but its missing values will not be imputed. The resulting completeMultiDataSet
object is stored by default in a target calledcomplete_set
; this name can be changed via thecomplete_data_name
argument.
Let’s apply this to our multi-omics dataset:
pca_complete_data_factory(
mo_set_transformed,
complete_data_name = "mo_set_complete"
)
We can have a look at the different targets constructed. By default, a PCA was run on all datasets:
tar_read(dataset_names_pca)
#> [1] "snps" "rnaseq" "metabolome"
tar_load(pca_mats_list)
map_chr(pca_mats_list, attr, "dataset_name")
#> pca_mats_list_302d7473 pca_mats_list_84d36937 pca_mats_list_64d37e6c
#> "snps" "rnaseq" "metabolome"
map(pca_mats_list, ~.x[1:5, 1:5])
#> $pca_mats_list_302d7473
#> R21 Y3660 Y3243 R5764 P4669
#> 1_41768691 1 0 2 2 1
#> 10-27008241-A-C-rs42918694 2 2 2 1 2
#> 10-37505419-T-C-rs136559242 0 1 0 2 0
#> 10-49904259-G-A-rs471723345 1 2 2 2 2
#> 1-109550832-G-A-rs209732846 2 2 1 2 2
#>
#> $pca_mats_list_84d36937
#> R9497 R5969 R5327 R5979 R9504
#> ENSBTAG00000000005 10.357314 11.289047 12.070536 10.201227 10.251942
#> ENSBTAG00000000008 4.836495 4.937948 4.375764 4.986154 5.710581
#> ENSBTAG00000000009 3.486141 4.054636 4.983434 3.486141 3.486141
#> ENSBTAG00000000010 12.216690 11.937084 12.079359 11.383360 11.897780
#> ENSBTAG00000000011 3.486141 4.054636 4.005139 3.979732 3.486141
#>
#> $pca_mats_list_64d37e6c
#> R21 Y3660 Y3243 R5764 P4669
#> HMDB00001 3.1858665 3.2779847 3.2172307 NA 3.6322682
#> HMDB00008 5.8629472 4.5109619 3.1858665 2.655352 4.1618877
#> HMDB00042 8.6546360 8.6147098 9.2431740 8.655352 7.9008668
#> HMDB00043 7.4312887 7.3496130 7.3680699 7.707359 6.4025858
#> HMDB00060 -0.5145732 0.5849625 0.4854268 2.906891 0.6780719
The result of the PCA run on the genomics dataset looks like this:
tar_read(pca_runs_list_74d71ae8)
#> nipals calculated PCA
#> Importance of component(s):
#> PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8
#> R2 0.05578 0.02881 0.01305 0.01196 0.01168 0.01005 0.009848 0.009565
#> Cumulative R2 0.05578 0.08459 0.09765 0.10960 0.12128 0.13133 0.141175 0.150739
#> PC9 PC10
#> R2 0.009286 0.008915
#> Cumulative R2 0.160026 0.168941
#> 23036 Variables
#> 139 Samples
#> 9615 NAs ( 0.3 %)
#> 10 Calculated component(s)
#> Data was mean centered before running PCA
#> Data was NOT scaled before running PCA
#> Scores structure:
#> [1] 139 10
#> Loadings structure:
#> [1] 23036 10
You can notice that there is some information about the number of principal components computed, and whether the dataset was centred and scaled before applying the PCA. This is handled by the default arguments of run_pca_matrix()
, but can be specified by passing the corresponding arguments to pca_complete_data_factory()
. For example, to scale the datasets before performing a PCA, we could use:
pca_complete_data_factory(
mo_set_transformed,
complete_data_name = "mo_set_complete",
scale = TRUE
)
For convenience, the run_pca()
function can be used to run a PCA on one of the omics datasets directly from a MultiDataSet
object. It is a wrapper around the run_pca_matrix()
function, and takes as input a MultiDataSet
object as well as the name of the omics dataset on which a PCA should be run, e.g.:
run_pca(mo_set_de, "rnaseq")
Converting targets factory to R script
pca_mats_list <- names(mo_set_transformed) |>
set_names() |>
map(\(x) get_dataset_matrix(mo_set_transformed, x, keep_dataset_name = TRUE))
pca_runs_list <- pca_mats_list |>
map(run_pca_matrix)
mo_set_complete <- get_complete_data(mo_set_transformed, pca_runs_list)
6.2.2 Visualising the PCA results
It is possible to get an overview of the results of each PCA. First, the function plot_screeplot_pca()
displays the percentage of variance explained by the principal components computed for each dataset. It takes as input the pca_runs_list
target constructed in the previous step. Note that by default, 10 components are computed for each dataset.
plot_screeplot_pca(pca_runs_list)
In addition, the plot_samples_coordinates_pca
allows us to display the samples in the reduced principal components space (the common PCA sample plot). The function returns a list of plots (one plot per dataset). By default, it shows all principal components computed for each dataset, but for clarity we will only look at the first three:
Note that it is possible to look at a different set of principal components for each dataset. For that, the index of the principal components should be passed to the pcs
argument as a named list (where the name of each element corresponds to a dataset name), e.g.:
plot_samples_coordinates_pca(
pca_runs_list,
pcs = list(
"snps" = 1:4,
"rnaseq" = 1:2,
"metabolome" = 1:3
)
)
By default, the points in the sample plots are not coloured. It is however possible to colour the samples according to the information contained in the sample metadata tables available through the MultiDataset
object. We can set different colours and shapes for the upper and lower plots in the scatterplot matrix, see the plot_samples_score()
function for more information. For example, we can assess whether the first three principal components show any clustering of the samples according to their cluster computed from genomics similarity, disease status or feedlot (we’ll only show the results for the SNPs dataset here):
plot_samples_coordinates_pca(
pca_runs_list,
datasets = "snps",
pcs = 1:3,
mo_data = mo_set_de,
colour_upper = "geno_comp_cluster",
shape_upper = "status",
colour_lower = "feedlot"
) +
theme(legend.box = "vertical")
6.2.3 Missing values imputation
We can check that the complete multi-omics set constructed has no more missing values:
tar_load(mo_set_complete)
mo_set_complete
#> Object of class 'MultiDataSet'
#> . assayData: 3 elements
#> . snps: 23036 features, 139 samples
#> . rnaseq: 20335 features, 143 samples
#> . metabolome: 55 features, 139 samples
#> . featureData:
#> . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#> . rnaseq: 20335 rows, 15 cols (feature_id, ..., de_signif)
#> . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#> . rowRanges:
#> . snps: YES
#> . rnaseq: YES
#> . metabolome: NO
#> . phenoData:
#> . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#> . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#> . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)
check_missing_values(mo_set_complete)
#> No missing values in snps dataset.
#> No missing values in rnaseq dataset.
#> No missing values in metabolome dataset.
6.3 Recap – targets list
For convenience, here is the list of targets that we created in this section:
Targets list for datasets preprocessing
list(
## Applying transformations to the datasets
transformation_datasets_factory(
mo_set_de,
c("rnaseq" = "vst-deseq2",
"metabolome" = "logx"),
log_bases = 2,
pre_log_functions = zero_to_half_min,
transformed_data_name = "mo_set_transformed"
),
## Density plot for each transformed dataset
tar_target(
density_plots_transformed,
plot_density_data(
mo_set_transformed,
combined = FALSE,
scales = "free"
)
),
## Plotting the mean-SD trend for transformed each dataset
tar_target(
mean_sd_plots_transformed,
plot_meansd_data(mo_set_transformed)
),
## Summary table of the transformations applied
tar_target(
transformation_summary,
get_table_transformations(transformations_runs_list)
),
## Running a PCA on each dataset
pca_complete_data_factory(
mo_set_transformed,
complete_data_name = "mo_set_complete"
),
## PCA screeplots
tar_target(
pca_screeplots,
plot_screeplot_pca(pca_runs_list)
),
## PCA sample plots
tar_target(
pca_sample_plots,
plot_samples_coordinates_pca(
pca_runs_list,
datasets = "snps",
pcs = 1:3,
mo_data = mo_set_de,
colour_upper = "geno_comp_cluster",
shape_upper = "status",
colour_lower = "feedlot"
)
)
)