5  Modifying the MultiDataSet object

library(targets)
library(moiraine)
library(MultiDataSet)

## For reading data
library(readr)
## For creating data-frames
library(tibble)
## For data-frame manipulation
library(dplyr)
## For working with lists
library(purrr)

In this chapter, we will show how to modify the MultiDataSet object that we created in Chapter 3.

As a reminder, here is what the _targets.R script should look like so far:

_targets.R script
library(targets)
library(tarchetypes)
library(moiraine)

tar_option_set(
  packages = c(
    "moiraine",
    "MOFA2",
    "mixOmics",
    "readr",
    "tibble",
    "tidyr",
    "dplyr",
    "ggplot2",
    "patchwork"
  )
)

list(
# Importing data ---------------------------------------------------------------

  ## Data import using a target factory
  import_dataset_csv_factory(
    files = c(
      system.file("extdata/genomics_dataset.csv", package = "moiraine"),
      system.file("extdata/transcriptomics_dataset.csv", package = "moiraine"),
      system.file("extdata/metabolomics_dataset.csv", package = "moiraine")
    ),
    col_ids = c("marker", "gene_id", "sample_id"),
    features_as_rowss = c(TRUE, TRUE, FALSE),
    target_name_suffixes = c("geno", "transcripto", "metabo")
  ),
  
  ## Genomics features metadata file
  tar_target(
    fmetadata_file_geno,
    system.file("extdata/genomics_features_info.csv", package = "moiraine"),
    format = "file"
  ),
  
  ## Genomics features metadata import
  tar_target(
    fmetadata_geno,
    import_fmetadata_csv(
      fmetadata_file_geno,
      col_id = "marker",
      col_types = c("chromosome" = "c")
    )
  ),
  
  ## Metabolomics features metadata import
  import_fmetadata_csv_factory(
    files = c(
      system.file("extdata/metabolomics_features_info.csv", package = "moiraine")
    ),
    col_ids = c("feature_id"),
    target_name_suffixes = c("metabo")
  ),
  
  ## Transcriptomics features metadata import
  import_fmetadata_gff_factory(
    files = system.file("extdata/bos_taurus_gene_model.gff3", package = "moiraine"),
    feature_types = "genes",
    add_fieldss = c("Name", "description"),
    target_name_suffixes = "transcripto"
  ),
  
  ## Samples metadata import
  import_smetadata_csv_factory(
    files = system.file("extdata/samples_info.csv", package = "moiraine"),
    col_ids = "animal_id",
    target_name_suffixes = "all"
  ),
  
  ## Creating omics sets for each dataset
  create_omics_set_factory(
    datasets = c(data_geno, data_transcripto, data_metabo),
    omics_types = c("genomics", "transcriptomics", "metabolomics"),
    features_metadatas = c(fmetadata_geno, fmetadata_transcripto, fmetadata_metabo),
    samples_metadatas = c(smetadata_all, smetadata_all, smetadata_all)
  ),
  
  ## Creating the MultiDataSet object
  tar_target(
    mo_set,
    create_multiomics_set(
      list(set_geno,
           set_transcripto,
           set_metabo)
    )
  ),

# Inspecting the `MultiDataSet` object -----------------------------------------

  ## Creating a density plot for each dataset
  tar_target(
    density_plots,
    plot_density_data(
      mo_set,
      combined = FALSE,
      scales = "free"
    )
  ),

  ## Plotting the relationship between features mean and standard deviation
  ## for each dataset
  tar_target(
    mean_sd_plots,
    plot_meansd_data(mo_set)
  ),
  
  ## Assessing missing values
  tar_target(
    n_missing_values,
    check_missing_values(mo_set)
  )
)

At this stage, our different omics datasets (and associated metadata) are stored in a MultiDataSet object, saved in the target called mo_set:

tar_load(mo_set)

mo_set
#> Object of class 'MultiDataSet'
#>  . assayData: 3 elements
#>     . snps: 23036 features, 139 samples 
#>     . rnaseq: 20335 features, 143 samples 
#>     . metabolome: 55 features, 139 samples 
#>  . featureData:
#>     . snps: 23036 rows, 13 cols (feature_id, ..., p_value)
#>     . rnaseq: 20335 rows, 8 cols (feature_id, ..., Name)
#>     . metabolome: 55 rows, 16 cols (feature_id, ..., de_signif)
#>  . rowRanges:
#>     . snps: YES
#>     . rnaseq: YES
#>     . metabolome: NO
#>  . phenoData:
#>     . snps: 139 samples, 10 cols (id, ..., geno_comp_3)
#>     . rnaseq: 143 samples, 10 cols (id, ..., geno_comp_3)
#>     . metabolome: 139 samples, 10 cols (id, ..., geno_comp_3)

There are several reasons why we would want to modify the information contained in this object after it has been created. For example, we might want to apply a specific transformation to the omics dataset, using a custom function not implemented in the moiraine package. In that case, it would be preferable to apply the transformation on the datasets after the MultiDataSet object has been created and exploratory analyses have been performed, rather than on the raw data before importing it into the moiraine workflow.

In another scenario, after having created the MultiDataSet object, we might obtain more information about the samples and/or features that we want to add to the object’s features or samples metadata, to be used in visualisations. In that case, it is recommended to include this information in the samples or features metadata files and re-run the integration pipeline; however functions to add information to the metadata tables are included for convenience. This can also be useful if, as with this example data, the features metadata was extracted from a specialised format (e.g. a GFF file for the RNAseq dataset) and we want to add in some information from another source.

5.1 Modifying a dataset matrix

Let us imagine that, prior to performing the data transformation step (which we will see in Chapter 6), we want to replace all zero values in the transcriptomics dataset with a small value, to avoid issues during the log-transformation process (note that this is only to illustrate this functionality, for the actual analysis we will use a different transformation that handles zero values). We will pick this small value as half of the non-null minimum value in the dataset (which will be 0.5, since we are working with count data). We can compute the new matrix of RNAseq counts:

rnaseq_mat <- get_dataset_matrix(mo_set, "rnaseq")
rnaseq_mat[1:5, 1:5]
#>                    R9497 R5969 R5327 R5979 R9504
#> ENSBTAG00000000005   733  1407  2919   872   740
#> ENSBTAG00000000008     6     7     3    10    20
#> ENSBTAG00000000009     0     1     9     0     0
#> ENSBTAG00000000010  2693  2212  2937  2000  2345
#> ENSBTAG00000000011     0     1     1     1     0

small_val <- (1/2) * min(rnaseq_mat[rnaseq_mat != 0])
small_val
#> [1] 0.5

new_mat <- rnaseq_mat
new_mat[new_mat == 0] <- small_val

new_mat[1:5, 1:5]
#>                     R9497 R5969 R5327  R5979  R9504
#> ENSBTAG00000000005  733.0  1407  2919  872.0  740.0
#> ENSBTAG00000000008    6.0     7     3   10.0   20.0
#> ENSBTAG00000000009    0.5     1     9    0.5    0.5
#> ENSBTAG00000000010 2693.0  2212  2937 2000.0 2345.0
#> ENSBTAG00000000011    0.5     1     1    1.0    0.5

In order to replace the rnaseq dataset stored in the MultiDataSet object with this new matrix, we pass both the object and the new matrix to the replace_dataset() function, along with the name of the omics dataset whose matrix should be replaced:

mo_set_modif <- replace_dataset(mo_set, "rnaseq", new_mat)

## Checking that the replacement has been done
get_dataset_matrix(mo_set_modif, "rnaseq")[1:5, 1:5]
#>                     R9497 R5969 R5327  R5979  R9504
#> ENSBTAG00000000005  733.0  1407  2919  872.0  740.0
#> ENSBTAG00000000008    6.0     7     3   10.0   20.0
#> ENSBTAG00000000009    0.5     1     9    0.5    0.5
#> ENSBTAG00000000010 2693.0  2212  2937 2000.0 2345.0
#> ENSBTAG00000000011    0.5     1     1    1.0    0.5

Note that this only works for modifying the values within an omics dataset, and not for filtering, since both features and samples number and IDs in the new dataset matrix should match the ones in the original matrix:

mo_set_modif <- replace_dataset(mo_set, "rnaseq", new_mat[1:10, 1:10])
#> Error in replace_dataset(mo_set, "rnaseq", new_mat[1:10, 1:10]): 'new_data' argument has incorrect dimensions. Should have 20335 rows (features) and 143 columns (samples).
Click here to see a targets version of the code.
list(
  ## Replacing zero values in RNAseq dataset
  ## (note that it is more tidy to write a function for that and call it here)
  tar_target(
    rnaseq_mat_nozero,
    {
      rnaseq_mat <- get_dataset_matrix(mo_set, "rnaseq")
      small_val <- (1/2) * min(rnaseq_mat[rnaseq_mat != 0])
      new_mat <- rnaseq_mat
      new_mat[new_mat == 0] <- small_val

      new_mat
    }
  ),

  ## Replacing RNAseq dataset in MultiDataSet object
  tar_target(
    mo_set_rnaseq_nozero,
    replace_dataset(mo_set, "rnaseq", rnaseq_mat_nozero)
  )
)

5.2 Adding information to features metadata

In the case of the transcriptomics dataset, we extracted the features metadata directly from a GFF file, which provides information about the genome annotation used. However, we might want to add information about the genes from a different source. We could add this information to the data-frame generated with import_fmetadata_gff() (see Section 3.3.3) before creating theMultiDataSet object, but we will demonstrate here how to add information once we’ve already created the object. Note that we will use targets for this example, as we will incorporate these changes in our analysis pipeline.

Let’s start by reading in the differential expression results:

list(
  tar_target(
    rnaseq_de_res_file,
    system.file(
      "extdata/transcriptomics_de_results.csv",
      package = "moiraine"
    ),
    format = "file"
  ),

  tar_target(
    rnaseq_de_res_df,
    read_csv(rnaseq_de_res_file) |>
      rename(feature_id = gene_id) |>
      mutate(dataset = "rnaseq")
  )
)

Notice that in the results file, the gene IDs are stored in the gene_id column. Here, we rename this column as feature_id, which is required for adding it to the features metadata. In addition, we create a dataset column which contains the name of the dataset in the MultiDataSet object to which the features belong. This is also necessary.

The differential results look like this:

tar_read(rnaseq_de_res_df) |>
  head()
#> # A tibble: 6 × 9
#>   feature_id  log_fc log_cpm     f  p_value      fdr de_signif de_status dataset
#>   <chr>        <dbl>   <dbl> <dbl>    <dbl>    <dbl> <chr>     <chr>     <chr>  
#> 1 ENSBTAG000…   4.61    4.19  358. 5.32e-40 5.93e-36 DE        upregula… rnaseq 
#> 2 ENSBTAG000…   3.80    6.83  356. 6.33e-40 5.93e-36 DE        upregula… rnaseq 
#> 3 ENSBTAG000…   5.41    2.24  347. 2.41e-39 1.50e-35 DE        upregula… rnaseq 
#> 4 ENSBTAG000…   4.34    3.52  344. 3.40e-39 1.59e-35 DE        upregula… rnaseq 
#> 5 ENSBTAG000…   2.18    6.74  327. 4.05e-38 1.52e-34 DE        upregula… rnaseq 
#> 6 ENSBTAG000…  -1.31    2.64  316. 2.39e-37 7.48e-34 Not DE    Not DE    rnaseq

We can now use the add_features_metadata() function to add this table to the features metadata of the transcriptomics dataset. The new MultiDataSet object that includes information about the differential expression results will be saved in the mo_set_de target:

tar_target(
  mo_set_de,
  add_features_metadata(mo_set, rnaseq_de_res_df)
)

The new information has been added to the features metadata of the transcriptomics dataset.

tar_read(mo_set_de) |>
  get_features_metadata() |>
  pluck("rnaseq") |>
  head()
#>                            feature_id chromosome    start      end  width
#> ENSBTAG00000000005 ENSBTAG00000000005         17 65389743 65505336 115594
#> ENSBTAG00000000008 ENSBTAG00000000008         29 32214439 32244810  30372
#> ENSBTAG00000000009 ENSBTAG00000000009         18 12338037 12342272   4236
#> ENSBTAG00000000010 ENSBTAG00000000010         21 34209956 34223394  13439
#> ENSBTAG00000000011 ENSBTAG00000000011          8  7950815  7971600  20786
#> ENSBTAG00000000012 ENSBTAG00000000012         20 33708626 33732944  24319
#>                    strand  Name
#> ENSBTAG00000000005      +  GRK3
#> ENSBTAG00000000008      - KCNJ1
#> ENSBTAG00000000009      + FOXF1
#> ENSBTAG00000000010      +  UBL7
#> ENSBTAG00000000011      -   TDH
#> ENSBTAG00000000012      + TTC33
#>                                                                                                       description
#> ENSBTAG00000000005                        G protein-coupled receptor kinase 3 [Source:VGNC Symbol;Acc:VGNC:53904]
#> ENSBTAG00000000008 potassium inwardly rectifying channel subfamily J member 1 [Source:VGNC Symbol;Acc:VGNC:30453]
#> ENSBTAG00000000009                                            forkhead box F1 [Source:VGNC Symbol;Acc:VGNC:29084]
#> ENSBTAG00000000010                                           ubiquitin like 7 [Source:VGNC Symbol;Acc:VGNC:50128]
#> ENSBTAG00000000011                                 L-threonine dehydrogenase [Source:VGNC Symbol;Acc:VGNC:108945]
#> ENSBTAG00000000012                         tetratricopeptide repeat domain 33 [Source:VGNC Symbol;Acc:VGNC:36471]
#>                         log_fc    log_cpm          f      p_value          fdr
#> ENSBTAG00000000005  0.13603041  5.9051177  4.1626465 4.324808e-02 7.170137e-02
#> ENSBTAG00000000008 -0.12965356 -0.7437052  1.0676845 3.032916e-01 3.938643e-01
#> ENSBTAG00000000009  1.27141158 -2.5627934 23.9562951 2.731328e-06 9.665375e-06
#> ENSBTAG00000000010  0.39567729  6.2563594 60.5303416 1.581955e-12 1.521160e-11
#> ENSBTAG00000000011  0.07766873 -2.7608361  0.1418457 7.070363e-01 7.773874e-01
#> ENSBTAG00000000012  0.15756169  3.6628775 11.0448775 1.141125e-03 2.623062e-03
#>                    de_signif de_status
#> ENSBTAG00000000005    Not DE    Not DE
#> ENSBTAG00000000008    Not DE    Not DE
#> ENSBTAG00000000009    Not DE    Not DE
#> ENSBTAG00000000010    Not DE    Not DE
#> ENSBTAG00000000011    Not DE    Not DE
#> ENSBTAG00000000012    Not DE    Not DE

Note that with this function, we can add information about features from different datasets at once, which is why there needs to be a dataset column in the data-frame to add, indicating the dataset to which each feature belongs. Also, not all features from a given dataset need to be present in this new data-frame; it is possible to add information for only a subset of them. In that case, the function will throw a warning giving the number of features from the corresponding dataset missing from the new data-frame, and the new columns in the corresponding features metadata will be filled with NA for features not present. However, it is only possible to add columns that do not already exist in the corresponding features metadata.

5.3 Adding information to samples metadata

Similarly, we can add a data-frame of information to the samples metadata. Here, we will create a table that contains “new” simulated information about the samples that we want to incorporate in our MultiDataSet object.

## Getting the list of samples ID across the datasets
samples_list <- get_samples(mo_set) |>
  unlist() |>
  unname() |>
  unique()

## Simulating new information table, with new samples grouping
new_samples_df <- tibble(id = samples_list) |>
  mutate(new_group = sample(letters[1:3], n(), replace = TRUE))

head(new_samples_df)
#> # A tibble: 6 × 2
#>   id    new_group
#>   <chr> <chr>    
#> 1 R21   b        
#> 2 Y3660 b        
#> 3 Y3243 a        
#> 4 R5764 a        
#> 5 P4669 c        
#> 6 R5452 a

Note that the sample IDs must be stored in a column named id. We will use the add_samples_metadata() function to add this new data-frame to the samples metadata in our MultiDataSet object. There are several options for which samples metadata tables should be modified; this is controlled through the datasets argument of the function. By default, the information is added to the samples metadata table of all omics datasets (case when datasets = NULL):

mo_set_new_samples_info <- add_samples_metadata(mo_set, new_samples_df)
#> Warning: snps dataset: 5 sample IDs not in 'mo_data', will be removed from
#> samples metadata.
#> Warning: rnaseq dataset: 1 sample IDs not in 'mo_data', will be removed from
#> samples metadata.
#> Warning: metabolome dataset: 5 sample IDs not in 'mo_data', will be removed
#> from samples metadata.

mo_set_new_samples_info |>
  get_samples_metadata() |>
  map(head)
#> $snps
#>          id feedlot gender  status day_on_feed rnaseq_batch geno_comp_1
#> R21     R21      F1 female Control          31           B2    0.007853
#> Y3660 Y3660      F1   male Control          19           B2    0.852220
#> Y3243 Y3243      F1   male Control          16           B2    0.044171
#> R5764 R5764      F2   male Control          46           B1    0.213094
#> P4669 P4669      F3   male     BRD          35           B2    0.389393
#> R5452 R5452      F2   male Control          49           B1    0.265998
#>       geno_comp_2 geno_comp_3 geno_comp_cluster new_group
#> R21      0.820691    0.171456                K3         b
#> Y3660    0.093585    0.054195                K2         b
#> Y3243    0.190262    0.765567                K1         a
#> R5764    0.676299    0.110607                K3         a
#> P4669    0.503471    0.107136                K3         c
#> R5452    0.733992    0.000010                K3         a
#> 
#> $rnaseq
#>          id feedlot gender status day_on_feed rnaseq_batch geno_comp_1
#> R9497 R9497      F2   male    BRD          35           B1    0.153716
#> R5969 R5969      F2   male    BRD          24           B1    0.110663
#> R5327 R5327      F2   male    BRD          38           B1    0.140730
#> R5979 R5979      F2   male    BRD          30           B1    0.286733
#> R9504 R9504      F2   male    BRD          31           B1    0.000010
#> R5994 R5994      F2   male    BRD          24           B1    0.129271
#>       geno_comp_2 geno_comp_3 geno_comp_cluster new_group
#> R9497    0.744505    0.101779                K3         a
#> R5969    0.625823    0.263514                K3         b
#> R5327    0.809396    0.049874                K3         b
#> R5979    0.107794    0.605473                K1         c
#> R9504    0.998913    0.001077                K3         c
#> R5994    0.034351    0.836377                K1         a
#> 
#> $metabolome
#>          id feedlot gender  status day_on_feed rnaseq_batch geno_comp_1
#> R21     R21      F1 female Control          31           B2    0.007853
#> Y3660 Y3660      F1   male Control          19           B2    0.852220
#> Y3243 Y3243      F1   male Control          16           B2    0.044171
#> R5764 R5764      F2   male Control          46           B1    0.213094
#> P4669 P4669      F3   male     BRD          35           B2    0.389393
#> R5452 R5452      F2   male Control          49           B1    0.265998
#>       geno_comp_2 geno_comp_3 geno_comp_cluster new_group
#> R21      0.820691    0.171456                K3         b
#> Y3660    0.093585    0.054195                K2         b
#> Y3243    0.190262    0.765567                K1         a
#> R5764    0.676299    0.110607                K3         a
#> P4669    0.503471    0.107136                K3         c
#> R5452    0.733992    0.000010                K3         a

However, it is also possible to specify for which dataset(s) the changes should be made, by passing their name to the datasets argument.

mo_set_new_samples_info <- add_samples_metadata(
  mo_set, 
  new_samples_df, 
  datasets = c("rnaseq", "metabolome")
)
#> Warning: rnaseq dataset: 1 sample IDs not in 'mo_data', will be removed from
#> samples metadata.
#> Warning: metabolome dataset: 5 sample IDs not in 'mo_data', will be removed
#> from samples metadata.

mo_set_new_samples_info |>
  get_samples_metadata() |>
  map(head)
#> $snps
#>          id feedlot gender  status day_on_feed rnaseq_batch geno_comp_1
#> R21     R21      F1 female Control          31           B2    0.007853
#> Y3660 Y3660      F1   male Control          19           B2    0.852220
#> Y3243 Y3243      F1   male Control          16           B2    0.044171
#> R5764 R5764      F2   male Control          46           B1    0.213094
#> P4669 P4669      F3   male     BRD          35           B2    0.389393
#> R5452 R5452      F2   male Control          49           B1    0.265998
#>       geno_comp_2 geno_comp_3 geno_comp_cluster
#> R21      0.820691    0.171456                K3
#> Y3660    0.093585    0.054195                K2
#> Y3243    0.190262    0.765567                K1
#> R5764    0.676299    0.110607                K3
#> P4669    0.503471    0.107136                K3
#> R5452    0.733992    0.000010                K3
#> 
#> $rnaseq
#>          id feedlot gender status day_on_feed rnaseq_batch geno_comp_1
#> R9497 R9497      F2   male    BRD          35           B1    0.153716
#> R5969 R5969      F2   male    BRD          24           B1    0.110663
#> R5327 R5327      F2   male    BRD          38           B1    0.140730
#> R5979 R5979      F2   male    BRD          30           B1    0.286733
#> R9504 R9504      F2   male    BRD          31           B1    0.000010
#> R5994 R5994      F2   male    BRD          24           B1    0.129271
#>       geno_comp_2 geno_comp_3 geno_comp_cluster new_group
#> R9497    0.744505    0.101779                K3         a
#> R5969    0.625823    0.263514                K3         b
#> R5327    0.809396    0.049874                K3         b
#> R5979    0.107794    0.605473                K1         c
#> R9504    0.998913    0.001077                K3         c
#> R5994    0.034351    0.836377                K1         a
#> 
#> $metabolome
#>          id feedlot gender  status day_on_feed rnaseq_batch geno_comp_1
#> R21     R21      F1 female Control          31           B2    0.007853
#> Y3660 Y3660      F1   male Control          19           B2    0.852220
#> Y3243 Y3243      F1   male Control          16           B2    0.044171
#> R5764 R5764      F2   male Control          46           B1    0.213094
#> P4669 P4669      F3   male     BRD          35           B2    0.389393
#> R5452 R5452      F2   male Control          49           B1    0.265998
#>       geno_comp_2 geno_comp_3 geno_comp_cluster new_group
#> R21      0.820691    0.171456                K3         b
#> Y3660    0.093585    0.054195                K2         b
#> Y3243    0.190262    0.765567                K1         a
#> R5764    0.676299    0.110607                K3         a
#> P4669    0.503471    0.107136                K3         c
#> R5452    0.733992    0.000010                K3         a

In both cases, the function throws some warnings to alert about samples missing from this new table, or samples that are not present in the original samples metadata table. These warnings should be checked to avoid issues due to typos, etc.

As with the add_features_metadata() function, it is possible to add information about only a subset of the samples; however the columns in the new data-frame must not already be present in the features metadata tables to which it will be added.

5.4 Recap – targets list

For convenience, here is the list of targets that we created in this section:

Targets list for modifying a MultiDataSet object
list(
  ## RNAseq differential expression results file
  tar_target(
    rnaseq_de_res_file,
    system.file(
      "extdata/transcriptomics_de_results.csv",
      package = "moiraine"
    ),
    format = "file"
  ),

  ## Reading the RNAseq differential expression results
  tar_target(
    rnaseq_de_res_df,
    read_csv(rnaseq_de_res_file) |>
      rename(feature_id = gene_id) |>
      mutate(dataset = "rnaseq")
  ),

  ## Adding the differential expression results to the MultiDataSet object
  tar_target(
    mo_set_de,
    add_features_metadata(mo_set, rnaseq_de_res_df)
  )
)