Return a TreeSummarizedExperiment with data based on sample data and feature data tables
Source:R/readParquet.R
returnSamples.Rd'returnSamples' takes tables with sample and feature information and retrieves the relevant data as a TreeSummarizedExperiment.
Usage
returnSamples(
data_type,
sample_data = NULL,
feature_data = NULL,
repo = NULL,
local_files = NULL,
include_empty_samples = TRUE,
dry_run = FALSE
)Arguments
- data_type
Single string: value found in the data_type' column of output_file_types() and also as part of the name of a view found in DBI::dbListTables(con), indicating which views to consider when collecting data.
- sample_data
Data frame: a table of sample metadata with a 'uuid' column. Often created by accessing 'data(sampleMetadata)' and filtering or otherwise transforming the result to only include samples of interest.
- feature_data
Data frame: a table of feature data. Each column will become a filtering argument. Often created by accessing one of the files listed in 'get_ref_info()' with 'load_ref()', then filtering or otherwise transforming the result to only include feature combinations of interest.
- repo
String (optional): Hugging Face repo where the parquet files are stored. If NULL and local_files is also NULL, the repo listed as the default in get_repo_info() will be selected. Default: NULL
- local_files
String or vector of strings (optional): path(s) to parquet file(s). If the elements are named, those names will be used for the created views instead of imputing from file names. Default: NULL
- include_empty_samples
Boolean (optional): should samples provided via a 'uuid' argument within 'filter_values' be included in the final TreeSummarizedExperiment if they do not show up in the results from filtering the source parquet data file. Default: TRUE
- dry_run
Boolean (optional): if TRUE, the function will return the tbl_duckdb_connection object prior to calling 'dplyr::collect'. Default: FALSE
Value
A TreeSummarizedExperiment object with process metadata, row data, column names, and relevant assays. If dry_run = TRUE, a tbl_duckdb_connection object.
Examples
# \donttest{
if (!exists("sampleMetadata", envir = environment())) {
utils::data("sampleMetadata", package = "parkinsonsMetagenomicData",
envir = environment())
}
table(sampleMetadata$control, useNA = "ifany")
#>
#> Case External Comparison Group Internal Comparison Group
#> 1311 90 59
#> Multiple System Atrophy Study Control <NA>
#> 8 2052 15
sample_data <- sampleMetadata |>
dplyr::filter(control %in% c("Case", "Study Control") &
age >= 16 &
!is.na(sex))
sample_data_small <- sample_data[seq(15),]
clade_name_ref <- load_ref("clade_name_ref")
feature_data_genus <- clade_name_ref |>
dplyr::filter(grepl("Faecalibacterium", clade_name_genus)) |>
dplyr::select(clade_name_genus)
genus_ex <- returnSamples(data_type = "relative_abundance",
sample_data = sample_data_small,
feature_data = feature_data_genus)
genus_ex
#> class: TreeSummarizedExperiment
#> dim: 16 15
#> metadata(0):
#> assays(1): relative_abundance
#> rownames(16):
#> k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15317
#> k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15316
#> ...
#> k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15339
#> k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15322
#> rowData names(19): clade_name clade_name_kingdom ...
#> NCBI_tax_id_terminal additional_species
#> colnames(15): 223148f5-c220-40ff-b056-001d2039dccf
#> 22b413e8-b819-4813-b8b7-fefd2a308a6c ...
#> a335b8b1-4f1c-49d8-acc9-ea77bef4c219
#> f1735869-be5c-4bc3-a7c1-a1f42d9ce872
#> colData names(51): uuid db_version ...
#> AsnicarF_2021_uncurated_treatment AsnicarF_2021_uncurated_pregnant
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> rowLinks: NULL
#> rowTree: NULL
#> colLinks: NULL
#> colTree: NULL
# }
if (!exists("sampleMetadata", envir = environment())) {
utils::data("sampleMetadata", package = "parkinsonsMetagenomicData",
envir = environment())
}
uuids <- c("8793b1dc-3ba1-4591-82b8-4297adcfa1d7",
"cc1f30a0-45d9-41b1-b592-7d0892919ee7",
"fb7e8210-002a-4554-b265-873c4003e25f",
"d9cc81ea-c39e-46a6-a6f9-eb5584b87706",
"4985aa08-6138-4146-8ae3-952716575395",
"8eb9f7ae-88c2-44e5-967e-fe7f6090c7af")
sample_data <- sampleMetadata |>
dplyr::filter(uuid %in% uuids) |>
dplyr::select(where(~ !any(is.na(.x))))
fpaths <- c(file.path(system.file("extdata",
package = "parkinsonsMetagenomicData"),
"pathcoverage_unstratified_uuid.parquet"),
file.path(system.file("extdata",
package = "parkinsonsMetagenomicData"),
"pathcoverage_unstratified_pathway.parquet"))
refpath <- file.path(system.file("extdata",
package = "parkinsonsMetagenomicData"),
"pathway_ref.parquet")
pathway_ref <- load_ref("pathway_ref", file_path = refpath)
feature_data_genus <- pathway_ref |>
dplyr::filter(grepl("Faecalibacterium", pathway_genus)) |>
dplyr::select(pathway_uniref) |>
dplyr::rename(pathway = pathway_uniref)
genus_ex <- returnSamples(data_type = "pathcoverage_unstratified",
sample_data = sample_data,
feature_data = feature_data_genus,
local_files = fpaths,
include_empty_samples = FALSE)
genus_ex
#> class: TreeSummarizedExperiment
#> dim: 2 6
#> metadata(0):
#> assays(1): coverage
#> rownames(2): ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)
#> ARGSYN-PWY: L-arginine biosynthesis I (via L-ornithine)
#> rowData names(1): pathway
#> colnames(6): d9cc81ea-c39e-46a6-a6f9-eb5584b87706
#> 4985aa08-6138-4146-8ae3-952716575395 ...
#> 8793b1dc-3ba1-4591-82b8-4297adcfa1d7
#> 8eb9f7ae-88c2-44e5-967e-fe7f6090c7af
#> colData names(305): uuid humann_header ...
#> WallenZD_2022_uncurated_Day_of_stool_collection_digestion_issue
#> WallenZD_2022_uncurated_Day_of_stool_collection_constipation
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> rowLinks: NULL
#> rowTree: NULL
#> colLinks: NULL
#> colTree: NULL