Skip to contents

'returnSamples' takes tables with sample and feature information and retrieves the relevant data as a TreeSummarizedExperiment.

Usage

returnSamples(
  data_type,
  sample_data = NULL,
  feature_data = NULL,
  repo = NULL,
  local_files = NULL,
  include_empty_samples = TRUE,
  dry_run = FALSE
)

Arguments

data_type

Single string: value found in the data_type' column of output_file_types() and also as part of the name of a view found in DBI::dbListTables(con), indicating which views to consider when collecting data.

sample_data

Data frame: a table of sample metadata with a 'uuid' column. Often created by accessing 'data(sampleMetadata)' and filtering or otherwise transforming the result to only include samples of interest.

feature_data

Data frame: a table of feature data. Each column will become a filtering argument. Often created by accessing one of the files listed in 'get_ref_info()' with 'load_ref()', then filtering or otherwise transforming the result to only include feature combinations of interest.

repo

String (optional): Hugging Face repo where the parquet files are stored. If NULL and local_files is also NULL, the repo listed as the default in get_repo_info() will be selected. Default: NULL

local_files

String or vector of strings (optional): path(s) to parquet file(s). If the elements are named, those names will be used for the created views instead of imputing from file names. Default: NULL

include_empty_samples

Boolean (optional): should samples provided via a 'uuid' argument within 'filter_values' be included in the final TreeSummarizedExperiment if they do not show up in the results from filtering the source parquet data file. Default: TRUE

dry_run

Boolean (optional): if TRUE, the function will return the tbl_duckdb_connection object prior to calling 'dplyr::collect'. Default: FALSE

Value

A TreeSummarizedExperiment object with process metadata, row data, column names, and relevant assays. If dry_run = TRUE, a tbl_duckdb_connection object.

Details

Files stored remotely and locally cannot be combined in the same connection.

See also

Examples

# \donttest{
 if (!exists("sampleMetadata", envir = environment())) {
     utils::data("sampleMetadata", package = "parkinsonsMetagenomicData",
     envir = environment())
 }

 table(sampleMetadata$control, useNA = "ifany")
#> 
#>                      Case External Comparison Group Internal Comparison Group 
#>                      1311                        90                        59 
#>   Multiple System Atrophy             Study Control                      <NA> 
#>                         8                      2052                        15 
 sample_data <- sampleMetadata |>
     dplyr::filter(control %in% c("Case", "Study Control") &
                    age >= 16 &
                    !is.na(sex))
 sample_data_small <- sample_data[seq(15),]

 clade_name_ref <- load_ref("clade_name_ref")
 feature_data_genus <- clade_name_ref |>
     dplyr::filter(grepl("Faecalibacterium", clade_name_genus)) |>
     dplyr::select(clade_name_genus)

 genus_ex <- returnSamples(data_type = "relative_abundance",
                           sample_data = sample_data_small,
                           feature_data = feature_data_genus)
 genus_ex
#> class: TreeSummarizedExperiment 
#> dim: 16 15 
#> metadata(0):
#> assays(1): relative_abundance
#> rownames(16):
#>   k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15317
#>   k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15316
#>   ...
#>   k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15339
#>   k__Bacteria|p__Firmicutes|c__Clostridia|o__Eubacteriales|f__Oscillospiraceae|g__Faecalibacterium|s__Faecalibacterium_prausnitzii|t__SGB15322
#> rowData names(19): clade_name clade_name_kingdom ...
#>   NCBI_tax_id_terminal additional_species
#> colnames(15): 223148f5-c220-40ff-b056-001d2039dccf
#>   22b413e8-b819-4813-b8b7-fefd2a308a6c ...
#>   a335b8b1-4f1c-49d8-acc9-ea77bef4c219
#>   f1735869-be5c-4bc3-a7c1-a1f42d9ce872
#> colData names(51): uuid db_version ...
#>   AsnicarF_2021_uncurated_treatment AsnicarF_2021_uncurated_pregnant
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> rowLinks: NULL
#> rowTree: NULL
#> colLinks: NULL
#> colTree: NULL
# }

if (!exists("sampleMetadata", envir = environment())) {
    utils::data("sampleMetadata", package = "parkinsonsMetagenomicData",
    envir = environment())
}

uuids <- c("8793b1dc-3ba1-4591-82b8-4297adcfa1d7",
           "cc1f30a0-45d9-41b1-b592-7d0892919ee7",
           "fb7e8210-002a-4554-b265-873c4003e25f",
           "d9cc81ea-c39e-46a6-a6f9-eb5584b87706",
           "4985aa08-6138-4146-8ae3-952716575395",
           "8eb9f7ae-88c2-44e5-967e-fe7f6090c7af")

sample_data <- sampleMetadata |>
    dplyr::filter(uuid %in% uuids) |>
    dplyr::select(where(~ !any(is.na(.x))))

fpaths <- c(file.path(system.file("extdata",
                                  package = "parkinsonsMetagenomicData"),
                      "pathcoverage_unstratified_uuid.parquet"),
            file.path(system.file("extdata",
                                  package = "parkinsonsMetagenomicData"),
                      "pathcoverage_unstratified_pathway.parquet"))

refpath <- file.path(system.file("extdata",
                                 package = "parkinsonsMetagenomicData"),
                     "pathway_ref.parquet")

pathway_ref <- load_ref("pathway_ref", file_path = refpath)
feature_data_genus <- pathway_ref |>
    dplyr::filter(grepl("Faecalibacterium", pathway_genus)) |>
    dplyr::select(pathway_uniref) |>
    dplyr::rename(pathway = pathway_uniref)

genus_ex <- returnSamples(data_type = "pathcoverage_unstratified",
                          sample_data = sample_data,
                          feature_data = feature_data_genus,
                          local_files = fpaths,
                          include_empty_samples = FALSE)
genus_ex
#> class: TreeSummarizedExperiment 
#> dim: 2 6 
#> metadata(0):
#> assays(1): coverage
#> rownames(2): ANAGLYCOLYSIS-PWY: glycolysis III (from glucose)
#>   ARGSYN-PWY: L-arginine biosynthesis I (via L-ornithine)
#> rowData names(1): pathway
#> colnames(6): d9cc81ea-c39e-46a6-a6f9-eb5584b87706
#>   4985aa08-6138-4146-8ae3-952716575395 ...
#>   8793b1dc-3ba1-4591-82b8-4297adcfa1d7
#>   8eb9f7ae-88c2-44e5-967e-fe7f6090c7af
#> colData names(305): uuid humann_header ...
#>   WallenZD_2022_uncurated_Day_of_stool_collection_digestion_issue
#>   WallenZD_2022_uncurated_Day_of_stool_collection_constipation
#> reducedDimNames(0):
#> mainExpName: NULL
#> altExpNames(0):
#> rowLinks: NULL
#> rowTree: NULL
#> colLinks: NULL
#> colTree: NULL