Set up DuckDB connection with views for available data types
Source:R/readParquet.R
accessParquetData.Rd'accessParquetData' is a wrapper function for 'db_connect' and 'retrieve_views'. A DuckDB connection is established and views are created for either all provided local files or all data types available in a repo of interest (see inst/extdata/parquet_repos.csv). When using a remote repo, a vector of specific data types can be supplied as doing this for all data types can take longer.
Arguments
- dbdir
Location for database files. Should be a path to an existing directory in the file system or the value ':memory:' to keep data in RAM. Default: ':memory:'
- repo
String (optional): Hugging Face repo where the parquet files are stored. If NULL and local_files is also NULL, the repo listed as the default in get_repo_info() will be selected. Default: NULL
- local_files
String or vector of strings (optional): path(s) to parquet file(s). If the elements are named, those names will be used for the created views instead of imputing from file names. Default: NULL
- data_types
Character vector (optional): when using a remote repo, a list of data types to establish database views for. If NULL, views will be created for all available data types. Default: NULL
Examples
# \donttest{
prepared_db <- accessParquetData()
#> The following data types are not present in the repo waldronlab/metagenomics_mac and will be skipped:
#> strainphlan_markers, fastqc, kneaddata_log
DBI::dbListTables(prepared_db)
#> [1] "clade_name_ref"
#> [2] "gene_family_ref"
#> [3] "genefamilies_cpm_gene_family_uniref"
#> [4] "genefamilies_cpm_stratified_gene_family_uniref"
#> [5] "genefamilies_cpm_stratified_uuid"
#> [6] "genefamilies_cpm_unstratified_gene_family"
#> [7] "genefamilies_cpm_unstratified_uuid"
#> [8] "genefamilies_cpm_uuid"
#> [9] "genefamilies_gene_family_uniref"
#> [10] "genefamilies_relab_gene_family_uniref"
#> [11] "genefamilies_relab_stratified_gene_family_uniref"
#> [12] "genefamilies_relab_stratified_uuid"
#> [13] "genefamilies_relab_unstratified_gene_family"
#> [14] "genefamilies_relab_unstratified_uuid"
#> [15] "genefamilies_relab_uuid"
#> [16] "genefamilies_stratified_gene_family_uniref"
#> [17] "genefamilies_stratified_uuid"
#> [18] "genefamilies_unstratified_gene_family"
#> [19] "genefamilies_unstratified_uuid"
#> [20] "genefamilies_uuid"
#> [21] "genome_name_ref"
#> [22] "marker_abundance_uniref"
#> [23] "marker_abundance_uuid"
#> [24] "marker_presence_uniref"
#> [25] "marker_presence_uuid"
#> [26] "pathabundance_cpm_pathway_uniref"
#> [27] "pathabundance_cpm_stratified_pathway_uniref"
#> [28] "pathabundance_cpm_stratified_uuid"
#> [29] "pathabundance_cpm_unstratified_pathway"
#> [30] "pathabundance_cpm_unstratified_uuid"
#> [31] "pathabundance_cpm_uuid"
#> [32] "pathabundance_pathway_uniref"
#> [33] "pathabundance_relab_pathway_uniref"
#> [34] "pathabundance_relab_stratified_pathway_uniref"
#> [35] "pathabundance_relab_stratified_uuid"
#> [36] "pathabundance_relab_unstratified_pathway"
#> [37] "pathabundance_relab_unstratified_uuid"
#> [38] "pathabundance_relab_uuid"
#> [39] "pathabundance_stratified_pathway_uniref"
#> [40] "pathabundance_stratified_uuid"
#> [41] "pathabundance_unstratified_pathway"
#> [42] "pathabundance_unstratified_uuid"
#> [43] "pathabundance_uuid"
#> [44] "pathcoverage_pathway_uniref"
#> [45] "pathcoverage_stratified_pathway_uniref"
#> [46] "pathcoverage_stratified_uuid"
#> [47] "pathcoverage_unstratified_pathway"
#> [48] "pathcoverage_unstratified_uuid"
#> [49] "pathcoverage_uuid"
#> [50] "pathway_ref"
#> [51] "relative_abundance_clade_name_species"
#> [52] "relative_abundance_uuid"
#> [53] "uniref_marker_ref"
#> [54] "viral_clusters_genome_name"
#> [55] "viral_clusters_uuid"
# }
# \donttest{
single_type <- accessParquetData(data_types = "pathcoverage_unstratified")
DBI::dbListTables(single_type)
#> [1] "pathcoverage_unstratified_pathway" "pathcoverage_unstratified_uuid"
# }
fpaths <- c(file.path(system.file("extdata",
package = "parkinsonsMetagenomicData"),
"pathcoverage_unstratified_uuid.parquet"),
file.path(system.file("extdata",
package = "parkinsonsMetagenomicData"),
"pathcoverage_unstratified_pathway.parquet"))
local_db <- accessParquetData(local_files = fpaths,
data_types = "pathcoverage_unstratified")
DBI::dbListTables(local_db)
#> [1] "pathcoverage_unstratified_pathway" "pathcoverage_unstratified_uuid"