Skip to contents

'accessParquetData' is a wrapper function for 'db_connect' and 'retrieve_views'. A DuckDB connection is established and views are created for either all provided local files or all data types available in a repo of interest (see inst/extdata/parquet_repos.csv). When using a remote repo, a vector of specific data types can be supplied as doing this for all data types can take longer.

Usage

accessParquetData(
  dbdir = ":memory:",
  repo = NULL,
  local_files = NULL,
  data_types = NULL
)

Arguments

dbdir

Location for database files. Should be a path to an existing directory in the file system or the value ':memory:' to keep data in RAM. Default: ':memory:'

repo

String (optional): Hugging Face repo where the parquet files are stored. If NULL and local_files is also NULL, the repo listed as the default in get_repo_info() will be selected. Default: NULL

local_files

String or vector of strings (optional): path(s) to parquet file(s). If the elements are named, those names will be used for the created views instead of imputing from file names. Default: NULL

data_types

Character vector (optional): when using a remote repo, a list of data types to establish database views for. If NULL, views will be created for all available data types. Default: NULL

Value

DuckDB connection object of class 'duckdb_connection'

Details

Files stored remotely and locally cannot be combined in the same connection.

Examples

# \donttest{
 prepared_db <- accessParquetData()
#> The following data types are not present in the repo waldronlab/metagenomics_mac and will be skipped:
#> strainphlan_markers, fastqc, kneaddata_log
 DBI::dbListTables(prepared_db)
#>  [1] "clade_name_ref"                                  
#>  [2] "gene_family_ref"                                 
#>  [3] "genefamilies_cpm_gene_family_uniref"             
#>  [4] "genefamilies_cpm_stratified_gene_family_uniref"  
#>  [5] "genefamilies_cpm_stratified_uuid"                
#>  [6] "genefamilies_cpm_unstratified_gene_family"       
#>  [7] "genefamilies_cpm_unstratified_uuid"              
#>  [8] "genefamilies_cpm_uuid"                           
#>  [9] "genefamilies_gene_family_uniref"                 
#> [10] "genefamilies_relab_gene_family_uniref"           
#> [11] "genefamilies_relab_stratified_gene_family_uniref"
#> [12] "genefamilies_relab_stratified_uuid"              
#> [13] "genefamilies_relab_unstratified_gene_family"     
#> [14] "genefamilies_relab_unstratified_uuid"            
#> [15] "genefamilies_relab_uuid"                         
#> [16] "genefamilies_stratified_gene_family_uniref"      
#> [17] "genefamilies_stratified_uuid"                    
#> [18] "genefamilies_unstratified_gene_family"           
#> [19] "genefamilies_unstratified_uuid"                  
#> [20] "genefamilies_uuid"                               
#> [21] "genome_name_ref"                                 
#> [22] "marker_abundance_uniref"                         
#> [23] "marker_abundance_uuid"                           
#> [24] "marker_presence_uniref"                          
#> [25] "marker_presence_uuid"                            
#> [26] "pathabundance_cpm_pathway_uniref"                
#> [27] "pathabundance_cpm_stratified_pathway_uniref"     
#> [28] "pathabundance_cpm_stratified_uuid"               
#> [29] "pathabundance_cpm_unstratified_pathway"          
#> [30] "pathabundance_cpm_unstratified_uuid"             
#> [31] "pathabundance_cpm_uuid"                          
#> [32] "pathabundance_pathway_uniref"                    
#> [33] "pathabundance_relab_pathway_uniref"              
#> [34] "pathabundance_relab_stratified_pathway_uniref"   
#> [35] "pathabundance_relab_stratified_uuid"             
#> [36] "pathabundance_relab_unstratified_pathway"        
#> [37] "pathabundance_relab_unstratified_uuid"           
#> [38] "pathabundance_relab_uuid"                        
#> [39] "pathabundance_stratified_pathway_uniref"         
#> [40] "pathabundance_stratified_uuid"                   
#> [41] "pathabundance_unstratified_pathway"              
#> [42] "pathabundance_unstratified_uuid"                 
#> [43] "pathabundance_uuid"                              
#> [44] "pathcoverage_pathway_uniref"                     
#> [45] "pathcoverage_stratified_pathway_uniref"          
#> [46] "pathcoverage_stratified_uuid"                    
#> [47] "pathcoverage_unstratified_pathway"               
#> [48] "pathcoverage_unstratified_uuid"                  
#> [49] "pathcoverage_uuid"                               
#> [50] "pathway_ref"                                     
#> [51] "relative_abundance_clade_name_species"           
#> [52] "relative_abundance_uuid"                         
#> [53] "uniref_marker_ref"                               
#> [54] "viral_clusters_genome_name"                      
#> [55] "viral_clusters_uuid"                             
# }
# \donttest{
 single_type <- accessParquetData(data_types = "pathcoverage_unstratified")
 DBI::dbListTables(single_type)
#> [1] "pathcoverage_unstratified_pathway" "pathcoverage_unstratified_uuid"   
# }

fpaths <- c(file.path(system.file("extdata",
                                  package = "parkinsonsMetagenomicData"),
                      "pathcoverage_unstratified_uuid.parquet"),
            file.path(system.file("extdata",
                                  package = "parkinsonsMetagenomicData"),
                      "pathcoverage_unstratified_pathway.parquet"))

local_db <- accessParquetData(local_files = fpaths,
                              data_types = "pathcoverage_unstratified")
DBI::dbListTables(local_db)
#> [1] "pathcoverage_unstratified_pathway" "pathcoverage_unstratified_uuid"