Skip to content

Commit

Permalink
fresh rewrite as hf
Browse files Browse the repository at this point in the history
  • Loading branch information
cboettig committed Aug 9, 2024
1 parent 3c44ce0 commit ef3e913
Show file tree
Hide file tree
Showing 3 changed files with 143 additions and 60 deletions.
79 changes: 79 additions & 0 deletions data-raw/huggingface.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
hf <- "https://huggingface.co"


hf_urls <- function(path = "data/fb/v24.07/parquet",
repo = "datasets/cboettig/fishbase",
branch = "main"
) {

paths <-
glue::glue("{hf}/api/{repo}/tree/{branch}/{path}") |>
jsonlite::read_json() |>
purrr::map_chr('path')

glue::glue("{hf}/{repo}/resolve/{branch}/{path}", path=paths)
}


server_code <- function(server = c("fishbase", "sealifebase")) {
server <- match.arg(server)
switch(server,
"fishbase" = "fb",
"sealifebase" = "slb")
}

available_releases <- function(server = c("fishbase", "sealifebase")) {

sv <- server_code(server)
repo <- "datasets/cboettig/fishbase"
path <- glue::glue("data/{sv}")

versions <-
glue::glue("{hf}/api/{repo}/tree/{branch}/{path}") |>
jsonlite::read_json() |>
purrr::map_chr('path') |>
stringr::str_extract("\\/v(\\d{2}\\.\\d{2})", 1)

versions

}
# "23.05" "23.01" "21.06" "19.04"
#url |> duckdbfs::open_dataset()

fb_urls <- function(server = c("fishbase", "sealifebase"),
version = "latest") {

if(version == "latest") {
version <- max(available_releases(server))
}

sv <- server_code(server)

path <- glue::glue("data/{sv}/v{version}/parquet")
hf_urls(path)

}

fb_tables <- function(server = c("fishbase", "sealifebase"),
version = "latest") {
fb_urls(server, version) |>
basename() |>
stringr::str_remove(".parquet")
}

fb_table <- function(tbl,
server = c("fishbase", "sealifebase"),
version = "latest",
collect = TRUE) {
urls <- fb_urls(server, version)
tbl_names <- urls |> basename() |> stringr::str_remove(".parquet")
names(urls) <- tbl_names
url <- urls[tbl]

out <- duckdbfs::open_dataset(url)
if(collect)
out <- dplyr::collect(out)

out
}

60 changes: 0 additions & 60 deletions data-raw/import_db.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,6 @@ for(table in tables){
#arrow::write_parquet(df, paste0(table,".parquet"))
}

fb.prov <- "inst/prov/fb.prov"

prov::write_prov(
data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/fb_parquet_2023-05/",
basename(fs::dir_ls("../rfishbase_board/fb_parquet_2023-05/"))),
title = "FishBase Snapshots v23.05",
description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci",
license = "https://creativecommons.org/licenses/by-nc/3.0/",
creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"),
version = "23.05",
issued = "2023-02-01",
prov=fb.prov,
schema="http://schema.org",
append=TRUE)

fs::file_copy(fb.prov, "fb_prov.json")
fs::file_copy("fb_prov.json", fb.prov, overwrite = TRUE)

jsonld::jsonld_frame(fb.prov,
'{
"@context": "http://schema.org/",
"@type": "Dataset"
}') |>
readr::write_lines(fb.prov)





Expand All @@ -64,37 +38,3 @@ for(table in tables){
}


urls <- paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-01/", tables, ".parquet")


slb.prov <- "inst/prov/slb.prov"

prov::write_prov(
data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-05/",
basename(fs::dir_ls("../rfishbase_board/slb_parquet_2023-05/"))),
title = "SeaLifeBase Snapshots v23.05",
description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci",
license = "https://creativecommons.org/licenses/by-nc/3.0/",
creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"),
version = "23.05",
issued = "2023-02-01",
prov=slb.prov,
schema="http://schema.org",
append=TRUE)

fs::file_copy(slb.prov, "slb_prov.json")
#fs::file_copy("slb_prov.json", slb.prov, overwrite = TRUE)

jsonld::jsonld_frame(slb.prov,
'{
"@context": "http://schema.org/",
"@type": "Dataset"
}') |>
readr::write_lines(slb.prov)



#mc("cp -r /home/cboettig/cboettig/rfishbase_board/fb_parquet_2023-01 thelio/shared-data/fishbase/")
#mc("cp -r slb_parquet_2023-01 thelio/shared-data/fishbase/")


64 changes: 64 additions & 0 deletions data-raw/write-prov.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@

fb.prov <- "inst/prov/fb.prov"

prov::write_prov(
data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/fb_parquet_2023-05/",
basename(fs::dir_ls("../rfishbase_board/fb_parquet_2023-07/"))),
title = "FishBase Snapshots v24.07",
description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci",
license = "https://creativecommons.org/licenses/by-nc/3.0/",
creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"),
version = "23.05",
issued = "2023-02-01",
prov=fb.prov,
schema="http://schema.org",
append=TRUE)

fs::file_copy(fb.prov, "fb_prov.json")
fs::file_copy("fb_prov.json", fb.prov, overwrite = TRUE)

jsonld::jsonld_frame(fb.prov,
'{
"@context": "http://schema.org/",
"@type": "Dataset"
}') |>
readr::write_lines(fb.prov)






urls <- paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-01/", tables, ".parquet")


slb.prov <- "inst/prov/slb.prov"

prov::write_prov(
data_out = paste0("https://github.com/cboettig/rfishbase_board/raw/main/slb_parquet_2023-05/",
basename(fs::dir_ls("../rfishbase_board/slb_parquet_2023-05/"))),
title = "SeaLifeBase Snapshots v23.05",
description = "Parquet formatted Snapshots of FishBase Tables, distributed by rOpenSci",
license = "https://creativecommons.org/licenses/by-nc/3.0/",
creator = list("type" = "Organization", name = "FishBase.org", "@id" = "https://fishbase.org"),
version = "23.05",
issued = "2023-02-01",
prov=slb.prov,
schema="http://schema.org",
append=TRUE)

fs::file_copy(slb.prov, "slb_prov.json")
#fs::file_copy("slb_prov.json", slb.prov, overwrite = TRUE)

jsonld::jsonld_frame(slb.prov,
'{
"@context": "http://schema.org/",
"@type": "Dataset"
}') |>
readr::write_lines(slb.prov)



#mc("cp -r /home/cboettig/cboettig/rfishbase_board/fb_parquet_2023-01 thelio/shared-data/fishbase/")
#mc("cp -r slb_parquet_2023-01 thelio/shared-data/fishbase/")

0 comments on commit ef3e913

Please sign in to comment.