Skip to content

Commit

Permalink
Merge pull request #94 from PIP-Technical-Team/auto-aux-update-check
Browse files Browse the repository at this point in the history
Auto aux update check
  • Loading branch information
randrescastaneda authored Mar 4, 2024
2 parents 43f9649 + caa1309 commit 3d4c998
Showing 1 changed file with 125 additions and 60 deletions.
185 changes: 125 additions & 60 deletions R/auto_aux_update.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,30 +10,31 @@ auto_aux_update <- function(measure = NULL,
maindir = gls$PIP_DATA_DIR,
owner = getOption("pipfun.ghowner"),
branch = c("DEV", "PROD", "main"),
tag = match.arg(branch)
) {

tag = match.arg(branch)) {
branch <- match.arg(branch)
from <- match.arg(from)
files_changed <- FALSE

isgls <- ls(sys.frame(), pattern = "^gls$") |>
length() > 0

if (isFALSE(isgls)) {
cli::cli_abort("object {.var gls} is not available in Globel env.
Run {.code gls <- pipfun::pip_create_globals()} first",
wrap = TRUE)
cli::cli_abort(
"object {.var gls} is not available in Globel env.
Run {.code gls <- pipfun::pip_create_globals()} first",
wrap = TRUE
)
}


assertthat::assert_that(Sys.getenv("GITHUB_PAT") != "",
msg = "Enviroment variable `GITHUB_PAT` is empty. Please set it up using Sys.setenv(GITHUB_PAT = 'code')")
msg = "Enviroment variable `GITHUB_PAT` is empty.
Please set it up using Sys.setenv(GITHUB_PAT = 'code')")
gh_user <- "https://raw.githubusercontent.com"
org_data <- paste(gh_user,
owner,
"pipaux/metadata/Data/git_metadata.csv",
sep = "/"
) |>
sep = "/") |>
readr::read_csv(show_col_types = FALSE)


Expand All @@ -45,24 +46,29 @@ auto_aux_update <- function(measure = NULL,
#Keep only those repos that start with "aux_"
grep("^aux_", x = _, value = TRUE)

if(!is.null(measure)) {
if (!is.null(measure)) {
all_repos <- all_repos[all_repos %in% glue::glue("aux_{measure}")]
}
# get hashs
hash <-
purrr::map(all_repos,
.f = ~{
gh::gh("GET /repos/{owner}/{repo}/commits/{branch}",
owner = owner,
repo = .x,
branch = branch)
.f = ~ {
gh::gh(
"GET /repos/{owner}/{repo}/commits/{branch}",
owner = owner,
repo = .x,
branch = branch
)
}) |>
purrr::map_chr(~.x[["sha"]])
purrr::map_chr( ~ .x[["sha"]])

# Get the latest hash of the repo
all_data <- dplyr::tibble(Repo = glue::glue("{owner}/{all_repos}"),
hash = hash,
branch = branch)
all_data <-
dplyr::tibble(
Repo = glue::glue("{owner}/{all_repos}"),
hash = hash,
branch = branch
)

old_data <- org_data %>%
dplyr::filter(.data$branch == branch) %>%
Expand All @@ -71,12 +77,17 @@ auto_aux_update <- function(measure = NULL,
old_data <- old_data %>%
dplyr::inner_join(all_data, by = c("Repo", "branch"))

cli::cli_alert_info("Number of rows from csv file : {nrow(old_data)}")
cli::cli_alert_info("Number of rows from Github : {nrow(all_data)}")
cli::cli_alert_info("Both the numbers above should be equal or else some
debugging is required.", wrap = TRUE)

new_data <- old_data %>%
dplyr::filter(.data$hash != .data$hash_original |
is.na(.data$hash_original) |
is.na(.data$hash))

all_data <- dplyr::rows_update(org_data, all_data, by = c("Repo", "branch"))
# all_data <- dplyr::rows_update(org_data, all_data, by = c("Repo", "branch"))



Expand All @@ -88,50 +99,84 @@ auto_aux_update <- function(measure = NULL,

# For each auxiliary data to be updated
cli::cli_alert_info("Updating data for {length(aux_fns)} files.")
for(aux in aux_fns) {
for (aux in aux_fns) {
# Find the corresponding functions to be run
# Add pip_ suffix so that it becomes function name
list_of_funcs <- paste0("pip_", return_value(aux, dependencies))
for(fn in list_of_funcs) {

for (fn in list_of_funcs) {

aux_file <- sub("pip_", "", fn)
cli::cli_alert_info("Running function {fn} for aux file {aux}.")

before_hash <- read_signature_file(aux_file, maindir, branch)
# Run the pip_.* function
match.fun(fn)(maindir = maindir, branch = branch) |>
suppressMessages()
}
after_hash <- read_signature_file(aux_file, maindir, branch)

if (before_hash != after_hash) {

cli::cli_alert_info("Updating csv for {fn}")
files_changed <- TRUE

# find rows of of org to be modified
aux_row_org <- org_data$Repo |>
fs::path_file() |>
sub('aux_', '', x = _) %in% aux_file &
org_data$branch == branch

# find rows in new that will be copied to org
aux_row_new <- new_data$Repo |>
fs::path_file() |>
sub('aux_', '', x = _) %in% aux_file &
new_data$branch == branch

org_data$hash[aux_row_org] <- new_data$hash[aux_row_new]

} # end of before_hash condition

} # end of list_of_funcs loop
} # end of aux_fns loop
last_updated_time <-
aux_file_last_updated(maindir, names(dependencies), branch)
if (length(aux_fns) > 0 && files_changed) {
# Write the latest auxiliary file and corresponding hash to csv
# Always save at the end.
# sha - hash object of current csv file in Data/git_metadata.csv
# content - base64 of changed data
out <- gh::gh(
"GET /repos/{owner}/{repo}/contents/{file_path}",
owner = "PIP-Technical-Team",
repo = "pipaux",
file_path = "Data/git_metadata.csv",
.params = list(ref = "metadata")
)
# There is no way to update only the lines which has changed using Github API
# We need to update the entire file every time. Refer - https://stackoverflow.com/a/21315234/3962914
res <- gh::gh(
"PUT /repos/{owner}/{repo}/contents/{path}",
owner = "PIP-Technical-Team",
repo = "pipaux",
path = "Data/git_metadata.csv",
.params = list(
branch = "metadata",
message = "updating csv file",
sha = out$sha,
content = convert_df_to_base64(org_data)
),
.token = Sys.getenv("GITHUB_PAT")
)
}

# Write the latest auxiliary file and corresponding hash to csv
# Always save at the end.
# sha - hash object of current csv file in Data/git_metadata.csv
# content - base64 of changed data
out <- gh::gh("GET /repos/{owner}/{repo}/contents/{file_path}",
owner = "PIP-Technical-Team",
repo = "pipaux",
file_path = "Data/git_metadata.csv",
.params = list(ref = "metadata"))

res <- gh::gh("PUT /repos/{owner}/{repo}/contents/{path}",
owner = "PIP-Technical-Team",
repo = "pipaux",
path = "Data/git_metadata.csv",
.params = list(branch = "metadata",
message = "updating csv file",
sha = out$sha, # why does the sha remain the same?
content = convert_df_to_base64(all_data)
),
.token = Sys.getenv("GITHUB_PAT")
)

cli::cli_h2("File updated status.")
out <- aux_file_last_updated(maindir, names(dependencies), branch)
knitr::kable(out)
knitr::kable(last_updated_time)
}


return_value <- function(aux, dependencies) {
val <- dependencies[[aux]]
if(length(val) > 0) {
for(i in val) {
if (length(val) > 0) {
for (i in val) {
val <- c(return_value(i, dependencies), val)
}
}
Expand All @@ -140,30 +185,50 @@ return_value <- function(aux, dependencies) {

convert_df_to_base64 <- function(df) {
df |>
write.table(quote = FALSE, row.names = FALSE, sep=",") |>
write.table(quote = FALSE,
row.names = FALSE,
sep = ",") |>
capture.output() |>
paste(collapse="\n") |>
paste(collapse = "\n") |>
charToRaw() |>
base64enc::base64encode()
}

aux_file_last_updated <- function(data_dir, aux_files, branch) {
filenames <- glue::glue("{data_dir}/_aux/{branch}/{aux_files}/{aux_files}.qs")
data <- sapply(filenames, function(x) qs::qattributes(x)$datetime)
data.frame(filename = basename(names(data)),
time_last_update = as.POSIXct(data, format = "%Y%m%d%H%M%S"), row.names = NULL) |>
filenames <-
glue::glue("{data_dir}/_aux/{branch}/{aux_files}/{aux_files}.qs")
data <- sapply(filenames, function(x)
qs::qattributes(x)$datetime)
data.frame(
filename = basename(names(data)),
time_last_update = as.POSIXct(data, format = "%Y%m%d%H%M%S"),
row.names = NULL
) |>
dplyr::arrange(desc(time_last_update))

}


read_dependencies <- function(gh_user, owner) {
dependencies <- paste(gh_user,
owner,
"pipaux/metadata/Data/dependency.yml",
sep = "/"
) |>
sep = "/") |>
yaml::read_yaml()

sapply(dependencies, \(x) if (length(x)) strsplit(x, ",\\s+")[[1]] else character())
sapply(dependencies, \(x) if (length(x))
strsplit(x, ",\\s+")[[1]]
else
character())
}

read_signature_file <- function(aux_file, maindir, branch) {
# Construct the path to data signature aux file
data_signature_path <-
fs::path(maindir,
"_aux",
branch,
aux_file,
glue::glue("{aux_file}_datasignature.txt"))
signature_hash <- readr::read_lines(data_signature_path)
return(signature_hash)
}

0 comments on commit 3d4c998

Please sign in to comment.