Skip to content

Commit

Permalink
scripts for wrangling data
Browse files Browse the repository at this point in the history
  • Loading branch information
Aariq committed Jan 15, 2025
1 parent a7bc978 commit 6a15a4f
Show file tree
Hide file tree
Showing 2 changed files with 83 additions and 0 deletions.
14 changes: 14 additions & 0 deletions 01-get_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
library(googledrive)
library(fs)

usage_reports <- drive_ls("https://drive.google.com/drive/folders/1E3LuHI1jTm22cz7xXAjScMkQcMupuH8M")
purrr::walk2(usage_reports$id, usage_reports$name, \(id, name) {
dir_create(path("data", "usage"))
drive_download(id, path = path("data", "usage", name))
})

registration_reports <- drive_ls("https://drive.google.com/drive/folders/1DOVUhsCiW-nE-6q3ehsjBEYulzt_l4Pl")
purrr::walk2(registration_reports$id, registration_reports$name, \(id, name) {
dir_create(path("data", "registration"))
drive_download(id, path = path("data", "registration", name))
})
69 changes: 69 additions & 0 deletions 02-wrangle_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
library(readr)
library(dplyr)
library(fs)
library(purrr)
library(janitor)

usage_reports <- dir_ls(path("data", "usage"))
reg_reports <- dir_ls(path("data", "registration"))

read_wrangle_usage <- function(usage_report) {
# check that metadata exists
if (!grepl(pattern = "Topic", readLines(usage_report, n = 1))) {
warning("Usage report doesn't contain meeting information")
#return
main <- read_csv(usage_report) |>
clean_names() |>
rename(name = name_original_name, attendance_duration = total_duration_minutes)
#add meeting ID from filename
ID <-
path_file(usage_report) |>
path_ext_remove() |>
gsub("participants_", "", x = _)

#return:
main |>
mutate(id = ID, .before = everything())

} else {

main <- read_csv(usage_report, skip = 3) |>
clean_names() |>
rename(name = name_original_name, attendance_duration = total_duration_minutes)
head <- read_csv(usage_report, n_max = 1, col_types = cols(ID = col_character())) |>
clean_names() |>
rename(meeting_duration = duration_minutes)

#return:
bind_cols(head, main)
}
}

usage <- map(usage_reports, read_wrangle_usage) |> list_rbind()
usage


read_wrangle_registration <- function(reg_report) {
df <- read_csv(reg_report, id = "file")
df <- df |> clean_names() |>
rename(
college = contains("college"),
department = contains("department"),
career_stage = contains("career_stage"),
how_discovered = contains("how_did_you_hear"),
mailing_list = contains("mailing_list")
) |>
# get meeting ID
mutate(
id = path_file(file) |> fs::path_ext_remove() |> gsub("_RegistrationReport", "", x = _),
.before = file
) |>
select(-any_of(c("x11", "file")))
df
}

registration <- map(reg_reports, read_wrangle_registration) |> list_rbind()

attended <- left_join(usage, registration)

write_csv(attended, "data/attended.csv")

0 comments on commit 6a15a4f

Please sign in to comment.