diff --git a/pxmap_scrape.R b/pxmap_scrape.R new file mode 100644 index 0000000..87ff6c6 --- /dev/null +++ b/pxmap_scrape.R @@ -0,0 +1,322 @@ + +library(RSelenium) +library(jsonlite) +library(pxweb) + +# Directory needs to have a folder named uttak. +# Data will be saved in uttak + +setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) +dname<-dirname(rstudioapi::getActiveDocumentContext()$path) + +#Open web driver ---- + +#'This should be improved by someone with a better understanding of docker. +#'Docker must be running before script is executed. +#'There is nn pariticular reason for using this firefox version (but evereything runs!). + +shell("docker pull selenium/standalone-firefox:2.53.0") +shell('docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.0') +Sys.sleep(10) +remDr <- remoteDriver(port = 4445L, browserName = "firefox") +remDr$open() + +#'Timing start of scraping session +timi0<-Sys.time() + +#Get table of level 1 categories ---- + +remDr$navigate("https://px.hagstofa.is/pxis/pxweb/is/") + +takki_1<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxweb')]") + +tafla_1_nofn<-vector() +tafla_1_slodir<-vector() + +for(i in 1:length(takki_1)){ + tafla_1_nofn[i]<-unlist(takki_1[[i]]$getElementText()) + tafla_1_slodir[i]<-unlist(takki_1[[i]]$getElementAttribute("href")) +} + +tafla1<-data.frame("nofn"=tafla_1_nofn, "slodir"=tafla_1_slodir) +tafla1<-tafla1[tafla1$nofn!="",] + + + +# Scrape table names and urls ---- + +teljari<-1 +timi1<-Sys.time() +tafla2<-data.frame(nofn2="", slod2="") + +# Loop iterates though level 1 categories +for(k in 1:nrow(tafla1)){ + curr_path_1<-tafla1$slodir[k] + curr_nafn_1<-tafla1$nofn[k] + remDr$navigate(curr_path_1) + + #Finding all expandable tree elements + takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand") + + #While there are expandable elements, click first element and search again + while(length(takki_2)>0){ + takki_2[[1]]$clickElement() + takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand") + } + + #Once all elements are expanded, find all elements with a hypertext reference attribute + #This line woukd have to be changed to scrape the english version + takki_3<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxis')]") + + nofn2<-vector() + slodir2<-vector() + #Loop iterates though all elements extracting the text and href elements + for(i in 1:length(takki_3)){ + nofn2[i]<-unlist(takki_3[[i]]$getElementText()) + slodir2[i]<-unlist(takki_3[[i]]$getElementAttribute("href")) + print(paste0("Skröpun 1: tafla númer: ", teljari)) + print(nofn2[i]) + teljari<-teljari+1 + } + + tafla2_temp<-data.frame("nofn2"=nofn2, "slod2"=slodir2) + #Remove links that are not tables + tafla2_temp$nchar<-nchar(tafla2_temp$slod2) + tafla2_temp<-tafla2_temp[tafla2_temp$nchar>37,] + tafla2_temp$nchar<-NULL + + tafla2<-rbind(tafla2, tafla2_temp) + +} + +print("Skröpun 1 lokið") +Sys.time()-timi1 + +# Make API-urls from px urls ---- +#' This part is problematic +#' Code would need to be updated if new level 1 databases were added. +#' Using the the API to list tables and make the pxweb links from those would be faster and more robust if it were possible. +#' There are occasional eccentricities in pxweb links preventing the use of the API for listing. +#' The API approach would also miss duplicates on the pxweb. +#' If these problems are fixed I recommend using https://github.com/bgautijonsson/hagstofa_scraping/tree/master/R instead + +tafla2<-tafla2[tafla2$nofn2!="",] +tafla2$apiurl2<-gsub("__", "/", tafla2$slod2) +tafla2$apiurl3<-gsub("https://px.hagstofa.is/pxis/pxweb/is/", "https://px.hagstofa.is:443/pxis/api/v1/is/", tafla2$apiurl2) +tafla2$apiurl3<-gsub("/Atvinnuvegir/Atvinnuvegir", "/Atvinnuvegir", tafla2$apiurl3) +tafla2$apiurl3<-gsub("/Efnahagur/Efnahagur", "/Efnahagur", tafla2$apiurl3) +tafla2$apiurl3<-gsub("/Ibuar/Ibuar", "/Ibuar", tafla2$apiurl3) +tafla2$apiurl3<-gsub("/Samfelag/Samfelag", "/Samfelag", tafla2$apiurl3) +tafla2$apiurl3<-gsub("/Sogulegar/Sogulegar", "/Sogulegar", tafla2$apiurl3) +tafla2$apiurl3<-gsub("/Umhverfi/Umhverfi", "/Umhverfi", tafla2$apiurl3) +tafla2$apiurl3<-gsub(".px/", ".px", tafla2$apiurl3) +tafla2$apiurl3<-gsub("%20", " ", tafla2$apiurl3) +tafla2$apiurl3<-gsub("px_autoopen=true", "", tafla2$apiurl3) +tafla2$apiurl3<-gsub("\\?", "", tafla2$apiurl3) +tafla2$apiurl3<-gsub("%C3%B0", "ð", tafla2$apiurl3) + +tafla2$apiurl<-tafla2$apiurl3 + +tafla2$apiurl2<-NULL +tafla2$apiurl3<-NULL + +# Make simple id strings from urls ---- + +#' The id is made from filenames at end of url +#' These are not unique for duplicate tables, a row number integer is added at end in case metadata might differ. + +ids<-strsplit(tafla2$apiurl, split = "/") + +id<-vector() +for(i in 1:length(ids)){ + temp<-ids[[i]] + id[i]<-temp[length(temp)] +} +id<-gsub("px", "", id) +id<-gsub("\\.", "", id) + +tafla2$id<-paste0(id, "_", 1:nrow(tafla2)) + +# Use API to get metadata ----- + +metadatalisti<-list() +timi2<-Sys.time() +breytur<-vector() + +for(k in 1:nrow(tafla2)){ + #' First API query returns list of variables + #' No data is requested so the API does not return other metadata + #' The variable list is used to make a minimal query for data in order to get metadata + pxd0<-pxweb_get(tafla2$apiurl[k]) + vars0<-pxd0$variables + codes<-vector() + values<-vector() + for(i in 1:length(vars0)){ + codes[i]<-vars0[[i]]$code + values[i]<-vars0[[i]]$values[1] + } + qlisti<-list() + for(i in 1:length(vars0)){ + qlisti[[codes[i]]]<-values[i] + } + pxq <- pxweb_query(qlisti) + pxd <- pxweb_get(tafla2$apiurl[k], pxq) #Second query + pxd$data<-NULL #Remove data + #' Metadata is placed in a list in an item named by table id + idtemp<-tafla2$id[k] + metadatalisti[[idtemp]]<-pxd + #' Make vector of variable names and paste to make column of variable names for tafla2 + breytur_sub<-vector() + for(i in 1:length(pxd$pxweb_metadata$variables)){ + breytur_sub[i]<-pxd$pxweb_metadata$variables[[i]]$text + } + breytur_sub<-paste0(breytur_sub, collapse = " ; ") + breytur[k]<-breytur_sub + print(paste0("Skröpun 2: tafla númer: ", k)) + print(tafla2$nofn2[k]) + # This approach could be improved using package polite! + Sys.sleep(0.5) +} +print("Skröpun 2 lokið") +Sys.time()-timi2 + +tafla2$breytur<-breytur + +# Add value column ---- + +#' Make column of all values of all variables. +#' Can be used to add hidden searchable column in main table in app + +valvec<-vector() + +for(i in 1:nrow(tafla2)){ + sel1<-metadatalisti[[i]]$pxweb_metadata$variables + skil<-"" + for(j in 1:length(sel1)){ + sel2<-sel1[[j]]$valueTexts + skil<-paste0(skil, " ; ", paste0(sel2, collapse = " ; ")) + } + valvec[i]<-skil +} + +tafla2$gildi<-valvec + +# Add oldest and most recent time series value ---- + +first<-vector() +last<-vector() + +for(i in 1:nrow(tafla2)){ + sel1<-metadatalisti[[i]]$pxweb_metadata$variables + for(j in 1:length(sel1)){ + sel2<-sel1[[j]] + if(sel2$time){ + first[i]<-sel2$valueTexts[1] + last[i]<-sel2$valueTexts[length(sel2$valueTexts)] + } + } +} + +tafla2$first_time<-first +tafla2$last_time<-last + +# Add date of last update ---- + +updatevec<-vector() + +for(i in 1:nrow(tafla2)){ + updatevec[i]<-metadatalisti[[i]]$metadata[[1]]$updated +} + +updatevec2<-as.Date.character(updatevec) + +tafla2$last_update<-updatevec2 + +# Add column of duplicate titles ---- + +tafla2<-tafla2 |> + dplyr::left_join( + tafla2 |> + dplyr::group_by(nofn2) |> + dplyr::summarise(n_duplicates=dplyr::n()) + ) + +# Scrape breadcrumbs, about table and comments from pxweb----- + +#' Getting sensible hierarchical organisation of table from the API is a bit of a challenge +#' Scraping metadata from pxweb is also necessary at his point as it is not always consistent with API metadata +#' The inconsistency should of course be resolved! + +skyringalisti<-list() +crumbs<-vector() +timi3<-Sys.time() +teljari<-1 +remDr$open() +for(k in 1:nrow(tafla2)){ + remDr$navigate(tafla2$slod2[k]) + saekja1<-remDr$findElements(using = "id", "breadcrumb") + crumb_sub<-unlist(saekja1[[1]]$getElementText()) + crumbs[k]<-crumb_sub + skyringar<-remDr$findElements(using = "class", "accordion-header") + for(i in 1:length(skyringar)){ + skyringar[[i]]$clickElement() + } + skyringatexti_em<-remDr$findElements(using = "class", "accordion-body") + if(length(skyringatexti_em)>1){ + skyringatexti<-unlist(skyringatexti_em[[2]]$getElementText()) + }else{ + skyringatexti<-"null" + } + sub_headers<-remDr$findElements(using = "class", "nested-accordion-header") + for(i in 1:length(sub_headers)){ + sub_headers[[i]]$clickElement() + } + sub_headers<-remDr$findElements(using = "class", "nested-accordion-header") + sub_textar<-vector() + for(i in 1:length(sub_headers)){ + sub_textar[i]<-sub_headers[[i]]$getElementText() + } + sub_bodies<-remDr$findElements(using = "class", "nested-accordion-body") + sub_bodies_textar<-vector() + for(i in 1:length(sub_headers)){ + sub_bodies_textar[i]<-sub_bodies[[i]]$getElementText() + } + idtemp<-tafla2$id[k] + skyringalisti0<-list() + skyringalisti0[["skyringatexti"]]<-skyringatexti + skyringalisti0[["sub_textar"]]<-sub_textar + skyringalisti0[["sub_bodies_textar"]]<-sub_bodies_textar + print(paste0("Skröpun 3: tafla númer: ", teljari)) + skyringalisti[[idtemp]]<-skyringalisti0 + teljari<-teljari+1 +} + +print("Skröpun 3 lokið") +Sys.time()-timi3 + +# Split breadcrumbs to make categories + +crumbsplit<-strsplit(crumbs, split = "/ ") +crumblengd<-sapply(crumbsplit, length) + +crumb1<-sapply(crumbsplit, "[[", 2) +crumb2<-sapply(crumbsplit, "[[", 3) +crumb3<-sapply(crumbsplit, "[[", 4) + +tafla2$crumbs<-crumbs +tafla2$crumbs1<-crumb1 +tafla2$crumbs2<-crumb2 +tafla2$crumbs3<-crumb3 + +# Add time stamp + +metadatalisti$timastimpill_char<-as.character.Date(timi0) +metadatalisti$timastimpill_POSIXct<-timi0 + + +saveRDS(tafla2, paste0(dname,"/uttak/maintab", as.numeric(Sys.time()), ".rds")) +saveRDS(metadatalisti, paste0(dname,"/uttak/apimeta", as.numeric(Sys.time()), ".rds")) +saveRDS(skyringalisti, paste0(dname,"/uttak/pxwebmeta", as.numeric(Sys.time()), ".rds")) + +print("Uppfæringu lokið") +Sys.time()-timi0