Add files via upload

hagstofan · Jan 29, 2024 · 26f55ec · 26f55ec
commit 26f55ec
Showing 1 changed file with 322 additions and 0 deletions.
diff --git a/pxmap_scrape.R b/pxmap_scrape.R
@@ -0,0 +1,322 @@
+
+library(RSelenium)
+library(jsonlite)
+library(pxweb)
+
+# Directory needs to have a folder named uttak.
+# Data will be saved in uttak
+
+setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
+dname<-dirname(rstudioapi::getActiveDocumentContext()$path)
+
+#Open web driver ----
+
+#'This should be improved by someone with a better understanding of docker.
+#'Docker must be running before script is executed.
+#'There is nn pariticular reason for using this firefox version (but evereything runs!).
+
+shell("docker pull selenium/standalone-firefox:2.53.0")
+shell('docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.0')
+Sys.sleep(10)
+remDr <- remoteDriver(port = 4445L, browserName = "firefox")
+remDr$open()
+
+#'Timing start of scraping session
+timi0<-Sys.time()
+
+#Get table of level 1 categories ----
+
+remDr$navigate("https://px.hagstofa.is/pxis/pxweb/is/")
+
+takki_1<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxweb')]")
+
+tafla_1_nofn<-vector()
+tafla_1_slodir<-vector()
+
+for(i in 1:length(takki_1)){
+  tafla_1_nofn[i]<-unlist(takki_1[[i]]$getElementText())
+  tafla_1_slodir[i]<-unlist(takki_1[[i]]$getElementAttribute("href"))
+}
+
+tafla1<-data.frame("nofn"=tafla_1_nofn, "slodir"=tafla_1_slodir)
+tafla1<-tafla1[tafla1$nofn!="",]
+
+
+
+# Scrape table names and urls ----
+
+teljari<-1
+timi1<-Sys.time()
+tafla2<-data.frame(nofn2="", slod2="")
+
+# Loop iterates though level 1 categories
+for(k in 1:nrow(tafla1)){
+  curr_path_1<-tafla1$slodir[k]
+  curr_nafn_1<-tafla1$nofn[k]
+  remDr$navigate(curr_path_1)
+
+  #Finding all expandable tree elements
+  takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")
+
+  #While there are expandable elements, click first element and search again
+  while(length(takki_2)>0){
+    takki_2[[1]]$clickElement()
+    takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")
+  }
+
+  #Once all elements are expanded, find all elements with a hypertext reference attribute
+  #This line woukd have to be changed to scrape the english version
+  takki_3<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxis')]")
+
+  nofn2<-vector()
+  slodir2<-vector()
+  #Loop iterates though all elements extracting the text and href elements
+  for(i in 1:length(takki_3)){
+    nofn2[i]<-unlist(takki_3[[i]]$getElementText())
+    slodir2[i]<-unlist(takki_3[[i]]$getElementAttribute("href"))
+    print(paste0("Skröpun 1: tafla númer: ", teljari))
+    print(nofn2[i])
+    teljari<-teljari+1
+  }
+
+  tafla2_temp<-data.frame("nofn2"=nofn2, "slod2"=slodir2)
+  #Remove links that are not tables
+  tafla2_temp$nchar<-nchar(tafla2_temp$slod2)
+  tafla2_temp<-tafla2_temp[tafla2_temp$nchar>37,]
+  tafla2_temp$nchar<-NULL
+
+  tafla2<-rbind(tafla2, tafla2_temp)
+
+}
+
+print("Skröpun 1 lokið")
+Sys.time()-timi1
+
+# Make API-urls from px urls ----
+#' This part is problematic
+#' Code would need to be updated if new level 1 databases were added.
+#' Using the the API to list tables and make the pxweb links from those would be faster and more robust if it were possible.
+#' There are occasional eccentricities in pxweb links preventing the use of the API for listing.
+#' The API approach would also miss duplicates on the pxweb.
+#' If these problems are fixed I recommend using https://github.com/bgautijonsson/hagstofa_scraping/tree/master/R instead
+
+tafla2<-tafla2[tafla2$nofn2!="",]
+tafla2$apiurl2<-gsub("__", "/", tafla2$slod2)
+tafla2$apiurl3<-gsub("https://px.hagstofa.is/pxis/pxweb/is/", "https://px.hagstofa.is:443/pxis/api/v1/is/", tafla2$apiurl2)
+tafla2$apiurl3<-gsub("/Atvinnuvegir/Atvinnuvegir", "/Atvinnuvegir", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("/Efnahagur/Efnahagur", "/Efnahagur", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("/Ibuar/Ibuar", "/Ibuar", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("/Samfelag/Samfelag", "/Samfelag", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("/Sogulegar/Sogulegar", "/Sogulegar", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("/Umhverfi/Umhverfi", "/Umhverfi", tafla2$apiurl3)
+tafla2$apiurl3<-gsub(".px/", ".px", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("%20", " ", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("px_autoopen=true", "", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("\\?", "", tafla2$apiurl3)
+tafla2$apiurl3<-gsub("%C3%B0", "ð", tafla2$apiurl3)
+
+tafla2$apiurl<-tafla2$apiurl3
+
+tafla2$apiurl2<-NULL
+tafla2$apiurl3<-NULL
+
+# Make simple id strings from urls ---- 
+
+#' The id is made from filenames at end of url
+#' These are not unique for duplicate tables, a row number integer is added at end in case metadata might differ.
+
+ids<-strsplit(tafla2$apiurl, split = "/")
+
+id<-vector()
+for(i in 1:length(ids)){
+  temp<-ids[[i]]
+  id[i]<-temp[length(temp)]
+}
+id<-gsub("px", "", id)
+id<-gsub("\\.", "", id)
+
+tafla2$id<-paste0(id, "_", 1:nrow(tafla2))
+
+# Use API to get metadata -----
+
+metadatalisti<-list()
+timi2<-Sys.time()
+breytur<-vector()
+
+for(k in 1:nrow(tafla2)){
+  #' First API query returns list of variables
+  #' No data is requested so the API does not return other metadata
+  #' The variable list is used to make a minimal query for data in order to get metadata
+  pxd0<-pxweb_get(tafla2$apiurl[k])
+  vars0<-pxd0$variables
+  codes<-vector()
+  values<-vector()
+  for(i in 1:length(vars0)){
+    codes[i]<-vars0[[i]]$code
+    values[i]<-vars0[[i]]$values[1]
+  }
+  qlisti<-list()
+  for(i in 1:length(vars0)){
+    qlisti[[codes[i]]]<-values[i]
+  }
+  pxq <- pxweb_query(qlisti)
+  pxd <- pxweb_get(tafla2$apiurl[k], pxq) #Second query
+  pxd$data<-NULL #Remove data
+  #' Metadata is placed in a list in an item named by table id
+  idtemp<-tafla2$id[k]
+  metadatalisti[[idtemp]]<-pxd
+  #' Make vector of variable names and paste to make column of variable names for tafla2
+  breytur_sub<-vector()
+  for(i in 1:length(pxd$pxweb_metadata$variables)){
+    breytur_sub[i]<-pxd$pxweb_metadata$variables[[i]]$text
+  }
+  breytur_sub<-paste0(breytur_sub, collapse = " ; ")
+  breytur[k]<-breytur_sub
+  print(paste0("Skröpun 2: tafla númer: ", k))
+  print(tafla2$nofn2[k])
+  # This approach could be improved using package polite! 
+  Sys.sleep(0.5)
+}
+print("Skröpun 2 lokið")
+Sys.time()-timi2
+
+tafla2$breytur<-breytur
+
+# Add value column ----
+
+#' Make column of all values of all variables.
+#' Can be used to add hidden searchable column in main table in app
+
+valvec<-vector()
+
+for(i in 1:nrow(tafla2)){
+  sel1<-metadatalisti[[i]]$pxweb_metadata$variables
+  skil<-""
+  for(j in 1:length(sel1)){
+    sel2<-sel1[[j]]$valueTexts
+    skil<-paste0(skil, " ; ", paste0(sel2, collapse = " ; "))
+  }
+  valvec[i]<-skil
+}
+
+tafla2$gildi<-valvec
+
+# Add oldest and most recent time series value ----
+
+first<-vector()
+last<-vector()
+
+for(i in 1:nrow(tafla2)){
+  sel1<-metadatalisti[[i]]$pxweb_metadata$variables
+  for(j in 1:length(sel1)){
+    sel2<-sel1[[j]]
+    if(sel2$time){
+      first[i]<-sel2$valueTexts[1]
+      last[i]<-sel2$valueTexts[length(sel2$valueTexts)]
+    }
+  }
+}
+
+tafla2$first_time<-first
+tafla2$last_time<-last
+
+# Add date of last update ----
+
+updatevec<-vector()
+
+for(i in 1:nrow(tafla2)){
+  updatevec[i]<-metadatalisti[[i]]$metadata[[1]]$updated
+}
+
+updatevec2<-as.Date.character(updatevec)
+
+tafla2$last_update<-updatevec2
+
+# Add column of duplicate titles ----
+
+tafla2<-tafla2 |> 
+  dplyr::left_join(
+    tafla2 |> 
+      dplyr::group_by(nofn2) |> 
+      dplyr::summarise(n_duplicates=dplyr::n())
+  )
+
+# Scrape breadcrumbs, about table and comments from pxweb-----
+
+#' Getting sensible hierarchical organisation of table from the API is a bit of a challenge
+#' Scraping metadata from pxweb is also necessary at his point as it is not always consistent with API metadata
+#' The inconsistency should of course be resolved!
+
+skyringalisti<-list()
+crumbs<-vector()
+timi3<-Sys.time()
+teljari<-1
+remDr$open()
+for(k in 1:nrow(tafla2)){
+  remDr$navigate(tafla2$slod2[k])
+  saekja1<-remDr$findElements(using = "id", "breadcrumb")
+  crumb_sub<-unlist(saekja1[[1]]$getElementText())
+  crumbs[k]<-crumb_sub
+  skyringar<-remDr$findElements(using = "class", "accordion-header")
+  for(i in 1:length(skyringar)){
+    skyringar[[i]]$clickElement()
+  }
+  skyringatexti_em<-remDr$findElements(using = "class", "accordion-body")
+  if(length(skyringatexti_em)>1){
+    skyringatexti<-unlist(skyringatexti_em[[2]]$getElementText())
+  }else{
+    skyringatexti<-"null"
+  }
+  sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
+  for(i in 1:length(sub_headers)){
+    sub_headers[[i]]$clickElement()
+  }
+  sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
+  sub_textar<-vector()
+  for(i in 1:length(sub_headers)){
+    sub_textar[i]<-sub_headers[[i]]$getElementText()
+  }
+  sub_bodies<-remDr$findElements(using = "class", "nested-accordion-body")
+  sub_bodies_textar<-vector()
+  for(i in 1:length(sub_headers)){
+    sub_bodies_textar[i]<-sub_bodies[[i]]$getElementText()
+  }
+  idtemp<-tafla2$id[k]
+  skyringalisti0<-list()
+  skyringalisti0[["skyringatexti"]]<-skyringatexti
+  skyringalisti0[["sub_textar"]]<-sub_textar
+  skyringalisti0[["sub_bodies_textar"]]<-sub_bodies_textar
+  print(paste0("Skröpun 3: tafla númer: ", teljari))
+  skyringalisti[[idtemp]]<-skyringalisti0
+  teljari<-teljari+1
+}
+
+print("Skröpun 3 lokið")
+Sys.time()-timi3
+
+# Split breadcrumbs to make categories
+
+crumbsplit<-strsplit(crumbs, split = "/ ")
+crumblengd<-sapply(crumbsplit, length)
+
+crumb1<-sapply(crumbsplit, "[[", 2)
+crumb2<-sapply(crumbsplit, "[[", 3)
+crumb3<-sapply(crumbsplit, "[[", 4)
+
+tafla2$crumbs<-crumbs
+tafla2$crumbs1<-crumb1
+tafla2$crumbs2<-crumb2
+tafla2$crumbs3<-crumb3
+
+# Add time stamp
+
+metadatalisti$timastimpill_char<-as.character.Date(timi0)
+metadatalisti$timastimpill_POSIXct<-timi0
+
+
+saveRDS(tafla2, paste0(dname,"/uttak/maintab", as.numeric(Sys.time()), ".rds"))
+saveRDS(metadatalisti, paste0(dname,"/uttak/apimeta", as.numeric(Sys.time()), ".rds"))
+saveRDS(skyringalisti, paste0(dname,"/uttak/pxwebmeta", as.numeric(Sys.time()), ".rds"))
+
+print("Uppfæringu lokið")
+Sys.time()-timi0