forked from hansharaldss/pxmap
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 26f55ec
Showing
1 changed file
with
322 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
|
||
library(RSelenium) | ||
library(jsonlite) | ||
library(pxweb) | ||
|
||
# Directory needs to have a folder named uttak. | ||
# Data will be saved in uttak | ||
|
||
setwd(dirname(rstudioapi::getActiveDocumentContext()$path)) | ||
dname<-dirname(rstudioapi::getActiveDocumentContext()$path) | ||
|
||
#Open web driver ---- | ||
|
||
#'This should be improved by someone with a better understanding of docker. | ||
#'Docker must be running before script is executed. | ||
#'There is nn pariticular reason for using this firefox version (but evereything runs!). | ||
|
||
shell("docker pull selenium/standalone-firefox:2.53.0") | ||
shell('docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.0') | ||
Sys.sleep(10) | ||
remDr <- remoteDriver(port = 4445L, browserName = "firefox") | ||
remDr$open() | ||
|
||
#'Timing start of scraping session | ||
timi0<-Sys.time() | ||
|
||
#Get table of level 1 categories ---- | ||
|
||
remDr$navigate("https://px.hagstofa.is/pxis/pxweb/is/") | ||
|
||
takki_1<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxweb')]") | ||
|
||
tafla_1_nofn<-vector() | ||
tafla_1_slodir<-vector() | ||
|
||
for(i in 1:length(takki_1)){ | ||
tafla_1_nofn[i]<-unlist(takki_1[[i]]$getElementText()) | ||
tafla_1_slodir[i]<-unlist(takki_1[[i]]$getElementAttribute("href")) | ||
} | ||
|
||
tafla1<-data.frame("nofn"=tafla_1_nofn, "slodir"=tafla_1_slodir) | ||
tafla1<-tafla1[tafla1$nofn!="",] | ||
|
||
|
||
|
||
# Scrape table names and urls ---- | ||
|
||
teljari<-1 | ||
timi1<-Sys.time() | ||
tafla2<-data.frame(nofn2="", slod2="") | ||
|
||
# Loop iterates though level 1 categories | ||
for(k in 1:nrow(tafla1)){ | ||
curr_path_1<-tafla1$slodir[k] | ||
curr_nafn_1<-tafla1$nofn[k] | ||
remDr$navigate(curr_path_1) | ||
|
||
#Finding all expandable tree elements | ||
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand") | ||
|
||
#While there are expandable elements, click first element and search again | ||
while(length(takki_2)>0){ | ||
takki_2[[1]]$clickElement() | ||
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand") | ||
} | ||
|
||
#Once all elements are expanded, find all elements with a hypertext reference attribute | ||
#This line woukd have to be changed to scrape the english version | ||
takki_3<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxis')]") | ||
|
||
nofn2<-vector() | ||
slodir2<-vector() | ||
#Loop iterates though all elements extracting the text and href elements | ||
for(i in 1:length(takki_3)){ | ||
nofn2[i]<-unlist(takki_3[[i]]$getElementText()) | ||
slodir2[i]<-unlist(takki_3[[i]]$getElementAttribute("href")) | ||
print(paste0("Skröpun 1: tafla númer: ", teljari)) | ||
print(nofn2[i]) | ||
teljari<-teljari+1 | ||
} | ||
|
||
tafla2_temp<-data.frame("nofn2"=nofn2, "slod2"=slodir2) | ||
#Remove links that are not tables | ||
tafla2_temp$nchar<-nchar(tafla2_temp$slod2) | ||
tafla2_temp<-tafla2_temp[tafla2_temp$nchar>37,] | ||
tafla2_temp$nchar<-NULL | ||
|
||
tafla2<-rbind(tafla2, tafla2_temp) | ||
|
||
} | ||
|
||
print("Skröpun 1 lokið") | ||
Sys.time()-timi1 | ||
|
||
# Make API-urls from px urls ---- | ||
#' This part is problematic | ||
#' Code would need to be updated if new level 1 databases were added. | ||
#' Using the the API to list tables and make the pxweb links from those would be faster and more robust if it were possible. | ||
#' There are occasional eccentricities in pxweb links preventing the use of the API for listing. | ||
#' The API approach would also miss duplicates on the pxweb. | ||
#' If these problems are fixed I recommend using https://github.com/bgautijonsson/hagstofa_scraping/tree/master/R instead | ||
|
||
tafla2<-tafla2[tafla2$nofn2!="",] | ||
tafla2$apiurl2<-gsub("__", "/", tafla2$slod2) | ||
tafla2$apiurl3<-gsub("https://px.hagstofa.is/pxis/pxweb/is/", "https://px.hagstofa.is:443/pxis/api/v1/is/", tafla2$apiurl2) | ||
tafla2$apiurl3<-gsub("/Atvinnuvegir/Atvinnuvegir", "/Atvinnuvegir", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("/Efnahagur/Efnahagur", "/Efnahagur", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("/Ibuar/Ibuar", "/Ibuar", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("/Samfelag/Samfelag", "/Samfelag", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("/Sogulegar/Sogulegar", "/Sogulegar", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("/Umhverfi/Umhverfi", "/Umhverfi", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub(".px/", ".px", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("%20", " ", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("px_autoopen=true", "", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("\\?", "", tafla2$apiurl3) | ||
tafla2$apiurl3<-gsub("%C3%B0", "ð", tafla2$apiurl3) | ||
|
||
tafla2$apiurl<-tafla2$apiurl3 | ||
|
||
tafla2$apiurl2<-NULL | ||
tafla2$apiurl3<-NULL | ||
|
||
# Make simple id strings from urls ---- | ||
|
||
#' The id is made from filenames at end of url | ||
#' These are not unique for duplicate tables, a row number integer is added at end in case metadata might differ. | ||
|
||
ids<-strsplit(tafla2$apiurl, split = "/") | ||
|
||
id<-vector() | ||
for(i in 1:length(ids)){ | ||
temp<-ids[[i]] | ||
id[i]<-temp[length(temp)] | ||
} | ||
id<-gsub("px", "", id) | ||
id<-gsub("\\.", "", id) | ||
|
||
tafla2$id<-paste0(id, "_", 1:nrow(tafla2)) | ||
|
||
# Use API to get metadata ----- | ||
|
||
metadatalisti<-list() | ||
timi2<-Sys.time() | ||
breytur<-vector() | ||
|
||
for(k in 1:nrow(tafla2)){ | ||
#' First API query returns list of variables | ||
#' No data is requested so the API does not return other metadata | ||
#' The variable list is used to make a minimal query for data in order to get metadata | ||
pxd0<-pxweb_get(tafla2$apiurl[k]) | ||
vars0<-pxd0$variables | ||
codes<-vector() | ||
values<-vector() | ||
for(i in 1:length(vars0)){ | ||
codes[i]<-vars0[[i]]$code | ||
values[i]<-vars0[[i]]$values[1] | ||
} | ||
qlisti<-list() | ||
for(i in 1:length(vars0)){ | ||
qlisti[[codes[i]]]<-values[i] | ||
} | ||
pxq <- pxweb_query(qlisti) | ||
pxd <- pxweb_get(tafla2$apiurl[k], pxq) #Second query | ||
pxd$data<-NULL #Remove data | ||
#' Metadata is placed in a list in an item named by table id | ||
idtemp<-tafla2$id[k] | ||
metadatalisti[[idtemp]]<-pxd | ||
#' Make vector of variable names and paste to make column of variable names for tafla2 | ||
breytur_sub<-vector() | ||
for(i in 1:length(pxd$pxweb_metadata$variables)){ | ||
breytur_sub[i]<-pxd$pxweb_metadata$variables[[i]]$text | ||
} | ||
breytur_sub<-paste0(breytur_sub, collapse = " ; ") | ||
breytur[k]<-breytur_sub | ||
print(paste0("Skröpun 2: tafla númer: ", k)) | ||
print(tafla2$nofn2[k]) | ||
# This approach could be improved using package polite! | ||
Sys.sleep(0.5) | ||
} | ||
print("Skröpun 2 lokið") | ||
Sys.time()-timi2 | ||
|
||
tafla2$breytur<-breytur | ||
|
||
# Add value column ---- | ||
|
||
#' Make column of all values of all variables. | ||
#' Can be used to add hidden searchable column in main table in app | ||
|
||
valvec<-vector() | ||
|
||
for(i in 1:nrow(tafla2)){ | ||
sel1<-metadatalisti[[i]]$pxweb_metadata$variables | ||
skil<-"" | ||
for(j in 1:length(sel1)){ | ||
sel2<-sel1[[j]]$valueTexts | ||
skil<-paste0(skil, " ; ", paste0(sel2, collapse = " ; ")) | ||
} | ||
valvec[i]<-skil | ||
} | ||
|
||
tafla2$gildi<-valvec | ||
|
||
# Add oldest and most recent time series value ---- | ||
|
||
first<-vector() | ||
last<-vector() | ||
|
||
for(i in 1:nrow(tafla2)){ | ||
sel1<-metadatalisti[[i]]$pxweb_metadata$variables | ||
for(j in 1:length(sel1)){ | ||
sel2<-sel1[[j]] | ||
if(sel2$time){ | ||
first[i]<-sel2$valueTexts[1] | ||
last[i]<-sel2$valueTexts[length(sel2$valueTexts)] | ||
} | ||
} | ||
} | ||
|
||
tafla2$first_time<-first | ||
tafla2$last_time<-last | ||
|
||
# Add date of last update ---- | ||
|
||
updatevec<-vector() | ||
|
||
for(i in 1:nrow(tafla2)){ | ||
updatevec[i]<-metadatalisti[[i]]$metadata[[1]]$updated | ||
} | ||
|
||
updatevec2<-as.Date.character(updatevec) | ||
|
||
tafla2$last_update<-updatevec2 | ||
|
||
# Add column of duplicate titles ---- | ||
|
||
tafla2<-tafla2 |> | ||
dplyr::left_join( | ||
tafla2 |> | ||
dplyr::group_by(nofn2) |> | ||
dplyr::summarise(n_duplicates=dplyr::n()) | ||
) | ||
|
||
# Scrape breadcrumbs, about table and comments from pxweb----- | ||
|
||
#' Getting sensible hierarchical organisation of table from the API is a bit of a challenge | ||
#' Scraping metadata from pxweb is also necessary at his point as it is not always consistent with API metadata | ||
#' The inconsistency should of course be resolved! | ||
|
||
skyringalisti<-list() | ||
crumbs<-vector() | ||
timi3<-Sys.time() | ||
teljari<-1 | ||
remDr$open() | ||
for(k in 1:nrow(tafla2)){ | ||
remDr$navigate(tafla2$slod2[k]) | ||
saekja1<-remDr$findElements(using = "id", "breadcrumb") | ||
crumb_sub<-unlist(saekja1[[1]]$getElementText()) | ||
crumbs[k]<-crumb_sub | ||
skyringar<-remDr$findElements(using = "class", "accordion-header") | ||
for(i in 1:length(skyringar)){ | ||
skyringar[[i]]$clickElement() | ||
} | ||
skyringatexti_em<-remDr$findElements(using = "class", "accordion-body") | ||
if(length(skyringatexti_em)>1){ | ||
skyringatexti<-unlist(skyringatexti_em[[2]]$getElementText()) | ||
}else{ | ||
skyringatexti<-"null" | ||
} | ||
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header") | ||
for(i in 1:length(sub_headers)){ | ||
sub_headers[[i]]$clickElement() | ||
} | ||
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header") | ||
sub_textar<-vector() | ||
for(i in 1:length(sub_headers)){ | ||
sub_textar[i]<-sub_headers[[i]]$getElementText() | ||
} | ||
sub_bodies<-remDr$findElements(using = "class", "nested-accordion-body") | ||
sub_bodies_textar<-vector() | ||
for(i in 1:length(sub_headers)){ | ||
sub_bodies_textar[i]<-sub_bodies[[i]]$getElementText() | ||
} | ||
idtemp<-tafla2$id[k] | ||
skyringalisti0<-list() | ||
skyringalisti0[["skyringatexti"]]<-skyringatexti | ||
skyringalisti0[["sub_textar"]]<-sub_textar | ||
skyringalisti0[["sub_bodies_textar"]]<-sub_bodies_textar | ||
print(paste0("Skröpun 3: tafla númer: ", teljari)) | ||
skyringalisti[[idtemp]]<-skyringalisti0 | ||
teljari<-teljari+1 | ||
} | ||
|
||
print("Skröpun 3 lokið") | ||
Sys.time()-timi3 | ||
|
||
# Split breadcrumbs to make categories | ||
|
||
crumbsplit<-strsplit(crumbs, split = "/ ") | ||
crumblengd<-sapply(crumbsplit, length) | ||
|
||
crumb1<-sapply(crumbsplit, "[[", 2) | ||
crumb2<-sapply(crumbsplit, "[[", 3) | ||
crumb3<-sapply(crumbsplit, "[[", 4) | ||
|
||
tafla2$crumbs<-crumbs | ||
tafla2$crumbs1<-crumb1 | ||
tafla2$crumbs2<-crumb2 | ||
tafla2$crumbs3<-crumb3 | ||
|
||
# Add time stamp | ||
|
||
metadatalisti$timastimpill_char<-as.character.Date(timi0) | ||
metadatalisti$timastimpill_POSIXct<-timi0 | ||
|
||
|
||
saveRDS(tafla2, paste0(dname,"/uttak/maintab", as.numeric(Sys.time()), ".rds")) | ||
saveRDS(metadatalisti, paste0(dname,"/uttak/apimeta", as.numeric(Sys.time()), ".rds")) | ||
saveRDS(skyringalisti, paste0(dname,"/uttak/pxwebmeta", as.numeric(Sys.time()), ".rds")) | ||
|
||
print("Uppfæringu lokið") | ||
Sys.time()-timi0 |