Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
hansharaldss authored Jan 29, 2024
0 parents commit 26f55ec
Showing 1 changed file with 322 additions and 0 deletions.
322 changes: 322 additions & 0 deletions pxmap_scrape.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@

library(RSelenium)
library(jsonlite)
library(pxweb)

# Directory needs to have a folder named uttak.
# Data will be saved in uttak

setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
dname<-dirname(rstudioapi::getActiveDocumentContext()$path)

#Open web driver ----

#'This should be improved by someone with a better understanding of docker.
#'Docker must be running before script is executed.
#'There is nn pariticular reason for using this firefox version (but evereything runs!).

shell("docker pull selenium/standalone-firefox:2.53.0")
shell('docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.0')
Sys.sleep(10)
remDr <- remoteDriver(port = 4445L, browserName = "firefox")
remDr$open()

#'Timing start of scraping session
timi0<-Sys.time()

#Get table of level 1 categories ----

remDr$navigate("https://px.hagstofa.is/pxis/pxweb/is/")

takki_1<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxweb')]")

tafla_1_nofn<-vector()
tafla_1_slodir<-vector()

for(i in 1:length(takki_1)){
tafla_1_nofn[i]<-unlist(takki_1[[i]]$getElementText())
tafla_1_slodir[i]<-unlist(takki_1[[i]]$getElementAttribute("href"))
}

tafla1<-data.frame("nofn"=tafla_1_nofn, "slodir"=tafla_1_slodir)
tafla1<-tafla1[tafla1$nofn!="",]



# Scrape table names and urls ----

teljari<-1
timi1<-Sys.time()
tafla2<-data.frame(nofn2="", slod2="")

# Loop iterates though level 1 categories
for(k in 1:nrow(tafla1)){
curr_path_1<-tafla1$slodir[k]
curr_nafn_1<-tafla1$nofn[k]
remDr$navigate(curr_path_1)

#Finding all expandable tree elements
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")

#While there are expandable elements, click first element and search again
while(length(takki_2)>0){
takki_2[[1]]$clickElement()
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")
}

#Once all elements are expanded, find all elements with a hypertext reference attribute
#This line woukd have to be changed to scrape the english version
takki_3<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxis')]")

nofn2<-vector()
slodir2<-vector()
#Loop iterates though all elements extracting the text and href elements
for(i in 1:length(takki_3)){
nofn2[i]<-unlist(takki_3[[i]]$getElementText())
slodir2[i]<-unlist(takki_3[[i]]$getElementAttribute("href"))
print(paste0("Skröpun 1: tafla númer: ", teljari))
print(nofn2[i])
teljari<-teljari+1
}

tafla2_temp<-data.frame("nofn2"=nofn2, "slod2"=slodir2)
#Remove links that are not tables
tafla2_temp$nchar<-nchar(tafla2_temp$slod2)
tafla2_temp<-tafla2_temp[tafla2_temp$nchar>37,]
tafla2_temp$nchar<-NULL

tafla2<-rbind(tafla2, tafla2_temp)

}

print("Skröpun 1 lokið")
Sys.time()-timi1

# Make API-urls from px urls ----
#' This part is problematic
#' Code would need to be updated if new level 1 databases were added.
#' Using the the API to list tables and make the pxweb links from those would be faster and more robust if it were possible.
#' There are occasional eccentricities in pxweb links preventing the use of the API for listing.
#' The API approach would also miss duplicates on the pxweb.
#' If these problems are fixed I recommend using https://github.com/bgautijonsson/hagstofa_scraping/tree/master/R instead

tafla2<-tafla2[tafla2$nofn2!="",]
tafla2$apiurl2<-gsub("__", "/", tafla2$slod2)
tafla2$apiurl3<-gsub("https://px.hagstofa.is/pxis/pxweb/is/", "https://px.hagstofa.is:443/pxis/api/v1/is/", tafla2$apiurl2)
tafla2$apiurl3<-gsub("/Atvinnuvegir/Atvinnuvegir", "/Atvinnuvegir", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Efnahagur/Efnahagur", "/Efnahagur", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Ibuar/Ibuar", "/Ibuar", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Samfelag/Samfelag", "/Samfelag", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Sogulegar/Sogulegar", "/Sogulegar", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Umhverfi/Umhverfi", "/Umhverfi", tafla2$apiurl3)
tafla2$apiurl3<-gsub(".px/", ".px", tafla2$apiurl3)
tafla2$apiurl3<-gsub("%20", " ", tafla2$apiurl3)
tafla2$apiurl3<-gsub("px_autoopen=true", "", tafla2$apiurl3)
tafla2$apiurl3<-gsub("\\?", "", tafla2$apiurl3)
tafla2$apiurl3<-gsub("%C3%B0", "ð", tafla2$apiurl3)

tafla2$apiurl<-tafla2$apiurl3

tafla2$apiurl2<-NULL
tafla2$apiurl3<-NULL

# Make simple id strings from urls ----

#' The id is made from filenames at end of url
#' These are not unique for duplicate tables, a row number integer is added at end in case metadata might differ.

ids<-strsplit(tafla2$apiurl, split = "/")

id<-vector()
for(i in 1:length(ids)){
temp<-ids[[i]]
id[i]<-temp[length(temp)]
}
id<-gsub("px", "", id)
id<-gsub("\\.", "", id)

tafla2$id<-paste0(id, "_", 1:nrow(tafla2))

# Use API to get metadata -----

metadatalisti<-list()
timi2<-Sys.time()
breytur<-vector()

for(k in 1:nrow(tafla2)){
#' First API query returns list of variables
#' No data is requested so the API does not return other metadata
#' The variable list is used to make a minimal query for data in order to get metadata
pxd0<-pxweb_get(tafla2$apiurl[k])
vars0<-pxd0$variables
codes<-vector()
values<-vector()
for(i in 1:length(vars0)){
codes[i]<-vars0[[i]]$code
values[i]<-vars0[[i]]$values[1]
}
qlisti<-list()
for(i in 1:length(vars0)){
qlisti[[codes[i]]]<-values[i]
}
pxq <- pxweb_query(qlisti)
pxd <- pxweb_get(tafla2$apiurl[k], pxq) #Second query
pxd$data<-NULL #Remove data
#' Metadata is placed in a list in an item named by table id
idtemp<-tafla2$id[k]
metadatalisti[[idtemp]]<-pxd
#' Make vector of variable names and paste to make column of variable names for tafla2
breytur_sub<-vector()
for(i in 1:length(pxd$pxweb_metadata$variables)){
breytur_sub[i]<-pxd$pxweb_metadata$variables[[i]]$text
}
breytur_sub<-paste0(breytur_sub, collapse = " ; ")
breytur[k]<-breytur_sub
print(paste0("Skröpun 2: tafla númer: ", k))
print(tafla2$nofn2[k])
# This approach could be improved using package polite!
Sys.sleep(0.5)
}
print("Skröpun 2 lokið")
Sys.time()-timi2

tafla2$breytur<-breytur

# Add value column ----

#' Make column of all values of all variables.
#' Can be used to add hidden searchable column in main table in app

valvec<-vector()

for(i in 1:nrow(tafla2)){
sel1<-metadatalisti[[i]]$pxweb_metadata$variables
skil<-""
for(j in 1:length(sel1)){
sel2<-sel1[[j]]$valueTexts
skil<-paste0(skil, " ; ", paste0(sel2, collapse = " ; "))
}
valvec[i]<-skil
}

tafla2$gildi<-valvec

# Add oldest and most recent time series value ----

first<-vector()
last<-vector()

for(i in 1:nrow(tafla2)){
sel1<-metadatalisti[[i]]$pxweb_metadata$variables
for(j in 1:length(sel1)){
sel2<-sel1[[j]]
if(sel2$time){
first[i]<-sel2$valueTexts[1]
last[i]<-sel2$valueTexts[length(sel2$valueTexts)]
}
}
}

tafla2$first_time<-first
tafla2$last_time<-last

# Add date of last update ----

updatevec<-vector()

for(i in 1:nrow(tafla2)){
updatevec[i]<-metadatalisti[[i]]$metadata[[1]]$updated
}

updatevec2<-as.Date.character(updatevec)

tafla2$last_update<-updatevec2

# Add column of duplicate titles ----

tafla2<-tafla2 |>
dplyr::left_join(
tafla2 |>
dplyr::group_by(nofn2) |>
dplyr::summarise(n_duplicates=dplyr::n())
)

# Scrape breadcrumbs, about table and comments from pxweb-----

#' Getting sensible hierarchical organisation of table from the API is a bit of a challenge
#' Scraping metadata from pxweb is also necessary at his point as it is not always consistent with API metadata
#' The inconsistency should of course be resolved!

skyringalisti<-list()
crumbs<-vector()
timi3<-Sys.time()
teljari<-1
remDr$open()
for(k in 1:nrow(tafla2)){
remDr$navigate(tafla2$slod2[k])
saekja1<-remDr$findElements(using = "id", "breadcrumb")
crumb_sub<-unlist(saekja1[[1]]$getElementText())
crumbs[k]<-crumb_sub
skyringar<-remDr$findElements(using = "class", "accordion-header")
for(i in 1:length(skyringar)){
skyringar[[i]]$clickElement()
}
skyringatexti_em<-remDr$findElements(using = "class", "accordion-body")
if(length(skyringatexti_em)>1){
skyringatexti<-unlist(skyringatexti_em[[2]]$getElementText())
}else{
skyringatexti<-"null"
}
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
for(i in 1:length(sub_headers)){
sub_headers[[i]]$clickElement()
}
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
sub_textar<-vector()
for(i in 1:length(sub_headers)){
sub_textar[i]<-sub_headers[[i]]$getElementText()
}
sub_bodies<-remDr$findElements(using = "class", "nested-accordion-body")
sub_bodies_textar<-vector()
for(i in 1:length(sub_headers)){
sub_bodies_textar[i]<-sub_bodies[[i]]$getElementText()
}
idtemp<-tafla2$id[k]
skyringalisti0<-list()
skyringalisti0[["skyringatexti"]]<-skyringatexti
skyringalisti0[["sub_textar"]]<-sub_textar
skyringalisti0[["sub_bodies_textar"]]<-sub_bodies_textar
print(paste0("Skröpun 3: tafla númer: ", teljari))
skyringalisti[[idtemp]]<-skyringalisti0
teljari<-teljari+1
}

print("Skröpun 3 lokið")
Sys.time()-timi3

# Split breadcrumbs to make categories

crumbsplit<-strsplit(crumbs, split = "/ ")
crumblengd<-sapply(crumbsplit, length)

crumb1<-sapply(crumbsplit, "[[", 2)
crumb2<-sapply(crumbsplit, "[[", 3)
crumb3<-sapply(crumbsplit, "[[", 4)

tafla2$crumbs<-crumbs
tafla2$crumbs1<-crumb1
tafla2$crumbs2<-crumb2
tafla2$crumbs3<-crumb3

# Add time stamp

metadatalisti$timastimpill_char<-as.character.Date(timi0)
metadatalisti$timastimpill_POSIXct<-timi0


saveRDS(tafla2, paste0(dname,"/uttak/maintab", as.numeric(Sys.time()), ".rds"))
saveRDS(metadatalisti, paste0(dname,"/uttak/apimeta", as.numeric(Sys.time()), ".rds"))
saveRDS(skyringalisti, paste0(dname,"/uttak/pxwebmeta", as.numeric(Sys.time()), ".rds"))

print("Uppfæringu lokið")
Sys.time()-timi0

0 comments on commit 26f55ec

Please sign in to comment.