forked from hansharaldss/pxmap
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpxmap_scrape.R
322 lines (254 loc) · 9.96 KB
/
pxmap_scrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
library(RSelenium)
library(jsonlite)
library(pxweb)
# Directory needs to have a folder named uttak.
# Data will be saved in uttak
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
dname<-dirname(rstudioapi::getActiveDocumentContext()$path)
#Open web driver ----
#'This should be improved by someone with a better understanding of docker.
#'Docker must be running before script is executed.
#'There is nn pariticular reason for using this firefox version (but evereything runs!).
shell("docker pull selenium/standalone-firefox:2.53.0")
shell('docker run -d -p 4445:4444 selenium/standalone-firefox:2.53.0')
Sys.sleep(10)
remDr <- remoteDriver(port = 4445L, browserName = "firefox")
remDr$open()
#'Timing start of scraping session
timi0<-Sys.time()
#Get table of level 1 categories ----
remDr$navigate("https://px.hagstofa.is/pxis/pxweb/is/")
takki_1<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxweb')]")
tafla_1_nofn<-vector()
tafla_1_slodir<-vector()
for(i in 1:length(takki_1)){
tafla_1_nofn[i]<-unlist(takki_1[[i]]$getElementText())
tafla_1_slodir[i]<-unlist(takki_1[[i]]$getElementAttribute("href"))
}
tafla1<-data.frame("nofn"=tafla_1_nofn, "slodir"=tafla_1_slodir)
tafla1<-tafla1[tafla1$nofn!="",]
# Scrape table names and urls ----
teljari<-1
timi1<-Sys.time()
tafla2<-data.frame(nofn2="", slod2="")
# Loop iterates though level 1 categories
for(k in 1:nrow(tafla1)){
curr_path_1<-tafla1$slodir[k]
curr_nafn_1<-tafla1$nofn[k]
remDr$navigate(curr_path_1)
#Finding all expandable tree elements
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")
#While there are expandable elements, click first element and search again
while(length(takki_2)>0){
takki_2[[1]]$clickElement()
takki_2<-remDr$findElements(using = "class", "AspNet-TreeView-Expand")
}
#Once all elements are expanded, find all elements with a hypertext reference attribute
#This line woukd have to be changed to scrape the english version
takki_3<-remDr$findElements(using = "xpath", "//a[contains(@href, 'pxis')]")
nofn2<-vector()
slodir2<-vector()
#Loop iterates though all elements extracting the text and href elements
for(i in 1:length(takki_3)){
nofn2[i]<-unlist(takki_3[[i]]$getElementText())
slodir2[i]<-unlist(takki_3[[i]]$getElementAttribute("href"))
print(paste0("Skröpun 1: tafla númer: ", teljari))
print(nofn2[i])
teljari<-teljari+1
}
tafla2_temp<-data.frame("nofn2"=nofn2, "slod2"=slodir2)
#Remove links that are not tables
tafla2_temp$nchar<-nchar(tafla2_temp$slod2)
tafla2_temp<-tafla2_temp[tafla2_temp$nchar>37,]
tafla2_temp$nchar<-NULL
tafla2<-rbind(tafla2, tafla2_temp)
}
print("Skröpun 1 lokið")
Sys.time()-timi1
# Make API-urls from px urls ----
#' This part is problematic
#' Code would need to be updated if new level 1 databases were added.
#' Using the the API to list tables and make the pxweb links from those would be faster and more robust if it were possible.
#' There are occasional eccentricities in pxweb links preventing the use of the API for listing.
#' The API approach would also miss duplicates on the pxweb.
#' If these problems are fixed I recommend using https://github.com/bgautijonsson/hagstofa_scraping/tree/master/R instead
tafla2<-tafla2[tafla2$nofn2!="",]
tafla2$apiurl2<-gsub("__", "/", tafla2$slod2)
tafla2$apiurl3<-gsub("https://px.hagstofa.is/pxis/pxweb/is/", "https://px.hagstofa.is:443/pxis/api/v1/is/", tafla2$apiurl2)
tafla2$apiurl3<-gsub("/Atvinnuvegir/Atvinnuvegir", "/Atvinnuvegir", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Efnahagur/Efnahagur", "/Efnahagur", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Ibuar/Ibuar", "/Ibuar", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Samfelag/Samfelag", "/Samfelag", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Sogulegar/Sogulegar", "/Sogulegar", tafla2$apiurl3)
tafla2$apiurl3<-gsub("/Umhverfi/Umhverfi", "/Umhverfi", tafla2$apiurl3)
tafla2$apiurl3<-gsub(".px/", ".px", tafla2$apiurl3)
tafla2$apiurl3<-gsub("%20", " ", tafla2$apiurl3)
tafla2$apiurl3<-gsub("px_autoopen=true", "", tafla2$apiurl3)
tafla2$apiurl3<-gsub("\\?", "", tafla2$apiurl3)
tafla2$apiurl3<-gsub("%C3%B0", "ð", tafla2$apiurl3)
tafla2$apiurl<-tafla2$apiurl3
tafla2$apiurl2<-NULL
tafla2$apiurl3<-NULL
# Make simple id strings from urls ----
#' The id is made from filenames at end of url
#' These are not unique for duplicate tables, a row number integer is added at end in case metadata might differ.
ids<-strsplit(tafla2$apiurl, split = "/")
id<-vector()
for(i in 1:length(ids)){
temp<-ids[[i]]
id[i]<-temp[length(temp)]
}
id<-gsub("px", "", id)
id<-gsub("\\.", "", id)
tafla2$id<-paste0(id, "_", 1:nrow(tafla2))
# Use API to get metadata -----
metadatalisti<-list()
timi2<-Sys.time()
breytur<-vector()
for(k in 1:nrow(tafla2)){
#' First API query returns list of variables
#' No data is requested so the API does not return other metadata
#' The variable list is used to make a minimal query for data in order to get metadata
pxd0<-pxweb_get(tafla2$apiurl[k])
vars0<-pxd0$variables
codes<-vector()
values<-vector()
for(i in 1:length(vars0)){
codes[i]<-vars0[[i]]$code
values[i]<-vars0[[i]]$values[1]
}
qlisti<-list()
for(i in 1:length(vars0)){
qlisti[[codes[i]]]<-values[i]
}
pxq <- pxweb_query(qlisti)
pxd <- pxweb_get(tafla2$apiurl[k], pxq) #Second query
pxd$data<-NULL #Remove data
#' Metadata is placed in a list in an item named by table id
idtemp<-tafla2$id[k]
metadatalisti[[idtemp]]<-pxd
#' Make vector of variable names and paste to make column of variable names for tafla2
breytur_sub<-vector()
for(i in 1:length(pxd$pxweb_metadata$variables)){
breytur_sub[i]<-pxd$pxweb_metadata$variables[[i]]$text
}
breytur_sub<-paste0(breytur_sub, collapse = " ; ")
breytur[k]<-breytur_sub
print(paste0("Skröpun 2: tafla númer: ", k))
print(tafla2$nofn2[k])
# This approach could be improved using package polite!
Sys.sleep(0.5)
}
print("Skröpun 2 lokið")
Sys.time()-timi2
tafla2$breytur<-breytur
# Add value column ----
#' Make column of all values of all variables.
#' Can be used to add hidden searchable column in main table in app
valvec<-vector()
for(i in 1:nrow(tafla2)){
sel1<-metadatalisti[[i]]$pxweb_metadata$variables
skil<-""
for(j in 1:length(sel1)){
sel2<-sel1[[j]]$valueTexts
skil<-paste0(skil, " ; ", paste0(sel2, collapse = " ; "))
}
valvec[i]<-skil
}
tafla2$gildi<-valvec
# Add oldest and most recent time series value ----
first<-vector()
last<-vector()
for(i in 1:nrow(tafla2)){
sel1<-metadatalisti[[i]]$pxweb_metadata$variables
for(j in 1:length(sel1)){
sel2<-sel1[[j]]
if(sel2$time){
first[i]<-sel2$valueTexts[1]
last[i]<-sel2$valueTexts[length(sel2$valueTexts)]
}
}
}
tafla2$first_time<-first
tafla2$last_time<-last
# Add date of last update ----
updatevec<-vector()
for(i in 1:nrow(tafla2)){
updatevec[i]<-metadatalisti[[i]]$metadata[[1]]$updated
}
updatevec2<-as.Date.character(updatevec)
tafla2$last_update<-updatevec2
# Add column of duplicate titles ----
tafla2<-tafla2 |>
dplyr::left_join(
tafla2 |>
dplyr::group_by(nofn2) |>
dplyr::summarise(n_duplicates=dplyr::n())
)
# Scrape breadcrumbs, about table and comments from pxweb-----
#' Getting sensible hierarchical organisation of table from the API is a bit of a challenge
#' Scraping metadata from pxweb is also necessary at his point as it is not always consistent with API metadata
#' The inconsistency should of course be resolved!
skyringalisti<-list()
crumbs<-vector()
timi3<-Sys.time()
teljari<-1
remDr$open()
for(k in 1:nrow(tafla2)){
remDr$navigate(tafla2$slod2[k])
saekja1<-remDr$findElements(using = "id", "breadcrumb")
crumb_sub<-unlist(saekja1[[1]]$getElementText())
crumbs[k]<-crumb_sub
skyringar<-remDr$findElements(using = "class", "accordion-header")
for(i in 1:length(skyringar)){
skyringar[[i]]$clickElement()
}
skyringatexti_em<-remDr$findElements(using = "class", "accordion-body")
if(length(skyringatexti_em)>1){
skyringatexti<-unlist(skyringatexti_em[[2]]$getElementText())
}else{
skyringatexti<-"null"
}
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
for(i in 1:length(sub_headers)){
sub_headers[[i]]$clickElement()
}
sub_headers<-remDr$findElements(using = "class", "nested-accordion-header")
sub_textar<-vector()
for(i in 1:length(sub_headers)){
sub_textar[i]<-sub_headers[[i]]$getElementText()
}
sub_bodies<-remDr$findElements(using = "class", "nested-accordion-body")
sub_bodies_textar<-vector()
for(i in 1:length(sub_headers)){
sub_bodies_textar[i]<-sub_bodies[[i]]$getElementText()
}
idtemp<-tafla2$id[k]
skyringalisti0<-list()
skyringalisti0[["skyringatexti"]]<-skyringatexti
skyringalisti0[["sub_textar"]]<-sub_textar
skyringalisti0[["sub_bodies_textar"]]<-sub_bodies_textar
print(paste0("Skröpun 3: tafla númer: ", teljari))
skyringalisti[[idtemp]]<-skyringalisti0
teljari<-teljari+1
}
print("Skröpun 3 lokið")
Sys.time()-timi3
# Split breadcrumbs to make categories
crumbsplit<-strsplit(crumbs, split = "/ ")
crumblengd<-sapply(crumbsplit, length)
crumb1<-sapply(crumbsplit, "[[", 2)
crumb2<-sapply(crumbsplit, "[[", 3)
crumb3<-sapply(crumbsplit, "[[", 4)
tafla2$crumbs<-crumbs
tafla2$crumbs1<-crumb1
tafla2$crumbs2<-crumb2
tafla2$crumbs3<-crumb3
# Add time stamp
metadatalisti$timastimpill_char<-as.character.Date(timi0)
metadatalisti$timastimpill_POSIXct<-timi0
saveRDS(tafla2, paste0(dname,"/uttak/maintab", as.numeric(Sys.time()), ".rds"))
saveRDS(metadatalisti, paste0(dname,"/uttak/apimeta", as.numeric(Sys.time()), ".rds"))
saveRDS(skyringalisti, paste0(dname,"/uttak/pxwebmeta", as.numeric(Sys.time()), ".rds"))
print("Uppfæringu lokið")
Sys.time()-timi0