-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathharmonized_key_variables.R
156 lines (128 loc) · 6.5 KB
/
harmonized_key_variables.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
library(data.table)
library(dplyr)
# Suggestion to harmonized auxiliary data files key variables.
# 1- reference year is going to be renamed as surveyid_year
# 2- reporting level such us pop_data_level, cpi_data_level, pce_data_level, ppp_data_level, and gdp_data_level
# will be renamed as reporting_level
# Here are suggestions for each auxiliary to harmonize their key variables:
# cpi ---------------------------------------------------
# cpi - key variables consisted of country_code, cpi_year, survey_acronym, and cpi_data_level.
# cpi_year and cpi_data_level variables are going to be renamed as surveyid_year and reporting_level
cpi <- pipload::pip_load_aux("cpi")
cpi <- cpi |>
setnames(c("cpi_year", "cpi_data_level"),
c("surveyid_year", "reporting_level"))
# pop ---------------------------------------------------
# pop - key variables consisted of country_code, year, and pop_data_level
# year and pop_data_level are going to be renamed as surveyid_year and reporting_level
pop <- pipload::pip_load_aux("pop")
pop <- pop |>
setnames(c("year", "pop_data_level"),
c("surveyid_year", "reporting_level"))
# pce ---------------------------------------------------
# pce - key variables consisted of country_code, year, and pce_data_level
# year and pce_data_level are going to be renamed as surveyid_year and reporting_level
pce <- pipload::pip_load_aux("pce")
pce <- pce |>
setnames(c("year", "pce_data_level"),
c("surveyid_year", "reporting_level"))
# gdp ---------------------------------------------------
# gdp - key variables consisted of country_code, year, and gdp_data_level
# year and gdp_data_level are going to be renamed as surveyid_year and reporting_level
gdp <- pipload::pip_load_aux("gdp")
gdp <- gdp |>
setnames(c("year", "gdp_data_level"),
c("surveyid_year", "reporting_level"))
# gdm ---------------------------------------------------
# gdm key variables consisted of country_code, year, and pop_data_level
# pop_data_level is going to be renamed as reporting_level
gdm <- pipload::pip_load_aux("gdm")
gdm <- gdm |>
setnames("pop_data_level", "reporting_level")
any(duplicated(gdm,
by = c("country_code", "surveyid_year",
"reporting_level")))
# ppp ---------------------------------------------------
# ppp - key variables consisted of country_code, ppp_year, ppp_data_level, ppp_data_level, release, adaption_version, and cpi_data_level.
# ppp data needs to be filtered by ppp defualt variable and then rename ppp_data_level to reporting_level
# to merge it with any of the auxiliary datasets
ppp <- pipload::pip_load_aux("ppp")
ppp <- ppp[ppp_default == TRUE, ] |>
setnames("ppp_data_level", "reporting_level")
any(duplicated(ppp,
by = c("country_code", "reporting_level")))
# maddison ---------------------------------------------------
# maddison - key variables consisted of country_code, and year.
# year is going to be renamed as surveyid_year
maddison <- pipload::pip_load_aux("maddison")
maddison <- maddison |>
setnames("year", "surveyid_year")
# weo ---------------------------------------------------
# weo - key variables consisted of country_code, and year.
# year is going to be renamed as surveyid_year
weo <- pipload::pip_load_aux("weo")
weo <- weo |>
setnames("year", "surveyid_year")
# npl ---------------------------------------------------
# npl - key variables consisted of country_code, and reporting_year.
# reporting_year is going to be renamed as surveyid_year * if content of the reporting_year
# is similar to surveyid_year
npl <- pipload::pip_load_aux("npl")
npl <- npl |>
setnames("reporting_year", "surveyid_year")
# pfw ---------------------------------------------------
# pfw - key variables consisted of country_code, year [surveyid_year][survey_year][reporting_year],
# and welfare_type [survey_acronym]. We need to get reporting_level variable to merge
# pfw data with any of the other auxiliary datasets. We need to generate reporting level variable using cpi dataset as follows:
#### ????? is possible to include reporting_level variable in the pfw? If possible we don't
# need to run the following script that would generate reporting_level.
pfw <- pipload::pip_load_aux("pfw")
pfw_key_options <- pfw[, .(country_code,
year,
surveyid_year,
survey_acronym,
survey_coverage,
welfare_type,
survey_year,
cpi_domain,
cpi_domain_var)]
pfw_key_options <- pfw_key_options[, cpi_domain_value:=
fifelse(cpi_domain_var == "urban",
0, 1)]
# load cpi ---------------------------------------------------
cpi <- pipload::pip_load_aux("cpi")
cpi_key <- cpi[, .(country_code,
# cpi_year,
survey_year,
survey_acronym,
cpi_domain,
cpi_domain_value,
cpi_data_level)] |>
setnames("cpi_data_level", "reporting_level")
cpi_key <- cpi_key[, cpi_domain :=
fifelse(cpi_domain == "National",
1, 2)]
cpi_key$cpi_domain <- as.numeric(cpi_key$cpi_domain)
pfw_cpi_key <- cpi_key[pfw_key_options, on = .(country_code, survey_year,
survey_acronym, cpi_domain, cpi_domain_value)]
pfw_cpi_key <- pfw_cpi_key |>
group_by(country_code, survey_year,
survey_acronym) |>
mutate(year_ = mean(year, na.rm = TRUE),
surveyid_year_ = mean(surveyid_year, na.rm = TRUE)) |>
ungroup() |>
mutate(year_ = ifelse(is.na(year_), cpi_year, year_),
surveyid_year_ = ifelse(is.na(surveyid_year_), cpi_year, surveyid_year_),
year = ifelse(is.na(year), year_, year),
surveyid_year = ifelse(is.na(surveyid_year), surveyid_year_, surveyid_year)) |>
select(country_code, survey_year, survey_acronym, reporting_level, cpi_domain) |>
setDT()
# add reporting level variable into pfw dataset
pfw <- pfw_cpi_key[pfw, on= .(country_code, survey_year,
survey_acronym, cpi_domain)]
# income_groups ---------------------------------------------------
# income_groups - key variables consisted of country_code and year_data.
# year_data is going to be renamed as surveyid_year
income_groups <- pipload::pip_load_aux("income_groups")
income_groups <- income_groups |>
setnames("year_data", "surveyid_year")