-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path21-sample-for-multilevel-model.R
126 lines (105 loc) · 3.55 KB
/
21-sample-for-multilevel-model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
Sys.setenv(SPARK_HOME = "/usr/hdp/current/spark2-client")
library(sparklyr)
library(tidyverse)
library(arrow)
library(patchwork)
source(here::here("R/helpers.R"))
message("Connecting to spark...")
config <- spark_config()
config$spark.executor.cores <- 5 # this should always stay the same
config$spark.executor.instances <- 8 # this can go up to 27, depending on RAM
config$spark.executor.memory <- "25G"
sc <- spark_connect(master = "yarn-client", config = config,
app_name = "OA_APCs")
message("Connection to Spark successful!")
spark_read_parquet(
sc, "works", "/user/tklebel/apc_paper/papers_with_concepts.parquet",
memory = TRUE
)
works <- tbl(sc, "works")
# only do for 2016-2019 for now
# better would be to use a single value for pptop10, but here we have it as a
# varying one.
selected_works <- works %>%
filter(!is.na(field),
# select period 2016-2019, which includes the respective publications
first_year_of_period == 2016)
# check that selection works
selected_works %>%
summarise(min_year = min(publication_year),
max_year = max(publication_year))
# we have overall low fractional authorships, and duplicate entries since
# authors have mutliple affiliations
# sample individual papers (so the probability of inclusion is similar
# regardless of how many authors or fields the paper belongs to)
only_papers <- selected_works %>%
distinct(id)
n <- sdf_nrow(only_papers)
# 960469 papers are in this set
# sample 8% of papers -> this will lead to much more rows, since we have multiple
# institutions and fields
frac <- .08
the_sample <- only_papers %>%
sdf_sample(fraction = frac, replacement = FALSE, seed = 20220929) %>%
left_join(selected_works) %>%
collect()
the_sample <- the_sample %>%
# calculate weights for each observation
mutate(total_weight = work_frac * concept_frac)
# checking the sample -----
# did we actually get about 80k papers?
the_sample %>%
distinct(id) %>%
nrow()
# yep: 76447
# what about the distribution of fields
the_sample %>%
distinct(id, field, concept_frac) %>%
group_by(field) %>%
summarise(n = sum(concept_frac)) %>%
arrange(desc(n))
# # A tibble: 19 × 2
# field n
# <chr> <dbl>
# 1 Medicine 23422.
# 2 Biology 12584.
# 3 Chemistry 7456.
# 4 Computer science 7185.
# 5 Materials science 6358.
# 6 Psychology 4483.
# 7 Physics 2458.
# 8 Environmental science 2257.
# 9 Political science 1875.
# 10 Geography 1602.
# 11 Sociology 1497.
# 12 Art 1213.
# 13 Business 1158.
# 14 Mathematics 911.
# 15 Geology 761.
# 16 Philosophy 527.
# 17 Economics 322.
# 18 History 198.
# 19 Engineering 180.
# this conforms in general to the overall pattern, but the ordinal ranking is
# not identical (expected given this is a sample and some differences are small)
# full counting on authorships
the_sample %>%
distinct(id, country) %>%
count(country, sort = TRUE)
# # A tibble: 69 × 2
# country n
# <chr> <int>
# 1 China 15407
# 2 United States 13265
# 3 Brazil 6853
# 4 United Kingdom 4339
# 5 Germany 3461
# 6 Spain 2985
# 7 Japan 2790
# 8 South Korea 2723
# 9 Canada 2574
# 10 Australia 2544
# # … with 59 more rows
the_sample %>%
write_csv("data/processed/multilevel_sample_large.csv")
spark_disconnect(sc)