title

author

date

output

Relationship between OA publishing, APCs and IF

Thomas Klebel

12 October, 2022

html_document

keep_md
true

Sample description

Institutions per country

universities_per_country <- works %>% 
  distinct(country, University) %>% 
  count(country, name = "n_universities") %>% 
  arrange(desc(n_universities)) %>% 
  collect()

universities_per_country %>% 
  knitr::kable()

country	n_universities
China	216
United States	199
United Kingdom	60
Japan	55
Germany	54
South Korea	46
Spain	42
Italy	41
India	38
Iran	36
Australia	32
Turkey	31
Brazil	31
Poland	31
Canada	30
France	28
Taiwan	21
Netherlands	13
Austria	12
Sweden	12
Russia	10
South Africa	9
Greece	8
Belgium	8
Switzerland	8
Egypt	8
Israel	8
Czech Republic	7
New Zealand	7
Finland	7
Thailand	6
Portugal	6
Hungary	6
Ireland	6
Mexico	6
Malaysia	6
Norway	6
Saudi Arabia	5
Denmark	5
Pakistan	5
Singapore	3
Tunisia	3
Romania	3
Argentina	3
Chile	3
Serbia	3
Colombia	3
Jordan	2
Slovakia	2
Nigeria	2
United Arab Emirates	2
Slovenia	2
Oman	1
Cyprus	1
Estonia	1
Uganda	1
Luxembourg	1
Kuwait	1
Algeria	1
Croatia	1
Ghana	1
Iceland	1
Qatar	1
Viet Nam	1
Uruguay	1
Ethiopia	1
Lithuania	1
Morocco	1
Lebanon	1

# papers per country
papers_per_country <- works %>% 
  distinct(country, country_code, id, work_frac, author_position, institution_id) %>% 
  group_by(country, country_code) %>% 
  summarise(sum_fractional_works = sum(work_frac) %>% round(digits = 1)) %>% 
  arrange(desc(sum_fractional_works)) %>% 
  collect()

## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'country'. You can override using the `.groups` argument.

papers_per_country %>% 
  select(-country_code) %>% 
  knitr::kable()

country	sum_fractional_works
China	460832.3
United States	456678.0
Brazil	266241.3
United Kingdom	130906.7
Germany	114970.8
Japan	90599.2
Canada	87827.8
Spain	84138.8
South Korea	79297.7
Australia	78220.5
Italy	77622.7
Netherlands	42592.2
Taiwan	40111.3
Sweden	38855.0
Poland	37813.5
France	31836.3
Switzerland	29575.2
Iran	28694.7
India	27611.5
South Africa	25886.0
Belgium	24577.5
Denmark	19803.7
Malaysia	18720.5
Turkey	18233.7
Mexico	17757.8
Austria	17341.0
Portugal	16621.8
Israel	15475.0
Norway	14952.8
Saudi Arabia	12935.7
Finland	12476.2
Russia	11144.3
Singapore	11055.0
Egypt	10171.3
Thailand	10076.0
Greece	9671.2
New Zealand	9474.0
Colombia	9076.3
Czech Republic	8389.5
Chile	8320.2
Ireland	7870.3
Argentina	7579.8
Hungary	6309.3
Serbia	6008.7
Pakistan	4285.5
Slovenia	4175.7
Croatia	3819.3
Nigeria	2140.8
Romania	1835.3
Uruguay	1702.5
Slovakia	1628.2
Estonia	1590.0
Uganda	1561.7
Jordan	1508.5
United Arab Emirates	1500.0
Lebanon	1475.3
Oman	1250.8
Tunisia	1221.0
Lithuania	1221.0
Ghana	1194.2
Ethiopia	1106.8
Kuwait	876.5
Iceland	778.3
Qatar	650.8
Luxembourg	643.8
Morocco	538.3
Viet Nam	463.5
Cyprus	383.8
Algeria	117.8

# average apc
average_apc <- works %>%
  # first get rid of duplicates from concepts
  distinct(country, country_code, id, work_frac, author_position, institution_id,
           APC_in_dollar) %>% 
  group_by(country) %>%
  # compute the average APC using fractional authorships as weights
  mutate(sum_frac = sum(work_frac)) %>%
  group_by(country, sum_frac) %>%
  summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac) %>% 
  collect()

## `summarise()` has grouped output by 'country'. You can override using the
## `.groups` argument.

# average APC over time
average_apc_time <- works %>%
  # first get rid of duplicates from concepts
  distinct(country, country_code, id, work_frac, author_position, institution_id,
           APC_in_dollar, publication_year) %>% 
  group_by(country, country_code, publication_year) %>%
  # compute the average APC using fractional authorships as weights
  mutate(sum_frac = sum(work_frac)) %>%
  group_by(country, country_code, sum_frac, publication_year) %>%
  summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac) %>% 
  collect()

## `summarise()` has grouped output by 'country', 'country_code', 'sum_frac'. You
## can override using the `.groups` argument.

average_apc_time %>% 
  left_join(wdi, by = c("country_code" = "iso2c")) %>% 
  ggplot(aes(publication_year, mean_apc)) +
  geom_line(aes(group = country), alpha = .3) +
  geom_smooth(se = FALSE, colour = "#007FA8") +
  facet_wrap(vars(region)) +
  scale_x_continuous(breaks = scales::pretty_breaks(6)) +
  coord_cartesian(ylim = c(0, 3000)) +
  labs(x = NULL, y = "Mean APC")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

All three joined

all_three_descriptives <- universities_per_country %>% 
  left_join(papers_per_country, by = "country") %>% 
  left_join(average_apc, by = "country")

all_three_descriptives %>% 
  arrange(desc(n_universities), desc(sum_fractional_works)) %>% 
  mutate(mean_apc = round(mean_apc, digits = 1),
         sum_fractional_works = scales::comma(sum_fractional_works)) %>% 
  select(Country = country, `n universities` = n_universities,
         `n fractional publications` = sum_fractional_works, 
         `Mean APC` = mean_apc) %>% 
  knitr::kable()

Country	n universities	n fractional publications	Mean APC
China	216	460,832.3	2002.6
United States	199	456,678.0	2099.0
United Kingdom	60	130,906.7	2042.1
Japan	55	90,599.2	1920.8
Germany	54	114,970.8	2014.4
South Korea	46	79,297.7	1843.7
Spain	42	84,138.8	1176.2
Italy	41	77,622.7	1919.2
India	38	27,611.5	1052.3
Iran	36	28,694.7	818.1
Australia	32	78,220.5	1982.5
Brazil	31	266,241.3	429.0
Poland	31	37,813.5	1149.0
Turkey	31	18,233.7	933.4
Canada	30	87,827.8	1926.1
France	28	31,836.3	1963.1
Taiwan	21	40,111.3	1920.0
Netherlands	13	42,592.2	2055.0
Sweden	12	38,855.0	2033.2
Austria	12	17,341.0	1958.8
Russia	10	11,144.3	734.9
South Africa	9	25,886.0	1167.1
Switzerland	8	29,575.2	2190.5
Belgium	8	24,577.5	1955.2
Israel	8	15,475.0	2189.4
Egypt	8	10,171.3	991.9
Greece	8	9,671.2	1643.8
Finland	7	12,476.2	1899.1
New Zealand	7	9,474.0	1861.0
Czech Republic	7	8,389.5	1371.1
Malaysia	6	18,720.5	1271.4
Mexico	6	17,757.8	1072.4
Portugal	6	16,621.8	1231.3
Norway	6	14,952.8	1877.1
Thailand	6	10,076.0	1673.7
Ireland	6	7,870.3	1998.1
Hungary	6	6,309.3	1791.9
Denmark	5	19,803.7	2008.1
Saudi Arabia	5	12,935.7	1561.1
Pakistan	5	4,285.5	1190.0
Singapore	3	11,055.0	2198.9
Colombia	3	9,076.3	445.3
Chile	3	8,320.2	1075.9
Argentina	3	7,579.8	664.7
Serbia	3	6,008.7	765.2
Romania	3	1,835.3	1129.6
Tunisia	3	1,221.0	1378.1
Slovenia	2	4,175.7	1133.1
Nigeria	2	2,140.8	1251.3
Slovakia	2	1,628.2	918.8
Jordan	2	1,508.5	1444.6
United Arab Emirates	2	1,500.0	1876.0
Croatia	1	3,819.3	607.2
Uruguay	1	1,702.5	690.7
Estonia	1	1,590.0	1588.3
Uganda	1	1,561.7	1895.4
Lebanon	1	1,475.3	1832.5
Oman	1	1,250.8	618.6
Lithuania	1	1,221.0	1351.3
Ghana	1	1,194.2	1721.3
Ethiopia	1	1,106.8	1750.5
Kuwait	1	876.5	1673.2
Iceland	1	778.3	1698.7
Qatar	1	650.8	1742.2
Luxembourg	1	643.8	2149.2
Morocco	1	538.3	980.9
Viet Nam	1	463.5	1518.1
Cyprus	1	383.8	1760.4
Algeria	1	117.8	838.8

gdp <- WDI::WDI(start = 2019, end = 2019)

# plot n papers against average apc
p <- all_three_descriptives %>% 
  left_join(wdi, by = c("country_code" = "iso2c")) %>% 
  ggplot(aes(sum_fractional_works, mean_apc, colour = region, label = country)) +
  geom_point() +
  scale_x_log10(labels = scales::comma) +
  scale_y_continuous(labels = scales::comma) +
  theme(legend.position = "top") +
  labs(y = NULL, colour = NULL, x = "Sum of fractional publications")
p

plotly::ggplotly(p)

preserve4904f9f20c21d3ab

pdata <- all_three_descriptives %>% 
  left_join(gdp, by = c("country_code" = "iso2c")) %>% 
  left_join(wdi, by = c("country_code" = "iso2c"))

labels <- pdata %>% 
  mutate(label = case_when(
    country.x %in% c("China", "India", "United States",
                      "Brazil", "Switzerland", "Israel", "Spain",
                     "Saudi Arabia") ~ country.x,
    TRUE ~ ""))

p <- pdata %>% 
  ggplot(aes(NY.GDP.PCAP.KD, mean_apc, colour = region, label = country.x)) +
  geom_point(aes(alpha = sum_fractional_works)) +
  ggrepel::geom_text_repel(data = labels, aes(label = label),
                           show.legend = FALSE, max.overlaps = Inf,
                           box.padding = 1, min.segment.length = 0) +
  scale_x_continuous(labels = scales::dollar) +
  scale_y_continuous(labels = scales::dollar) +
  scale_alpha_continuous(trans = "log10", range = c(.1, 1),
                         labels = scales::comma) +
  scale_colour_discrete_qualitative(palette = "Dark 3") +
  theme(legend.position = "top", legend.box = "vertical") +
  labs(y = "Mean APC", colour = NULL, x = "GDP per capita", 
       alpha = "Number of fractional publications")
p

plotly::ggplotly(p)

preserve8098e76bd79ab088

Papers per continent

plot_data <- papers_per_country %>% 
  left_join(wdi, by = c("country_code" = "iso2c")) %>% 
  group_by(region) %>% 
  summarise(sum_fractional_works = sum(sum_fractional_works)) %>% 
  mutate(prop = sum_fractional_works / sum(sum_fractional_works),
         label = glue::glue(
           "{scales::comma(sum_fractional_works)} ({scales::percent(prop, accuracy = .1)})"
         )
  )

plot_data %>% 
  ggplot(aes(sum_fractional_works, fct_reorder(region, prop))) +
  geom_segment(aes(xend = 0, yend = region), colour = "grey70") +
  geom_point() +
  geom_text(aes(label = label), nudge_x = 10000, hjust = "left") +
  scale_x_continuous(expand = expansion(mult = c(0.05, .25)),
                     labels = scales::comma) +
  labs(x = "Number of fractional publications", y = NULL) +
  theme(panel.grid = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(margin = margin(r = -10)))

Distribution across topics

How many papers do we have, which also have a topic?

works %>% 
  distinct(id) %>% 
  sdf_nrow()

## [1] 1572417

This is our total sample size.

Which topics are represented in our sample?

frac_concept_papers <- works %>% 
  distinct(id, field, concept_frac) %>% 
  group_by(field) %>% 
  summarise(frac_papers = sum(concept_frac)) %>% 
  arrange(desc(frac_papers)) %>% 
  collect()

plot_data <- frac_concept_papers %>% 
  drop_na() %>% 
  mutate(prop = frac_papers / sum(frac_papers),
         label = glue::glue(
           "{scales::comma(frac_papers)} ({scales::percent(prop, accuracy = .1)})"
         )
  )

plot_data %>% 
  ggplot(aes(frac_papers, fct_reorder(field, prop))) +
  geom_segment(aes(xend = 0, yend = field), colour = "grey70") +
  geom_point() +
  geom_text(aes(label = label), nudge_x = 8000, hjust = "left") +
  scale_x_continuous(expand = expansion(mult = c(0.05, .25)),
                     labels = scales::comma) +
  labs(x = "Number of fractional publications", y = NULL) +
  theme(panel.grid = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(margin = margin(r = -10)))

What is the average apc across topics?

apc_field <- works %>% 
  distinct(id, field, concept_frac, APC_in_dollar) %>% 
  group_by(field) %>%
  mutate(sum_frac = sum(concept_frac)) %>%
  group_by(field, sum_frac) %>%
  summarise(mean_apc = sum(concept_frac * APC_in_dollar) / sum_frac) %>% 
  collect()

## `summarise()` has grouped output by 'field'. You can override using the
## `.groups` argument.

plot_data %>% 
  mutate(frac_papers = scales::comma(frac_papers),
         prop = scales::percent(prop, accuracy = .1)) %>% 
  knitr::kable()

field	frac_papers	prop	label
Medicine	481,047	30.6%	481,047 (30.6%)
Biology	290,499	18.5%	290,499 (18.5%)
Chemistry	163,767	10.4%	163,767 (10.4%)
Computer science	140,438	8.9%	140,438 (8.9%)
Materials science	106,550	6.8%	106,550 (6.8%)
Psychology	98,461	6.3%	98,461 (6.3%)
Environmental science	46,768	3.0%	46,768 (3.0%)
Physics	44,382	2.8%	44,382 (2.8%)
Political science	36,339	2.3%	36,339 (2.3%)
Geography	31,787	2.0%	31,787 (2.0%)
Sociology	29,340	1.9%	29,340 (1.9%)
Mathematics	24,093	1.5%	24,093 (1.5%)
Art	21,573	1.4%	21,573 (1.4%)
Business	20,992	1.3%	20,992 (1.3%)
Geology	13,937	0.9%	13,937 (0.9%)
Philosophy	10,596	0.7%	10,596 (0.7%)
Economics	5,709	0.4%	5,709 (0.4%)
History	3,379	0.2%	3,379 (0.2%)
Engineering	2,760	0.2%	2,760 (0.2%)

p_apc_field <- apc_field %>% 
  drop_na() %>% 
  ggplot(aes(mean_apc, fct_reorder(field, mean_apc))) +
  geom_segment(aes(xend = 0, yend = field), colour = "grey70") +
  geom_point() + 
  geom_text(aes(label = scales::comma(mean_apc, accuracy = 1)), nudge_x = 30, 
            hjust = "left") +
  labs(y = NULL, x = "Mean APC (in $)") +
  theme(panel.grid.major.y = element_blank(),
        panel.grid.minor.x = element_blank(),
        axis.ticks = element_blank(),
        panel.border = element_blank()) +
  scale_x_continuous(expand = expansion(mult = c(0, .1)),
                     labels = scales::comma)
p_apc_field

Association between P_top10 and APC

get_mean_apc_by_author_position <- function(df) {
  df %>%
    # first get rid of duplicates from concepts
    distinct(id, author_position, work_frac, APC_in_dollar, University, country,
             publication_year, P_top10) %>% 
    group_by(University, publication_year, country, P_top10) %>%
    # compute the average APC using fractional authorships as weights
    mutate(sum_frac = sum(work_frac)) %>%
    group_by(University, publication_year, country, P_top10, sum_frac,
             author_position) %>%
    summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac,
              fractional_works = sum(work_frac))
}

mean_apcs <- works %>% 
  filter(publication_year == last_year_of_period) %>% 
  get_mean_apc_by_author_position()

mean_apcs_local <- mean_apcs %>%
  collect()

## `summarise()` has grouped output by 'University', 'publication_year', 'country',
## 'P_top10', 'sum_frac'. You can override using the `.groups` argument.

mean_apc_16_19 <- works %>% 
  filter(first_year_of_period == 2016) %>% 
  get_mean_apc_by_author_position()

mean_apc_16_19_local <- mean_apc_16_19 %>% 
  collect()

## `summarise()` has grouped output by 'University', 'publication_year', 'country',
## 'P_top10', 'sum_frac'. You can override using the `.groups` argument.

# plot for 2016-19

# taking out the correlation, because they are incorrect given that the figure
# shows a non-linear relationship (x-axis logged), but the correlation is linear
# (and quite unsuitable to the skewed P_top10)
p1 <- mean_apc_16_19_local %>%
  mutate(author_position = recode(author_position, first = "First authors", 
                                  last = "Last authors")) %>% 
  ggplot(aes(P_top10, mean_apc, colour = fractional_works)) + 
  geom_point(aes(), alpha = .5) +
  scale_colour_continuous_sequential(palette = "Mako", trans = "log10",
                                     labels = comma) +
  geom_smooth(colour = "grey30") +
  facet_wrap(vars(author_position)) +
  scale_x_log10() +
  scale_y_continuous(labels = dollar) +
  labs(caption = "Time period: 2016-2019", y = "Mean APC",
       colour = "Number of papers per institution",
       x = expression(P["top 10%"])) +
  theme(legend.position = "top",
        legend.key.width = unit(1.5, 'cm'))
p1

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

p2 <- mean_apcs_local %>%
  mutate(author_position = recode(author_position, first = "First authors", 
                                  last = "Last authors")) %>% 
  group_by(publication_year, author_position) %>%
  mutate(ptop10_quantiles = cut_quartiles(P_top10)) %>%
  group_by(ptop10_quantiles, publication_year, author_position) %>%
  summarise(mean_apc = weighted.mean(mean_apc, sum_frac, na.rm = TRUE), 
            .groups = "drop_last") %>%
  ggplot(aes(publication_year, mean_apc, colour = ptop10_quantiles,
             group = ptop10_quantiles)) +
  geom_line() +
  facet_wrap(vars(author_position)) +
  scale_x_continuous(breaks = seq(2010, 2018, by = 4)) +
  scale_y_continuous(labels = dollar) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  theme(legend.position = "top") +
  labs(y = "Mean APC",
       colour = expression(P["top 10%"]), x = NULL)
p2

p1 / p2 +
  plot_layout(heights = c(4.5, 4)) +
  plot_annotation(tag_levels = "A")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

get_mean_apc_by_concept <- function(df) {
  df %>%
    distinct(id, University, publication_year, P_top10, field, work_frac, 
             APC_in_dollar, author_position) %>% 
    group_by(University, publication_year, P_top10, field) %>%
    # spark is unhappy for some reason, so coerce again to numeric
    mutate(work_frac = as.numeric(work_frac)) %>% 
    # compute the average APC using fractional authorships as weights
    mutate(sum_frac = sum(work_frac)) %>%
    group_by(University, publication_year, P_top10, sum_frac,
             author_position, field) %>%
    summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac)
}

mean_apc_concept <- works %>% 
  filter(publication_year == last_year_of_period) %>% 
  get_mean_apc_by_concept()

mean_apc_concept_local <- mean_apc_concept %>%
  collect()

## `summarise()` has grouped output by 'University', 'publication_year', 'P_top10',
## 'sum_frac', 'author_position'. You can override using the `.groups` argument.

mean_apc_concept_16_19 <- works %>% 
  filter(first_year_of_period == 2016) %>% 
  get_mean_apc_by_concept()

mean_apc_concept_16_19_local <- mean_apc_concept_16_19 %>% 
  collect()

## `summarise()` has grouped output by 'University', 'publication_year', 'P_top10',
## 'sum_frac', 'author_position'. You can override using the `.groups` argument.

mean_apc_concept_16_19_local <- mean_apc_concept_16_19_local %>% 
  mutate(author_position = recode(author_position, first = "First authors", 
                                  last = "Last authors"))

# plot for 2016-2019
p <- mean_apc_concept_16_19_local %>%
  drop_na(field) %>% 
  ggplot(aes(P_top10, mean_apc, colour = field)) +
  geom_smooth(alpha = .15) +
  facet_wrap(vars(author_position), nrow = 1) +
  scale_x_log10() +
  scale_y_continuous(labels = dollar) + 
  labs(caption = "2016-2019", y = "Mean APC",
       x = expression(P["top 10%"]))
p

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

plotly::ggplotly(p)

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

preserve503bbba6547ddd22

Using ggrepel

field_selection <- mean_apc_concept_16_19_local %>%
  drop_na(field) %>% 
  filter(field %in% c("Medicine", "Philosophy", "Sociology", "Biology",
                      "Materials science", "Physics"))
  
final_ptop_apc_field <- mean_apc_concept_16_19_local %>% 
  anti_join(field_selection) %>% 
  ggplot(aes(P_top10, mean_apc, group = field)) +
  geom_smooth(alpha = .3, colour = "grey80", fill = "grey90") +
  geom_smooth(aes(colour = field), data = field_selection) +
  facet_wrap(vars(author_position), nrow = 1) +
  scale_x_log10() +
  scale_y_continuous(labels = dollar) + 
  scale_color_discrete_qualitative(palette = "Dark 3") +
  labs(caption = "Time period: 2016-2019", y = "Mean APC", colour = NULL,
       x = expression(P["top 10%"])) +
  theme(legend.position = "top") +
  guides(colour = guide_legend(override.aes = list(alpha = 0),
                               nrow = 1))

## Joining, by = c("University", "publication_year", "P_top10", "sum_frac",
## "author_position", "field", "mean_apc")

final_ptop_apc_field

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

(final_ptop_apc_field + 
   # https://stackoverflow.com/a/65946462/3149349
   theme(axis.title.y = element_text(margin = margin(r = -120, unit = "pt")))
   ) / p_apc_field +
  plot_layout(heights = c(3.5, 5)) +
  plot_annotation(tag_levels = "A")

## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Over time - first-authors

mean_apc_concept_local %>%
  filter(!is.na(field), author_position == "first") %>% 
  group_by(publication_year, field) %>%
  mutate(ptop10_quantiles = cut_quartiles(P_top10)) %>%
  group_by(ptop10_quantiles, publication_year, field) %>%
  summarise(mean_apc = weighted.mean(mean_apc, sum_frac, na.rm = TRUE), 
            .groups = "drop_last") %>%
  ggplot(aes(publication_year, mean_apc, colour = ptop10_quantiles,
             group = ptop10_quantiles)) +
  geom_line() +
  facet_wrap(vars(field)) +
  scale_x_continuous(breaks = seq(2010, 2018, by = 4)) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  theme(legend.position = "top") +
  labs(y = "Mean APC",
       colour = expression(P["top 10%"]), x = NULL)

Over time - last-authors

mean_apc_concept_local %>%
  filter(!is.na(field), author_position == "last") %>% 
  group_by(publication_year, field) %>%
  mutate(ptop10_quantiles = cut_quartiles(P_top10)) %>%
  group_by(ptop10_quantiles, publication_year, field) %>%
  summarise(mean_apc = weighted.mean(mean_apc, sum_frac, na.rm = TRUE), 
            .groups = "drop_last") %>%
  ggplot(aes(publication_year, mean_apc, colour = ptop10_quantiles,
             group = ptop10_quantiles)) +
  geom_line() +
  facet_wrap(vars(field)) +
  scale_x_continuous(breaks = seq(2010, 2018, by = 4)) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  theme(legend.position = "top") +
  labs(y = "Mean APC",
       colour = expression(P["top 10%"]), x = NULL)

Country comparison

mean_apc_country_16_19 <- works %>%
  filter(first_year_of_period == 2016) %>% 
  # first get rid of duplicates from concepts
  distinct(id, work_frac, APC_in_dollar, University, country, P_top10, country_code) %>% 
  # spark is unhappy for some reason, so coerce again to numeric
  mutate(work_frac = as.numeric(work_frac)) %>% 
  group_by(University, country, P_top10) %>%
  # compute the average APC using fractional authorships as weights
  mutate(sum_frac = sum(work_frac)) %>%
  group_by(University, country, P_top10, sum_frac, country_code) %>%
  summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac)

mean_apc_country_16_19_local <- mean_apc_country_16_19 %>%
  collect()

## `summarise()` has grouped output by 'University', 'country', 'P_top10',
## 'sum_frac'. You can override using the `.groups` argument.

mean_apc_country_16_19_local <- mean_apc_country_16_19_local %>% 
  left_join(wdi, by = c("country_code" = "iso2c"))

mean_apc_country_16_19_local %>% 
  ggplot(aes(P_top10, mean_apc, colour = region)) +
  geom_point(alpha = .3, size = 1.2) +
  geom_smooth(alpha = .3) +
  scale_x_log10() +
  scale_y_continuous(labels = dollar) +
  scale_color_discrete_qualitative(palette = "Dark 3") +
  labs(y = "Mean APC", x = expression(P["top 10%"]), colour = NULL) +
  theme(legend.position = "top") +
  guides(colour = guide_legend(override.aes = list(alpha = 0)))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

spark_disconnect(sc)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

20-APC-analysis.md

20-APC-analysis.md

Sample description

Institutions per country

Papers per continent

Distribution across topics

Association between P_top10 and APC

Over time - first-authors

Over time - last-authors

Country comparison

Files

20-APC-analysis.md

Latest commit

History

20-APC-analysis.md

File metadata and controls

Sample description

Institutions per country

Papers per continent

Distribution across topics

Association between P_top10 and APC

Over time - first-authors

Over time - last-authors

Country comparison