22-additional-mini-analyses.Rmd

---
title: "Additional mini analyses"
author: "Thomas Klebel"
date: "`r format(Sys.time(), '%d %B, %Y')`"
output: 
  html_document:
    keep_md: true
---

```{r setup, include=FALSE, message=FALSE, warning=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, dpi = 300)

# setup -----
Sys.setenv(SPARK_HOME = "/usr/hdp/current/spark2-client")
library(sparklyr)
library(tidyverse)
library(scales)
library(arrow)
library(dbplot)
library(ggridges)
library(colorspace)
source(here::here("R/helpers.R"))

message("Connecting to spark...")

config <- spark_config()
config$spark.executor.cores <- 5 # this should always stay the same
config$spark.executor.instances <- 20 # this can go up to 27, depending on RAM
config$spark.executor.memory <- "10G"
sc <- spark_connect(master = "yarn-client", config = config,
                    app_name = "mini-analyses")
message("Connection to Spark successful!")

spark_read_parquet(
  sc, "works", "/user/tklebel/apc_paper/papers_with_concepts.parquet",
  memory = TRUE
)
works <- tbl(sc, "works")

works <- works %>% 
  mutate(
    # rerunning the data pipeline introduced a hiccup regarding decimal precision
    # this is unlikely of substantive concern, since all values are present,
    # and no missing values are introduced through the below command
    work_frac = as.numeric(work_frac))


doaj <- read_csv("data/processed/doaj_cleaned.csv")

multilevel_sample <- read_csv("data/processed/multilevel_sample.csv")

message("Successfully read all datasets!")


theme_clean <- theme_bw() +
  theme(panel.border = element_blank(),
        axis.ticks = element_blank(),
        axis.text = element_text(size = rel(1)))

theme_set(theme_clean)
```


# How many of the journals in DOAJ charge APCs?
```{r}
doaj %>% 
  count(APC) %>% 
  mutate(prop = n/sum(n))
```

# Multimodal distribution of APC across fields
```{r}
apcs <- works %>% 
  select(id, APC_in_dollar)

field_apcs <- works %>% 
  distinct(id, field) %>% 
  left_join(apcs)
```


```{r}
pdata <- multilevel_sample %>% 
  # remove duplicates from multiple institutions for single papers 
  distinct(id, field, APC_in_dollar) %>% 
  drop_na() # remove null APCs
```

```{r apc-by-field, fig.height=8, fig.width=8}
pdata %>% 
  ggplot(aes(APC_in_dollar, fct_reorder(field, APC_in_dollar), fill = stat(x))) +
  geom_density_ridges_gradient(rel_min_height = .01, scale = 1.3, alpha = .7,
                               show.legend = FALSE) + 
  scale_fill_continuous_sequential(palette = "Mako") +
  scale_x_continuous(labels = scales::comma) +
  theme_clean +
  coord_cartesian(clip = "off") +
  labs(y = NULL, x = "APC in dollar") 
```

# What are the levels of P top 10% in India?
```{r}
institutions <- works %>% 
  filter(publication_year == 2019) %>% 
  distinct(country, institution_id, P_top10) %>% 
  collect()
  
institutions %>% 
  summarise(q20 = quantile(P_top10, .2),
            q80 = quantile(P_top10, .8))
```


```{r}
institutions %>% 
  filter(country == "India") %>% 
  summarise(min = min(P_top10),
            max = max(P_top10))
```


# Compare effect of Ptop10% on APC between India and China
```{r}
get_mean_apc_by_author_position <- function(df) {
  df %>%
    # first get rid of duplicates from concepts
    distinct(id, author_position, work_frac, APC_in_dollar, University, country,
             publication_year, P_top10) %>% 
    group_by(University, publication_year, country, P_top10) %>%
    # compute the average APC using fractional authorships as weights
    mutate(sum_frac = sum(work_frac)) %>%
    group_by(University, publication_year, country, P_top10, sum_frac,
             author_position) %>%
    summarise(mean_apc = sum(work_frac * APC_in_dollar) / sum_frac,
              fractional_works = sum(work_frac))
}

mean_apc_16_19 <- works %>% 
  filter(first_year_of_period == 2016,
         country %in% c("India", "China")) %>% 
  get_mean_apc_by_author_position()

mean_apc_16_19_local <- mean_apc_16_19 %>% 
  collect()
```

```{r effect-india-china, fig.width=7, fig.height=4}
p1 <- mean_apc_16_19_local %>%
  mutate(author_position = recode(author_position, first = "First authors", 
                                  last = "Last authors")) %>% 
  ggplot(aes(P_top10, mean_apc, colour = fractional_works)) + 
  geom_point(aes(), alpha = .5) +
  scale_colour_continuous_sequential(palette = "Mako", trans = "log10",
                                     labels = comma) +
  geom_smooth(colour = "grey30") +
  facet_wrap(vars(country)) +
  scale_x_log10() +
  scale_y_continuous(labels = dollar) +
  labs(caption = "Fractional counting; 2016-2019", y = "Mean APC",
       colour = "Number of papers per institution",
       x = expression(P["top 10%"])) +
  theme(legend.position = "top",
        legend.key.width = unit(1.5, 'cm'))
p1
```

This comparison demonstrates that there is only a very weak effect in China,
and a much stronger effect in India. However, in India there is also more going
on, with a slight structural break at a little over 100 $P_{top\ 10\%}$.
Additionally, it should be noted that the above effects are potentially 
confounded by field effects.

```{r}
spark_disconnect(sc)
```