-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconversionProcess.RMD
118 lines (107 loc) · 4.36 KB
/
conversionProcess.RMD
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
---
title: "conversionProcess"
author: "ab"
date: '2022-09-08'
output: pdf_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = F)
```
## Process a SLAMSEQ experiment
### Load data in
```{r}
run_name <- "PCN.ThreeWay_Slamseq"
## Path to conversions
p.in <- file.path("/local/artem/Projects/Stability/Results/Slamseq_runs", run_name, "conversions/")
v.bases <- c("A", "C", "T", "G")
p.in_N <- purrr::map(v.bases,
~ list.files(file.path(p.in, "genomeN", .x), pattern = "*perGene*", full.names = T)) %>%
purrr::reduce(., c)
p.in_NN <- purrr::map(v.bases,
~ list.files(file.path(p.in, "genomeNN", .x), pattern = "*perGene*", full.names = T)) %>%
purrr::reduce(., c)
p.in_SNP <- list.files(file.path(p.in, "SNPs"), pattern = "*perGene*", full.names = T)
## assemble path table to ease the future sample selections
list(
purrr::map_dfr(p.in_N,
~ tibble(stem = str_split_fixed(basename(.x), "perGene.", 2)[, 2] %>% str_remove(., ".txt"),
base = str_split_fixed(basename(.x), ".perGene", 2)[, 1] %>% str_remove(., "counts"),
path_base = .x)),
purrr::map_dfr(p.in_NN,
~ tibble(stem = str_split_fixed(basename(.x), "perGene.", 2)[, 2] %>% str_remove(., ".txt"),
conv = str_split_fixed(basename(.x), ".perGene", 2)[, 1] %>% str_remove(., "genome|counts"),
path_conv = .x,
base = str_sub(conv, 1, 1))),
purrr::map_dfr(p.in_SNP,
~ tibble(stem = str_split_fixed(basename(.x), "perGene.", 2)[, 2] %>% str_remove(., ".txt"),
snp = str_split_fixed(basename(.x), ".perGene", 2)[, 1] %>% str_remove(., "genome|counts"),
path_snp = .x,
base = str_sub(snp, 1, 1)))
) %>%
purrr::reduce(., left_join, by = c("stem", "base")) %>%
filter(is.na(snp) | (snp == conv)) -> t.p_comb
## clean sample names
t.p_comb <- t.p_comb %>% mutate(stem = str_split_fixed(stem, "_S", 2)[, 1])
```
## work only with protein coding genes; exclue miRs, exclue lncRNA
```{r}
mm10_gnames <- read_tsv("/local/shared_scripts/database/mm10/slam_annot/Mus_musculus.GRCm38.96_geneNames.tsv")
mm10_lncRNA <- read_json("/local/artem/Data/Annotations/essential lncRNA.json", simplifyVector = T) %>% filter(Organism == "Mouse")
```
## Check conversion rates at T0
```{r, fig.height=6, warning=FALSE}
t.cr_T0 <- comp_T0NN(tab_comb = t.p_comb)
## keep only protein coding genes
t.cr_T0 <- t.cr_T0 %>%
filter(conv_count > snp_count) %>%
mutate(gene_name = str_split_fixed(gene, "_", 2)[, 2]) %>%
filter(gene_name %in% mm10_gnames[mm10_gnames$gene_biotype == "protein_coding", ]$gene_name,
str_detect(gene_name, "^Mir", negate = T),
!gene_name %in% mm10_lncRNA$Name) %>%
dplyr::select(-gene_name)
##
plot_TONN.qc(t.cr_T0, percent_limit = 7)
```
## Check T -> C conversions
```{r}
t.cr_TC <- comp_TC(t.p_comb)
## keep only protein coding genes
t.cr_TC <- t.cr_TC %>%
filter(conv_count > snp_count) %>%
mutate(gene_name = str_split_fixed(gene, "_", 2)[, 2]) %>%
filter(gene_name %in% mm10_gnames[mm10_gnames$gene_biotype == "protein_coding", ]$gene_name,
str_detect(gene_name, "^Mir", negate = T),
!gene_name %in% mm10_lncRNA$Name) %>%
dplyr::select(-gene_name)
```
```{r, warning=FALSE}
plot_TC.qc_boxes(t.cr_TC,
y_lims = c(0, 8))
```
```{r, fig.width=8, warning=FALSE}
plot_TC.qc_ks(t.cr_TC)
```
## Filter
```{r}
# dnCaf1: T0 br3, T16 br 2
filt_out <- filt_TC(t.cr_TC,
min_T0 = 2,
min_T4 = 2,
min_T8 = 2,
min_T16 = 1,
out_drop = c("Cyto_T0_Rep3", "Cyto_T0_Rep4", "Nuc_T0_Rep4", "Nuc_T8_Rep4", "Pro_T4_Rep3"))
t.cr_TC.filt <- filt_out$tab_cr_TC.filt
```
## Check the number of genes left after filtering
```{r}
purrr::map(split(filt_out$filt_stats$obs_T, filt_out$filt_stats$obs_T$group),
~ sum(!.x$dropped))
```
## Normalize conversions & estimate half-lives
```{r}
# After all, it is better to normalize each individual replciate to an average of T0
t.cr_TC.filt.norm <- norm_TC(t.cr_TC.filt)
t.hl_tw <- estimate_HL(t.cr_TC.filt.norm,
n_cores = 12,
out_path = "/local/artem/Projects/Stability/Results/Slamseq_runs/PCN.ThreeWay_Slamseq/")
```