-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils.R
904 lines (832 loc) · 46.2 KB
/
utils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
#' Run RCTD deconvolution by first ensuring that single-cell RNA-seq reference only includes genes in ST data.
#'
#' @param st.obj A Seurat object for the ST data
#' @param sc.counts The reference scRNA-seq count matrix (with genes as rows and cells as columns).
#' Genes should be in the same namespace (e.g., symbols or Ensembl ids) as the ST count data in st.obj.
#' @param sc.cell.types The cell type assignments of each cell in sc.counts stored in a vector, named according to the columns/cells of sc.counts
#' @param rds.output.file If non-null, an output file name to store the RCTD results. Default: NULL.
#' If the file exists, the output will be read and returned by this function.
#' @param run.rctd Boolean indicating whether to actually deconvolve data (run.RCTD) or simply to create.RCTD.
#' @param intersect.sc.and.st.genes Boolean indicating whether to subset scRNA-seq genes to those within
#' ST data.
#' @return An RCTD object
#'
#' Assuming st.obj is the filtered feature obj, the associated count matrix should _already_ be filtered to include only
#' targeted genes, as described here:
#' https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/matrices
rctd.wrapper <- function(st.obj, sc.counts, sc.cell.types, rds.output.file = NULL, run.rctd = TRUE, intersect.sc.and.st.genes = FALSE) {
if(!is.null(rds.output.file) && file.exists(rds.output.file)) {
myRCTD <- readRDS(rds.output.file)
gc()
return(myRCTD)
}
# Get raw ST coiunts
st.counts <- GetAssayData(st.obj, assay="Spatial", slot="counts")
# Get the spot coordinates
st.coords <- st.obj[[]][, c("col", "row")]
colnames(st.coords) <- c("x","y")
# Get the # of UMI per spot
nUMI <- colSums(st.counts) # In this case, total counts per pixel is nUMI
# Create the RCTD 'puck', representing the ST data
puck <- SpatialRNA(st.coords, st.counts, nUMI)
# Create an RCTD scRNA-seq reference.
# NB: we do this for each of potentially multiple ST samples because some of those
# samples may be from FFPE assays and others from fresh frozen. These will have
# different gene sets that should be reflected (i.e., filtered) in the scRNA-seq reference (sc.counts).
if(intersect.sc.and.st.genes) {
stop("Would not recommend restricting scRNA-seq genes to those in ST!")
# spacexr/RCTD:get_de_genes has an expression threshold (expr_threshold)
# that any marker gene must exceed. Restricting the set of genes (to those in ST),
# impacts their normalization (by total UMI) in RCTD. I have observed that this
# can lead to significant differences in the number of genes passing the
# expression threshold (e.g., twice as many following subsetting the genes).
# Note that get_de_genes will only consider as markers those genes in common
# anyways between scRNA-seq and ST. As such, it intuitively makes sense
# to consistently define the normalized expression values of the scRNA-seq data
# irrespective of the ST data -- i.e., without subsetting.
sc.counts <- sc.counts[rownames(sc.counts) %in% rownames(st.counts),]
}
sc.cell.types <- sc.cell.types[colnames(sc.counts)]
nUMI <- colSums(sc.counts)
reference <- Reference(sc.counts, sc.cell.types, nUMI, n_max_cells = ncol(sc.counts) + 1)
max_cores <- min(5, detectCores() - 1)
# Note that we have keep_reference = TRUE here. Without it, the default is
# keep_reference = FALSE, which would ignore the Reference we have
# intentionally created above.
myRCTD <- create.RCTD(puck, reference, max_cores = max_cores, keep_reference = TRUE)
# NB: running in full mode!
if(run.rctd) {
myRCTD <- suppressPackageStartupMessages(run.RCTD(myRCTD, doublet_mode = 'full'))
}
if(!is.null(rds.output.file)) {
saveRDS(myRCTD, rds.output.file)
}
gc()
myRCTD
}
#' Create a .csv file for the automatic analysis of ST data, as it is described
#' here: https://github.com/sdomanskyi/spatialtranscriptomics
#'
#' @param base_dir A base directory where the .csv file will be saved
#' @param species_sample Vector describing the species of each sample
#'
#'
#' # Write csv file for automatic ST pipeline
write.automated.csv.file <- function(base_dir, dataset_names, species_sample) {
temp_var<- paste0(base_dir,dataset_names,'/spaceranger/')
temp_var2<- replicate(length(dataset_names), "")
ST_automated<- data.frame (sample_id= names(dataset_names), species= species_sample, st_data_dir=temp_var, sc_data_dir= temp_var2 )
dir.create(file.path(base_dir, '/automatic_analysis/'), recursive=TRUE)
temp <- temp <- strsplit(base_dir,"/")
curr_datatypes <- tail(temp[[1]], n=1)
write.csv(ST_automated,paste0(base_dir,'/automatic_analysis/',curr_datatypes,'.csv'), row.names = FALSE ,quote=FALSE)
}
#' Get the position information for each spot in a tissue
#'
#' A function that extracts spot position information from the spaceranger output corresponding to
#' a single tissue. See https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/images
#' for a full description.
#'
#' @param spaceranger_dir The path to the spaceranger output.
#' @return A data.frame with columns:
#' barcode: The sequence of the barcode associated to the spot.
#' in_tissue: Binary, indicating if the spot falls inside (1) or outside (0) of tissue.
#' array_row: The row coordinate of the spot in the array from 0 to 77. The array has 78 rows.
#' array_col: The column coordinate of the spot in the array. In order to express the orange crate arrangement of the spots, this column index uses even numbers from 0 to 126 for even rows, and odd numbers from 1 to 127 for odd rows. Notice then that each row (even or odd) has 64 spots.
#' pxl_row_in_fullres: The row pixel coordinate of the center of the spot in the full resolution image.
#' pxl_col_in_fullres: The column pixel coordinate of the center of the spot in the full resolution image.
get.tissue.position.metadata <- function(spaceranger_dir) {
image.dir <- paste0(spaceranger_dir, "/", "spatial/")
file <- file.path(image.dir, "tissue_positions_list.csv")
col.names <- c("barcodes", "tissue", "row", "col", "imagerow", "imagecol")
if(file.exists(file)) {
tissue.positions <-
read.csv(file = file, col.names = col.names, header = FALSE, as.is = TRUE, row.names = 1)
return(tissue.positions)
}
file <- file.path(image.dir, "tissue_positions.csv")
if(!file.exists(file)) {
stop("Found neither tissue_positions_list.csv nor tissue_positions.csv")
}
tissue.positions <-
read.csv(file = file, header = TRUE, as.is = TRUE)
colnames(tissue.positions) <- col.names
rownames(tissue.positions) <- tissue.positions$barcodes
tissue.positions <- tissue.positions[, !(colnames(tissue.positions) %in% c("barcodes"))]
return(tissue.positions)
}
#' Create a Seurat object based on the spaceranger baseline file
#'
#' @param spaceranger_dirs The directory where the folders for each dataset exist
#' @param dataset The name of queried dataset
#' @param spaceranger_bsl_filename The spaceranger file where the details for each Seurat data odject are extracted
#' @param filter_param Binary variable for filtering tissue positions
#' @return A Suerat object
create.Seurat.object <- function(spaceranger_dirs, dataset, spaceranger_bsl_filename, filter_param){
obj <- Seurat::Load10X_Spatial(spaceranger_dirs[[dataset]], filename = spaceranger_bsl_filename, filter.matrix = filter_param, slice = dataset)
obj$orig.ident <- dataset
tissue.positions <- get.tissue.position.metadata(spaceranger_dirs[[dataset]])
tissue.positions$spot_type <- "background"
tissue.positions[tissue.positions$tissue == 1, "spot_type"] <- "tissue"
obj <- AddMetaData(obj, tissue.positions)
obj
}
#' Merge all Seurat objects in one object using harmony (https://portals.broadinstitute.org/harmony/articles/quickstart.html)
#' and cluster the data based on gene expression data and RUN UMAP
#'
#' @param objs A merged Seurat object including all Seurat objects we want to merge
#' @param norm_param Choose which normalization assay will be used
#' @return merged_obj One Seurat object containing all unique Seurat objs
merge.Seurat.objs <- function(objs, norm_param){
merged_obj <- ScaleData(merged_obj, assay=norm_param)
VariableFeatures(merged_obj, assay=norm_param) <- unique(unname(unlist(lapply(filtered.objs, function(x) VariableFeatures(x, assay=norm_param)))))
merged_obj <- RunPCA(merged_obj, verbose = FALSE)
ElbowPlot(merged_obj, ndims = 40)
mat <- Seurat::GetAssayData(merged_obj, assay = norm_param)
pca <- merged_obj[["pca"]]
# Get the total variance:
total_variance <- sum(matrixStats::rowVars(as.matrix(mat)))
eigValues = (pca@stdev)^2 ## EigenValues
varExplained = eigValues / total_variance
cumVarExplained <- cumsum(varExplained)
merged_obj <- RunHarmony(merged_obj, group.by.vars = make.names("orig ident"))
# Pick the number of dimensions based on a flattening of the elbow plot
merged_obj <- FindNeighbors(merged_obj,reduction = "harmony", dims = 1:20)
merged_obj <- FindClusters(merged_obj, reduction = "harmony",verbose = FALSE)
merged_obj <- RunUMAP(merged_obj, reduction = "harmony",dims = 1:20)
merged_obj
}
#' Return count matrix from a Seurat object
#'
#' @param obj A Seurat object.
#' @return A (sparse) matrix holding the spot counts (i.e., from the "counts" slot)
get.count.matrix <- function(obj) {
as.matrix(x = GetAssayData(object = obj, assay = "Spatial", slot = "counts"))
}
#' Downsample each column of a matrix to a desired total count.
#'
#' A function that, independently for each column/cell, uses a multinomial distribution, parameterized
#' by the frequency of each gene within that column/cell and the desired total count, to sample a vector of gene counts.
#' This is in contrast to related functions such as SampleUMI (in Seurat), which applies a binomial to each gene (!?)
#' and thus only approximates the desired total count, and downsampleMatrix (in DropletUtils), which downsamples to
#' a fraction of the original number of reads.
#'
#' @param expr_mat A (possibly sparse) matrix of counts.
#' @param tot_cnt The desired number of counts _for each column_
#' @return A downsample matrix
downsample.matrix <- function (expr_mat, tot_cnt)
{
down_sample <- function(x) {
prob <- x/sum(x)
return(rmultinom(1, tot_cnt, prob))
}
down_sampled_mat <- apply(expr_mat, 2, down_sample)
return(down_sampled_mat)
}
#' Compute the number of expressed genes/features in a downsampled matrix.
#'
#' A function to downsample the input matrix such that each cell/column has tot_cnt reads and
#' that reports the _median_ (across cells/columns) number of genes/features with non-zero counts.
#'
#' @param expr_mat A (possibly sparse) matrix of counts.
#' @param tot_cnt The desired number of counts _for each column_
#' @return The median (across cells/columns) number of genes/features with non-zero counts in the downsampled matrix.
compute.num.features.at.total.reads <- function(expr_mat, tot_cnt) {
ds <- downsample.matrix(expr_mat, tot_cnt)
return(median(colSums(ds > 0)))
}
#' Return most highly-expressed genes
#'
#' A function to return the most highly-expressed genes (i.e., those with highest value) from a matrix.
#'
#' @param mat A (sparse) matrix.
#' @param n.top The number of genes to return.
#' @param assay The assay from which to access the gene exprssion/values in the Seurat object.
#' @param slot The slot in the assay form which to access the gene expression/values.
#' @return A vector of the highest-expressed genes.
get.top.genes.matrix <- function(mat, n.top = 20, assay = "Spatial", slot = "data") {
rs <- rowSums(mat)
rs <- rs[order(rs, decreasing=TRUE)]
top.genes <- names(rs)[1:n.top]
top.genes
}
#' Return most highly-expressed genes
#'
#' A function to return the most highly-expressed genes (i.e., those with highest value) from a Seurat assay.
#'
#' @param obj A Seurat object.
#' @param n.top The number of genes to return.
#' @param assay The assay from which to access the gene exprssion/values in the Seurat object.
#' @param slot The slot in the assay form which to access the gene expression/values.
#' @return A vector of the highest-expressed genes.
get.top.genes <- function(obj, n.top = 20, assay = "Spatial", slot = "data") {
mat <- GetAssayData(obj, assay = assay, slot = slot)
get.top.genes.matrix(mat, n.top = n.top, assay = assay, slot = slot)
}
#' Return data subsetted by spot tissue status
#'
#' A function to return the subset of the data corresponding to spots that are (or are not) in the tissue
#'
#' @param obj A Seurat object.
#' @param tissue.val Spots with tissue.val for the tissue.col metadata field will be retained.
#' @param tissue.col The metadata column of the Seurat object holding the tissue status
#' @param assay The assay from which to access the gene exprssion/values in the Seurat object.
#' @param slot The slot in the assay form which to access the gene expression/values.
#' @return A (sparse) matrix of spots that are (or are not) in tissue
subset.matrix.based.on.tissue.status <- function(obj, tissue.val, tissue.col = "tissue", assay = "Spatial", slot = "data") {
cell.subset <- rownames(([email protected])[([email protected])[,tissue.col] == tissue.val,])
mat <- GetAssayData(obj, assay = assay, slot = slot)
mat[,cell.subset]
}
#' Identify biotype of each gene in a a geneset
#'
#' @param gene_set A character array carrying the querried gene symbols names
#' @param gene_db A Mart object database
#' @return A data.frame giving the biotype of each gene in the gene_set
get.biotypes_ <- function(gene_set,gene_db){
gb <- getBM(attributes=c("external_gene_name", "gene_biotype", "chromosome_name"),filters = c("external_gene_name"), values=gene_set, mart=gene_db)
# Label mitochondrial genes as any encoded on the "MT" chromosome
flag <- grepl(gb$chromosome_name, pattern="MT", ignore.case = TRUE)
gb[flag, "gene_biotype"] <- "MT"
# Label ribosomal proteins (RP) as any with gene name Rps or Rpl.
# Note that these are not the same as ribosomal (t)RNAs, which should be listed as rRNA by biomart
flag <- grepl(gb$external_gene_name, pattern="^Rp[sl]", ignore.case = TRUE) # for mouse
flag <- flag | grepl(gb$external_gene_name, pattern="^RP[SL]", ignore.case = TRUE) # for human
gb[flag, "gene_biotype"] <- "RP"
colnames(gb) <- c("gene", "gene_biotype", "chromosome_name")
gb <- gb[, c("gene", "gene_biotype")]
df <- data.frame(gene = gene_set)
df <- merge(df, gb, all.x = TRUE)
df
}
#' Identify frequency of each biotype within a geneset
#'
#' @param gene_set A character array carrying the querried gene symbols names
#' @param gene_db A Mart object database
#' @return A data.frame giving the count (Freq) of each biotype in gene_set
get.biotypes <- function(gene_set,gene_db){
gb <- get.biotypes_(gene_set, gene_db)
querried.biotypes <- as.data.frame(table(gb$gene_biotype))
colnames(querried.biotypes) <- c("biotype", "Freq")
o <- order(querried.biotypes$Freq, decreasing=TRUE)
querried.biotypes <- querried.biotypes[o,]
}
#' Simplify biotypes by condensing several into large categories
#'
#' @param df A data.frame with a biotype column (e.g., with each row corresponding to a gene)
#' @param biotype.col The name of the biotype column in df
#' @return df modified so that immunoglobulin-related biotypes (IG_) are condensed into an IG biotype,
#' T-cell receptor-related biotypes (TR_) are condensed into a TR biotype,
#' and NA biotypes are converted to unknown.
condense.biotypes <- function(df, biotype.col = "gene_biotype") {
flag <- is.na(df[, biotype.col])
df[flag, biotype.col] <- "unknown"
flag <- grepl(df[, biotype.col], pattern="IG_")
df[flag, biotype.col] <- "IG"
flag <- grepl(df[, biotype.col], pattern="TR_")
df[flag, biotype.col] <- "TR"
flag <- grepl(df[, biotype.col], pattern="pseudogene")
df[flag, biotype.col] <- "pseudogene"
df
}
#' Try to load package, installing first (via pacman) if necessary
#'
#' @param package A string representing the package to load
#' @return Nothing
p_load_ <- function(package) {
if(!require(pacman)) {
install.packages(pacman)
}
suppressPackageStartupMessages(p_load(package))
}
#' Create a panel of plots, each showing deconvolved fractions of a particular population.
#'
#' @param rctd An RCTD object, from the spacexr package.
#' @param normalize Boolean indicating whether weights should be normalized to sum to 1.
#' @return A data frame whose rows are spots, whose columns are deconvolved populations, and whose
#' entries are the (predicted) fraction of a population in a given spot.
#' Also adds columns x and y, holding spatial coordinates of spots.
format.rctd.output_ <- function(rctd, normalize = TRUE) {
barcodes <- colnames(rctd@spatialRNA@counts)
weights <- rctd@results$weights
if(normalize) {
weights <- normalize_weights(weights)
}
df <- as.data.frame(weights)
df$x <- rctd@spatialRNA@coords$x
df$y <- rctd@spatialRNA@coords$y
df
}
#' Parse ligand-receptor interactions from cellphonedb.
#'
#' @return A list with the following entries:
#' interaction.tbl: a data.frame where rows are ligand-receptor interactions and with the following columns:
#' id_cp_interaction: cellphonedb name for ligand-receptor interaction pair
#' partner_a: cellphonedb name for partner a in the interaction (i.e., ligand)
#' partner_b: cellphonedb name for partner b in the interaction (i.e., receptor)
#' ligand_hgnc_symbols: comma-separated list of ligand hgnc symbols
#' ligand_ensembl_ids: comma-separated list of ligand ensembl ids
#' receptor_hgnc_symbols: comma-separated list of ligand hgnc symbols
#' receptor_ensembl_ids: comma-separated list of ligand ensembl ids
#' all.ligand.hgnc.symbols: a vector of all ligand hgnc symbols (i.e., concatenation of ligand_hgnc_symbols from interaction.tbl)
#' all.ligand.ensembl.ids: a vector of all ligand ensembl ids (i.e., concatenation of ligand_ensembl_ids from interaction.tbl)
#' all.receptor.hgnc.symbols: a vector of all receptor hgnc symbols (i.e., concatenation of receptor_hgnc_symbols from interaction.tbl)
#' all.receptor.ensembl.ids: a vector of all receptor ensembl ids (i.e., concatenation of receptor_ensembl_ids from interaction.tbl)
read.cellphonedb.ligand.receptors <- function() {
# Read the cellphonedb tables
interaction.tbl <- read.table("https://raw.githubusercontent.com/ventolab/CellphoneDB/master/cellphonedb/src/core/data/interaction_input.csv", sep=",", header=TRUE)
complex.tbl <- read.table("https://raw.githubusercontent.com/ventolab/CellphoneDB/master/cellphonedb/src/core/data/complex_input.csv", sep=",", header=TRUE)
gene.tbl <- read.table("https://raw.githubusercontent.com/ventolab/CellphoneDB/master/cellphonedb/src/core/data/gene_input.csv", sep=",", header=TRUE)
protein.tbl <- read.table("https://raw.githubusercontent.com/ventolab/CellphoneDB/master/cellphonedb/src/core/data/protein_input.csv", sep=",", header=TRUE)
# The complexes table has a complex_name (e.g., ACVL1_BMPR2) and up to four associated proteins
# (identified by uniprot ids uniprot_{1,4}).
# To translate these proteins to genes, merge with the genes table, which has columns
# gene_name, uniport, hgnc_symbol, and ensembl
uniprot.cols <- grep(x = colnames(complex.tbl), pattern="uniprot", value = TRUE)
suffixes <- gsub(uniprot.cols, pattern = "uniprot", replacement="")
for(i in 1:length(uniprot.cols)) {
gene.i.tbl <- gene.tbl
colnames(gene.i.tbl) <- paste0(colnames(gene.i.tbl), suffixes[i])
complex.tbl <- merge(complex.tbl, gene.i.tbl, all.x = TRUE)
}
# Concatenate all of the hgnc symbol and ensembl columns
hgnc.symbol.cols <- grep(x = colnames(complex.tbl), pattern="hgnc_symbol", value = TRUE)
complex.tbl[, "hgnc_symbols"] <- unlist(apply(complex.tbl[, hgnc.symbol.cols], 1, function(row) paste0(na.omit(row), collapse=",")))
ensembl.cols <- grep(x = colnames(complex.tbl), pattern="ensembl", value = TRUE)
complex.tbl[, "ensembl_ids"] <- unlist(apply(complex.tbl[, ensembl.cols], 1, function(row) paste0(na.omit(row), collapse=",")))
# Now merge the interaction and complex tables -- where partner_{a,b} in the interaction table is the {ligand,receptor} complex (or uniprot protein id)
suffixes <- c("_a", "_b")
for(i in 1:length(suffixes)) {
complex.i.tbl <- complex.tbl[, c("complex_name", "hgnc_symbols", "ensembl_ids")]
colnames(complex.i.tbl) <- paste0(colnames(complex.i.tbl), "_complex", suffixes[i])
interaction.tbl <- merge(interaction.tbl, complex.i.tbl, all.x = TRUE, by.x = paste0("partner", suffixes[i]), by.y = paste0("complex_name_complex", suffixes[i]))
}
# Now merge the interaction and protein tables -- to capture cases in which partner_{a,b} is a uniprot id
# But, first merge the protein and gene tables
protein.tbl <- merge(protein.tbl, gene.tbl, all.x = TRUE, by = "uniprot")
for(i in 1:length(suffixes)) {
protein.i.tbl <- protein.tbl[, c("uniprot", "hgnc_symbol", "ensembl")]
colnames(protein.i.tbl) <- paste0(colnames(protein.i.tbl), "_protein", suffixes[i])
interaction.tbl <- merge(interaction.tbl, protein.i.tbl, all.x = TRUE, by.x = paste0("partner", suffixes[i]), by.y = paste0("uniprot_protein", suffixes[i]))
}
# At this point, we have potentially multiple rows per interaction and multiple columns with both hgnc symbols and ensembl ids.
# Let's concatentate all of these
concat.vals.across.rows.and.cols <- function(df, pattern) {
cols <- grep(colnames(df), pattern=pattern, value=TRUE)
vals <- unlist(apply(df[, cols, drop=FALSE], 1, function(row) paste0(na.omit(row), collapse=",")))
vals <- paste0(vals, collapse=",")
vals <- sort(unique(unlist(strsplit(vals, split=",")[[1]])))
vals <- paste0(vals, collapse=",")
vals
}
interaction.tbl <-
ddply(interaction.tbl, .variables = c("id_cp_interaction", "partner_a", "partner_b"),
.fun = function(df) {
lig.syms <- concat.vals.across.rows.and.cols(df, pattern = ".*symbol.*_a")
rec.syms <- concat.vals.across.rows.and.cols(df, pattern = ".*symbol.*_b")
lig.ids <- concat.vals.across.rows.and.cols(df, pattern = ".*ensembl.*_a")
rec.ids <- concat.vals.across.rows.and.cols(df, pattern = ".*ensembl.*_b")
data.frame("ligand_hgnc_symbols" = lig.syms, "ligand_ensembl_ids" = lig.ids,
"receptor_hgnc_symbols" = rec.syms, "receptor_ensembl_ids" = rec.ids)
})
all.ligand.symbols <- strsplit(concat.vals.across.rows.and.cols(interaction.tbl, pattern="ligand_hgnc_symbols"), split=",")[[1]]
all.ligand.ids <- strsplit(concat.vals.across.rows.and.cols(interaction.tbl, pattern="ligand_ensembl_ids"), split=",")[[1]]
all.receptor.symbols <- strsplit(concat.vals.across.rows.and.cols(interaction.tbl, pattern="receptor_hgnc_symbols"), split=",")[[1]]
all.receptor.ids <- strsplit(concat.vals.across.rows.and.cols(interaction.tbl, pattern="receptor_ensembl_ids"), split=",")[[1]]
lst <- list("interaction.tbl" = interaction.tbl,
"all.ligand.hgnc.symbols" = all.ligand.symbols,
"all.ligand.ensembl.ids" = all.ligand.ids,
"all.receptor.hgnc.symbols" = all.receptor.symbols,
"all.receptor.ensembl.ids" = all.receptor.ids
)
return(lst)
}
#' Calculate an "expression" for a ligand-receptor pair.
#'
#' Represents the LR pair expression as the expression of a gene whose mean expression (across samples)
#' is minimal. This seems similar to the approach taken by cellphonedb, wherein the gene with minimal
#' expression within a multi-subunit heteromeric complex is used as the expression of that (ligand or receptor)
#' complex. Here, we are effectively taking the minimum of the ligand and receptor complexes for use as the
#' LR pair expression.
#'
#' @param expr.mat An expression matrix (with no assumed units), whose rows are genes (in no particular namespace) and whose columns are samples/cells/spots.
#' @param ligand.genes A vector of ligand genes (in the same namespace as the rows of expr.mat)
#' @param receptor.genes A vector of receptor genes (in the same namespace as the rows of expr.mat)
#' @return A vector of expression for the ligand-receptor pair, named with the columns / samples of the expression matrix.
calculate.ligand.receptor.pair.expression <- function(expr.mat, ligand.genes, receptor.genes) {
all.lr.genes <- unique(c(ligand.genes, receptor.genes))
all.lr.genes <- all.lr.genes[all.lr.genes %in% rownames(expr.mat)]
lr.means <- rowMeans(expr.mat[all.lr.genes,])
# Find the gene with the minimum (average) expression. We will use this to represent the
# expression of the lr pair
gene.rep <- names(which.min(lr.means))[1]
expr.mat[gene.rep,]
}
#' Calculate the quantiles of the mean expression (over samples/cell/spots/columns) of an expression matrix.
#'
#' @param expr.mat An expression matrix (with no assumed units), whose rows are genes (in no particular namespace) and whose columns are samples/cells/spots.
#' @param quantiles A vector of probability values
#' @summary.func Function to apply to rows of expression matrix to summarize each gene's expression
#' @return A data.frame containing the estimated quantiles for each probability value in quantiles (one row, and as many columns as values in quantiles)
calculate.expression.quantiles <- function(expr.mat, quantiles = seq(0, 1, by=0.1), summary.func = mean) {
# gene.summaries <- rowMeans(expr.mat)
gene.summaries <- apply(expr.mat, 1, summary.func)
quantile(gene.summaries, probs=quantiles)
}
#' Add metadata to Seurat object.
#'
#' This function is a simple wrapper around AddMetaData
#'
#' @param obj A Seurat object
#' @param metadata.df A data.frame expected to have all of the rownames (corresponding to cells/spots) in existing metadata for obj (obj[[]])
#' @return obj modified to hold the metadata in metadata.df
add.metadata.to.seurat.obj <- function(obj, metadata.df) {
# rows <- rownames(obj[[]])
# print(head(rows))
# print(table(rows %in% rownames(metadata.df)))
# stopifnot(rows %in% rownames(metadata.df))
# metadata.df <- metadata.df[rows,]
colnames(metadata.df) <- make.names(colnames(metadata.df))
metadata.df <- metadata.df[, !(colnames(metadata.df) %in% colnames(obj[[]])), drop=FALSE]
cols.to.add <- colnames(metadata.df)
all.meta <- merge(obj[[]], metadata.df, by = "row.names", all.x = TRUE)
rownames(all.meta) <- all.meta$Row.names
ids <- Cells(obj)
all.meta <- all.meta[ids,]
obj <- AddMetaData(obj, all.meta[, cols.to.add, drop=FALSE])
obj
}
#' Enable parallel execution.
#'
#' #' @param obj A Seurat object
#' @return The number of cores available for parallel execution.
setup.parallel.environment <- function(max.cores = NULL) {
num.cores <- detectCores()
if(!is.na(num.cores) && (num.cores > 1)) {
if(!is.null(max.cores)) { num.cores <- min(num.cores, max.cores) }
suppressPackageStartupMessages(p_load("doMC"))
cat(paste("Registering ", num.cores-1, " cores.\n", sep=""))
registerDoMC(cores=(num.cores-1))
}
num.cores
}
#' Create a Seurat object for a Visium 10X dataset.
#'
#' @param sample.name A string name for the sample / dataset.
#' @param spaceranger.dir A string directory holding the files filtered_feature_bc_matrix.h5 and
#' raw_feature_bc_matrix.h5 and the subdirectory spatial. This is likely
#' the outs directory created by spaceranger count or the equivalent.
#' @param filter.spots A boolean indicating whether to include all spots (filter.spots = FALSE) or only
#' those spots overlapping the tissue (filter.spots = TRUE)
#' @return A Seurat object, annotated with the position and status (spot_type = "tissue" or "background") for each spot.
create.visium.seurat.object <- function(sample.name, spaceranger.dir, filter.spots = TRUE) {
# load filtered_feature_bc_matrix.h5 with filter.matrix = TRUE or
# raw_feature_bc_matrix.h5 with filter.matrix = FALSE
# filtered_feature_bc_matrix.h5 should only contain data from spots overlaying tissue,
# though I can't find this documented conclusively.
# filter.matrix = TRUE definitely filters coordinates in the Seurat object
# according to those that overlap the tissue (i.e., have tissue == 1 in the
# tissue_positions_list.csv file -- see code for Read10X_Image, which is called
# by Load10X_Spatial)
# See this issue for how to set up these individual objects such that we can merge them:
# https://github.com/satijalab/seurat/issues/3732
# Namely, set the slice and then the orig.ident
filename <- 'filtered_feature_bc_matrix.h5'
if(!filter.spots) {
filename <- 'raw_feature_bc_matrix.h5'
}
obj <- Seurat::Load10X_Spatial(spaceranger.dir, filename = filename, filter.matrix = filter.spots, slice = sample.name)
obj$orig.ident <- sample.name
tissue.positions <- get.tissue.position.metadata(spaceranger.dir)
tissue.positions$spot_type <- "background"
tissue.positions[tissue.positions$tissue == 1, "spot_type"] <- "tissue"
obj <- AddMetaData(obj, tissue.positions)
obj
}
#' Create Seurat objects, one for eachVisium 10X sample / dataset.
#'
#' @param spaceranger.dirs A named list, where each entry corresponds to a sample and provides
#' a string directory holding the files filtered_feature_bc_matrix.h5 and
#' raw_feature_bc_matrix.h5 and the subdirectory spatial. This is likely
#' the outs directory created by spaceranger count or the equivalent.
#' @param filter.spots A boolean indicating whether to include all spots (filter.spots = FALSE) or only
#' those spots overlapping the tissue (filter.spots = TRUE)
#' @return A named list of Seurat objects, each annotated with the position and status
#' (spot_type = "tissue" or "background") for each spot. The names are those of spaceranger.dirs.
create.visium.seurat.objects <- function(spaceranger.dirs, filter.spots = TRUE) {
samples <- names(spaceranger.dirs)
names(samples) <- samples
objs <-
llply(samples, .fun = function(sample.name) {
create.visium.seurat.object(sample.name, spaceranger.dirs[[sample.name]], filter.spots = filter.spots)
})
objs
}
#' Concatenate Seurat objects into one object
#'
#' @param objs A list of Seurat objects
#' @return A single Seurat object that is the concatenation of the input objects
concatenate.seurat.objects <- function(objs) {
all.objs <- Reduce(merge, objs)
Idents(all.objs) <- all.objs$orig.ident
all.objs
}
#' Identify biotypes from a geneset given the orgnaism
#'
#' @param gene_set A character array carrying the querried gene symbols names
#' @param gene_db A Mart object database
#' @return querried.biotypes
get.annotationhub.biotypes <- function(gene_set){
stop("Not implemented")
ah <- AnnotationHub()
human_ens <- query(ah, c("Homo sapiens", "EnsDb"))
foo <- human_ens[["AH75011"]]
df <- genes(foo, return.type = "data.frame")
gb <- getBM(attributes=c("external_gene_name", "gene_biotype"),filters = c("external_gene_name"), values=gene_set, mart=gene_db)
querried.biotypes <- as.data.frame(table(gb$gene_biotype))
colnames(querried.biotypes) <- c("biotype", "Freq")
o <- order(querried.biotypes$Freq, decreasing=TRUE)
querried.biotypes <- querried.biotypes[o,]
}
#' Extract per-spot number/fraction of intronic, exonic, and intergenic (mapped) reads and number unmapped reads.
#'
#' @param bam.file A string providing the path to an (indexed) bam file
#' @return A data.frame in which each row corresponds to a spot.
#' It has rownames corresponding to the barcode of the spot and columns:
#' CThe sequence of the barcode associated to the spot.
#' E: The number of exonic reads in the spot
#' I: The number of intronic reads in the spot
#' N: The number of intergenic reads in the spot
#' tot.mapped.reads: The number of mapped reads in the spot
#' tot.unmapped.reads: The number of unmapped reads in the spot
#' tot.reads: tot.mapped.reads + tot.unmapped.reas
#' E.frac.mapped: E / tot.mapped.reads
#' I.frac.mapped: I / tot.mapped.reads
#' N.frac.mapped: N / tot.mapped.reads
get.per.spot.alignment.metrics <- function(bam.file) {
# The following tries to efficiently parse the bam by chromosome,
# but this doesn't appear to work for FFPE/targeted sequencing.
# For some reason, the rname/chromosome is NA for the vast majority (~98%)
# of reads. Probably they are mapped to probe contigs/"chromosomes", which
# aren't reflected in the rname.
if(FALSE) {
# Get the chromosome names
ret <- scanBamHeader(c(bam.file))
# The two elements returned by scanBamHeader are targets and text.
# According to the Rsamtools docs:
# The targets element contains target (reference) sequence lengths. The text element is
# itself a list with each element a list corresponding to tags (e.g., ‘@SQ’) found in the header, and the
# associated tag values.
#sq.indices <- names(ret[[1]]$text) == "@SQ"
#seqnames <- as.vector(unlist(lapply(ret[[1]]$text[sq.indices], function(x) gsub(x[1], pattern="SN:", replacement=""))))
seq.lengths <- as.vector(ret[[1]]$targets)
names(seq.lengths) <- names(ret[[1]]$targets)
seq.names <- names(seq.lengths)
names(seq.names) <- seq.names
# For computational/memory efficiency, iterate over the chromosomes as opposed
# to processing the entire file
res <- ldply(seq.names, .parallel = FALSE,
.fun = function(seqname) {
print(seqname)
# CB: error-corrected spot barcode
# UB: error-corrected molecular barcode
# RE: E = exonic, N = intronic, I = intergenic
param <- ScanBamParam(tag=c('CB','RE'), which=GRanges(seqname, IRanges(1, seq.lengths[[seqname]])), flag=scanBamFlag(isUnmappedQuery=FALSE))
x <- scanBam(bam.file, param=param)[[1]]
if(is.null(x$tag$CB)) { return(NULL) }
if(is.null(x$tag$RE)) { return(NULL) }
df <- as.data.frame(x$tag)
# Combine counts within spot (for this chromosome)
ddply(df, .variables = c("CB"),
.fun = function(df.cb) {
if(nrow(df.cb) == 0) { return(NULL) }
data.frame(E = length(which(!is.na(df.cb$RE) & (df.cb$RE == "E"))),
I = length(which(!is.na(df.cb$RE) & (df.cb$RE == "I"))),
N = length(which(!is.na(df.cb$RE) & (df.cb$RE == "N"))),
tot.mapped.reads = nrow(df.cb))
})
})
} # end if(FALSE)
param <- ScanBamParam(tag=c('CB','RE'), flag=scanBamFlag(isUnmappedQuery=FALSE))
x <- scanBam(bam.file, param=param)[[1]]
df <- as.data.frame(x$tag)
# Combine counts within spot (for this chromosome)
res <- ddply(df, .variables = c("CB"),
.fun = function(df.cb) {
if(nrow(df.cb) == 0) { return(NULL) }
data.frame(E = length(which(!is.na(df.cb$RE) & (df.cb$RE == "E"))),
I = length(which(!is.na(df.cb$RE) & (df.cb$RE == "I"))),
N = length(which(!is.na(df.cb$RE) & (df.cb$RE == "N"))),
tot.mapped.reads = nrow(df.cb))
})
# Combine results for a spot (CB) across chromosomes
spot.res <- ddply(res, .variables=c("CB"), .fun = function(df) data.frame(E = sum(df$E), I = sum(df$I), N = sum(df$N), tot.mapped.reads = sum(df$tot.mapped.reads)))
# See description of _cell_ranger filtering here:
# https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/algorithms/overview
# I believe the bam we are reading already has duplicate UMIs filtered
param <- ScanBamParam(tag=c('CB'),flag=scanBamFlag(isUnmappedQuery=TRUE))
unmr <- scanBam(bam.file, param=param)[[1]]
unmr.tbl <- as.data.frame(table(na.omit(unmr$tag$CB)))
colnames(unmr.tbl) <- c("CB", "tot.unmapped.reads")
spot.res <- merge(spot.res, unmr.tbl, all.x=TRUE)
#spot.res <- spot.res[!is.na(spot.res$CB),]
spot.res$tot.reads <- spot.res$tot.mapped.reads + spot.res$tot.unmapped.reads
spot.res$E.frac.mapped <- spot.res$E / spot.res$tot.mapped.reads
spot.res$I.frac.mapped <- spot.res$I / spot.res$tot.mapped.reads
spot.res$N.frac.mapped <- spot.res$N / spot.res$tot.mapped.reads
# tmp <- merge(unfiltered.objs[[3]][[]], spot.res, by.x = c("row.names"), by.y = c("CB"))
# spot.res[!is.na(spot.res$CB),"CB"] <- "ambiguous.spot"
# rownames(spot.res) <- spot.res$CB
# spot.res[, !(colnames(spot.res) == "CB")]
spot.res
}
#' Extract per-spot number/fraction of intronic, exonic, and intergenic (mapped) reads and number unmapped reads over Seurat objects.
#'
#' Wrapper to call get.per.spot.alignment.metrics across Seurat objects.
#'
#' @param spaceranger_dirs _Named_ list of spacernanger output directories, one per dataset.
#' @return A list of tables of alignment metrics, each output by get.per.spot.alignment.metrics
get.all.per.spot.alignment.metrics <- function(spaceranger_dirs, prefix = NULL) {
nms <- names(spaceranger_dirs)
names(nms) <- nms
align.metrics <-
llply(nms, .parallel = FALSE,
.fun = function(nm) {
print(nm)
alignment.metric.file <- NULL
if(!is.null(prefix)) {
alignment.metric.file <- paste0(prefix, "/", nm, "-alignment-metrics.csv")
}
bam.file <- paste0(spaceranger_dirs[[nm]], "/possorted_genome_bam.bam")
df <- NULL
if(is.null(alignment.metric.file) || !file.exists(alignment.metric.file)) {
df <- get.per.spot.alignment.metrics(bam.file)
write.table(file=alignment.metric.file, df, row.names=FALSE, col.names=TRUE, sep=",", quote=FALSE)
} else {
df <- as.data.frame(fread(alignment.metric.file))
}
df
})
align.metrics
}
#' Extract per-spot number of UMIs, total number of reads, and saturation.
#'
#' Number of UMIs should match nCount_Spatial in metadata.
#'
#' Saturation = PCR duplication = # non-unique reads / tot # reads
#'
#' @param mol.info.file A string providing the path to "molecule_info.h5" file
#' @return A data.frame in which each row corresponds to a spot.
#' It has rownames corresponding to the barcode of the spot and columns:
#' num.reads: total number of reads to barcode
#' num.umis: number of unique reads (UMIs) to barcode
#' num.nonzero.genes: number of genes to barcode with one or more reads
#' saturation: (num.reads - num.unis) / num.reads
get.per.spot.saturation.stats <- function(mol.info.file, filter = TRUE) {
# Get the barcode and count of each UMI (ignoring the feature/gene)
df <- data.frame(barcode = h5read(mol.info.file, "/barcodes")[h5read(mol.info.file, "/barcode_idx")+1],
count = h5read(mol.info.file, "/count"),
feature.idx = h5read(mol.info.file, "/feature_idx"))
# This is how you would extract the associated gene
# gene <- h5read(mol.info.file, "/features/name")[h5read(mol.info.file, "/feature_idx")+1]
# This should tell us whether the read mapped to an intron or exon, but I only see exonic reads
# umi.type <- h5read(mol.info.file, "/umi_type")
ls.info <- h5ls(mol.info.file)
probe.set.key.flag <- grepl(ls.info$name, pattern="Probe Set")
probe.set <- NULL
if(length(which(probe.set.key.flag)) == 1) {
probe.set <- h5read(mol.info.file, paste0(ls.info[probe.set.key.flag,"group"], "/", ls.info[probe.set.key.flag,"name"]))
}
res <- ddply(df,
.variables = c("barcode"),
.fun = function(tbl) {
# As stated here:
# https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/matrices
# non-targeted genes are removed from the 'filtered' (but not the 'unfiltered') matrix
if(filter && !is.null(probe.set)) {
tbl <- subset(tbl, feature.idx %in% probe.set)
}
num.reads <- sum(tbl$count)
num.umis <- nrow(tbl)
num.nonzero.genes <- length(unique(tbl$feature.idx[tbl$count > 0]))
saturation <- (num.reads - num.umis) / num.reads
data.frame(num.reads = num.reads, num.umis = num.umis, num.nonzero.genes = num.nonzero.genes, saturation = saturation)
})
rownames(res) <- res$barcode
res <- res[, !(colnames(res) == "barcode")]
res
}
#' Find the empirical distance between spots in the x and y directions.
#'
#' It makes most sense for the input Seurat object to be unfiltered (by background vs tissue/foreground)
#' so that there aren't gaps between rows / columns. Nevertheless, the code attempts to
#' overcome this possibility by taking the most common distance between neighboring spots.
#'
#' @param obj A Seurat object.
#' @return A 2-tuple with elements:
#' dx: the separation between neighboring spots in the x dimension
#' dy: the separation between neighboring spots in the y dimension
get.visium.spot.distance.separation <- function(obj) {
# Within each row, calculate the distance between spots ordered by column.
tmp <- ddply(obj[[]], .variables=c("row"),
.fun = function(df) {
o <- order(df$col, decreasing=FALSE)
df <- df[o,]
diffs <- unlist(lapply(1:(nrow(df)-1), function(i) df[i+1,"imagecol"] - df[i,"imagecol"]))
unit.diffs <- unlist(lapply(1:(nrow(df)-1), function(i) df[i+1,"col"] - df[i,"col"]))
na.omit(data.frame(diff=diffs, unit.diff=unit.diffs))
})
# Take the most common such distance between columns of neighboring spots as the
# separation in the x direction -- dx
tbl <- table(tmp$diff)
dx <- as.numeric(names(sort(tbl, decreasing=TRUE))[1])
tbl <- table(tmp$unit.diff)
unit.dx <- as.numeric(names(sort(tbl, decreasing=TRUE))[1])
# Within each col, calculate the distance between spots ordered by row.
tmp <- ddply(obj[[]], .variables=c("col"),
.fun = function(df) {
o <- order(df$row, decreasing=FALSE)
df <- df[o,]
diffs <- unlist(lapply(1:(nrow(df)-1), function(i) df[i+1,"imagerow"] - df[i,"imagerow"]))
unit.diffs <- unlist(lapply(1:(nrow(df)-1), function(i) df[i+1,"row"] - df[i,"row"]))
na.omit(data.frame(diff=diffs, unit.diff=unit.diffs))
})
# Take the most common such distance between rows of neighboring spots as the
# separation in the y direction -- dy
tbl <- table(tmp$diff)
dy <- as.numeric(names(sort(tbl, decreasing=TRUE))[1])
tbl <- table(tmp$unit.diff)
unit.dy <- as.numeric(names(sort(tbl, decreasing=TRUE))[1])
return(c(dx, dy, unit.dx, unit.dy))
}
#' Return the genes targeted by sequencing.
#'
#' For fresh frozen, this will be all genes (in the Seurat object's expression matrix).
#' For FFPE, this will be those in the probe set.
#'
#' @param mol.info.file A string providing the path to "molecule_info.h5" file
#' @return A list with named entries:
#' gene.targets: A vector of gene names
#' gene.off.targets: A vector of gene names
#' gene.id.targets: A vector of gene ids
#' gene.id.off.targets: A vector of gene ids
get.targeted.genes <- function(mol.info.file) {
gene.ensg.ids <- h5read(mol.info.file, "/features/id")
gene.names <- h5read(mol.info.file, '/features/name')
ls.info <- h5ls(mol.info.file)
probe.set.key.flag <- grepl(ls.info$name, pattern="Probe Set")
if(length(which(probe.set.key.flag)) == 0) {
# This is not a targeted assay
return(list("gene.targets" = gene.names,
"gene.off.targets" = NULL,
"gene.id.targets" = gene.ensg.ids,
"gene.id.off.targets" = NULL))
}
# Empirically, it appears that these indices are zero-based. I can't find that directly stated online, however:
# 1. Other indices are zero-based, e.g., barcode_idx and feature_idx here:
# https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/molecule_info
# 2. As stated here
# https://support.10xgenomics.com/spatial-gene-expression/software/pipelines/latest/output/matrices
# the filtered feature for FFPE/targeted assays should only include the targeted genes. When I assume zero-based indices (as below),
# I get a near perfect correspondence between probe.gene.names and the rownames of the (filtered) count matrix.
# When I assume a one-based index, as I had previously, the correspondence is poor.
probe.gene.indices <- h5read(mol.info.file, paste0(ls.info[probe.set.key.flag,"group"], "/", ls.info[probe.set.key.flag,"name"])) + 1
gene.indices <- 1:length(gene.ensg.ids)
nonprobe.gene.indices <- gene.indices[!(gene.indices %in% probe.gene.indices)]
# probe.gene.indices <- gene.indices[(gene.indices %in% probe.gene.ids)]
nonprobe.gene.names <- sort(gene.names[nonprobe.gene.indices])
probe.gene.names <- gene.names[probe.gene.indices]
nonprobe.gene.ids <- gene.ensg.ids[nonprobe.gene.indices]
probe.gene.ids <- gene.ensg.ids[probe.gene.indices]
list("gene.targets" = probe.gene.names,
"gene.off.targets" = nonprobe.gene.names,
"gene.id.targets" = probe.gene.ids,
"gene.id.off.targets" = nonprobe.gene.ids)
}
#' Wrapper to apply sctransform to a seurat object.
#'
#' @param obj A seurat object
#' @param use.v2 Boolean indicating whether to apply the "v2" flavor of sctransform
#' @param assay The assay to apply sctransform to
#' @return obj modified to hold sctransform'ed results in the "SCT" assay
apply.sctransform_ <- function(obj, use.v2 = TRUE, assay = "Spatial") {
# obj <- PercentageFeatureSet(obj, pattern = "^MT-", col.name = "percent.mt")
if(use.v2) {
obj <- SCTransform(obj, assay = assay, vars.to.regress = c(paste0("nCount_", assay)), verbose = FALSE, vst.flavor = "v2", method = "glmGamPoi")
} else {
obj <- SCTransform(obj, assay = assay, vars.to.regress = c(paste0("nCount_", assay)), verbose = FALSE)
}
return(obj)
}
#' Wrapper to apply sctransform to a list of seurat objects.
#'
#' @param objs A list of seurat objects
#' @return the list of objs, with each entry modified to hold sctransform'ed results in the "SCT" assay
apply.sctransform <- function(objs, use.v2 = TRUE) {
objs <-
llply(objs,
.fun = function(obj) {
apply.sctransform_(obj, use.v2 = use.v2)
})
objs
}