Skip to content

Commit 568310d

Browse files
committed
adding R package files
1 parent e8f1301 commit 568310d

13 files changed

+1259
-0
lines changed

.Rbuildignore

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
^renv$
2+
^renv\.lock$
3+
^snpeffr\.Rproj$
4+
^\.Rproj\.user$
5+
^ext$
6+
^README\.Rmd$

.Rprofile

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
source("renv/activate.R")

DESCRIPTION

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
Package: snpeffr
2+
Title: What the Package Does (One Line, Title Case)
3+
Version: 0.0.0.9000
4+
Authors@R:
5+
person("First", "Last", , "[email protected]", role = c("aut", "cre"),
6+
comment = c(ORCID = "YOUR-ORCID-ID"))
7+
Description: What the package does (one paragraph).
8+
License: `use_mit_license()`, `use_gpl3_license()` or friends to pick a
9+
license
10+
Encoding: UTF-8
11+
Roxygen: list(markdown = TRUE)
12+
RoxygenNote: 7.2.1
13+
Imports:
14+
data.table,
15+
R.utils

NAMESPACE

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Generated by roxygen2: do not edit by hand
2+
3+
export(snpeffr)
4+
import(data.table)
5+
importFrom(data.table,":=")
6+
importFrom(data.table,.BY)
7+
importFrom(data.table,.EACHI)
8+
importFrom(data.table,.GRP)
9+
importFrom(data.table,.I)
10+
importFrom(data.table,.N)
11+
importFrom(data.table,.NGRP)
12+
importFrom(data.table,.SD)
13+
importFrom(data.table,data.table)

R/snpeff_fun.R

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
#' Pull in, filter, and parse vcf data and annotations from snpeff
2+
#'
3+
#' This function parses a vcf file generated by snpeff and filters to specific
4+
#' genes, positions, and effects of interest. See the [snpeff](https://pcingola.github.io/SnpEff)
5+
#' documentation for more information on
6+
#' how to interpret these inputs and outputs.
7+
#'
8+
#' @param vcf_path the path to the vcf file output from snpeff (read in using data.table::fread and R.utils if zipped)
9+
#' @param positions a named list of positions to filter to, each element can either
10+
#' be a sequence, i.e. 1000:2000 or a vector i.e. c(1000:2000, 300).
11+
#' @param genes the name of the genes of interest to filter to
12+
#' @param exclude_effects a string formatted as a regular expression,
13+
#' effects to ignore (as categorized by snpeff, see [here](https://pcingola.github.io/SnpEff/se_inputoutput/)),
14+
#'
15+
#' @return a data.frame with n rows and 7 columns:
16+
#' \describe{
17+
#' \item{\code{sample_id}}{ID of the sample, the names of the annotation columns from snpeff}
18+
#' \item{\code{snpeff_gene_name}}{name of the gene with the mutation}
19+
#' \item{\code{region}}{name of the region of interest, i.e. from the named list of positions,
20+
#' defaults to fks1_hs1 and fks1_hs2 (fks1, hotspots 1 & 2)}
21+
#' \item{\code{position}}{the nucleotide position of the mutation}
22+
#' \item{\code{mutation}}{the protein level mutation, annotated using (HGVS notation)(http://varnomen.hgvs.org/bg-material/simple/)}
23+
#' \item{\code{ref_sequence}}{the nucleotide sequence for the reference}
24+
#' \item{\code{sample_sequence}}{the nucleotide sequence for the sample}
25+
#' }
26+
#'
27+
#'
28+
#' @export
29+
#' @import data.table
30+
#'
31+
snpeffr <- function(vcf_path,
32+
positions = list(fks1_hs1 = 221638:221665, fks1_hs2 = 223782:223805),
33+
genes = c("CAB11_002014"),
34+
exclude_effects = "synonymous_variant") {
35+
36+
vcf <- data.table::fread(file = vcf_path)
37+
38+
posits <- unlist(positions, use.names = FALSE)
39+
names(posits) <- rep(names(positions), unlist(lapply(positions, length)))
40+
41+
# filter by position
42+
vcf <- vcf[POS %in% posits]
43+
44+
if(nrow(vcf) == 0) {
45+
# Create an empty data.table
46+
return(
47+
data.table("sample_id" = factor(),
48+
"snpeff_gene_name" = character(),
49+
"region" = character(),
50+
"position" = character(),
51+
"mutation" = character(),
52+
"ref_sequence" = character(),
53+
"sample_sequence" = character())
54+
)
55+
} else {
56+
# parse annotations
57+
# first filter to any with a functional annotation
58+
vcf <- vcf[grepl("ANN=", vcf$INFO)]
59+
vcf[, rowid := 1:nrow(vcf)]
60+
61+
annots <- vcf[, tstrsplit(gsub(".*ANN=", "", INFO), split = ",", fixed=TRUE, fill="<NA>")]
62+
colnames(annots) <- ann_cols <- paste0("parseANN", 1:ncol(annots))
63+
64+
prse_annots <- cbind(vcf[, c("rowid", "#CHROM", "POS", "REF", "ALT"), with = FALSE],
65+
annots)
66+
ids <- colnames(prse_annots)[!grepl("parseANN", colnames(prse_annots))]
67+
prse_annots <- melt(prse_annots, id.vars = ids)
68+
prse_annots <- prse_annots[value != "<NA>"]
69+
70+
# annotation order and fields from snpeff
71+
# http://pcingola.github.io/SnpEff/se_inputoutput/
72+
ann_fields <- c("allele", "effect", "putative_impact", "gene_name", "gene_id", "feature_type",
73+
"feature_id", "transcript_biotype", "rank_total", "HGVS.c", "HGVS.p",
74+
"cDNA_pos_len", "CDS_pos_len", "protein_pos_len", "distance")
75+
76+
# parse by each annotation! (otherwise will create too large a file!)
77+
prsed <- prse_annots[, data.table::tstrsplit(value, split = "|", fill = "<NA>", fixed = TRUE)]
78+
setnames(prsed, 1:15, ann_fields)
79+
80+
if(ncol(prsed) > 15) {
81+
extra_cols <- c("err_warn_info", paste0("V", (length(ann_fields) + 1):ncol(prsed)))
82+
prsed[, err_warn_info := Reduce(function(...) paste(..., sep = "|"), .SD), .SDcols = extra_cols]
83+
prsed[, err_warn_info := gsub("|<NA>", "", err_warn_info, fixed = TRUE)]
84+
} else {
85+
prsed[, err_warn_info := NA]
86+
}
87+
88+
89+
prse_annots <- cbind(prse_annots[, -c("variable", "value")], prsed[, ann_fields, with = FALSE])
90+
91+
# filter by effects and genes
92+
93+
# join up with reference and allele info
94+
# then parse the alternatives & bind them to the reference
95+
nts <- prse_annots[, tstrsplit(ALT, ",", fixed = TRUE, fill = "<NA>")]
96+
nts <- as.matrix(cbind(rep("", nrow(nts)), prse_annots$REF, nts)) # first row is for the dots
97+
98+
# re-merging with sample info ----
99+
samps <- colnames(vcf)[!colnames(vcf) %in% c("#CHROM", "POS" , "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", "rowid")]
100+
samp_info <- vcf[, c("rowid", samps), with = FALSE]
101+
102+
# just get genotypes (helper fun)
103+
genot <- function(x) {
104+
x <- sub("(.*?)[\\.|:].*", "\\1", x)
105+
as.numeric(fifelse(x == "", "-1", x))
106+
}
107+
samp_info[, (samps) := lapply(.SD, genot), .SDcols = samps]
108+
samp_info[, (samps) := lapply(.SD, function(x) nts[cbind(rowid, x + 2)]), .SDcols = samps]
109+
110+
# merge with the parsed results
111+
vcf_prsed <- samp_info[prse_annots, on = "rowid"]
112+
vcf_prsed <- vcf_prsed[HGVS.p != "" & !grepl(exclude_effects, effect) & gene_id %in% genes]
113+
vcf_long <- melt(vcf_prsed[, c(samps, "REF", "POS", "gene_id", "effect", "ALT", "allele", "HGVS.p"), with = FALSE],
114+
measure.vars = samps, variable.name = "sample_id")
115+
116+
# get just the samples with mutations
117+
vcf_long <- vcf_long[value == allele]
118+
posits <- data.table(region = names(posits), POS = posits)
119+
vcf_long <- posits[vcf_long, on = "POS"]
120+
121+
setnames(vcf_long, old = c("POS", "REF", "gene_id", "allele", "HGVS.p"),
122+
new = c("position", "ref_sequence", "snpeff_gene_name", "sample_sequence",
123+
"mutation"))
124+
125+
return(vcf_long[, c("sample_id", "snpeff_gene_name", "region", "position", "mutation",
126+
"ref_sequence", "sample_sequence")])
127+
}
128+
129+
130+
}

R/snpeffr-package.R

+15
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
#' @keywords internal
2+
"snpeffr"
3+
4+
## usethis namespace: start
5+
#' @importFrom data.table .BY
6+
#' @importFrom data.table .EACHI
7+
#' @importFrom data.table .GRP
8+
#' @importFrom data.table .I
9+
#' @importFrom data.table .N
10+
#' @importFrom data.table .NGRP
11+
#' @importFrom data.table .SD
12+
#' @importFrom data.table :=
13+
#' @importFrom data.table data.table
14+
## usethis namespace: end
15+
NULL

inst/snpeffr.R

+57
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
#! /usr/bin/env Rscript
2+
3+
# load docopt
4+
tt <- require(docopt, quietly = TRUE)
5+
6+
if(!tt) {
7+
print("Installing docopt package first!\n")
8+
install.packages("docopt", repos = "https://cloud.r-project.org")
9+
library(docopt, warn.conflicts = FALSE)
10+
}
11+
12+
# configuration for docopt
13+
"
14+
Parse snpeff output to summary of mutations at positions of interest
15+
16+
Usage:
17+
snpeffr.r [--fpath=FPATH] [--pos=POS] [--genes=GENES] [-exc=EXCL] [-out=OUT]
18+
19+
Options:
20+
-v, --version Show version.
21+
-f FPATH --fpath=FPATH path to input vcf file from snpeff
22+
-p POS --pos=POS format as named comma separated list
23+
of positions (no spaces), i.e. see default
24+
[default: fks1_hs1=221638:221665,fks1_hs2=223782:223805]
25+
-g GENES --genes=GENES a list of comma separated gene names (no spaces) [default: CAB11_002014]
26+
-e EXCL --exc=EXCL a quoted regular expression for effects to exclude [default: 'synonymous_variant']
27+
-o OUT --out=OUT csv or gz file to save output to [default: out.csv]
28+
-h, --help show this help text
29+
" -> doc
30+
31+
argus <- docopt(doc, version = 'v1.0\n')
32+
33+
# check arguments
34+
genes <- trimws(unlist(strsplit(argus$genes, split = ",")))
35+
if(length(genes) < 1 || !is.character(genes)) {
36+
37+
stop("List of genes passed not formatted correctly. These should be
38+
formatted as: my_gene_1, my_gene_2")
39+
40+
}
41+
42+
pos <- eval(str2lang(paste0("list(", argus$pos, ")")))
43+
if(!is.list(pos) || length(pos) < 1) {
44+
stop("List of positions passed is not formatted correctly. These should be
45+
formatted as: gene_name1=posfirst:poslast, gene_name2=posfirst:poslast")
46+
}
47+
48+
# deal with quoting of regular expression
49+
argus$exc <- substr(argus$exc, 2, nchar(argus$exc) - 1)
50+
51+
# run parser
52+
out <- snpeffr::snpeffr(vcf_path = argus$fpath,
53+
positions = pos,
54+
genes = genes,
55+
exclude_effects = argus$exc)
56+
57+
data.table::fwrite(out, argus$out)

inst/vcf-filter_ann_mod.vcf.gz

20.3 MB
Binary file not shown.

man/snpeffr.Rd

+43
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

renv.lock

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
{
2+
"R": {
3+
"Version": "4.0.4",
4+
"Repositories": [
5+
{
6+
"Name": "CRAN",
7+
"URL": "https://cloud.r-project.org"
8+
}
9+
]
10+
},
11+
"Packages": {
12+
"renv": {
13+
"Package": "renv",
14+
"Version": "0.15.5",
15+
"Source": "Repository",
16+
"Repository": "CRAN",
17+
"Hash": "6a38294e7d12f5d8e656b08c5bd8ae34",
18+
"Requirements": []
19+
}
20+
}
21+
}

renv/.gitignore

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
library/
2+
local/
3+
cellar/
4+
lock/
5+
python/
6+
staging/

0 commit comments

Comments
 (0)