-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_GenBank_DNA.R
97 lines (80 loc) · 3.63 KB
/
get_GenBank_DNA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
## Code for downloading batches of DNA sequences from GenBank for multiple
## genes at a time.
##
## Written by Vikram B. Baliga
## Last updated: 2019-12-05
############################### package loading ################################
## Specify the packages you'll use in the script
packages <- c("tidyverse", "ape")
## Now for each package you listed, first check to see if the package is already
## installed. If it is installed, it's simply loaded. If not, it's downloaded
## from CRAN and then installed and loaded.
package.check <- lapply(packages,
FUN = function(x) {
if (!require(x, character.only = TRUE)) {
install.packages(x, dependencies = TRUE)
library(x, character.only = TRUE)
}
}
)
################################# data import ##################################
## The imported csv file should have:
## - column 1 = species names; example file uses "binomial" as this column name
## - each successive column represents one gene
## - each row is one species
accessions_batch <- read_csv("./Baliga_et_al_2019_SciAdv_all_genes.csv")
## Now exclude the column with species names and make a list of each gene set
accessions_list <-
accessions_batch %>%
## exclude species names
dplyr::select(-binomial) %>%
## coerce to list
as.list() %>%
## remove NAs
lapply(function(x) x[!is.na(x)])
## OPTIONAL:
## Perform a check - does the number of genes match expectations?
## Next block of code is commented out so it doesn't run; un-comment if you want
## to use it.
### First state how many genes you expect
#number_of_genes <- 12
### Now test
#ifelse(length(accessions_list) == number_of_genes,
# yes = "Number of genes matches expectations",
# no = stop("\nNumber of genes does not match your expectations.
# \nPlease check how data are being imported."))
################################ batch downloading #############################
## It is possible to use lapply() on accessions_list to apply read.GenBank()
## to each gene within the list. BUT, you'll run into a `429 Too Many Requests`
## after the first few hundred. It's an API issue. The code below gives an
## example of this, which won't work. Maybe someday this will be permissible...
#test <- lapply(accessions_list, ape::read.GenBank, species.names = TRUE)
## Instead, we'll use purrr::slowly() along with map().
## Use rate_delay() to make R momentarily pause before running the next line.
## This prevents overloading the API.
## I've found a 2-sec pause seems to work; YMMV.
sleep_timer <- rate_delay(2)
## Now use slowly() to take ape::read.GenBank() and wait 5 secs
slow_pulls <- slowly(~ read.GenBank(.x, species.names = TRUE),
rate = sleep_timer,
quiet = FALSE)
## Now map this function, using accessions_list
sequences_set <-
map(accessions_list, slow_pulls) %>%
## And rename according to species instead of accession IDs
lapply(function(x) {
attr(x, "species") -> attr(x, "names")
return(x)
})
################################ writing to files ##############################
## First make list of gene names to prep for file export
gene_names <-
colnames(accessions_batch)[-1] %>%
paste0("_seqs.fasta")
## Write each set of sequences to FASTA files
invisible(lapply(1:length(sequences_set),
function(x) {
write.dna(sequences_set[[x]],
gene_names[[x]],
format = "fasta")
}))