-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathevaluate_marker_sets.R
146 lines (131 loc) · 5.12 KB
/
evaluate_marker_sets.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Functions to assess a set of markers for how well it can differentiate accessions
# freqmat is a matrix of allele frequencies with markers in rows and populations in columns.
# Function to get expected heterozygosity, from a vector of allele frequencies,
# assuming biallelic markers.
Expected_het <- function(freq){
return(1 - freq ^ 2 - (1 - freq) ^ 2)
}
# Get Jost's D from a matrix of allele frequencies.
# Global Jost's D is calculated. To get pairwise Jost's D, do two columns at a time.
JostD <- function(freqmat){
meanfreq <- rowMeans(freqmat, na.rm = TRUE)
Ht <- Expected_het(meanfreq)
Hs_mat <- apply(freqmat, 2, Expected_het)
Hs <- rowMeans(Hs_mat, na.rm = TRUE)
return((Ht - Hs) / (1 - Hs))
}
# The geometric mean of pairwise Jost's D could be a good way to see if we have markers that distinguish all pops
DiffScore <- function(freqmat){
npop <- ncol(freqmat)
if(npop < 2) stop("Can't do population differentiation with less than two populations.")
npairs <- sum(seq_len(npop - 1))
p <- 1
dmat <- matrix(NA_real_, nrow = nrow(freqmat), ncol = npairs,
dimnames = list(rownames(freqmat), NULL))
for(i in 1:(npop - 1)){
for(j in (i + 1):npop){
dmat[,p] <- JostD(freqmat[,c(i, j)])
p <- p + 1
}
}
score <- exp(mean(log(colMeans(dmat, na.rm = TRUE))))
return(score)
}
# Geometric mean of expected heterozygosity across populations
DivScore <- function(freqmat){
hmat <- Expected_het(freqmat)
score <- exp(mean(log(colMeans(hmat, na.rm = TRUE))))
return(score)
}
# Simulated annealing algorithm to find an ideal marker set.
# nSNP: The number of SNPs to have in the set.
# within_pop_weight and between_pop_weight indicate how important it is to have
# variation within and between populations, respectively.
# T0: Starting temperature. The higher this is, the more the algorithm will
# randomly explore the solution space.
# reps: Number of sets to test at each temperature. More reps means a longer
# computational time but more thorough exploration of solutions.
# rho: The factor by which temperature decreases with each round. The higher
# this is, the more the algorithm will randomly explore the solution space.
# maxrounds: The maximum number of rounds of simulated annealing to perform
# before stopping. The algorithm will also stop if no new solutions are found
# within a round.
# min_dist_bp: The minimum distance, in basepairs, between markers in the set.
findMarkerSet <- function(freqmat, nSNP = 20,
within_pop_weight = 1, between_pop_weight = 0.5,
T0 = 0.2, reps = 500, rho = 0.95,
maxrounds = 100,
min_dist_bp = 1e5){
# setup
totSNP <- nrow(freqmat)
Temp <- T0
temps_used <- numeric(maxrounds)
scores_by_round <- numeric(maxrounds)
# function to generate a score for a set of SNPs
scorefn <- function(snps){
num <- log(DiffScore(freqmat[snps,])) * between_pop_weight +
log(DivScore(freqmat[snps,])) * within_pop_weight
den <- between_pop_weight + within_pop_weight
return(exp(num / den))
}
# function to determine if a SNP is too physically close to any already in the set
chromosomes <- sub("_[[:digit:]]+$", "", rownames(freqmat))
positions <- as.integer(sub("^.+_", "", rownames(freqmat)))
snp_pos_ok <- function(snp, set){
samechr <- set[chromosomes[set] == chromosomes[snp]]
return(all(abs(positions[samechr] - positions[snp]) >= min_dist_bp))
}
# initial set of SNPs
thisset <- integer(nSNP)
for(s in seq_len(nSNP)){
snp <- sample(totSNP, 1)
while(!snp_pos_ok(snp, thisset[seq_len(s - 1)])){
snp <- sample(totSNP, 1)
}
thisset[s] <- snp
}
thisscore <- scorefn(thisset)
bestset <- thisset
bestscore <- thisscore
# simulated annealing
for(i in seq_len(maxrounds)){
message(paste("Simulated annealing round", i))
switched <- FALSE
for(j in seq_len(reps)){
# remove one SNP
newset <- sample(thisset, nSNP - 1)
# find a new SNP and confirm it is not too close to any in the set
repeat{
newsnp <- sample(seq_len(totSNP)[-newset], 1)
if(snp_pos_ok(newsnp, newset)) break
}
newset <- c(newset, newsnp)
# score the set of SNPs by diversity and differentiation
newscore <- scorefn(newset)
# determine whether to move to this new solution
if(newscore > thisscore ||
runif(1) < Temp - (thisscore - newscore)){
thisset <- newset
thisscore <- newscore
switched <- TRUE
if(thisscore > bestscore){
bestset <- thisset
bestscore <- thisscore
}
}
}
temps_used[i] <- Temp
scores_by_round[i] <- bestscore
Temp <- Temp * rho
if(!switched) break
}
return(list(Set = rownames(freqmat)[bestset],
Temperatures = temps_used[1:i],
Scores = scores_by_round[1:i]))
}
## Determine the proportion of genotypes that are unique with a given set of markers
propUnique <- function(numgen){
dup1 <- duplicated(numgen, MARGIN = 2)
dup2 <- duplicated(numgen, MARGIN = 2, fromLast = TRUE)
return(mean(!dup1 & !dup2))
}