-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalign.R
59 lines (49 loc) · 2.75 KB
/
align.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#
# Amadou Gaye - 13 September 2014
# This function processes two file prior to merging: it ensure the two files have only their overlapping SNPs
# included and aligns the alleles information using the alleles information of the first file.
# The arguments to the functions are the names of the input files, the path that indicates their location
# and the output of the ouptu files.
#
align <- function(file1, file2, indir, outdir){
# load the .bim files of the datasets
cat(" Loading .bim files\n")
t1 <- read.table(paste0(indir,file1,".bim"), header=F, sep="\t")
t2 <- read.table(paste0(indir,file2,".bim"), header=F, sep="\t")
# merge the 2 files by and chromosome, rsname and position
# I do not want rsnames with different chromosomal or physical position
colnames(t1) <- c("chr", "name", "gd", "pos", "allele1", "allele2")
colnames(t2) <- c("chr", "name", "gd", "pos", "allele1", "allele2")
mergedtable <- as.matrix(merge(t1,t2,by=c("name","chr","pos")))
# get the common rsnames and save it as a file
argfile1 <- paste0(outdir, file1, "_", file2, ".arg.txt")
write.table(mergedtable[,1], file=argfile1, quote=F, row.names=F, col.names=F)
# generate a plink file from the second file which contains only the overlapping rsnames
infile1 <- paste0(indir,file1)
infile2 <- paste0(indir,file2)
outfile1 <- paste0(outdir, file1)
tempfile2 <- paste0(outdir, file2,"_temp")
cat(" Generating the subset plink files with common markers only\n")
system(paste0("p-link --silent --bfile ", infile1," --extract ", argfile1, " --make-bed --out ", outfile1))
system(paste0("p-link --silent --bfile ", infile2," --extract ", argfile1, " --make-bed --out ", tempfile2))
# now get the allele coding of the merged file and update the allele information in the 2nd dataset
argfile2 <- paste0(outdir, file2,"_arg.txt")
mm <- cbind(mergedtable[,1], mergedtable[,c(8,9)], mergedtable[,c(5,6)])
# check for duplicated rsnames and keep only one, otherwise the aliging step will fail
mm <- unique(mm)
# now remove eventual duplicated rsname with differering allele coding (those will not be caught by the above 'unique' command)
duprs <- as.character(mm[duplicated(mm[,1]),1])
if(length(duprs) > 0){
for(i in 1:length(duprs)){
xx <- which(mm[,1] %in% duprs[i])
mm <- mm[-xx[2:length(xx)],]
}
}
write.table(mm, file=argfile2, quote=F, row.names=F, col.names=F, sep=" ")
# align second dataset allele info to those of the first dataset
outfile2 <- paste0(outdir,file2)
cat(" Aligning the allele information\n")
system(paste0("p-link --silent --bfile ", tempfile2," --update-alleles ", argfile2, " --make-bed --out ", outfile2))
# delete the temprorary file to spare space
system(paste0("rm ", tempfile2, ".*"))
}