-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarize.R
94 lines (76 loc) · 3.96 KB
/
summarize.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#
# Amadou Gaye - 17 September 2014
# This script summarizes the results of the plink IBD checks. It counts and records the names of GIDs that
# were expected to be duplicates (i.e. those GIDs that wre present two or more times in within or across
# datasets) and found as so, the GIDs expected to be found as duplicates but not found and the unexpected
# duplicates (i.e. where two different GID have an IBD proportion > 0.9.
#
# names of the plink datasets
files <- c("cng","metabochip","rahi","t1dgc","wtccc1_aff","wtccc1_inf","wtccc2_ill","wtccc1_15k","wtccc2_aff")
size <- length(files)
# path to input files
path0 = "/home/ag13748/1958BC/Merge.31.8.14/checksResults/"
# path to output files
path1 = "/home/ag13748/1958BC/Merge.31.8.14/summaries/"
# load the functions required
source("/home/ag13748/1958BC/1958BC_Merge/countwithin.R")
source("/home/ag13748/1958BC/1958BC_Merge/countacross.R")
# count expected duplicates found, expected duplicates not found and unexpected duplicates found
# the 1st matrix hold the counts of expected found and not found and the 2nd the counts of unexpected
count1 <- matrix(data=NA, nrow=size, ncol=size, dimnames=list(files,files))
count2 <- matrix(data=NA, nrow=size, ncol=size, dimnames=list(files,files))
for(i in 1:size){
if(i == size){
# print file names to monitor progress
cat(files[i],"...\n")
tlast <- read.table(paste0(path0,files[i],".genome"), header=T)
flast <- read.table(paste0(path,files[i], ".txt"))
# get counts of duplicates within the dataset
out1 <- countwithin(tlast, flast, files[i], path1)
count1[i,i] <- paste0(as.character(length(out1[[1]])), " [", length(out1[[2]]), "]")
count2[i,i] <- dim(out1[[3]])[1]
break
}
for(j in (i+1):size){
# load input file
infile <- paste0(path0,files[i], "_", files[j])
t <- read.table(paste0(infile,".genome"), header=T)
# get the duplicated GID names
path <- "/home/ag13748/1958BC/Merge.31.8.14/plinkArgFiles/args4conversion/"
f1 <- read.table(paste0(path,files[i], ".txt"))
f2 <- read.table(paste0(path,files[j], ".txt"))
if(i == j-1){
# print file names to monitor progress
cat(files[i],"...\n")
tdiag <- read.table(paste0(path0,files[i],".genome"), header=T)
# get counts of duplicates within the dataset
out1 <- countwithin(tdiag, f1, files[i], path1)
count1[i,i] <- paste0(as.character(length(out1[[1]])), " [", length(out1[[2]]), "]")
count2[i,i] <- dim(out1[[3]])[1]
}
# print file names to monitor progress
cat(files[i],"-", files[j], "...\n")
# output file
system(paste0("mkdir ", paste0(path1, files[i], "_", files[j])))
outpath <- paste0(path1, files[i], "_", files[j],"/")
# get counts of duplicates across datasets
out2 <- countacross(t, f1, f2, files[i], files[j])
# save the GIDs in each of the categories (found, not found and unexpected
if(length(out2[[1]][,1]) > 0){
write.table(out2[[1]][,1], file=paste0(outpath,"expectedfound.txt"), quote=F, row.names=F, col.names="GID")
}
if(length(out2[[2]]) > 0){
write.table(out2[[2]], file=paste0(outpath,"expectedNOTfound.txt"), quote=F, row.names=F, col.names="GID")
}
if(length(out2[[3]]) > 0){
write.table(out2[[3]], file=paste0(outpath,"unexpected.txt"), quote=F, row.names=F, col.names=c(files[i], files[j]), sep=";")
}
# store the counts of expected found, not found in the cell in the 1st count matrix
count1[i,j] <- paste0(as.character(dim(out2[[1]])[1]), " [", length(out2[[2]]),"]")
# store the counts of unexpected duplicates in the 2nd matrix
count2[i,j] <- dim(out2[[3]])[1]
}
}
# save the count tables
write.table(count1, file=paste0(path1,"countsofexpected.csv"), quote=FALSE, row.names=TRUE, col.names=TRUE, sep=';')
write.table(count2, file=paste0(path1,"countsofUNexpected.csv"), quote=FALSE, row.names=TRUE, col.names=TRUE, sep=';')