-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep_01_import_raw_data.R
186 lines (155 loc) · 5.55 KB
/
step_01_import_raw_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
library(tidyverse)
library(stringi)
library(conflicted)
# Load the data in data/Pair{n}/*.tsv
# n is the number of the subject pair
# first get a list of all the files
# originals start with Subj*
data_prefix <- "qtm_capture_"
data_dir <- "data/raw/"
data_dir_out <- "data/"
data_out_prefix <- "data_"
metadata_out_prefix <- "metadata_"
events_out_prefix <- "events_"
data_files <- Sys.glob(paste0(data_dir, data_prefix, "*.tsv"))
# id trial_order partner door_or_window handedness age tone_deaf gender years_formal_music_training rating1 rating2 rating3 rating4
subject_info <- read_tsv(
paste0(
data_dir, "subject_information.tsv"
),
col_types = cols(
id = col_character(),
trial_order = col_character(),
partner = col_character(),
door_or_window = col_character(),
handedness = col_character(),
age_range = col_double(),
tone_deaf = col_logical(),
gender = col_character(),
years_formal_music_training = col_double(),
rating1 = col_double(),
rating2 = col_double(),
rating3 = col_double(),
rating4 = col_double()
)
)
# now we need to anonymize the data
# specifically, removing the timestamp
# (this is in the header data starting with TIME_STAMP)
# we also need to extract the events, and make it into
# a proper TSV, we can save the events in a separate file
print("Cleaning data")
print(paste("Found", length(data_files), "files"))
print("=====================")
# read in the data as raw text
for (data_file in data_files) {
print(paste("Processing", data_file))
# read in the data
dat <- readLines(data_file)
subj_ids_match <- stri_match_first_regex(
data_file, paste0(data_prefix, "([^_]+)_([^_\\.]+)"))
subj_id_1 <- subj_ids_match[1, 2]
subj_id_2 <- subj_ids_match[1, 3]
subj_w <- subject_info %>%
filter(door_or_window == "w" & (id == subj_id_1 | id == subj_id_2)) %>%
pull(id)
subj_d <- subject_info %>%
filter(door_or_window == "d" & (id == subj_id_1 | id == subj_id_2)) %>%
pull(id)
if (length(subj_w) == 0 | length(subj_d) == 0) {
stop(paste("Could not find subject",
subj_id_1, "or", subj_id_2, "in subject_info"))
}
if (length(subj_w) > 1 | length(subj_d) > 1) {
stop(paste("Found multiple subjects",
subj_id_1, "or", subj_id_2, "in subject_info"))
}
subj_w <- subj_w[1]
subj_d <- subj_d[1]
data_file <- sub(data_dir, data_dir_out, data_file)
data_file <- sub(".tsv", ".tsv.bz2", data_file)
# remove the timestamp
events <- stri_extract_first_regex(dat, "^EVENT.*")
# remove NAs
events <- events[!is.na(events)]
# now get other relevant metadata
# get frequency
freq <- stri_extract_first_regex(dat, "^FREQUENCY.*")
freq <- freq[!is.na(freq)]
# get marker count
marker_count <- stri_extract_first_regex(dat, "^NO_OF_MARKERS.*")
marker_count <- marker_count[!is.na(marker_count)]
marker_count_value <- as.integer(stri_split_fixed(marker_count, "\t")[[1]][2])
# get frame count
frame_count <- stri_extract_first_regex(dat, "^NO_OF_FRAMES.*")
frame_count <- frame_count[!is.na(frame_count)]
frame_count_value <- as.integer(stri_split_fixed(frame_count, "\t")[[1]][2])
# get marker names
marker_names <- stri_extract_first_regex(dat, "^MARKER_NAMES.*")
marker_names <- marker_names[!is.na(marker_names)]
marker_names_values <- stri_split_fixed(
marker_names,
"\t"
)[[1]][1:marker_count_value + 1]
# now just keep the tracking information
dat <- stri_extract_first_regex(dat, "^[0-9]+\t.*")
dat <- dat[!is.na(dat)]
# now ensure the number of frames is correct
# this is the number of lines in the data
assertthat::assert_that(
length(dat) == frame_count_value,
msg = "Number of frames is not correct"
)
# ensure the number of markers is correct
# this is 3 columns per marker, plus index and time
assertthat::assert_that(
length(stri_split_fixed(dat[[1]], "\t")[[1]]) == marker_count_value * 3 + 2,
msg = "Number of markers is not correct"
)
print(paste("File has", length(dat), "frames"))
print(paste("File has", marker_count_value, "markers"))
print("Writing data files......")
# now we need to add a header row to the data
# this is:
# index\telapsed_time\t{marker_names}[1]x\t
# {marker_names}[1]y\t{marker_names}[1]z ...
# where {marker_names}[1] is the first marker name
header <- c(
"index",
"elapsed_time",
paste0(
rep(marker_names_values, each = 3),
c("_x", "_y", "_z")
),
"subj_w",
"subj_d"
)
header <- paste0(header, collapse = "\t")
print("adding subject IDs")
# now we need to add the subject ids to the data
# this will be the last two columns
dat <- sapply(dat, paste0, "\t", subj_w, "\t", subj_d)
dat <- c(header, dat)
# write the data back out in the same directory but prefix with anon_
data_out_filename <- gsub(data_prefix, data_out_prefix, data_file)
print(paste("Writing", data_out_filename))
con <- bzfile(data_out_filename)
writeLines(dat, con)
close(con)
# add header to events
events <- c("type\tevent_label\tindex\telapsed_time", events)
# write the events out to a separate file
event_out_filename <- gsub(data_prefix, events_out_prefix, data_file)
print(paste("Writing", event_out_filename))
con <- bzfile(event_out_filename)
writeLines(events, con)
close(con)
# write the metadata out to a separate file
metadata <- c(freq, marker_count, frame_count, marker_names)
metadata_out_filename <- gsub(data_prefix, metadata_out_prefix, data_file)
print(paste("Writing", metadata_out_filename))
con <- bzfile(metadata_out_filename)
writeLines(metadata, con)
close(con)
}
print("Done")