-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduce_db.sql
441 lines (373 loc) · 16.9 KB
/
reduce_db.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
-- This is an sqlite3 sql script to convert a VNTRseek sqlite3 database
-- to a smaller and more efficient version to use with VNTRview
--
-- USAGE:
-- $ sqlite3 source_database < conversion_script (this script)
-- eg.
-- $ sqlite3 hg00096.db < newdb.sql
-- Note that the name of the new database is hardcoded below and this has
-- to be made into a variable
-- ATTACH DATABASE 'temp_reduced.db' as 'newdb';
-- ATTACH DATABASE '{modified_database}' as 'newdb';
BEGIN TRANSACTION;
--*******************************************************
-- rank table
-- fields ties, refdir not used in any queries
-- multi-field primary key, use WITHOUT ROWID
-- ***field score is only used in mapped_reads queries for concatenated fields named Info (in some queries), or unnamed (in other queries); the Info field seems to be used for file naming in the fasta section of server.py
DROP TABLE IF EXISTS newdb.rank;
--CREATE TABLE newdb.rank (
-- refid integer NOT NULL,
-- readid integer NOT NULL,
-- score float DEFAULT NULL,
-- ties integer NOT NULL DEFAULT 0,
-- refdir char(1) NOT NULL,
-- PRIMARY KEY (refid, readid)
--) WITHOUT ROWID;
--INSERT INTO newdb.rank (refid, readid, score) SELECT refid, readid, score FROM rank;
--*******************************************************
-- rankflank table
-- field ties is not used in any query
-- multi-field primary key, use WITHOUT ROWID
-- ***field score is only used in mapped_reads queries for concatenated fields named Info (in some queries), or unnamed (in other queries); the Info field seems to be used for file naming in the fasta section of server.py
DROP TABLE IF EXISTS newdb.rankflank;
--CREATE TABLE newdb.rankflank (
-- refid integer NOT NULL,
-- readid integer NOT NULL,
-- score float DEFAULT NULL,
-- ties integer NOT NULL DEFAULT 0,
-- PRIMARY KEY (refid, readid)
--) WITHOUT ROWID;
-- INSERT INTO newdb.rankflank (refid, readid, score) SELECT refid, readid, score FROM rankflank;
--*******************************************************
-- rankandrankflank table
-- new table, to avoid join of rank and rankflank in reference.sql query
-- may drop this table
DROP TABLE IF EXISTS newdb.rankandrankflank;
-- CREATE TABLE newdb.rankandrankflank (
-- refid integer NOT NULL,
-- readid integer NOT NULL,
-- -- rankscore float DEFAULT NULL, -- do not need, not used in reference.sql
-- -- rankflankscore float DEFAULT NULL, -- do not need, not used in reference.sql
-- PRIMARY KEY (refid,readid)
-- ) WITHOUT ROWID;
--
-- INSERT INTO newdb.rankandrankflank (refid, readid) SELECT refid, readid FROM rank JOIN rankflank USING (refid, readid);
--*******************************************************
-- map table
-- field bbb needed to determine which reads are mapped bbb,
-- -- KA: 8/22
-- "but is never used in the new database once refidcounts table is created, so remove"
-- I don't think this is true
-- field reserved is 1 if the mapping is for a vntr
-- field reserved2 not used in any queries
-- multi-field primary key, use WITHOUT ROWID
-- XXX might be XXX is combined with other tables with same multi-field key
-- Analysis of queries shows that map has lookup of refid and a join using readid, but only after finding the refid, so can drop readid index
DROP TABLE IF EXISTS newdb.map;
--CREATE TABLE newdb.map (
-- refid integer NOT NULL,
-- readid integer NOT NULL,
-- reserved integer NOT NULL,
-- reserved2 integer NOT NULL,
-- bbb integer NOT NULL DEFAULT 0,
-- PRIMARY KEY (refid, readid)
--) WITHOUT ROWID;
--INSERT INTO newdb.map (refid, readid, reserved, bbb) SELECT refid, readid, reserved, bbb FROM map;
--INSERT INTO newdb.map (refid, readid, reserved) SELECT refid, readid, reserved FROM map;
--CREATE INDEX "idx_map_read_index" ON "map" (`readid`);
--*******************************************************
-- scores new table
-- combines map, rank, and rankflank tables
-- field bbb needed to determine which reads are mapped bbb
-- field reserved is 1 if the mapping is for a vntr
-- multi-field primary key, use WITHOUT ROWID
DROP TABLE IF EXISTS newdb.scores;
CREATE TABLE newdb.scores (
refid integer NOT NULL,
readid integer NOT NULL,
reserved integer NOT NULL,
bbb integer NOT NULL DEFAULT 0,
rank_score float DEFAULT NULL,
rankflank_score float DEFAULT NULL,
PRIMARY KEY (refid, readid)
) WITHOUT ROWID;
INSERT INTO newdb.scores (refid, readid, reserved, bbb, rank_score, rankflank_score)
SELECT refid, readid, reserved, bbb, rank.score, rankflank.score
FROM map LEFT JOIN rank using(refid, readid) LEFT JOIN rankflank USING(refid, readid);
--*******************************************************
-- refidcounts new table
-- used to speed up references query
-- and to add BBB mappings without ties
DROP TABLE IF EXISTS newdb.refidcounts;
CREATE TABLE newdb.refidcounts (
refid integer NOT NULL,
mapcount integer,
rankcount integer,
rankflankcount integer,
bbbcount integer,
support text,
PRIMARY KEY (refid)
);
INSERT INTO newdb.refidcounts (refid, mapcount, rankcount, rankflankcount, bbbcount, support)
SELECT rid,
(SELECT COUNT(*) FROM map WHERE map.refid=rid) as mapcount,
(SELECT COUNT(*) FROM rank WHERE rank.refid=rid) as rankcount,
(SELECT COUNT(*) FROM rankflank WHERE rankflank.refid=rid) as rankflankcount,
(SELECT COUNT(*) FROM map WHERE map.refid=rid and bbb=1) as bbbcount,
(SELECT GROUP_CONCAT(vntr_support.copies || "(" || support || ")") FROM vntr_support WHERE vntr_support.refid=-rid ORDER BY vntr_support.copies) as support
FROM fasta_ref_reps;
--*******************************************************
-- clusterlnk table
-- field reserved is VNTR_copy_diff in reference.sql
-- field direction is used in vcf_row...sql and others
-- ***field reserved2 is always zero and is used in mapped_align_reads_format...sql files, but is unnamed
-- clusterid is not used for joins
-- repeatid is unique and thus can be primary key
-- remove multi-field primary key
DROP TABLE IF EXISTS newdb.clusterlnk;
CREATE TABLE newdb.clusterlnk (
clusterid integer NOT NULL,
repeatid integer NOT NULL,
direction char(1) NOT NULL,
reserved integer NOT NULL,
-- reserved2 integer NOT NULL,
--PRIMARY KEY (clusterid,repeatid),
--UNIQUE (repeatid)
PRIMARY KEY (repeatid)
);
INSERT INTO newdb.clusterlnk (clusterid, repeatid, direction, reserved) SELECT clusterid, repeatid, direction, reserved FROM clusterlnk;
-- doesn't work for some reason, syntax error at .
-- CREATE INDEX newdb.idx_clusterlnk_clusterid_index ON clusterlnk (clusterid);
--*******************************************************
-- replnk table
-- all joins use rid
-- no joins use sid, so index is unneeded
-- fields profile, profilerc, profsize, patsize are not used in any queries
DROP TABLE IF EXISTS newdb.replnk;
CREATE TABLE newdb.replnk (
rid integer NOT NULL,
sid integer NOT NULL,
first integer NOT NULL,
last integer NOT NULL,
-- patsize integer NOT NULL,
copynum float NOT NULL,
pattern text NOT NULL,
-- profile text COLLATE BINARY,
-- profilerc text COLLATE BINARY,
-- profsize integer DEFAULT NULL,
PRIMARY KEY (rid)
);
--CREATE INDEX "idx_replnk_sid" ON "replnk" (sid);
INSERT INTO newdb.replnk (rid, sid, first, last, copynum, pattern) SELECT rid, sid, first, last, copynum, pattern FROM replnk;
--*******************************************************
-- fasta_reads table
-- field head is not used in a join, unique index can be removed
-- field qual is not used in any query
DROP TABLE IF EXISTS newdb.fasta_reads;
CREATE TABLE newdb.fasta_reads (
sid integer NOT NULL,
head varchar(500) NOT NULL,
dna varchar(8000) DEFAULT NULL,
-- qual varchar(8000) DEFAULT NULL,
PRIMARY KEY (sid)
-- UNIQUE (head)
);
INSERT INTO newdb.fasta_reads (sid, head, dna) SELECT sid, head, dna FROM fasta_reads;
--*******************************************************
-- clusters table
-- the following fields are empty: refs_flank_undist, refs_flank_dist, refs_flank_undist_l, refs_flank_undist_lr, refs_flank_undist_r
-- the following fields are empty, but used in cluster.sql and cluster_count.sql: assemblyreq, flankdensity, mcpattern, aveentropy
-- they are not used in the cluster.html page, so remove and let it fail
-- they are used in the filtering and ordering by parts of the html pages, so remove
DROP TABLE IF EXISTS newdb.clusters;
CREATE TABLE newdb.clusters (
cid integer NOT NULL,
minpat integer NOT NULL,
maxpat integer NOT NULL,
-- refs_flank_undist integer DEFAULT NULL,
-- refs_flank_dist integer DEFAULT NULL,
-- refs_flank_undist_l integer DEFAULT NULL,
-- refs_flank_undist_r integer DEFAULT NULL,
-- refs_flank_undist_lr integer DEFAULT NULL,
repeatcount integer NOT NULL,
refcount integer NOT NULL,
variability integer NOT NULL DEFAULT '0',
-- assemblyreq integer DEFAULT NULL,
profdensity float DEFAULT NULL,
-- flankdensity float DEFAULT NULL,
-- mcpattern varchar(5000) DEFAULT NULL,
-- aveentropy float DEFAULT NULL,
PRIMARY KEY (cid)
);
INSERT INTO newdb.clusters (cid, minpat, maxpat, repeatcount, refcount, variability, profdensity) SELECT cid, minpat, maxpat, repeatcount, refcount, variability, profdensity FROM clusters;
--*******************************************************
-- vntr_support table
-- all joins use refid, but it is not unique
-- multi-field primary key, use WITHOUT ROWID
-- no joins use representative, copies, or support so remove indexes
-- ***field copiesfloat is not used in any query
DROP TABLE IF EXISTS newdb.vntr_support;
CREATE TABLE newdb.vntr_support (
refid integer NOT NULL,
copies integer NOT NULL,
sameasref integer NOT NULL,
support integer NOT NULL DEFAULT '0',
copiesfloat float NOT NULL,
representative integer DEFAULT NULL,
PRIMARY KEY (refid,copies)
) WITHOUT ROWID;
-- CREATE INDEX "idx_vntr_support_read_index" ON "vntr_support" (`representative`);
-- CREATE INDEX "idx_vntr_support_copies_support" ON "vntr_support" (`copies`,`support`);
INSERT INTO newdb.vntr_support (refid, copies, sameasref, support, copiesfloat, representative) SELECT refid, copies, sameasref, support, copiesfloat, representative FROM vntr_support;
--*******************************************************
-- fasta_ref_reps table (sample)
-- all joins use rid
-- unique index not required
-- fields not used: flank_disting, has_support, span1, spanN, support_vntr, support_vntr_span1
-- fields maybe will be used in future: homez..., hetez...
-- field not required: comment
DROP TABLE IF EXISTS newdb.fasta_ref_reps;
CREATE TABLE newdb.fasta_ref_reps (
rid integer NOT NULL,
-- comment varchar(500) DEFAULT NULL,
-- flank_disting integer DEFAULT NULL,
entropy float NOT NULL DEFAULT '0',
-- has_support integer DEFAULT NULL,
-- span1 integer DEFAULT NULL,
-- spanN integer DEFAULT NULL,
homez_same integer DEFAULT NULL,
homez_diff integer DEFAULT NULL,
hetez_same integer DEFAULT NULL,
hetez_diff integer DEFAULT NULL,
hetez_multi integer DEFAULT NULL,
-- support_vntr integer DEFAULT NULL,
-- support_vntr_span1 integer DEFAULT NULL,
alleles_sup integer DEFAULT NULL,
allele_sup_same_as_ref integer DEFAULT NULL,
PRIMARY KEY (rid)
--UNIQUE (rid,comment)
);
INSERT INTO newdb.fasta_ref_reps (rid, entropy, homez_same, homez_diff, hetez_same, hetez_diff, hetez_multi, alleles_sup, allele_sup_same_as_ref) SELECT rid, entropy, homez_same, homez_diff, hetez_same, hetez_diff, hetez_multi, alleles_sup, allele_sup_same_as_ref FROM fasta_ref_reps;
--*******************************************************
-- stats table
DROP TABLE IF EXISTS newdb.stats;
CREATE TABLE newdb.stats (
id integer NOT NULL,
MAP_ROOT varchar(500) DEFAULT NULL,
PARAM_TRF varchar(500) DEFAULT NULL,
PARAM_PROCLU varchar(500) DEFAULT NULL,
FOLDER_FASTA varchar(500) DEFAULT NULL,
FOLDER_PROFILES varchar(500) DEFAULT NULL,
FOLDER_PROFILES_CLEAN varchar(500) DEFAULT NULL,
FOLDER_REFERENCE varchar(500) DEFAULT NULL,
FILE_REFERENCE_LEB varchar(500) DEFAULT NULL,
FILE_REFERENCE_SEQ varchar(500) DEFAULT NULL,
NUMBER_READS integer DEFAULT NULL,
NUMBER_TRS_IN_READS integer DEFAULT NULL,
NUMBER_TRS_IN_READS_GE7 integer DEFAULT NULL,
NUMBER_READS_WITHTRS integer DEFAULT NULL,
NUMBER_READS_WITHTRS_GE7 integer DEFAULT NULL,
NUMBER_READS_WITHTRS_GE7_AFTER_REDUND integer DEFAULT NULL,
NUMBER_TRS_IN_READS_AFTER_REDUND integer DEFAULT NULL,
NUMBER_REF_TRS integer DEFAULT NULL,
NUMBER_REFS_TRS_AFTER_REDUND integer DEFAULT NULL,
CLUST_NUMBER_OF_PROCLU_CLUSTERS integer DEFAULT NULL,
CLUST_NUMBER_OF_PROCLU_CLUSTERS_BEFORE_REJOIN integer DEFAULT NULL,
CLUST_NUMBER_OF_EXACTPAT_CLUSTERS integer DEFAULT NULL,
CLUST_NUMBER_OF_REF_REPS_IN_CLUSTERS integer DEFAULT NULL,
CLUST_NUMBER_OF_READ_REPS_IN_CLUSTERS integer DEFAULT NULL,
CLUST_LARGEST_NUMBER_OF_TRS_IN_PROCLU_CLUSTER integer DEFAULT NULL,
CLUST_LARGEST_NUMBER_OF_REFS_IN_PROCLU_CLUSTER integer DEFAULT NULL,
CLUST_LARGEST_PATRANGE_IN_PROCLU_CLUSTER integer DEFAULT NULL,
CLUST_LARGEST_NUMBER_OF_TRS_IN_EXACTPAT_CLUSTER integer DEFAULT NULL,
CLUST_LARGEST_NUMBER_OF_REFS_IN_EXACTPAT_CLUSTER integer DEFAULT NULL,
CLUST_NUMBER_OF_REFS_WITH_PREDICTED_VNTR integer DEFAULT NULL,
CLUST_NUMBER_OF_CLUSTERS_WITH_PREDICTED_VNTR integer DEFAULT NULL,
NUMBER_REFS_VNTR_SPAN_N integer DEFAULT NULL,
NUMBER_REFS_SINGLE_REF_CLUSTER integer DEFAULT NULL,
NUMBER_REFS_SINGLE_REF_CLUSTER_WITH_READS_MAPPED integer DEFAULT NULL,
NUMBER_REFS_SINGLE_REF_CLUSTER_WITH_NO_READS_MAPPED integer DEFAULT NULL,
NUMBER_MAPPED integer DEFAULT NULL,
NUMBER_RANK integer DEFAULT NULL,
NUMBER_RANKFLANK integer DEFAULT NULL,
INTERSECT_RANK_AND_RANKFLANK integer DEFAULT NULL,
INTERSECT_RANK_AND_RANKFLANK_BEFORE_PCR integer DEFAULT NULL,
BBB_WITH_MAP_DUPS integer DEFAULT NULL,
BBB integer DEFAULT NULL,
RANK_EDGES_OVERCUTOFF integer DEFAULT NULL,
RANK_REMOVED_SAMEREF integer DEFAULT NULL,
RANK_REMOVED_SAMESEQ integer DEFAULT NULL,
RANK_REMOVED_PCRDUP integer DEFAULT NULL,
RANKFLANK_EDGES_INSERTED integer DEFAULT NULL,
RANKFLANK_REMOVED_SAMEREF integer DEFAULT NULL,
RANKFLANK_REMOVED_SAMESEQ integer DEFAULT NULL,
RANKFLANK_REMOVED_PCRDUP integer DEFAULT NULL,
TIME_MYSQLCREATE integer DEFAULT NULL,
TIME_TRF integer DEFAULT NULL,
TIME_RENUMB integer DEFAULT NULL,
TIME_REDUND integer DEFAULT NULL,
TIME_PROCLU integer DEFAULT NULL,
TIME_JOINCLUST integer DEFAULT NULL,
TIME_DB_INSERT_REFS integer DEFAULT NULL,
TIME_DB_INSERT_READS integer DEFAULT NULL,
TIME_WRITE_FLANKS integer DEFAULT NULL,
TIME_MAP_FLANKS integer DEFAULT NULL,
TIME_MAP_REFFLANKS integer DEFAULT NULL,
TIME_MAP_INSERT integer DEFAULT NULL,
TIME_EDGES integer DEFAULT NULL,
TIME_INDEX_PCR integer DEFAULT NULL,
TIME_PCR_DUP integer DEFAULT NULL,
TIME_MAP_DUP integer DEFAULT NULL,
TIME_VNTR_PREDICT integer DEFAULT NULL,
TIME_ASSEMBLYREQ integer DEFAULT NULL,
TIME_REPORTS integer DEFAULT NULL,
TIME_DB_CONVERSION_AND_READ_COMPRESSION integer DEFAULT NULL,
DATE_MYSQLCREATE text DEFAULT NULL,
DATE_TRF text DEFAULT NULL,
DATE_RENUMB text DEFAULT NULL,
DATE_REDUND text DEFAULT NULL,
DATE_PROCLU text DEFAULT NULL,
DATE_JOINCLUST text DEFAULT NULL,
DATE_DB_INSERT_REFS text DEFAULT NULL,
DATE_DB_INSERT_READS text DEFAULT NULL,
DATE_WRITE_FLANKS text DEFAULT NULL,
DATE_MAP_FLANKS text DEFAULT NULL,
DATE_MAP_REFFLANKS text DEFAULT NULL,
DATE_MAP_INSERT text DEFAULT NULL,
DATE_EDGES text DEFAULT NULL,
DATE_INDEX_PCR text DEFAULT NULL,
DATE_PCR_DUP text DEFAULT NULL,
DATE_MAP_DUP text DEFAULT NULL,
DATE_VNTR_PREDICT text DEFAULT NULL,
DATE_ASSEMBLYREQ text DEFAULT NULL,
DATE_REPORTS text DEFAULT NULL,
DATE_DB_CONVERSION_AND_READ_COMPRESSION text DEFAULT NULL,
ERROR_STEP integer NOT NULL DEFAULT '0',
ERROR_DESC varchar(500) NOT NULL DEFAULT '',
ERROR_CODE integer NOT NULL DEFAULT '0',
N_MIN_SUPPORT integer NOT NULL DEFAULT '0',
MIN_FLANK_REQUIRED integer NOT NULL DEFAULT '0',
MAX_FLANK_CONSIDERED integer NOT NULL DEFAULT '0',
PRIMARY KEY (id)
);
-- copy table without naming fields
INSERT INTO newdb.stats SELECT * FROM stats;
--*******************************************************
COMMIT;
ANALYZE newdb;
-- can't vacuum from within a transaction
-- for vacuum, need to have enough space in the /var/tmp directory
-- During a VACUUM, SQLite creates a temporary file that is approximately the same size as the original database. It does this in order to maintain the database ACID properties. SQLite uses a directory to hold the temporary files it needs while doing the VACUUM. In order to determine what directory to use, it descends a hierarchy looking for the first directory that has the proper access permissions.For Unix (and Linux) the hierarchy is:
-- Whatever is specified by the SQLITE_TMPDIR environment variable,
-- Whatever is specified by the TMPDIR environment variable,
-- /var/tmp,
-- /usr/tmp,
-- /tmp, and finally
-- The current working directory.
-- possible fix (should put these at top)
-- may not work because temp_store_directory is deprecated
-- pragma temp_store = 1;
-- pragma temp_store_directory = '/var/www/VNTRview_1.09/db';
VACUUM newdb;
DETACH newdb;