-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcedarpp.h
877 lines (871 loc) · 36.1 KB
/
cedarpp.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
// cedar -- C++ implementation of Efficiently-updatable Double ARray trie
// $Id: cedarpp.h 1830 2014-06-16 06:17:42Z ynaga $
// Copyright (c) 2009-2014 Naoki Yoshinaga <[email protected]>
//
// Modifications to the code by Gopalakrishna Palem <[email protected]>
// 1. Efficient reset method to reset the trie without reallocating the memory
// 2. Additional CommonPrefixSearch() based on sentinel(without the need for computing the string length)
//
#ifndef CEDAR_H
#define CEDAR_H
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <climits>
#include <cassert>
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#define STATIC_ASSERT(e, msg) typedef char msg[(e) ? 1 : -1]
namespace cedar {
// typedefs
#if LONG_BIT == 64
typedef unsigned long npos_t; // possibly compatible with size_t
#else
typedef unsigned long long npos_t;
#endif
typedef unsigned char uchar;
static const npos_t TAIL_OFFSET_MASK = static_cast <npos_t> (0xffffffff);
static const npos_t NODE_INDEX_MASK = static_cast <npos_t> (0xffffffff) << 32;
template <typename T> struct NaN { enum { N1 = -1, N2 = -2 }; };
template <> struct NaN <float> { enum { N1 = 0x7f800001, N2 = 0x7f800002 }; };
static const int MAX_ALLOC_SIZE = 1 << 16; // must be divisible by 256
// dynamic double array
template <typename value_type,
const int NO_VALUE = NaN <value_type>::N1,
const int NO_PATH = NaN <value_type>::N2,
const bool ORDERED = true,
const int MAX_TRIAL = 1,
const size_t NUM_TRACKING_NODES = 0>
class da {
public:
enum error_code { CEDAR_NO_VALUE = NO_VALUE, CEDAR_NO_PATH = NO_PATH };
typedef value_type result_type;
struct result_pair_type {
value_type value;
size_t length; // prefix length
};
struct result_triple_type { // for predict ()
value_type value;
size_t length; // suffix length
npos_t id; // node id of value
};
struct node {
union { int base; value_type value; }; // negative means prev empty index
int check; // negative means next empty index
node (const int base_ = 0, const int check_ = 0)
: base (base_), check (check_) {}
};
struct ninfo { // x1.5 update speed; +.25 % memory (8n -> 10n)
uchar sibling; // right sibling (= 0 if not exist)
uchar child; // first child
ninfo () : sibling (0), child (0) {}
};
struct block { // a block w/ 256 elements
int prev; // prev block; 3 bytes
int next; // next block; 3 bytes
short num; // # empty elements; 0 - 256
short reject; // minimum # branching failed to locate; soft limit
int trial; // # trial
int ehead; // first empty item
block () : prev (0), next (0), num (256), reject (257), trial (0), ehead (0) {}
};
da () : tracking_node (), _array (0), _tail (0), _tail0 (0), _ninfo (0), _block (0), _bheadF (0), _bheadC (0), _bheadO (0), _capacity (0), _size (0), _quota (0), _quota0 (0), _no_delete (false), _reject () {
STATIC_ASSERT(sizeof (value_type) <= sizeof (int),
value_type_is_not_supported___maintain_a_value_array_by_yourself_and_store_its_index_to_trie
);
_initialize ();
}
~da () { clear (false); }
size_t capacity () const { return static_cast <size_t> (_capacity); }
size_t size () const { return static_cast <size_t> (_size); }
size_t length () const { return static_cast <size_t> (*_length); }
size_t total_size () const { return sizeof (node) * _size; }
size_t unit_size () const { return sizeof (node); }
size_t nonzero_size () const {
size_t i = 0;
for (int to = 0; to < _size; ++to)
if (_array[to].check >= 0) ++i;
return i;
}
size_t nonzero_length () const {
size_t i (0), j (0);
for (int to = 0; to < _size; ++to) {
const node& n = _array[to];
if (n.check >= 0 && _array[n.check].base != to && n.base < 0)
{ ++j; for (const char* p = &_tail[-n.base]; *p; ++p) ++i; }
}
return i + j * (1 + sizeof (value_type));
}
size_t num_keys () const {
size_t i = 0;
for (int to = 0; to < _size; ++to) {
const node& n = _array[to];
if (n.check >= 0 && (_array[n.check].base == to || n.base < 0)) ++i;
}
return i;
}
// interfance
template <typename T>
T exactMatchSearch (const char* key) const
{ return exactMatchSearch <T> (key, std::strlen (key)); }
template <typename T>
T exactMatchSearch (const char* key, size_t len, npos_t from = 0) const {
union { int i; value_type x; } b;
size_t pos = 0;
b.i = _find (key, from, pos, len);
if (b.i == CEDAR_NO_PATH) b.i = CEDAR_NO_VALUE;
T result;
_set_result (&result, b.x, len, from);
return result;
}
template <typename T, typename TSentinel>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, TSentinel sentinel) const
{
npos_t from = 0;
size_t num = 0;
for (size_t pos = 0; key[pos] != sentinel; ) { // break when key[pos] == '\0'
union { int i; value_type x; } b;
b.i = _find(key, from, pos, pos + 1);
if (b.i == CEDAR_NO_VALUE) continue;
if (b.i == CEDAR_NO_PATH) return num;
if (num < result_len) _set_result(&result[num], b.x, pos, from);
++num;
}
return num;
}
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len) const
{ return commonPrefixSearch (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixSearch (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) const {
size_t num = 0;
for (size_t pos = 0; pos < len; ) {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, pos + 1);
if (b.i == CEDAR_NO_VALUE) continue;
if (b.i == CEDAR_NO_PATH) return num;
if (num < result_len) _set_result (&result[num], b.x, pos, from);
++num;
}
return num;
}
// predict key from double array
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len)
{ return commonPrefixPredict (key, result, result_len, std::strlen (key)); }
template <typename T>
size_t commonPrefixPredict (const char* key, T* result, size_t result_len, size_t len, npos_t from = 0) {
size_t num (0), pos (0), p (0);
if (_find (key, from, pos, len) == CEDAR_NO_PATH) return 0;
union { int i; value_type x; } b;
const npos_t root = from;
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p, root)) {
if (num < result_len)
_set_result (&result[num], b.x, p, from);
++num;
}
return num;
}
void suffix (char* key, size_t len, npos_t to) const {
key[len] = '\0';
if (const int offset = static_cast <int> (to >> 32)) {
to &= TAIL_OFFSET_MASK;
size_t len_tail = std::strlen (&_tail[-_array[to].base]);
if (len > len_tail) len -= len_tail; else len_tail = len, len = 0;
std::memcpy (&key[len], &_tail[static_cast <size_t> (offset) - len_tail], len_tail);
}
while (len--) {
const int from = _array[to].check;
key[len] = static_cast <char> (_array[from].base ^ static_cast <int> (to));
to = static_cast <npos_t> (from);
}
}
value_type traverse (const char* key, npos_t& from, size_t& pos) const
{ return traverse (key, from, pos, std::strlen (key)); }
value_type traverse (const char* key, npos_t& from, size_t& pos, size_t len) const {
union { int i; value_type x; } b;
b.i = _find (key, from, pos, len);
return b.x;
}
struct empty_callback { void operator () (const int, const int) {} }; // dummy empty function
value_type& update (const char* key)
{ return update (key, std::strlen (key)); }
value_type& update (const char* key, size_t len, value_type val = value_type (0))
{ npos_t from (0); size_t pos (0); return update (key, from, pos, len, val); }
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val = value_type (0))
{ empty_callback cf; return update (key, from, pos, len, val, cf); }
template <typename T>
value_type& update (const char* key, npos_t& from, size_t& pos, size_t len, value_type val, T& cf) {
if (! len && ! from)
_err (__FILE__, __LINE__, "failed to insert zero-length key\n");
#ifndef USE_FAST_LOAD
if (! _ninfo || ! _block) restore ();
#endif
npos_t offset = from >> 32;
if (! offset) { // node on trie
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
_array[from].base >= 0; ++pos) {
if (pos == len)
{ const int to = _follow (from, 0, cf); return _array[to].value += val; }
from = static_cast <size_t> (_follow (from, key_[pos], cf));
}
offset = static_cast <npos_t> (-_array[from].base);
}
if (offset >= sizeof (int)) { // go to _tail
const size_t pos_orig = pos;
char* const tail = &_tail[offset] - pos;
while (pos < len && key[pos] == tail[pos]) ++pos;
//
if (pos == len && tail[pos] == '\0') { // found exact key
if (const npos_t moved = pos - pos_orig) { // search end on tail
from &= TAIL_OFFSET_MASK;
from |= (offset + moved) << 32;
}
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
}
// otherwise, insert the common prefix in tail if any
if (from >> 32) {
from &= TAIL_OFFSET_MASK; // reset to update tail offset
for (npos_t offset_ = static_cast <npos_t> (-_array[from].base);
offset_ < offset; ) {
from = static_cast <size_t>
(_follow (from, static_cast <uchar> (_tail[offset_]), cf));
++offset_;
// this shows intricacy in debugging updatable double array trie
if (NUM_TRACKING_NODES) // keep the traversed node (on tail) updated
for (size_t j = 0; tracking_node[j] != 0; ++j)
if (tracking_node[j] >> 32 == offset_)
tracking_node[j] = static_cast <npos_t> (from);
}
}
for (size_t pos_ = pos_orig; pos_ < pos; ++pos_)
from = static_cast <size_t>
(_follow (from, static_cast <uchar> (key[pos_]), cf));
npos_t moved = pos - pos_orig;
if (tail[pos]) { // remember to move offset to existing tail
const int to_ = _follow (from, static_cast <uchar> (tail[pos]), cf);
_array[to_].base = - static_cast <int> (offset + ++moved);
moved -= 1 + sizeof (value_type); // keep record
}
moved += offset;
for (npos_t i = offset; i <= moved; i += 1 + sizeof (value_type)) {
if (_quota0 == ++*_length0) {
#ifdef USE_EXACT_FIT
_quota0 += *_length0 >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length0;
#else
_quota0 += _quota0;
#endif
_realloc_array (_tail0, _quota0, *_length0);
}
_tail0[*_length0] = static_cast <int> (i);
}
if (pos == len || tail[pos] == '\0') {
const int to = _follow (from, 0, cf);
if (pos == len) return _array[to].value += val; // set value on tail
_array[to].value += *reinterpret_cast <value_type*> (&tail[pos + 1]);
}
from = static_cast <size_t> (_follow (from, static_cast <uchar> (key[pos]), cf));
++pos;
}
const int needed = static_cast <int> (len - pos + 1 + sizeof (value_type));
if (pos == len && *_length0) { // reuse
const int offset0 = _tail0[*_length0];
_tail[offset0] = '\0';
_array[from].base = -offset0;
--*_length0;
return *reinterpret_cast <value_type*> (&_tail[offset0 + 1]) = val;
}
if (_quota < *_length + needed) {
#ifdef USE_EXACT_FIT
_quota += needed > *_length || needed > MAX_ALLOC_SIZE ? needed :
(*_length >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : *_length);
#else
_quota += _quota >= needed ? _quota : needed;
#endif
_realloc_array (_tail, _quota, *_length);
}
_array[from].base = -*_length;
const size_t pos_orig = pos;
char* const tail = &_tail[*_length] - pos;
if (pos < len) {
do tail[pos] = key[pos]; while (++pos < len);
from |= (static_cast <npos_t> (*_length) + (len - pos_orig)) << 32;
}
*_length += needed;
return *reinterpret_cast <value_type*> (&tail[len + 1]) += val;
}
// easy-going erase () without compression
int erase (const char* key) { return erase (key, std::strlen (key)); }
int erase (const char* key, size_t len, npos_t from = 0) {
size_t pos = 0;
const int i = _find (key, from, pos, len);
if (i == CEDAR_NO_PATH || i == CEDAR_NO_VALUE) return -1;
if (from >> 32) from &= TAIL_OFFSET_MASK; // leave tail as is
bool flag = _array[from].base < 0; // have sibling
int e = flag ? static_cast <int> (from) : _array[from].base ^ 0;
from = _array[e].check;
do {
const node& n = _array[from];
flag = _ninfo[n.base ^ _ninfo[from].child].sibling;
if (flag) _pop_sibling (from, n.base, static_cast <uchar> (n.base ^ e));
_push_enode (e);
e = static_cast <int> (from);
from = static_cast <size_t> (_array[from].check);
} while (! flag);
return 0;
}
int build (size_t num, const char** key, const size_t* len = 0, const value_type* val = 0) {
for (size_t i = 0; i < num; ++i)
update (key[i], len ? len[i] : std::strlen (key[i]), val ? val[i] : value_type (i));
return 0;
}
template <typename T>
void dump (T* result, const size_t result_len) {
union { int i; value_type x; } b;
size_t num (0), p (0);
npos_t from = 0;
for (b.i = begin (from, p); b.i != CEDAR_NO_PATH; b.i = next (from, p))
if (num < result_len)
_set_result (&result[num++], b.x, p, from);
else
_err (__FILE__, __LINE__, "dump() needs array of length = num_keys()\n");
}
void shrink_tail () {
union { char* tail; int* length; } t;
const size_t length_
= static_cast <size_t> (*_length)
- static_cast <size_t> (*_length0) * (1 + sizeof (value_type));
t.tail = static_cast <char*> (std::malloc (length_));
if (! t.tail) _err (__FILE__, __LINE__, "memory allocation failed\n");
*t.length = static_cast <int> (sizeof (int));
for (int to = 0; to < _size; ++to) {
node& n = _array[to];
if (n.check >= 0 && _array[n.check].base != to && n.base < 0) {
char* const tail (&t.tail[*t.length]), * const tail_ (&_tail[-n.base]);
n.base = - *t.length;
int i = 0; do tail[i] = tail_[i]; while (tail[i++]);
*reinterpret_cast <value_type*> (&tail[i])
= *reinterpret_cast <const value_type*> (&tail_[i]);
*t.length += i + static_cast <int> (sizeof (value_type));
}
}
std::free (_tail);
_tail = t.tail;
_realloc_array (_tail, *_length, *_length);
_quota = *_length;
_realloc_array (_tail0, 1);
_quota0 = 1;
}
int save (const char* fn, const char* mode, const bool shrink) {
if (shrink) shrink_tail ();
return save (fn, mode);
}
int save (const char* fn, const char* mode = "wb") const {
// _test ();
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
std::fwrite (_tail, sizeof (char), static_cast <size_t> (*_length), fp);
std::fwrite (_array, sizeof (node), static_cast <size_t> (_size), fp);
std::fclose (fp);
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fwrite (&_bheadF, sizeof (int), 1, fp);
std::fwrite (&_bheadC, sizeof (int), 1, fp);
std::fwrite (&_bheadO, sizeof (int), 1, fp);
std::fwrite (_ninfo, sizeof (ninfo), static_cast <size_t> (_size), fp);
std::fwrite (_block, sizeof (block), static_cast <size_t> (_size >> 8), fp);
std::fclose (fp);
#endif
return 0;
}
int open (const char* fn, const char* mode = "rb",
const size_t offset = 0, size_t size_ = 0) {
FILE* fp = std::fopen (fn, mode);
if (! fp) return -1;
// get size
if (! size_) {
if (std::fseek (fp, 0, SEEK_END) != 0) return -1;
size_ = static_cast <size_t> (std::ftell (fp));
if (std::fseek (fp, 0, SEEK_SET) != 0) return -1;
}
if (size_ <= offset) return -1;
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
int len = 0;
if (std::fread (&len, sizeof (int), 1, fp) != 1) return -1;
const size_t length_ = static_cast <size_t> (len);
if (size_ <= offset + length_) return -1;
// set array
clear (false);
size_ = (size_ - offset - length_) / sizeof (node);
_array = static_cast <node*> (std::malloc (sizeof (node) * size_));
_tail = static_cast <char*> (std::malloc (length_));
_tail0 = static_cast <int*> (std::malloc (sizeof (int)));
#ifdef USE_FAST_LOAD
_ninfo = static_cast <ninfo*> (std::malloc (sizeof (ninfo) * size_));
_block = static_cast <block*> (std::malloc (sizeof (block) * size_));
if (! _array || ! _tail || ! _tail0 || ! _ninfo || ! _block)
#else
if (! _array || ! _tail || ! _tail0)
#endif
_err (__FILE__, __LINE__, "memory allocation failed\n");
if (std::fseek (fp, static_cast <long> (offset), SEEK_SET) != 0) return -1;
if (length_ != std::fread (_tail, sizeof (char), length_, fp) ||
size_ != std::fread (_array, sizeof (node), size_, fp))
return -1;
std::fclose (fp);
_size = static_cast <int> (size_);
*_length0 = 0;
#ifdef USE_FAST_LOAD
const char* const info
= std::strcat (std::strcpy (new char[std::strlen (fn) + 5], fn), ".sbl");
fp = std::fopen (info, mode);
delete [] info; // resolve memory leak
if (! fp) return -1;
std::fread (&_bheadF, sizeof (int), 1, fp);
std::fread (&_bheadC, sizeof (int), 1, fp);
std::fread (&_bheadO, sizeof (int), 1, fp);
if (size_ != std::fread (_ninfo, sizeof (ninfo), size_, fp) ||
size_ >> 8 != std::fread (_block, sizeof (block), size_ >> 8, fp))
return -1;
std::fclose (fp);
_capacity = _size;
_quota = *_length;
_quota0 = 1;
#endif
return 0;
}
#ifndef USE_FAST_LOAD
void restore () { // restore information to update
if (! _block) _restore_block ();
if (! _ninfo) _restore_ninfo ();
_capacity = _size;
_quota = *_length;
_quota0 = 1;
}
#endif
// remove all the keys while keeping the memoryallocation
void reset()
{
// initialize existing arrays while keeping the size
_realloc_array(_array, _capacity, 256);
_realloc_array(_tail, _quota);
_realloc_array(_tail0, _quota0);
_realloc_array(_ninfo, _capacity);
_realloc_array(_block, _capacity >> 8);
// initialize first block
_array[0] = node(0, -1);
_array[1] = node(-255, -2);
for (int i = 2; i < 255; ++i)
_array[i] = node(-(i - 1), -(i + 1));
_array[255] = node(-254, -1);
_size = 256;
_block[0].ehead = 1; // bug fix for erase
*_length = static_cast <int> (sizeof (int));
_bheadF = _bheadC = _bheadO = 0;
for (size_t i = 0; i <= NUM_TRACKING_NODES; ++i)
tracking_node[i] = 0;
for (short i = 0; i <= 256; ++i)
_reject[i] = i + 1;
}
void set_array (void* p, size_t size_ = 0) { // ad-hoc
clear (false);
if (size_)
size_ = size_ * unit_size () - static_cast <size_t> (*static_cast <int*> (p));
_tail = static_cast <char*> (p);
_array = reinterpret_cast <node*> (_tail + *_length);
_size = static_cast <int> (size_ / unit_size () + (size_ % unit_size () ? 1 : 0));
_no_delete = true;
}
const void* array () const { return _array; }
void clear (const bool reuse = true) {
if (_no_delete) _array = 0, _tail = 0;
if (_array) std::free (_array); _array = 0;
if (_tail) std::free (_tail); _tail = 0;
if (_tail0) std::free (_tail0); _tail0 = 0;
if (_ninfo) std::free (_ninfo); _ninfo = 0;
if (_block) std::free (_block); _block = 0;
_bheadF = _bheadC = _bheadO = _capacity = _size = _quota = _quota0 = 0;
if (reuse) _initialize ();
_no_delete = false;
}
// return the first child for a tree rooted by a given node
int begin (npos_t& from, size_t& len) {
#ifndef USE_FAST_LOAD
if (! _ninfo) _restore_ninfo ();
#endif
int base = from >> 32 ? - static_cast <int> (from >> 32) : _array[from].base;
if (base >= 0) { // on trie
uchar c = _ninfo[from].child;
if (! from && ! (c = _ninfo[base ^ c].sibling)) // bug fix
return CEDAR_NO_PATH; // no entry
for (; c && base >= 0; ++len) {
from = static_cast <size_t> (base) ^ c;
base = _array[from].base;
c = _ninfo[from].child;
}
if (base >= 0) return _array[base ^ c].base;
}
const size_t len_ = std::strlen (&_tail[-base]);
from &= TAIL_OFFSET_MASK;
from |= static_cast <npos_t> (static_cast <size_t> (-base) + len_) << 32;
len += len_;
return *reinterpret_cast <int*> (&_tail[-base] + len_ + 1);
}
// return the next child if any
int next (npos_t& from, size_t& len, const npos_t root = 0) {
uchar c = 0;
if (const int offset = static_cast <int> (from >> 32)) { // on tail
if (root >> 32) return CEDAR_NO_PATH;
from &= TAIL_OFFSET_MASK;
len -= static_cast <size_t> (offset - (-_array[from].base));
} else
c = _ninfo[_array[from].base ^ 0].sibling;
for (; ! c && from != root; --len) {
c = _ninfo[from].sibling;
from = static_cast <size_t> (_array[from].check);
}
if (! c) return CEDAR_NO_PATH;
return begin (from = static_cast <size_t> (_array[from].base) ^ c, ++len);
}
npos_t tracking_node[NUM_TRACKING_NODES + 1];
private:
// currently disabled; implement these if you need
da (const da&);
da& operator= (const da&);
node* _array;
union { char* _tail; int* _length; };
union { int* _tail0; int* _length0; };
ninfo* _ninfo;
block* _block;
int _bheadF; // first block of Full; 0
int _bheadC; // first block of Closed; 0 if no Closed
int _bheadO; // first block of Open; 0 if no Open
int _capacity;
int _size;
int _quota;
int _quota0;
int _no_delete;
short _reject[257];
//
static void _err (const char* fn, const int ln, const char* msg)
{ std::fprintf (stderr, "cedar: %s [%d]: %s", fn, ln, msg); std::exit (1); }
template <typename T>
static void _realloc_array (T*& p, const int size_n, const int size_p = 0) {
void* tmp = std::realloc (p, sizeof (T) * static_cast <size_t> (size_n));
if (! tmp)
std::free (p), _err (__FILE__, __LINE__, "memory reallocation failed\n");
p = static_cast <T*> (tmp);
static const T T0 = T ();
for (T* q (p + size_p), * const r (p + size_n); q != r; ++q) *q = T0;
}
void _initialize () { // initilize the first special block
_realloc_array (_array, 256, 256);
_realloc_array (_tail, sizeof (int));
_realloc_array (_tail0, 1);
_realloc_array (_ninfo, 256);
_realloc_array (_block, 1);
_array[0] = node (0, -1);
for (int i = 1; i < 256; ++i)
_array[i] = node (i == 1 ? -255 : - (i - 1), i == 255 ? -1 : - (i + 1));
_capacity = _size = 256;
_block[0].ehead = 1; // bug fix for erase
_quota = *_length = static_cast <int> (sizeof (int));
_quota0 = 1;
for (size_t i = 0 ; i <= NUM_TRACKING_NODES; ++i) tracking_node[i] = 0;
for (short i = 0; i <= 256; ++i) _reject[i] = i + 1;
}
// follow/create edge
template <typename T>
int _follow (npos_t& from, const uchar& label, T& cf) {
int to = 0;
const int base = _array[from].base;
if (base < 0 || _array[to = base ^ label].check < 0) {
to = _pop_enode (base, label, static_cast <int> (from));
_push_sibling (from, to ^ label, label, base >= 0);
} else if (_array[to].check != static_cast <int> (from))
to = _resolve (from, base, label, cf);
return to;
}
// find key from double array
int _find (const char* key, npos_t& from, size_t& pos, const size_t len) const {
npos_t offset = from >> 32;
if (! offset) { // node on trie
for (const uchar* const key_ = reinterpret_cast <const uchar*> (key);
_array[from].base >= 0; ) {
if (pos == len) {
const node& n = _array[_array[from].base ^ 0];
if (n.check != static_cast <int> (from)) return CEDAR_NO_VALUE;
return n.base;
}
size_t to = static_cast <size_t> (_array[from].base); to ^= key_[pos];
if (_array[to].check != static_cast <int> (from)) return CEDAR_NO_PATH;
++pos;
from = to;
}
offset = static_cast <npos_t> (-_array[from].base);
}
// switch to _tail to match suffix
const size_t pos_orig = pos; // start position in reading _tail
const char* const tail = &_tail[offset] - pos;
if (pos < len) {
do if (key[pos] != tail[pos]) break; while (++pos < len);
if (const npos_t moved = pos - pos_orig) {
from &= TAIL_OFFSET_MASK;
from |= (offset + moved) << 32;
}
if (pos < len) return CEDAR_NO_PATH; // input > tail, input != tail
}
if (tail[pos]) return CEDAR_NO_VALUE; // input < tail
return *reinterpret_cast <const int*> (&tail[len + 1]);
}
#ifndef USE_FAST_LOAD
void _restore_ninfo () {
_realloc_array (_ninfo, _size);
for (int to = 0; to < _size; ++to) {
const int from = _array[to].check;
if (from < 0) continue; // skip empty node
const int base = _array[from].base;
if (const uchar label = static_cast <uchar> (base ^ to)) // skip leaf
_push_sibling (static_cast <size_t> (from), base, label,
! from || _ninfo[from].child || _array[base ^ 0].check == from);
}
}
void _restore_block () {
_realloc_array (_block, _size >> 8);
_bheadF = _bheadC = _bheadO = 0;
for (int bi (0), e (0); e < _size; ++bi) { // register blocks to full
block& b = _block[bi];
b.num = 0;
for (; e < (bi << 8) + 256; ++e)
if (_array[e].check < 0 && ++b.num == 1) b.ehead = e;
int& head_out = b.num == 1 ? _bheadC : (b.num == 0 ? _bheadF : _bheadO);
_push_block (bi, head_out, ! head_out && b.num);
}
}
#endif
void _set_result (result_type* x, value_type r, size_t = 0, npos_t = 0) const
{ *x = r; }
void _set_result (result_pair_type* x, value_type r, size_t l, npos_t = 0) const
{ x->value = r; x->length = l; }
void _set_result (result_triple_type* x, value_type r, size_t l, npos_t from) const
{ x->value = r; x->length = l; x->id = from; }
void _pop_block (const int bi, int& head_in, const bool last) {
if (last) { // last one poped; Closed or Open
head_in = 0;
} else {
const block& b = _block[bi];
_block[b.prev].next = b.next;
_block[b.next].prev = b.prev;
if (bi == head_in) head_in = b.next;
}
}
void _push_block (const int bi, int& head_out, const bool empty) {
block& b = _block[bi];
if (empty) { // the destination is empty
head_out = b.prev = b.next = bi;
} else { // use most recently pushed
int& tail_out = _block[head_out].prev;
b.prev = tail_out;
b.next = head_out;
head_out = tail_out = _block[tail_out].next = bi;
}
}
int _add_block () {
if (_size == _capacity) { // allocate memory if needed
#ifdef USE_EXACT_FIT
_capacity += _size >= MAX_ALLOC_SIZE ? MAX_ALLOC_SIZE : _size;
#else
_capacity += _capacity;
#endif
_realloc_array (_array, _capacity, _capacity);
_realloc_array (_ninfo, _capacity, _size);
_realloc_array (_block, _capacity >> 8, _size >> 8);
}
_block[_size >> 8].ehead = _size;
_array[_size] = node (- (_size + 255), - (_size + 1));
for (int i = _size + 1; i < _size + 255; ++i)
_array[i] = node (-(i - 1), -(i + 1));
_array[_size + 255] = node (- (_size + 254), -_size);
_push_block (_size >> 8, _bheadO, ! _bheadO); // append to block Open
_size += 256;
return (_size >> 8) - 1;
}
// transfer block from one start w/ head_in to one start w/ head_out
void _transfer_block (const int bi, int& head_in, int& head_out) {
_pop_block (bi, head_in, bi == _block[bi].next);
_push_block (bi, head_out, ! head_out && _block[bi].num);
}
// pop empty node from block; never transfer the special block (bi = 0)
int _pop_enode (const int base, const uchar label, const int from) {
const int e = base < 0 ? _find_place () : base ^ label;
const int bi = e >> 8;
node& n = _array[e];
block& b = _block[bi];
if (--b.num == 0) {
if (bi) _transfer_block (bi, _bheadC, _bheadF); // Closed to Full
} else { // release empty node from empty ring
_array[-n.base].check = n.check;
_array[-n.check].base = n.base;
if (e == b.ehead) b.ehead = -n.check; // set ehead
if (bi && b.num == 1 && b.trial != MAX_TRIAL) // Open to Closed
_transfer_block (bi, _bheadO, _bheadC);
}
// initialize the released node
if (label) n.base = -1; else n.value = value_type (0);
n.check = from;
if (base < 0) _array[from].base = e ^ label;
return e;
}
// push empty node into empty ring
void _push_enode (const int e) {
const int bi = e >> 8;
block& b = _block[bi];
if (++b.num == 1) { // Full to Closed
b.ehead = e;
_array[e] = node (-e, -e);
if (bi) _transfer_block (bi, _bheadF, _bheadC); // Full to Closed
} else {
const int prev = b.ehead;
const int next = -_array[prev].check;
_array[e] = node (-prev, -next);
_array[prev].check = _array[next].base = -e;
if (b.num == 2 || b.trial == MAX_TRIAL) { // Closed to Open
if (bi) _transfer_block (bi, _bheadC, _bheadO);
}
b.trial = 0;
}
if (b.reject < _reject[b.num]) b.reject = _reject[b.num];
_ninfo[e] = ninfo (); // reset ninfo; no child, no sibling
}
// push label to from's child
void _push_sibling (const npos_t from, const int base, const uchar label, const bool flag = true) {
uchar* c = &_ninfo[from].child;
if (flag && (ORDERED ? label > *c : ! *c))
do c = &_ninfo[base ^ *c].sibling; while (ORDERED && *c && *c < label);
_ninfo[base ^ label].sibling = *c, *c = label;
}
// pop label from from's child
void _pop_sibling (const npos_t from, const int base, const uchar label) {
uchar* c = &_ninfo[from].child;
while (*c != label) c = &_ninfo[base ^ *c].sibling;
*c = _ninfo[base ^ label].sibling;
}
// check whether to replace branching w/ the newly added node
bool _consult (const int base_n, const int base_p, uchar c_n, uchar c_p) const {
do c_n = _ninfo[base_n ^ c_n].sibling, c_p = _ninfo[base_p ^ c_p].sibling;
while (c_n && c_p);
return c_p;
}
// enumerate (equal to or more than one) child nodes
uchar* _set_child (uchar* p, const int base, uchar c, const int label = -1) {
--p;
if (! c) { *++p = c; c = _ninfo[base ^ c].sibling; } // 0: terminal
if (ORDERED)
while (c && c < label) { *++p = c; c = _ninfo[base ^ c].sibling; }
if (label != -1) *++p = static_cast <uchar> (label);
while (c) { *++p = c; c = _ninfo[base ^ c].sibling; }
return p;
}
// explore new block to settle down
int _find_place () {
if (_bheadC) return _block[_bheadC].ehead;
if (_bheadO) return _block[_bheadO].ehead;
return _add_block () << 8;
}
int _find_place (const uchar* const first, const uchar* const last) {
if (int bi = _bheadO) {
const int bz = _block[_bheadO].prev;
const short nc = static_cast <short> (last - first + 1);
while (1) { // set candidate block
block& b = _block[bi];
if (b.num >= nc && nc < b.reject) // explore configuration
for (int e = b.ehead;;) {
const int base = e ^ *first;
for (const uchar* p = first; _array[base ^ *++p].check < 0; )
if (p == last) return b.ehead = e; // no conflict
if ((e = -_array[e].check) == b.ehead) break;
}
b.reject = nc;
if (b.reject < _reject[b.num]) _reject[b.num] = b.reject;
const int bi_ = b.next;
if (++b.trial == MAX_TRIAL) _transfer_block (bi, _bheadO, _bheadC);
if (bi == bz) break;
bi = bi_;
}
}
return _add_block () << 8;
}
// resolve conflict on base_n ^ label_n = base_p ^ label_p
template <typename T>
int _resolve (npos_t& from_n, const int base_n, const uchar label_n, T& cf) {
// examine siblings of conflicted nodes
const int to_pn = base_n ^ label_n;
const int from_p = _array[to_pn].check;
const int base_p = _array[from_p].base;
const bool flag // whether to replace siblings of newly added
= _consult (base_n, base_p, _ninfo[from_n].child, _ninfo[from_p].child);
uchar child[256];
uchar* const first = &child[0];
uchar* const last =
flag ? _set_child (first, base_n, _ninfo[from_n].child, label_n)
: _set_child (first, base_p, _ninfo[from_p].child);
const int base =
(first == last ? _find_place () : _find_place (first, last)) ^ *first;
// replace & modify empty list
const int from = flag ? static_cast <int> (from_n) : from_p;
const int base_ = flag ? base_n : base_p;
if (flag && *first == label_n) _ninfo[from].child = label_n; // new child
_array[from].base = base; // new base
for (const uchar* p = first; p <= last; ++p) { // to_ => to
const int to = _pop_enode (base, *p, from);
const int to_ = base_ ^ *p;
_ninfo[to].sibling = (p == last ? 0 : *(p + 1));
if (flag && to_ == to_pn) continue; // skip newcomer (no child)
cf (to_, to);
node& n = _array[to];
node& n_ = _array[to_];
if ((n.base = n_.base) > 0 && *p) { // copy base; bug fix
uchar c = _ninfo[to].child = _ninfo[to_].child;
do _array[n.base ^ c].check = to; // adjust grand son's check
while ((c = _ninfo[n.base ^ c].sibling));
}
if (! flag && to_ == static_cast <int> (from_n)) // parent node moved
from_n = static_cast <size_t> (to); // bug fix
if (! flag && to_ == to_pn) { // the address is immediately used
_push_sibling (from_n, to_pn ^ label_n, label_n);
_ninfo[to_].child = 0; // remember to reset child
if (label_n) n_.base = -1; else n_.value = value_type (0);
n_.check = static_cast <int> (from_n);
} else
_push_enode (to_);
if (NUM_TRACKING_NODES) // keep the traversed node updated
for (size_t j = 0; tracking_node[j] != 0; ++j) {
if (static_cast <int> (tracking_node[j] & TAIL_OFFSET_MASK) == to_) {
tracking_node[j] &= NODE_INDEX_MASK;
tracking_node[j] |= static_cast <npos_t> (to);
}
}
}
return flag ? base ^ label_n : to_pn;
}
// test the validity of double array for debug
void _test (const npos_t from = 0) const {
const int base = _array[from].base;
if (base < 0) { // validate tail offset
assert (*_length >= static_cast <int> (-base + 1 + sizeof (value_type)));
return;
}
uchar c = _ninfo[from].child;
do {
if (from) assert (_array[base ^ c].check == static_cast <int> (from));
if (c) _test (static_cast <npos_t> (base ^ c));
} while ((c = _ninfo[base ^ c].sibling));
}
};
}
#endif