datafiller/datafiller.py at master · 216software/datafiller · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#! /usr/bin/env python
#
# $Id: datafiller.py 268 2013-06-30 16:01:33Z fabien $
#
# TODO:
# - types: BIT/ARRAY? others?
# - disable/enable constraints? within a transaction?
#   not possible with PostgreSQL, must DROP/CREATE?
# - improve "parsing" regexpr, including some support for ALTER?
# - real parser? PLY? see http://nedbatchelder.com/text/python-parsers.html

VERSION = '1.1.2'

import re
Id='$Id: datafiller.py 268 2013-06-30 16:01:33Z fabien $'
revision, revdate = re.match(r'.*? (\d+) (\d{4}-\d\d-\d\d) ', Id).group(1, 2)
version = "{0} (r{1} on {2})".format(VERSION, revision, revdate)

# python 2/3 compatibility hacks -- this is tiring:-(
# - D.has_key(K) => K in D # better anyway
# - "..." % ... => "...".format(...) # advised, not necessary?
# - print ... => print(...) # ok
# - print => print('')      # hmmm...
# - xrange => range         # grrr...
# - casts: s[:l] => s[:int(l)], sn * fp => sn * int(fp) # this is a regression
# - StringIO: changed place   # should be transparent
import sys
if sys.version_info < (3,):
    # python 2
    from StringIO import StringIO
    range = xrange
else:
    # python 3
    from io import StringIO

# plain old embedded documentation... Yes, the perl thing;-)
# could use pandoc/markdown,but it seems that pandoc cannot display
# a manual page interactively as pod2usage, and as a haskell script
# it is not installed by default.
POD="""

=pod

=head1 NAME

B<datafiller.py> - generate random data from database schema extended with directives

=head1 SYNOPSIS

B<datafiller.py> [--help --man ...] [schema.sql ...] > data.sql

=head1 DESCRIPTION

This script generates random data from a database schema enriched with
simple directives in SQL comments to drive eleven data generators which
cover typical data types.
Reasonable defaults are provided, especially based on key and type constraints,
so that few directives should be necessary. The minimum setup is to specify the
relative size of tables with directive B<mult> so that the data generation can
be scaled.
Run with C<--test=comics> and look at the output for a didactic example.

=head1 OPTIONS

=over 4

=item C<--debug> or C<-D>

Set debug mode.
Default is no debug.

=item C<--drop>

Drop tables before recreating them.
This implies option C<--filter>, otherwise there would be no table to fill.

Default is not to.

=item C<--filter> of C<-f>

Work as a filter, i.e. send the schema input script to stdout and then
the generated data.
This is convenient to pipe the result of the script directly for
execution to the database command.

Default is to only ouput generated data.

=item C<--help> or C<-h>

Show basic help.

=item C<--man> or C<-m>

Show full man page based on POD. Yes, the perl thing:-)

=item C<--mangle>

Use random steps and shifts for integer generators.
This is useful for avoiding strong correlations between tuple keys.

Default is to use 1 for step and 0 for shift.
This can also be set with the B<mangle> directive at the schema level,
or overriden one way or the other with per-attribute with directives
B<mangle> or B<nomangle>, or with explicit B<step> and B<shift> directives.

=item C<--null RATE> or C<-n RATE>

Probability to generate a null value for nullable attributes.

Default is 0.01, which can be overriden by the B<null> directive at
the schema level, or per-attributes provided B<null> rate.

=item C<--offset OFFSET> or C<-O OFFSET>

Set default offset for integer generators on I<primary keys>.
This is useful to extend the already existing content of a database
for larger tests.

Default is 1, which can be overriden by the B<offset> directive at
the schema level, or per-attribute provided B<offset>.

=item C<--pod COMMAND>

Override pod conversion command used by option C<--man>.

Default is 'pod2usage -verbose 3'.

=item C<--seed SEED> or C<-S SEED>

Seed random generated with provided string.

Default uses OS supplied randomness or current time.

=item C<--size SIZE>

Set overall scaling. The size is combined with the B<mult> directive value
on a table to compute the actual number of tuples to generate in each table.

Default is 100, which can be overriden with the B<size> directive at the
schema level.

=item C<--target (postgresql|mysql)> or C<-t ...>

Target database engine. MySQL support is really experimental.

Default is to target PostgreSQL.

=item C<--test=(comics|pgbench|validate)> or C<--test='int:directives...'>

Output test data for B<comics> or B<pgbench> schemas (see L</EXAMPLE> below),
or the internal validation,
or run tests for bool, integer, float or blob generators with some directives.

Example: --test='bool:rate=0.3' may show I<True: 30.68%>,
stating the rate at which I<True> was actually seen during the test.

Option C<--test=...> sets C<--filter> automatically.

Default is to process argument files or standard input.

=item C<--transaction> or C<-T>

Use a global transaction.

Default is not to.

=item C<--tries=NUM>

How hard to try to satisfy a compound unique constraint before giving up
on a given tuple.

Default is 10.

=item C<--truncate>

Delete table contents before filling.

Default is not to.

=item C<--validate>

Shortcut for C<--test=validate --filter --transaction>.

To run the validation in a temporary schema C<df>:

  sh> datafiller.py --validate | psql

=item C<--version> or C<-v>

Show script version.

=back

=head1 ARGUMENTS

Files containing SQL schema definitions, or F<stdin> is processed if empty.

=head1 DIRECTIVES AND DATA GENERATORS

Directives drive the data sizes and the underlying data generators.
They must appear in SQL comments I<after> the object on which they apply,
although possibly on the same line, introduced by S<'-- df: '>.

  CREATE TABLE Stuff(         -- df: mult=2.0
    id SERIAL PRIMARY KEY,    -- df: step=19
    data TEXT UNIQUE NOT NULL -- df: prefix=st length=30 lenvar=3
  );

In the above example, with option C<--size=1000>, 2000 tuples
will be generated I<(2.0*1000)> with B<id> I<1+(i*19)%2000> and
unique text B<data> of length about 30+-3 prefixed with C<st>.
The sequence for B<id> will be restarted at I<2001>.

The default size is the number of tuples of the containing table.
This implies many collisions for a I<uniform> generator.

=head2 DATA GENERATORS

There are eleven data generators which are selected by the attribute type
or possibly directives. All generators are also subject to the B<null>
directive which drives the probability of a C<NULL> value.

=over 4

=item B<bool generator>

This generator is used for the boolean type.
It is subject to the B<rate> directive.

=item B<int generator>

This generator is used directly for integer types, and indirectly
by text, word and date generators.
Its internal working is subject to directives: B<gen>, B<size> (or B<mult>),
B<offset>, B<shift>, B<step>, B<mangle> and B<nomangle>.

=item B<float generator>

This generator is used for floating point types.
It does not support C<UNIQUE>, but uniqueness is very likely.
Its configuration relies on directives B<gen>, B<alpha> and B<beta>.

=item B<date generator>

This generator is used for the date type.
It uses an B<int generator> internally to drive its extent.
Its internal working is subject to directives B<start>, B<end> and B<prec>.

=item B<timestamp generator>

This generator is used for the timestamp type.
It is similar to the date generator but at a finer granularity.
The B<tz> directive allows to specify the target timezone.

=item B<interval generator>

This generator is used for the time interval type.
It uses the B<int generator> internally to drive its extent.
See also the B<unit> directive.

=item B<string generator>

This generator is used by default for text types.
This is a good generator for filling stuff without much ado.

It takes into account B<prefix>, B<length> and B<lenvar> directives.
The generated text is of length B<length> +- B<lenvar>.
For C<CHAR(n)> and C<VARCHAR(n)> text types, automatic defaults are set
for both B<length> and B<lenvar>.

=item B<chars generator>

This alternate generator for text types generates random string of characters.
It is triggered by the B<chars> directive.

In addition to the underlying B<int generator> which allows to select values,
another B<int generator> is used to build words from the provided list
of characters,

The B<cgen> directives is the name of a macro which specifies the
B<int generator> parameters for the random char selection.

It also takes into account the B<length> and B<lenvar> directives.

This generator does not support C<UNIQUE>.

=item B<word generator>

This alternate generator for text types is triggered by the B<word> directive.
It uses B<int generator> to select words from a list or a file.

This generator handles C<UNIQUE> if enough words are provided.

=item B<text generator>

This alternate generator for text types generates sentences of words
drawn from a list of words specified with directive B<word>.
It is triggered by the B<text> directive.

It also takes into account the B<length> and B<lenvar> directives which
handle the number of words to generate.

This generator does not support C<UNIQUE>, but uniqueness is very likely
for a text with a significant length drawn from a dictionnary.

=item B<blob generator>

This is for blob types, such as PostgreSQL's C<BYTEA>.

This generator does not support C<UNIQUE>.

=back

=head2 GLOBAL DIRECTIVES

A directive macro can be defined and then used later by inserting its name
between the introductory C<df> and the C<:>. The specified directives are
stored in the macro and can be reused later.
For instance, macros B<words>, B<mangle> B<cfr> and B<cen> can be defined as:

  --df words: word=/etc/dictionaries-common/words gen=power alpha=1.7
  --df mix: offset=10000 step=17 shift=3
  --df cfr: gen=scale alpha=6.7
  --df cen: gen=scale alpha=5.9

Then they can be used in any datafiller directive with B<use=...>:

  --df: use=words use=mix
  --df: use=mix

Or possibly for chars generators with B<cgen=...>:

  --df: cgen=cfr chars='esaitnru...'

There are four predefined macros:
B<cfr> and B<cen> define skewed integer generators with the above parameters.
B<french>, B<english> define chars generators which tries to mimic the
character frequency of these languages.

The B<size>, B<offset>, B<mangle>, B<null> and B<seed> directives
can be defined at the schema level to override from the SQL script
the default size multiplier, primary key offset, use of random shifts and
steps, null rate and seed.
However, they are ignored if the corresponding options are set.

=head2 TABLE DIRECTIVES

=over 4

=item B<mult=float>

Size multiplier for scaling, that is computing the number of tuples to
generate.

=item B<nogen>

Do not generate data for this table.

=item B<null>

Set defaut B<null> rate for this table.

=item B<size=int>

Use this size, so there is no scaling with the C<--size> option
and B<mult> directive.

=item B<skip=float>

Skip (that is generate but do not insert) some tuples with this probability.
Useful to create some holes in data. Tables with a non-zero B<skip> cannot be
referenced.

=back

=head2 ATTRIBUTE DIRECTIVES

=over 4

=item B<type=(int|float|bool|date|...)>

Force specified type generator regardless of attribute type.
The default is to select the generator based on the attribute type,
so this option should never be necessary.

=item B<gen=GENERATOR>

For integer or float type, use this underlying generator.

The generators for integers are:

=over 4

=item B<serial>

This is really a counter which generates distinct integers,
depending on B<offset>, B<shift> and B<step>.

=item B<uniform>

Generates uniform random number integers between B<offset> and B<offset+size-1>.
This is the default.

=item B<serand>

Generate integers based on B<serial> up to B<size>, then use B<uniform>.
Useful to fill foreign keys.

=item B<power> with parameter B<alpha> or B<rate>

Use probability to this B<alpha> power.
When B<rate> is specified, compute alpha so that value 0 is drawn
at the specified rate.
Uniform is similar to B<power> with B<alpha=1.0>, or I<B<rate>=1.0/size>
The higher B<alpha>, the more skewed towards I<0>.

Example distribution with C<--test='int:gen=power rate=0.3 size=10'>:

  value     0   1   2   3   4   5   6   7   8   9
  percent  30  13  10   9   8   7   6   6   5   5

=item B<scale> with parameter B<alpha> or B<rate>

Another form of skewing. The probability of increasing values drawn
is less steep at the beginning compared to B<power>, thus the probability
of values at the end is lower.

Example distribution with C<--test='int:gen=scale rate=0.3 size=10'>:

  value     0   1   2   3   4   5   6   7   8   9
  percent  30  19  12   9   7   6   5   4   3   2

=back

The random generators for floats are those provided by Python's C<random>:

=over 4

=item B<beta>

Beta distribution, B<alpha> and B<beta> must be >0.

=item B<exp>

Exponential distribution with mean 1.0 / B<alpha>

=item B<gamma>

Gamma distribution, B<alpha> and B<beta> must be >0.

=item B<gauss>

Gaussian distribution with mean B<alpha> and stdev B<beta>.

=item B<log>

Log normal distribution, see B<normal>.

=item B<norm>

Normal distribution with mean B<alpha> and stdev B<beta>.

=item B<pareto>

Pareto distribution with shape B<alpha>.

=item B<uniform>

Uniform distribution between B<alpha> and B<beta>.
This is default distribution.

=item B<vonmises>

Circular data distribution, with mean angle B<alpha> in radians
and concentration B<beta>.

=item B<weibull>

Weibull distribution with scale B<alpha> and shape B<beta>.

=back

=item B<chars='abcdefghijkl' cgen=macro>

The B<chars> directive triggers the B<chars generator> described above.
Directive B<chars> provides a list of characters which are used to build words.
The macro name specified in directive B<cgen> is used to setup the character
selection random generator.

For exemple:

  ...
  -- df skewed: gen=power rate=0.3
  , stuff TEXT -- df: gen=uniform chars='abcdef' size=23 cgen=skewed

The text is chosen uniformly in a list of 23 words, each word being
built from characters 'abcdef' with the I<skewed> generator described
in the corresponding macro definition on the line above.

=item B<mangle> or B<nomangle>

Whether to automatically choose random B<shift> and B<step> for
an integer generator, or not.

=item B<mult=float>

Use this multiplier to compute the generator B<size>.

=item B<nogen>

Do not generate data for this attribute, so it will get its default value.

=item B<null=float>

Probability of generating a null value for this attribute.
This applies to all generators.

=item B<offset=int shift=int step=int>

Various parameters for generated integers.
The generated integer is B<offset+(shift+step*i)%size>.
B<step> must not be a divider of B<size>, it is ignored and replaced
with 1 if so.

Defaults: offset is 1, shift is 0, step is 1.

=item B<prefix=st>

Prefix for string data.

=item B<length=int lenvar=int>

Length and length variation for generated characters of string data or
number of words of text data.

=item B<rate=float>

For the bool generator, rate of generating I<True> vs I<False>.
Must be in [0, 1]. Default is I<0.5>.

For the int generator, rate of generating value 0 for generators
B<power> and B<scale>.

=item B<seed=str>

Set default global seed from the schema level.
This can be overriden by option C<--seed>.
Default is to used the default random generator seed, usually
relying on OS supplied randomness or the current time.

=item B<size=int>

Number of underlying values to generate or draw from, depending on the
generator. For keys (primary, foreign, unique) , this is necessarily the
corresponding number of tuples.

=item B<start=date/time> , B<end=date/time>, B<prec=int>

For the B<date> and B<timestamp> generators,
issue from B<start> up to B<end> at precision B<prec>.
Precision is in days for dates and seconds for timestamp.
Default is to set B<end> to current date/time and B<prec> to
1 day for dates et 60 seconds for timestamps.
If both B<start> and B<end> are specified, the underlying size is
adjusted.

For example, to draw from about 100 years of dates ending on
January 19, 2038:
  -- df: end=2038-01-19 size=36525

=item B<text>

The B<text> directive triggers the B<text generator>.
This generator requires the list of words to draw from to be specified
with the B<word> directive.

=item B<unit>

The B<unit> directive specifies the unit of the generated intervals.
Possible values include B<s m h d mon y>. Default is B<s>, i.e. seconds.

=item B<word=file> or B<word=:list,of,words>

The B<word> directive triggers the B<word generator> described above.
Use provided word list or lines of file to generate data.
The default B<size> is the size of the word list.

If the file contents is ordered by word frequency, and the int generator is
skewed (see B<gen>), the first words can be made to occur more frequently.

=back

=head1 EXAMPLES

The first example is a didactic schema to illustrate directives.
The second example is taken from B<pgbench>.
As both schemas are embedded into this script, they can be invoked
directly with the C<--test> option:

  sh> datafiller.py --test=comics -T --size=10 | psql bench
  sh> datafiller.py --test=pgbench -T --size=10 | psql bench

=head2 COMICS SCHEMA

This schema models B<Comics> books written in a B<Language> and
published by a B<Publisher>. Each book can have several B<Author>s
through B<Written>. The B<Inventory> tells on which shelf books are
stored. Some of the books may be missing and a few may be available
twice or more.

{comics}

=head2 PGBENCH SCHEMA

This schema is taken from the TCP-B benchmark.
Each B<Branch> has B<Tellers> and B<Accounts> attached to it.
The B<History> records operations performed when the benchmark is run.

{pgbench}

The integer I<*balance> figures are generated with a skewed generator
defined in macro B<regress>. The negative B<offset> setting on I<abalance>
will help generate negative values, and the I<regress> skewed generator
will make small values more likely.

If this is put in a C<tpc-b.sql> file, then working test data can be
generated with:

  sh> datafiller.py -f -T --size=10 tpc-b.sql | psql bench

=head1 BUGS AND FEATURES

All software has bug, this is a software, hence it has bugs.

If you find one, please sent a report, or even better a patch that fixes it!

There is no SQL parser, table and attributes are analysed with basic
regular expressions. The C<ALTER> syntax is fully ignored.

Foreign keys cannot reference compound keys.

Inconsistent directives may be set.
Some directives may be ignored in some cases.

Handling of quoted identifiers is partial and may not work at all.

Beware that unique constraint checks for big data generation may require
a lot of memory.

=head1 LICENSE

=for html
<img src="http://www.gnu.org/graphics/gplv3-127x51.png"
alt="GNU GPLv3" align="right" />

Copyright 2013 Fabien Coelho <fabien at coelho dot net>

This is free software, both inexpensive and with sources.

The GNU General Public License v3 applies, see
L<http://www.gnu.org/copyleft/gpl.html> for details.

The summary is: you get as much as you paid for, and I am not responsible
for anything.

If you are happy with this software, feel free to send me a postcard saying so!
See my web page for current address L<http://www.coelho.net/>.

=head1 SEE ALSO

Relational data generation tools are often GUI or even Web applications,
possibly commercial. I did not find a simple filter-oriented tool driven
by directives, and I wanted to do something useful to play with python.

=over 4

=item L<http://en.wikipedia.org/wiki/Test_data_generation>

=item L<http://generatedata.com/> PHP/MySQL

=item L<http://www.databasetestdata.com/> generate one table

=item L<http://www.mobilefish.com/services/random_test_data_generator/random_test_data_generator.php>

=item L<http://www.sqledit.com/dg/> Win GUI/MS SQL Server, Oracle, DB2

=item L<http://sourceforge.net/projects/spawner/> Pascal, GUI for MySQL

=item L<http://sourceforge.net/projects/dbmonster/> 2005 - Java from XML
for PostgreSQL, MySQL, Oracle

=item L<http://sourceforge.net/projects/datagenerator/> 2006, alpha in Pascal, Win GUI

=item L<http://sourceforge.net/projects/dgmaster/> 2009, Java, GUI

=item L<http://www.gsapps.com/products/datagenerator/> GUI/...

=item L<http://rubyforge.org/projects/datagen>

=item L<http://msdn.microsoft.com/en-us/library/dd193262%28v=vs.100%29.asp>

=item L<http://stackoverflow.com/questions/3371503/sql-populate-table-with-random-data>

=item Perl (Data::Faker Data::Random), Ruby (Faker ffaker), Python random

=back

=head1 DOWNLOAD

This is F<{script}> version {version}.

Latest version and online documentation should be available from
L<http://www.coelho.net/datafiller.html>.

Download script at
L<https://www.cri.ensmp.fr/people/coelho/datafiller.py>.

History of versions:

=over 4

=item B<version {version}>

Improved and simplified code, better comments and validation.
Various hacks for python 2 & 3 compatibility.
Make validations stop on errors.
Check that B<lenvar> is less than B<length>.
Fixes for B<length> and B<lenvar> overriding in B<string generator>.

=item B<version 1.1.1 (r250 on 2013-06-29)>

Minor fix to the documentation.

=item B<version 1.1.0 (r248 on 2013-06-29)>

Improved documentation, code and comments.
Add C<--test> option for demonstration and checks,
including an embedded validation.
Add C<--validate> option as a shortcut for script validation.
Add B<seed>, B<skip>, B<mangle> and B<nomangle> directives.
Add B<null> directive on tables.
Add B<nogen> and B<type> directives on attributes.
Accept size 0.
Change B<alpha> behavior under B<gen=scale> so that the higher B<alpha>,
the more skewed towards 0.
Add alternative simpler B<rate> specification for B<scale> and B<power>
integer generators.
Deduce B<size> when both B<start> and B<end> are specified for the
date and timestamp generators.
Add B<tz> directive for the timestamp generator.
Add float, interval and blob generators.
Add some support for user-defined enum types.
Add C<--tries> option to control the effort to satisfy C<UNIQUE> constraints.
Add support for non integer foreign keys.
Remove B<square> and B<cube> integer generators.
Change macro definition syntax so as to be more intuitive.
Add C<-V> option for short version.
Some bug fixes.

=item B<version 1.0.0 (r128 on 2013-06-16)>

Initial distribution.

=back

"""

PGBENCH = """
  -- TPC-B example adapted from pgbench

  \\set ON_ERROR_STOP

  -- df regress: gen=power alpha=1.5
  -- df: size=1

  CREATE TABLE pgbench_branches( -- df: mult=1.0
    bid SERIAL PRIMARY KEY,
    bbalance INTEGER NOT NULL,   -- df: size=100000000 use=regress
    filler CHAR(88) NOT NULL
  );

  CREATE TABLE pgbench_tellers(  -- df: mult=10.0
    tid SERIAL PRIMARY KEY,
    bid INTEGER NOT NULL REFERENCES pgbench_branches,
    tbalance INTEGER NOT NULL,   -- df: size=100000 use=regress
    filler CHAR(84) NOT NULL
  );

  CREATE TABLE pgbench_accounts( -- df: mult=100000.0
    aid BIGSERIAL PRIMARY KEY,
    bid INTEGER NOT NULL REFERENCES pgbench_branches,
    abalance INTEGER NOT NULL,   -- df: offset=-1000 size=100000 use=regress
    filler CHAR(84) NOT NULL
  );

  CREATE TABLE pgbench_history(  -- df: nogen
    tid INTEGER NOT NULL REFERENCES pgbench_tellers,
    bid INTEGER NOT NULL REFERENCES pgbench_branches,
    aid BIGINT NOT NULL REFERENCES pgbench_accounts,
    delta INTEGER NOT NULL,
    mtime TIMESTAMP NOT NULL,
    filler CHAR(22)
    -- UNIQUE (tid, bid, aid, mtime)
  );
"""

# embedded PostgreSQL validation
VALIDATE = """
\\set ON_ERROR_STOP

-- df: size=2000 null=0.0

CREATE SCHEMA df;

CREATE TYPE df.color AS ENUM ('red','blue','green');

CREATE TABLE df.Stuff( -- df: mult=1.0
  id SERIAL PRIMARY KEY
  -- integer
, i0 INTEGER CHECK(i0 IS NULL) -- df: null=1.0 size=1
, i1 INTEGER CHECK(i1 IS NOT NULL AND i1=1) -- df: null=0.0 size=1
, i2 INTEGER NOT NULL CHECK(i2 BETWEEN 1 AND 6) --df: size=5
, i3 INTEGER UNIQUE -- df: offset=1000000
, i4 INTEGER CHECK(i2 BETWEEN 1 AND 6) -- df: gen=power rate=0.7 size=5
, i5 INTEGER CHECK(i2 BETWEEN 1 AND 6) -- df: gen=scale rate=0.7 size=5
, i6 INT8  -- df: size=1800000000000000000 offset=-900000000000000000
, i7 INT4  -- df: size=4000000000 offset=-2000000000
, i8 INT2  -- df: size=65000 offset=-32500
  -- boolean
, b0 BOOLEAN NOT NULL
, b1 BOOLEAN -- df: null=0.5
, b2 BOOLEAN NOT NULL -- df: rate=0.7
  -- float
, f0 REAL NOT NULL CHECK (f0 >= 0.0 AND f0 < 1.0)
, f1 DOUBLE PRECISION -- df: gen=gauss alpha=5.0 beta=2.0
, f2 DOUBLE PRECISION CHECK(f2 >= -10.0 AND f2 < 10.0)
    -- df: gen=uniform alpha=-10.0 beta=10.0
, f3 DOUBLE PRECISION -- df: gen=beta alpha=1.0 beta=2.0
, f4 DOUBLE PRECISION -- df: gen=exp alpha=0.1
, f5 DOUBLE PRECISION -- df: gen=gamma alpha=1.0 beta=2.0
, f6 DOUBLE PRECISION -- df: gen=log alpha=1.0 beta=2.0
, f7 DOUBLE PRECISION -- df: gen=norm alpha=20.0 beta=0.5
, f8 DOUBLE PRECISION -- df: gen=pareto alpha=1.0
, f9 DOUBLE PRECISION -- df: gen=vonmises alpha=1.0 beta=2.0
, fa DOUBLE PRECISION -- df: gen=weibull alpha=1.0 beta=2.0
, fb NUMERIC(2,1) CHECK(fb BETWEEN 0.0 AND 9.9)
    -- df: gen=uniform alpha=0.0 beta=9.9
, fc DECIMAL(5,2) CHECK(fc BETWEEN 100.00 AND 999.99)
    -- df: gen=uniform alpha=100.0 beta=999.99
  -- date
, d0 DATE NOT NULL
       CHECK(d0 <= CURRENT_DATE AND d0 >= CURRENT_DATE - 1) -- df: size=2
, d1 DATE NOT NULL
       CHECK(d1 = DATE '2038-01-19') -- df: start=2038-01-19 end=2038-01-19
, d2 DATE NOT NULL
       CHECK(d2 = DATE '2038-01-19' OR d2 = DATE '2038-01-20')
       -- df: start=2038-01-19 size=2
, d3 DATE NOT NULL
       CHECK(d3 = DATE '2038-01-18' OR d3 = DATE '2038-01-19')
       -- df: end=2038-01-19 size=2
, d4 DATE NOT NULL
       CHECK(d4 = DATE '2013-06-01' OR d4 = DATE '2013-06-08')
       -- df: start=2013-06-01 end=2013-06-08 prec=7
, d5 DATE NOT NULL
       CHECK(d5 = DATE '2013-06-01' OR d5 = DATE '2013-06-08')
       -- df: start=2013-06-01 end=2013-06-14 prec=7
  -- timestamp
, t0 TIMESTAMP NOT NULL
          CHECK(t0 = TIMESTAMP '2013-06-01 00:00:05' OR
                t0 = TIMESTAMP '2013-06-01 00:01:05')
          -- df: start='2013-06-01 00:00:05' end='2013-06-01 00:01:05'
, t1 TIMESTAMP NOT NULL
          CHECK(t1 = TIMESTAMP '2013-06-01 00:02:00' OR
                t1 = TIMESTAMP '2013-06-01 00:02:05')
          -- df: start='2013-06-01 00:02:00' end='2013-06-01 00:02:05' prec=5
, t2 TIMESTAMP NOT NULL
          CHECK(t2 >= TIMESTAMP '2013-06-01 01:00:00' AND
                t2 <= TIMESTAMP '2013-06-01 02:00:00')
          -- df: start='2013-06-01 01:00:00' size=30 prec=120
, t3 TIMESTAMP WITH TIME ZONE NOT NULL
          CHECK(t3 = TIMESTAMP '2013-06-22 09:17:54 CEST')
          -- df: start='2013-06-22 07:17:54' size=1 tz='UTC'
  -- interval
, v0 INTERVAL NOT NULL CHECK(v0 BETWEEN '1 s' AND '1 m')
     -- df: size=59 offset=1 unit='s'
, v1 INTERVAL NOT NULL CHECK(v1 BETWEEN '1 m' AND '1 h')
     -- df: size=59 offset=1 unit='m'
, v2 INTERVAL NOT NULL CHECK(v2 BETWEEN '1 h' AND '1 d')
     -- df: size=23 offset=1 unit='h'
, v3 INTERVAL NOT NULL CHECK(v3 BETWEEN '1 d' AND '1 mon')
     -- df: size=29 offset=1 unit='d'
, v4 INTERVAL NOT NULL CHECK(v4 BETWEEN '1 mon' AND '1 y')
     -- df: size=11 offset=1 unit='mon'
, v5 INTERVAL NOT NULL -- df: size=100 offset=0 unit='y'
, v6 INTERVAL NOT NULL -- df: size=1000000 offset=0 unit='s'
  -- text
, s0 CHAR(12) UNIQUE NOT NULL
, s1 VARCHAR(15) UNIQUE NOT NULL
, s2 TEXT NOT NULL -- df: length=23 lenvar=1 size=20 seed=s2
, s3 TEXT NOT NULL CHECK(s3 LIKE 'stuff%') -- df: prefix='stuff'
, s4 TEXT NOT NULL CHECK(s4 ~ '^[a-f]{9,11}$')
    -- df: chars='abcdef' size=20 length=10 lenvar=1
-- df skewed: gen=scale rate=0.7 seed='Calvin and Hobbes are good friends!'
, s5 TEXT NOT NULL CHECK(s5 ~ '^[ab]{20}$')
    --df: chars='ab' size=50 length=20 lenvar=0 cgen=skewed
, s6 TEXT NOT NULL -- df: word=:calvin,hobbes,susie
, s7 TEXT NOT NULL -- df: word=:one,two,three,four,five,six,seven size=3 mangle
, s8 TEXT NOT NULL CHECK(s8 ~ '^((un|deux) ){3}(un|deux)$')
    -- df: text word=:un,deux length=4 lenvar=0
, s9 VARCHAR(10) NOT NULL CHECK(LENGTH(s9) BETWEEN 8 AND 10)
  -- df: length=9 lenvar=1
, sa VARCHAR(8) NOT NULL CHECK(LENGTH(sa) BETWEEN 5 AND 7) -- df: lenvar=1
  -- user defined enum
, e0 df.color NOT NULL
  -- blob
, l0 BYTEA NOT NULL
, l1 BYTEA NOT NULL CHECK(LENGTH(l1) = 3) -- df: length=3 lenvar=0
, l2 BYTEA NOT NULL CHECK(LENGTH(l2) BETWEEN 0 AND 6) -- df: length=3 lenvar=3
);

CREATE TABLE df.ForeignKeys( -- df: mult=2.0
  id SERIAL PRIMARY KEY
, fk1 INTEGER NOT NULL REFERENCES df.stuff
, fk2 INTEGER REFERENCES df.Stuff -- df: null=0.5
, fk3 INTEGER NOT NULL REFERENCES df.Stuff -- df: gen=serial
, fk4 INTEGER NOT NULL REFERENCES df.Stuff -- df: gen=serial mangle
, fk5 INTEGER NOT NULL REFERENCES df.Stuff -- df: gen=serand nomangle
, fk6 INTEGER NOT NULL REFERENCES df.Stuff -- df: gen=serand mangle
, fk7 INTEGER NOT NULL REFERENCES df.Stuff(id) -- df: gen=serand mangle
, fk8 INTEGER NOT NULL REFERENCES df.stuff(i3) -- df: gen=serand mangle
, fk9 INTEGER NOT NULL REFERENCES df.stuff(i3) -- df: gen=uniform
, fka INTEGER NOT NULL REFERENCES df.stuff(i3) -- df: gen=scale rate=0.2
, fkb CHAR(12) NOT NULL REFERENCES df.stuff(s0)
);

CREATE TABLE df.NotFilled( -- df: nogen
  id SERIAL PRIMARY KEY CHECK(id=1)
);
INSERT INTO df.NotFilled(id) VALUES(1);

CREATE TABLE df.Ten( -- df: size=10 null=1.0
  id SERIAL PRIMARY KEY CHECK(id BETWEEN 18 AND 27) -- df: offset=18
, nogen INTEGER DEFAULT 123 -- df: nogen
, n TEXT
  -- forced generators
, x0 TEXT NOT NULL CHECK(x0 ~ '^[0-9]+$') -- df: type=int size=1000
, x1 TEXT NOT NULL CHECK(x1 = 'TRUE' OR x1 = 'FALSE') -- df: type=bool
, x2 TEXT NOT NULL CHECK(x2::DOUBLE PRECISION >=0 AND
                         x2::DOUBLE PRECISION <= 100.0)
                         -- df: type=float alpha=0.0 beta=100.0
, x3 TEXT NOT NULL CHECK(x3 ~ '^\d{4}-\d\d-\d\d$') -- df: type=date
, x4 TEXT NOT NULL CHECK(x4 ~ '^\d{4}-\d\d-\d\d \d\d:\d\d:\d\d$')
                        -- df: type=timestamp
, x5 TEXT NOT NULL CHECK(x5 LIKE 'boo%') -- df: type=string prefix=boo
, x6 TEXT NOT NULL CHECK(x6 ~ '^\d+\s\w+$') -- df: type=interval unit='day'
  -- more forced generators
, y0 INTEGER NOT NULL CHECK(y0 BETWEEN 2 AND 29)
     -- df: type=word word=:2,3,5,7,11,13,17,19,23,29
, y1 BOOLEAN NOT NULL -- df: type=word word=:TRUE,FALSE
, y2 DOUBLE PRECISION NOT NULL CHECK(y2=0.0 OR y2=1.0)
     -- df: type=word word=:0.0,1.0
, y3 FLOAT NOT NULL CHECK(y3=0.0 OR y3=1.0)
     -- df: type=word word=:0.0,1.0
, y4 DATE NOT NULL CHECK(y4 = DATE '2013-06-23' OR y4 = DATE '2038-01-19')
     -- df: type=word word=:2013-06-23,2038-01-19
, y5 TIMESTAMP NOT NULL CHECK(y5 = TIMESTAMP '2013-06-23 19:54:55')
     -- df: type=word word=':2013-06-23 19:54:55'
, y6 INTEGER NOT NULL CHECK(y6::TEXT ~ '^[4-8]{1,9}$')
     -- df: type=chars chars='45678' length=5 lenvar=4 size=1000000
, y7 INTERVAL NOT NULL
     -- df: type=word word=:1y,1mon,1day,1h,1m,1s
);

CREATE TABLE df.Skip( -- df: skip=0.9 size=1000
  id SERIAL PRIMARY KEY
);

"""

VALIDATE_CHECK = """
-- useful for additional checks
CREATE OR REPLACE FUNCTION df.assert(what TEXT, ok BOOLEAN) RETURNS BOOLEAN
IMMUTABLE CALLED ON NULL INPUT AS $$
BEGIN
  IF ok IS NULL OR NOT ok THEN
    RAISE EXCEPTION 'assert failed: %', what;
  END IF;
  RETURN ok;
END
$$ LANGUAGE plpgsql;

-- one if true else 0
CREATE FUNCTION df.oitez(ok BOOLEAN) RETURNS INTEGER
IMMUTABLE STRICT AS $$
  SELECT CASE WHEN ok THEN 1 ELSE 0 END;
$$ LANGUAGE SQL;

-- check value to a precision
CREATE FUNCTION df.value(d DOUBLE PRECISION,
                         val DOUBLE PRECISION, epsilon DOUBLE PRECISION)