-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathinstance.properties
1570 lines (1223 loc) · 83.4 KB
/
instance.properties
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
## The following properties are commonly used throughout Sleeper.
# A string to uniquely identify this deployment. This should be no longer than 20 chars. It should be
# globally unique as it will be used to name AWS resources such as S3 buckets.
sleeper.id=full-example
# The S3 bucket containing the jar files of the Sleeper components.
sleeper.jars.bucket=the name of the bucket containing your jars, e.g. sleeper-<insert-unique-name-here>-jars
# A comma-separated list of the jars containing application specific iterator code. These jars are
# assumed to be in the bucket given by sleeper.jars.bucket, e.g. if that bucket contains two iterator
# jars called iterator1.jar and iterator2.jar then the property should be
# 'sleeper.userjars=iterator1.jar,iterator2.jar'.
# sleeper.userjars=
# A name for a tag to identify the stack that deployed a resource. This will be set for all AWS
# resources, to the ID of the CDK stack that they are deployed under. This can be used to organise the
# cost explorer for billing.
sleeper.stack.tag.name=DeploymentStack
# Whether to keep the sleeper table bucket, Dynamo tables, query results bucket, etc., when the
# instance is destroyed.
sleeper.retain.infra.after.destroy=true
# The optional stacks to deploy. Not case sensitive.
# Valid values: [IngestStack, IngestBatcherStack, EmrServerlessBulkImportStack, EmrBulkImportStack,
# PersistentEmrBulkImportStack, EksBulkImportStack, EmrStudioStack, QueryStack, WebSocketQueryStack,
# AthenaStack, KeepLambdaWarmStack, CompactionStack, GarbageCollectorStack, PartitionSplittingStack,
# DashboardStack, TableMetricsStack]
sleeper.optional.stacks=IngestStack,IngestBatcherStack,EmrServerlessBulkImportStack,EmrStudioStack,QueryStack,AthenaStack,CompactionStack,GarbageCollectorStack,PartitionSplittingStack,DashboardStack,TableMetricsStack
# The deployment type for AWS Lambda. Not case sensitive.
# Valid values: [jar, container]
sleeper.lambda.deploy.type=jar
# The AWS account number. This is the AWS account that the instance will be deployed to.
sleeper.account=1234567890
# The AWS region to deploy to.
sleeper.region=eu-west-2
# The id of the VPC to deploy to.
sleeper.vpc=1234567890
# Whether to check that the VPC that the instance is deployed to has an S3 endpoint. If there is no S3
# endpoint then the NAT costs can be very significant.
sleeper.vpc.endpoint.check=true
# A comma separated list of subnets to deploy to. ECS tasks will be run across multiple subnets. EMR
# clusters will be deployed in a subnet chosen when the cluster is created.
sleeper.subnets=subnet-abcdefgh
# The Hadoop filesystem used to connect to S3.
sleeper.filesystem=s3a://
# An email address used by the TopicStack to publish SNS notifications of errors.
# sleeper.errors.email=
# The length of time in days that CloudWatch logs from lambda functions, ECS containers, etc., are
# retained.
# See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-logs-loggroup.html
# for valid options.
# Use -1 to indicate infinite retention.
sleeper.log.retention.days=30
# Used to set the value of fs.s3a.connection.maximum on the Hadoop configuration. This controls the
# maximum number of http connections to S3.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.max-connections=100
# Used to set the value of fs.s3a.block.size on the Hadoop configuration. Uploads to S3 happen in
# blocks, and this sets the size of blocks. If a larger value is used, then more data is buffered
# before the upload begins.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.upload.block.size=32M
# The version of Fargate to use.
sleeper.fargate.version=1.4.0
# The amount of memory in MB for the lambda that creates ECS tasks to execute compaction and ingest
# jobs.
sleeper.task.runner.memory.mb=1024
# The timeout in seconds for the lambda that creates ECS tasks to execute compaction jobs and ingest
# jobs.
# This must be >0 and <= 900.
sleeper.task.runner.timeout.seconds=900
# If true, properties will be reloaded every time a long running job is started or a lambda is run.
# This will mainly be used in test scenarios to ensure properties are up to date.
sleeper.properties.force.reload=false
# If set, this property will be used as a prefix for the names of ECR repositories. If unset, then the
# instance ID will be used to determine the names instead.
# Note: This is only used by the deployment scripts to upload Docker images, not the CDK. We may add
# the ability to use this in the CDK in the future.
# sleeper.ecr.repository.prefix=
# A comma-separated list of up to 5 security group IDs to be used when running ECS tasks.
# sleeper.ecs.security.groups=
# Default value for the reserved concurrency for each lambda in the Sleeper instance that scales
# according to the number of Sleeper tables.
# The state store committer lambda is an exception to this, as it has reserved concurrency by default.
# This is set in the property sleeper.statestore.committer.concurrency.reserved. Other lambdas are
# present that do not scale by the number of Sleeper tables, and are not set from this property.
# By default no concurrency is reserved for the lambdas. Each lambda also has its own property that
# overrides the value found here.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.default.lambda.concurrency.reserved=
# Default value for the maximum concurrency for each lambda in the Sleeper instance that scales
# according to the number of Sleeper tables.
# Other lambdas are present that do not scale by the number of Sleeper tables, and are not set from
# this property.
# By default the maximum concurrency is set to 10, which is enough for 10 online tables. If there are
# more online tables, this number may need to be increased. Each lambda also has its own property that
# overrides the value found here.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.default.lambda.concurrency.max=10
## The following properties relate to handling the state of Sleeper tables.
# Default value for amount of memory in MB for each lambda that holds the state of Sleeper tables in
# memory. These use a state store provider which caches a number of tables at once, set in
# `sleeper.statestore.provider.cache.size`. Not all lambdas are covered by this, e.g. see
# `sleeper.batch.table.lambdas.memory.mb`.
sleeper.default.table.state.lambda.memory.mb=4096
# The amount of memory in MB for lambdas that create batches of tables to run some operation against,
# eg. create compaction jobs, run garbage collection, perform partition splitting.
sleeper.batch.table.lambdas.memory.mb=1024
# The timeout in seconds for lambdas that create batches of tables to run some operation against, eg.
# create compaction jobs, run garbage collection, perform partition splitting.
sleeper.batch.table.lambdas.timeout.seconds=60
# The timeout in minutes for when the table properties provider cache should be cleared, forcing table
# properties to be reloaded from S3.
sleeper.cache.table.properties.provider.timeout.minutes=60
# The maximum size of state store providers. If a state store is needed and the cache is full, the
# oldest state store in the cache will be removed to make space.
sleeper.statestore.provider.cache.size=10
# This specifies whether point in time recovery is enabled for the DynamoDB state store. This is set
# on the DynamoDB tables.
sleeper.statestore.dynamo.pointintimerecovery=false
# This specifies whether point in time recovery is enabled for the S3 state store. This is set on the
# revision DynamoDB table.
sleeper.statestore.s3.dynamo.pointintimerecovery=false
# The number of tables to create transaction log snapshots for in a single invocation. This will be
# the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.snapshot.creation.batch.size=1
# The frequency in seconds with which the transaction log snapshot creation lambda is run.
sleeper.statestore.snapshot.creation.lambda.period.seconds=60
# The timeout in seconds after which to terminate the transaction log snapshot creation lambda.
sleeper.statestore.snapshot.creation.lambda.timeout.seconds=900
# The amount of memory in MB for the transaction log snapshot creation lambda.
sleeper.statestore.snapshot.creation.memory.mb=4096
# The reserved concurrency for the snapshot creation lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.snapshot.creation.concurrency.reserved=
# The maximum given concurrency allowed for the snapshot creation lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.snapshot.creation.concurrency.max=10
# The number of tables to delete old transaction log snapshots for in a single invocation. This will
# be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.snapshot.deletion.batch.size=1
# The frequency in minutes with which the transaction log snapshot deletion lambda is run.
sleeper.statestore.snapshot.deletion.lambda.period.minutes=60
# The reserved concurrency for the snapshot deletion lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.snapshot.deletion.concurrency.reserved=
# The maximum given concurrency allowed for the snapshot deletion lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.snapshot.deletion.concurrency.max=10
# The number of tables to delete old transaction log transactions for in a single invocation. This
# will be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.transaction.deletion.batch.size=1
# The frequency in minutes with which the transaction log transaction deletion lambda is run.
sleeper.statestore.transaction.deletion.lambda.period.minutes=60
# The reserved concurrency for the transaction deletion lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.transaction.deletion.concurrency.reserved=
# The maximum given concurrency allowed for the transaction deletion lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.transaction.deletion.concurrency.max=10
# The maximum timeout for the transaction deletion lambda in seconds.
sleeper.statestore.transaction.deletion.lambda.timeout.seconds=900
# The reserved concurrency for the lambda that follows the state store transaction log to trigger
# updates.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.transaction.follower.concurrency.reserved=
# The maximum given concurrency allowed for the lambda that follows the state store transaction log to
# trigger updates.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.transaction.follower.concurrency.max=10
# The maximum timeout in seconds for the lambda that follows the state store transaction log to
# trigger updates.
sleeper.statestore.transaction.follower.lambda.timeout.seconds=900
# The amount of memory in MB for the lambda that follows the state store transaction log to trigger
# updates.
sleeper.statestore.transaction.follower.memory.mb=4096
# This specifies whether point in time recovery is enabled for the Sleeper table index. This is set on
# the DynamoDB tables.
sleeper.tables.index.dynamo.pointintimerecovery=false
# This specifies whether queries and scans against the table index DynamoDB tables are strongly
# consistent.
sleeper.tables.index.dynamo.consistent.reads=true
# The amount of memory in MB for the lambda that commits state store updates.
sleeper.statestore.committer.lambda.memory.mb=4096
# The timeout for the lambda that commits state store updates in seconds.
sleeper.statestore.committer.lambda.timeout.seconds=900
# The number of state store updates to be sent to the state store committer lambda in one invocation.
# This will be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.committer.batch.size=10
# The reserved concurrency for the state store committer lambda.
# Presently this value defaults to 10 to align with expectations around table efficiency.
# This is to ensure that state store operations can still be applied to at least 10 tables, even when
# concurrency is used up in the account.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
sleeper.statestore.committer.concurrency.reserved=10
# The maximum given concurrency allowed for the state store committer lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.committer.concurrency.max=10
## The following properties relate to standard ingest.
# The name of the ECR repository for the ingest container. The Docker image from the ingest module
# should have been uploaded to an ECR repository of this name in this account.
sleeper.ingest.repo=<insert-unique-sleeper-id>/ingest
# The maximum number of concurrent ECS tasks to run.
sleeper.ingest.max.concurrent.tasks=200
# The frequency in minutes with which an EventBridge rule runs to trigger a lambda that, if necessary,
# runs more ECS tasks to perform ingest jobs.
sleeper.ingest.task.creation.period.minutes=1
# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the ingest queue so that they are not processed by other processes.
# This should be less than the value of sleeper.ingest.queue.visibility.timeout.seconds.
sleeper.ingest.keepalive.period.seconds=300
# The visibility timeout in seconds for the standard ingest job queue. This should be greater than
# sleeper.ingest.keepalive.period.seconds.
sleeper.ingest.queue.visibility.timeout.seconds=900
# This sets the value of fs.s3a.experimental.input.fadvise on the Hadoop configuration used to read
# and write files to and from S3 in ingest jobs. Changing this value allows you to fine-tune how files
# are read. Possible values are "normal", "sequential" and "random". More information is available
# here:
# https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/performance.html#fadvise.
sleeper.ingest.fs.s3a.experimental.input.fadvise=sequential
# The amount of CPU used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.cpu=2048
# The amount of memory in MB used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.memory.mb=4096
# The frequency in seconds with which ingest tasks refresh their view of the partitions.
# (NB Refreshes only happen once a batch of data has been written so this is a lower bound on the
# refresh frequency.)
sleeper.ingest.partition.refresh.period=120
# A comma-separated list of buckets that contain files to be ingested via ingest jobs. The buckets
# should already exist, i.e. they will not be created as part of the cdk deployment of this instance
# of Sleeper. The ingest and bulk import stacks will be given read access to these buckets so that
# they can consume data from them.
# sleeper.ingest.source.bucket=
# Flag to enable/disable storage of tracking information for ingest jobs and tasks.
sleeper.ingest.tracker.enabled=true
# The time to live in seconds for ingest job updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.job.status.ttl=604800
# The time to live in seconds for ingest task updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.task.status.ttl=604800
# The time in seconds to wait for ingest jobs to appear on the queue before an ingest task terminates.
# Must be >= 0 and <= 20.
# See also
# https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-short-and-long-polling.html
sleeper.ingest.job.queue.wait.time=20
# The maximum number of records written to local file in an ingest job. (Records are written in sorted
# order to local disk before being uploaded to S3. Increasing this value increases the amount of time
# before data is visible in the system, but increases the number of records written to S3 in a batch,
# therefore reducing costs.)
# (arraylist-based ingest only)
sleeper.ingest.max.local.records=100000000
# The maximum number of records to read into memory in an ingest job. (Up to
# sleeper.ingest.memory.max.batch.size records are read into memory before being sorted and written to
# disk. This process is repeated until sleeper.ingest.max.local.records records have been written to
# local files. Then the sorted files and merged and the data is written to sorted files in S3.)
# (arraylist-based ingest only)
sleeper.ingest.memory.max.batch.size=1000000
# The number of bytes to allocate to the Arrow working buffer. This buffer is used for sorting and
# other sundry activities. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [256MB]
sleeper.ingest.arrow.working.buffer.bytes=268435456
# The number of bytes to allocate to the Arrow batch buffer, which is used to hold the records before
# they are written to local disk. A larger value means that the local disk holds fewer, larger files,
# which are more efficient to merge together during an upload to S3. Larger values may require a
# larger working buffer. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [1GB]
sleeper.ingest.arrow.batch.buffer.bytes=1073741824
# The maximum number of bytes to store on the local disk before uploading to the main Sleeper store. A
# larger value reduces the number of S3 PUTs that are required to upload thle data to S3 and results
# in fewer files per partition.
# (arrow-based ingest only) [2GB]
sleeper.ingest.arrow.max.local.store.bytes=2147483648
# The number of records to write at once into an Arrow file in the local store. A single Arrow file
# contains many of these micro-batches and so this parameter does not significantly affect the final
# size of the Arrow file. Larger values may require a larger working buffer.
# (arrow-based ingest only) [1K]
sleeper.ingest.arrow.max.single.write.to.file.records=1024
# The implementation of the async S3 client to use for upload during ingest.
# Valid values are 'java' or 'crt'. This determines the implementation of S3AsyncClient that gets
# used.
# With 'java' it makes a single PutObject request for each file.
# With 'crt' it uses the AWS Common Runtime (CRT) to make multipart uploads.
# Note that the CRT option is recommended. Using the Java option may cause failures if any file is
# >5GB in size, and will lead to the following warning:
# "The provided S3AsyncClient is not an instance of S3CrtAsyncClient, and thus multipart
# upload/download feature is not enabled and resumable file upload is not supported. To benefit from
# maximum throughput, consider using S3AsyncClient.crtBuilder().build() instead."
# (async partition file writer only)
sleeper.ingest.async.client.type=crt
# The part size in bytes to use for multipart uploads.
# (CRT async ingest only) [128MB]
sleeper.ingest.async.crt.part.size.bytes=134217728
# The target throughput for multipart uploads, in GB/s. Determines how many parts should be uploaded
# simultaneously.
# (CRT async ingest only)
sleeper.ingest.async.crt.target.throughput.gbps=10
# The amount of memory in MB for the lambda that receives submitted requests to ingest files.
sleeper.ingest.batcher.submitter.memory.mb=1024
# The timeout in seconds for the lambda that receives submitted requests to ingest files. Also used to
# define the visibility timeout for the batcher submit queue.
sleeper.ingest.batcher.submitter.timeout.seconds=20
# The amount of memory in MB for the lambda that creates ingest jobs from submitted file ingest
# requests.
sleeper.ingest.batcher.job.creation.memory.mb=1024
# The timeout in seconds for the lambda that creates ingest jobs from submitted file ingest requests.
sleeper.ingest.batcher.job.creation.timeout.seconds=900
# The rate at which the ingest batcher job creation lambda runs (in minutes, must be >=1).
sleeper.ingest.batcher.job.creation.period.minutes=1
## The following properties relate to bulk import, i.e. ingesting data using Spark jobs running on EMR
## or EKS.
##
## Note that on EMR, the total resource allocation must align with the instance types used for the
## cluster. For the maximum memory usage, combine the memory and memory overhead properties, and
## compare against the maximum memory allocation for YARN in the Hadoop task configuration:
##
## https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html
##
## As an example, if we use m7i.xlarge for executor instances, that has a maximum allocation of 54272
## MiB, or 53 GiB. If we want 3 executors per instance, we can have 53 GiB / 3 = 18,090.666 MiB per
## executor. We can set the executor memory to 16 GiB, and the executor memory overhead to the
## remainder of that amount, which is 18,090 MiB - 16 GiB = 1,706 MiB, or 1.666 GiB. This is just above
## the default Spark memory overhead factor of 0.1, i.e. 16 GiB x 0.1 = 1.6 GiB.
##
## Also see EMR best practices:
##
## https://aws.github.io/aws-emr-best-practices/docs/bestpractices/Applications/Spark/best_practices/#bp-516----tune-driverexecutor-memory-cores-and-sparksqlshufflepartitions-to-fully-utilize-cluster-resources
# The class to use to perform the bulk import. The default value below uses Spark Dataframes. There is
# an alternative option that uses RDDs (sleeper.bulkimport.runner.rdd.BulkImportJobRDDDriver).
sleeper.bulk.import.class.name=sleeper.bulkimport.runner.dataframelocalsort.BulkImportDataframeLocalSortDriver
# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.spark.shuffle.mapStatus.compression.codec=lz4
# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.speculation=false
# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.spark.speculation.quantile=0.75
# The amount of memory in MB for lambda functions that start bulk import jobs.
sleeper.bulk.import.starter.memory.mb=4096
# The amount of memory allocated to a Spark executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory=16g
# The amount of memory allocated to the Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory=16g
# The number of executors. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.instances=29
# The memory overhead for an executor. Used to set spark.executor.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory.overhead=1706m
# The memory overhead for the driver. Used to set spark.driver.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory.overhead=1706m
# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.default.parallelism=290
# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.sql.shuffle.partitions=290
# (Non-persistent or persistent EMR mode only) An EC2 keypair to use for the EC2 instances. Specifying
# this will allow you to SSH to the nodes in the cluster while it's running.
sleeper.bulk.import.emr.keypair.name=my-key
# (Non-persistent or persistent EMR mode only) Specifying this security group causes the group to be
# added to the EMR master's list of security groups.
# sleeper.bulk.import.emr.master.additional.security.group=
# (Non-persistent or persistent EMR mode only) The number of cores used by an executor. Used to set
# spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.cores=5
# (Non-persistent or persistent EMR mode only) The number of cores used by the driver. Used to set
# spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.cores=5
# (Non-persistent or persistent EMR mode only) The default timeout for network interactions in Spark.
# Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.network.timeout=800s
# (Non-persistent or persistent EMR mode only) The interval between heartbeats from executors to the
# driver. Used to set spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.heartbeat.interval=60s
# (Non-persistent or persistent EMR mode only) Whether Spark should use dynamic allocation to scale
# resources up and down. Used to set spark.dynamicAllocation.enabled.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.dynamic.allocation.enabled=false
# (Non-persistent or persistent EMR mode only) The fraction of heap space used for execution and
# storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.fraction=0.80
# (Non-persistent or persistent EMR mode only) The amount of storage memory immune to eviction,
# expressed as a fraction of the heap space used for execution and storage. Used to set
# spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.storage.fraction=0.30
# (Non-persistent or persistent EMR mode only) JVM options passed to the executors. Used to set
# spark.executor.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'
# (Non-persistent or persistent EMR mode only) JVM options passed to the driver. Used to set
# spark.driver.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'
# (Non-persistent or persistent EMR mode only) The maximum number of executor failures before YARN can
# fail the application. Used to set spark.yarn.scheduler.reporterThread.maxFailures.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.yarn.scheduler.reporter.thread.max.failures=5
# (Non-persistent or persistent EMR mode only) The storage to use for temporary caching. Used to set
# spark.storage.level.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.storage.level=MEMORY_AND_DISK_SER
# (Non-persistent or persistent EMR mode only) Whether to compress serialized RDD partitions. Used to
# set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.rdd.compress=true
# (Non-persistent or persistent EMR mode only) Whether to compress map output files. Used to set
# spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.compress=true
# (Non-persistent or persistent EMR mode only) Whether to compress data spilled during shuffles. Used
# to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.spill.compress=true
# (Non-persistent or persistent EMR mode only) The size of the EBS volume in gibibytes (GiB).
# This can be a number from 10 to 1024.
sleeper.bulk.import.emr.ebs.volume.size.gb=256
# (Non-persistent or persistent EMR mode only) The type of the EBS volume.
# Valid values are 'gp2', 'gp3', 'io1', 'io2'.
sleeper.bulk.import.emr.ebs.volume.type=gp2
# (Non-persistent or persistent EMR mode only) The number of EBS volumes per instance.
# This can be a number from 1 to 25.
sleeper.bulk.import.emr.ebs.volumes.per.instance=4
# ARN of the KMS Key used to encrypt data at rest on the local file system in AWS EMR.
# See
# https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-encryption-enable.html#emr-encryption-create-keys.
# sleeper.bulk.import.emr.ebs.encryption.key.arn=
# The architecture for EMR Serverless to use. X86_64 or ARM64 (Coming soon)
sleeper.bulk.import.emr.serverless.architecture=X86_64
# The version of EMR Serverless to use.
sleeper.bulk.import.emr.serverless.release=emr-7.2.0
# The name of the repository for the EMR serverless container. The Docker image from the bulk-import
# module should have been uploaded to an ECR repository of this name in this account.
sleeper.bulk.import.emr.serverless.repo=<insert-unique-sleeper-id>/bulk-import-runner-emr-serverless
# Set to true to allow an EMR Serverless Application to start automatically when a job is submitted.
sleeper.bulk.import.emr.serverless.autostart.enabled=true
# Set to true to allow an EMR Serverless Application to stop automatically when there are no jobs to
# process.
# Turning this off with pre-initialised capacity turned off is not recommended.
sleeper.bulk.import.emr.serverless.autostop.enabled=true
# The number of minutes of inactivity before EMR Serverless stops the application.
sleeper.bulk.import.emr.serverless.autostop.timeout=15
# The number of cores used by a Serverless executor. Used to set spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.cores=4
# The amount of memory allocated to a Serverless executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.memory=16G
# The amount of storage allocated to a Serverless executor.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.emr-serverless.executor.disk=200G
# The number of executors to be used with Serverless. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.instances=36
# The number of cores used by the Serverless Spark driver. Used to set spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.cores=4
# The amount of memory allocated to the Serverless Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.memory=16G
# The path to JAVA_HOME to be used by the custom image for bulk import.
sleeper.bulk.import.emr.serverless.spark.executorEnv.JAVA_HOME=/usr/lib/jvm/jre-11
# Whether Spark should use dynamic allocation to scale resources up and down. Used to set
# spark.dynamicAllocation.enabled. See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.dynamic.allocation.enabled=false
# Whether to compress serialized RDD partitions. Used to set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.rdd.compress=true
# Whether to compress map output files. Used to set spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.compress=true
# Whether to compress data spilled during shuffles. Used to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.spill.compress=true
# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.default.parallelism=288
# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.sql.shuffle.partitions=288
# The default timeout for network interactions in Spark. Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.network.timeout=800s
# (The interval between heartbeats from executors to the driver. Used to set
# spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.heartbeat.interval=60s
# The fraction of heap space used for execution and storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.fraction=0.80
# The amount of storage memory immune to eviction, expressed as a fraction of the heap space used for
# execution and storage. Used to set spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.storage.fraction=0.30
# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation=false
# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation.quantile=0.75
# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.serverless.spark.shuffle.mapStatus.compression.codec=lz4
# Set to enable the pre-initialise capacity option for EMR Serverless application.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.enabled=false
# The number of executors to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.count=72
# The amount of CPUs per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.cores=4vCPU
# The amount of memory per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.memory=18GB
# The amount of storage per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.disk=200GB
# The number of drivers to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.count=5
# The amount of CPUs per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.cores=4vCPU
# The amount of memory per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.memory=18GB
# The amount of storage per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.disk=20GB
# (Non-persistent EMR mode only) The default EMR release label to be used when creating an EMR cluster
# for bulk importing data using Spark running on EMR.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.release.label=emr-7.2.0
# (Non-persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR
# cluster. Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk
# import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.instance.architecture=arm64
# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.master.x86.instance.types=m7i.xlarge
# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.executor.x86.instance.types=m7i.4xlarge
# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.master.arm.instance.types=m7g.xlarge
# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.executor.arm.instance.types=m7g.4xlarge
# (Non-persistent EMR mode only) The default purchasing option to be used for the executor nodes of
# the EMR cluster.
# Valid values are ON_DEMAND or SPOT.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.market.type=SPOT
# (Non-persistent EMR mode only) The default initial number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.initial.instances=2
# (Non-persistent EMR mode only) The default maximum number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.max.instances=10
# (Persistent EMR mode only) The EMR release used to create the persistent EMR cluster.
sleeper.bulk.import.persistent.emr.release.label=emr-7.2.0
# (Persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR cluster.
# Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk import using
# EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.instance.architecture=arm64
# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.master.x86.instance.types=m7i.xlarge
# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.executor.x86.instance.types=m7i.4xlarge
# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.master.arm.instance.types=m7g.xlarge
# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.executor.arm.instance.types=m7g.4xlarge
# (Persistent EMR mode only) Whether the persistent EMR cluster should use managed scaling or not.
sleeper.bulk.import.persistent.emr.use.managed.scaling=true
# (Persistent EMR mode only) The minimum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# If managed scaling is not used then the cluster will be of fixed size, with a number of instances
# equal to this value.
sleeper.bulk.import.persistent.emr.min.capacity=1
# (Persistent EMR mode only) The maximum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This value is only used if managed scaling is used.
sleeper.bulk.import.persistent.emr.max.capacity=10
# (Persistent EMR mode only) This controls the number of EMR steps that can run concurrently.
sleeper.bulk.import.persistent.emr.step.concurrency.level=2
# (EKS mode only) The name of the ECS repository where the Docker image for the bulk import container
# is stored.
sleeper.bulk.import.eks.repo=<insert-unique-sleeper-id>/bulk-import-runner
# (EKS mode only) Names of AWS IAM roles which should have access to administer the EKS cluster.
# sleeper.bulk.import.eks.cluster.admin.roles=
# (EKS mode only) Set to true if sleeper.bulk.import.eks.repo contains the image built with native
# Hadoop libraries. By default when deploying with the EKS stack enabled, an image will be built based
# on the official Spark Docker image, so this should be false.
sleeper.bulk.import.eks.is.native.libs.image=false
## The following properties relate to the splitting of partitions.
# The frequency in minutes with which the lambda runs to find partitions that need splitting and send
# jobs to the splitting lambda.
sleeper.partition.splitting.period.minutes=30
# When a partition needs splitting, a partition splitting job is created. This reads in the sketch
# files associated to the files in the partition in order to identify the median. This parameter
# controls the maximum number of files that are read in.
sleeper.partition.splitting.files.maximum=50
# The number of tables to find partitions to split for in a single invocation. This will be the batch
# size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.partition.splitting.finder.batch.size=1
# The amount of memory in MB for the lambda function used to identify partitions that need to be
# split.
sleeper.partition.splitting.finder.memory.mb=4096
# The timeout in seconds for the lambda function used to identify partitions that need to be split.
sleeper.partition.splitting.finder.timeout.seconds=900
# The reserved concurrency for the find partitions to split lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.partition.splitting.finder.concurrency.reserved=
# The maximum given concurrency allowed for the find partitions to split lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.partition.splitting.finder.concurrency.max=10
# The amount of memory in MB for the lambda function used to split partitions.
sleeper.partition.splitting.memory.mb=4096
# The timeout in seconds for the lambda function used to split partitions.
sleeper.partition.splitting.timeout.seconds=900
# The number of lambda instances to reserve from your AWS account's quota for splitting partitions.
# Note that this will not provision instances until they are needed. Each time partition splitting
# runs, a separate lambda invocation will be made for each partition that needs to be split. If the
# reserved concurrency is less than the number of partitions that need to be split across all Sleeper
# tables in the instance, these invocations may queue up.
sleeper.partition.splitting.reserved.concurrency=10
# This is the default value of the partition splitting threshold. Partitions with more than the
# following number of records in will be split. This value can be overridden on a per-table basis.
sleeper.default.partition.splitting.threshold=1000000000
## The following properties relate to garbage collection.
# The frequency in minutes with which the garbage collector lambda is run.
sleeper.gc.period.minutes=15
# The timeout in seconds for the garbage collector lambda.
sleeper.gc.lambda.timeout.seconds=840
# The amount of memory in MB for the lambda function used to perform garbage collection.
sleeper.gc.memory.mb=4096
# The reserved concurrency for the garbage collection lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.gc.concurrency.reserved=
# The maximum given concurrency allowed for the garbage collection lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.gc.concurrency.max=10
# The number of tables to perform garbage collection for in a single invocation. This will be the
# batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.gc.table.batch.size=1
# Whether to perform garbage collection for offline tables.
sleeper.gc.offline.enabled=false
# The size of the batch of files ready for garbage collection requested from the State Store.
sleeper.gc.batch.size=2000
# A file will not be deleted until this number of minutes have passed after it has been marked as
# ready for garbage collection. The reason for not deleting files immediately after they have been
# marked as ready for garbage collection is that they may still be in use by queries. This property
# can be overridden on a per-table basis.
sleeper.default.gc.delay.minutes=15
## The following properties relate to compactions.
# The name of the repository for the compaction container. The Docker image from the
# compaction-job-execution module should have been uploaded to an ECR repository of this name in this
# account.
sleeper.compaction.repo=<insert-unique-sleeper-id>/compaction-job-execution
# The number of tables to perform compaction job creation for in a single invocation. This will be the
# batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.compaction.job.creation.batch.size=1
# The number of finished compaction commits to gather in the batcher before committing to the state
# store. This will be the batch size for a lambda as an SQS event source.
# This can be a maximum of 10,000. In practice the effective maximum is limited by the number of
# messages that fit in a synchronous lambda invocation payload, see the AWS documentation:
# https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html
sleeper.compaction.job.commit.batch.size=1000
# The time in seconds that the batcher will wait for compaction commits to appear if the batch size is
# not filled. This will be set in the SQS event source for the lambda. This can be a maximum of 300,
# i.e. 5 minutes.
sleeper.compaction.job.commit.batching.window.seconds=30
# The visibility timeout for the queue of compaction jobs.
sleeper.compaction.queue.visibility.timeout.seconds=900
# The visibility timeout for the queue of pending compaction job batches.
sleeper.compaction.pending.queue.visibility.timeout.seconds=900
# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the compaction job queue so that they are not processed by other
# processes.
# This should be less than the value of sleeper.compaction.queue.visibility.timeout.seconds.
sleeper.compaction.keepalive.period.seconds=300
# The delay in seconds until a failed compaction job becomes visible on the compaction job queue and
# can be processed again.
sleeper.compaction.job.failed.visibility.timeout.seconds=60
# The time in seconds for a compaction task to wait for a compaction job to appear on the SQS queue
# (must be <= 20).
# When a compaction task waits for compaction jobs to appear on the SQS queue, if the task receives no
# messages in the time defined by this property, it will try to wait for a message again.
sleeper.compaction.task.wait.time.seconds=20
# Set to true if compaction tasks should wait for input files to be assigned to a compaction job
# before starting it. The compaction task will poll the state store for whether the input files have
# been assigned to the job, and will only start once this has occurred.
# This prevents invalid compaction jobs from being run, particularly in the case where the compaction
# job creator runs again before the input files are assigned.
# This also causes compaction tasks to wait idle while input files are assigned, and puts extra load
# on the state store when there are many compaction tasks.
# If this is false, any created job will be executed, and will only be validated when committed to the
# state store.
sleeper.compaction.task.wait.for.input.file.assignment=false
# The time in seconds for a compaction task to wait after receiving no compaction jobs before
# attempting to receive a message again.
# When a compaction task waits for compaction jobs to appear on the SQS queue, if the task receives no
# messages in the time defined by the property "sleeper.compaction.task.wait.time.seconds", it will
# wait for a number of seconds defined by this property, then try to receive a message again.
sleeper.compaction.task.delay.before.retry.seconds=10
# The total time in seconds that a compaction task can be idle before it is terminated.
# When there are no compaction jobs available on the SQS queue, and SQS returns no jobs, the task will
# check whether this idle time has elapsed since the last time it finished a job. If so, the task will