example/full/instance.properties


## The following properties are commonly used throughout Sleeper.

# A string to uniquely identify this deployment. This should be no longer than 20 chars. It should be
# globally unique as it will be used to name AWS resources such as S3 buckets.
sleeper.id=full-example

# The S3 bucket containing the jar files of the Sleeper components.
sleeper.jars.bucket=the name of the bucket containing your jars, e.g. sleeper-<insert-unique-name-here>-jars

# A comma-separated list of the jars containing application specific iterator code. These jars are
# assumed to be in the bucket given by sleeper.jars.bucket, e.g. if that bucket contains two iterator
# jars called iterator1.jar and iterator2.jar then the property should be
# 'sleeper.userjars=iterator1.jar,iterator2.jar'.
# sleeper.userjars=

# A name for a tag to identify the stack that deployed a resource. This will be set for all AWS
# resources, to the ID of the CDK stack that they are deployed under. This can be used to organise the
# cost explorer for billing.
sleeper.stack.tag.name=DeploymentStack

# Whether to keep the sleeper table bucket, Dynamo tables, query results bucket, etc., when the
# instance is destroyed.
sleeper.retain.infra.after.destroy=true

# The optional stacks to deploy. Not case sensitive.
# Valid values: [IngestStack, IngestBatcherStack, EmrServerlessBulkImportStack, EmrBulkImportStack,
# PersistentEmrBulkImportStack, EksBulkImportStack, EmrStudioStack, QueryStack, WebSocketQueryStack,
# AthenaStack, KeepLambdaWarmStack, CompactionStack, GarbageCollectorStack, PartitionSplittingStack,
# DashboardStack, TableMetricsStack]
sleeper.optional.stacks=IngestStack,IngestBatcherStack,EmrServerlessBulkImportStack,EmrStudioStack,QueryStack,AthenaStack,CompactionStack,GarbageCollectorStack,PartitionSplittingStack,DashboardStack,TableMetricsStack

# The deployment type for AWS Lambda. Not case sensitive.
# Valid values: [jar, container]
sleeper.lambda.deploy.type=jar

# The AWS account number. This is the AWS account that the instance will be deployed to.
sleeper.account=1234567890

# The AWS region to deploy to.
sleeper.region=eu-west-2

# The id of the VPC to deploy to.
sleeper.vpc=1234567890

# Whether to check that the VPC that the instance is deployed to has an S3 endpoint. If there is no S3
# endpoint then the NAT costs can be very significant.
sleeper.vpc.endpoint.check=true

# A comma separated list of subnets to deploy to. ECS tasks will be run across multiple subnets. EMR
# clusters will be deployed in a subnet chosen when the cluster is created.
sleeper.subnets=subnet-abcdefgh

# The Hadoop filesystem used to connect to S3.
sleeper.filesystem=s3a://

# An email address used by the TopicStack to publish SNS notifications of errors.
# sleeper.errors.email=

# The length of time in days that CloudWatch logs from lambda functions, ECS containers, etc., are
# retained.
# See https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-logs-loggroup.html
# for valid options.
# Use -1 to indicate infinite retention.
sleeper.log.retention.days=30

# Used to set the value of fs.s3a.connection.maximum on the Hadoop configuration. This controls the
# maximum number of http connections to S3.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.max-connections=100

# Used to set the value of fs.s3a.block.size on the Hadoop configuration. Uploads to S3 happen in
# blocks, and this sets the size of blocks. If a larger value is used, then more data is buffered
# before the upload begins.
# See https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/performance.html
sleeper.fs.s3a.upload.block.size=32M

# The version of Fargate to use.
sleeper.fargate.version=1.4.0

# The amount of memory in MB for the lambda that creates ECS tasks to execute compaction and ingest
# jobs.
sleeper.task.runner.memory.mb=1024

# The timeout in seconds for the lambda that creates ECS tasks to execute compaction jobs and ingest
# jobs.
# This must be >0 and <= 900.
sleeper.task.runner.timeout.seconds=900

# If true, properties will be reloaded every time a long running job is started or a lambda is run.
# This will mainly be used in test scenarios to ensure properties are up to date.
sleeper.properties.force.reload=false

# If set, this property will be used as a prefix for the names of ECR repositories. If unset, then the
# instance ID will be used to determine the names instead.
# Note: This is only used by the deployment scripts to upload Docker images, not the CDK. We may add
# the ability to use this in the CDK in the future.
# sleeper.ecr.repository.prefix=

# A comma-separated list of up to 5 security group IDs to be used when running ECS tasks.
# sleeper.ecs.security.groups=

# Default value for the reserved concurrency for each lambda in the Sleeper instance that scales
# according to the number of Sleeper tables.
# The state store committer lambda is an exception to this, as it has reserved concurrency by default.
# This is set in the property sleeper.statestore.committer.concurrency.reserved. Other lambdas are
# present that do not scale by the number of Sleeper tables, and are not set from this property.
# By default no concurrency is reserved for the lambdas. Each lambda also has its own property that
# overrides the value found here.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.default.lambda.concurrency.reserved=

# Default value for the maximum concurrency for each lambda in the Sleeper instance that scales
# according to the number of Sleeper tables.
# Other lambdas are present that do not scale by the number of Sleeper tables, and are not set from
# this property.
# By default the maximum concurrency is set to 10, which is enough for 10 online tables. If there are
# more online tables, this number may need to be increased. Each lambda also has its own property that
# overrides the value found here.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.default.lambda.concurrency.max=10


## The following properties relate to handling the state of Sleeper tables.

# Default value for amount of memory in MB for each lambda that holds the state of Sleeper tables in
# memory. These use a state store provider which caches a number of tables at once, set in
# `sleeper.statestore.provider.cache.size`. Not all lambdas are covered by this, e.g. see
# `sleeper.batch.table.lambdas.memory.mb`.
sleeper.default.table.state.lambda.memory.mb=4096

# The amount of memory in MB for lambdas that create batches of tables to run some operation against,
# eg. create compaction jobs, run garbage collection, perform partition splitting.
sleeper.batch.table.lambdas.memory.mb=1024

# The timeout in seconds for lambdas that create batches of tables to run some operation against, eg.
# create compaction jobs, run garbage collection, perform partition splitting.
sleeper.batch.table.lambdas.timeout.seconds=60

# The timeout in minutes for when the table properties provider cache should be cleared, forcing table
# properties to be reloaded from S3.
sleeper.cache.table.properties.provider.timeout.minutes=60

# The maximum size of state store providers. If a state store is needed and the cache is full, the
# oldest state store in the cache will be removed to make space.
sleeper.statestore.provider.cache.size=10

# This specifies whether point in time recovery is enabled for the DynamoDB state store. This is set
# on the DynamoDB tables.
sleeper.statestore.dynamo.pointintimerecovery=false

# This specifies whether point in time recovery is enabled for the S3 state store. This is set on the
# revision DynamoDB table.
sleeper.statestore.s3.dynamo.pointintimerecovery=false

# The number of tables to create transaction log snapshots for in a single invocation. This will be
# the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.snapshot.creation.batch.size=1

# The frequency in seconds with which the transaction log snapshot creation lambda is run.
sleeper.statestore.snapshot.creation.lambda.period.seconds=60

# The timeout in seconds after which to terminate the transaction log snapshot creation lambda.
sleeper.statestore.snapshot.creation.lambda.timeout.seconds=900

# The amount of memory in MB for the transaction log snapshot creation lambda.
sleeper.statestore.snapshot.creation.memory.mb=4096

# The reserved concurrency for the snapshot creation lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.snapshot.creation.concurrency.reserved=

# The maximum given concurrency allowed for the snapshot creation lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.snapshot.creation.concurrency.max=10

# The number of tables to delete old transaction log snapshots for in a single invocation. This will
# be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.snapshot.deletion.batch.size=1

# The frequency in minutes with which the transaction log snapshot deletion lambda is run.
sleeper.statestore.snapshot.deletion.lambda.period.minutes=60

# The reserved concurrency for the snapshot deletion lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.snapshot.deletion.concurrency.reserved=

# The maximum given concurrency allowed for the snapshot deletion lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.snapshot.deletion.concurrency.max=10

# The number of tables to delete old transaction log transactions for in a single invocation. This
# will be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.transaction.deletion.batch.size=1

# The frequency in minutes with which the transaction log transaction deletion lambda is run.
sleeper.statestore.transaction.deletion.lambda.period.minutes=60

# The reserved concurrency for the transaction deletion lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.transaction.deletion.concurrency.reserved=

# The maximum given concurrency allowed for the transaction deletion lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.transaction.deletion.concurrency.max=10

# The maximum timeout for the transaction deletion lambda in seconds.
sleeper.statestore.transaction.deletion.lambda.timeout.seconds=900

# The reserved concurrency for the lambda that follows the state store transaction log to trigger
# updates.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.statestore.transaction.follower.concurrency.reserved=

# The maximum given concurrency allowed for the lambda that follows the state store transaction log to
# trigger updates.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.transaction.follower.concurrency.max=10

# The maximum timeout in seconds for the lambda that follows the state store transaction log to
# trigger updates.
sleeper.statestore.transaction.follower.lambda.timeout.seconds=900

# The amount of memory in MB for the lambda that follows the state store transaction log to trigger
# updates.
sleeper.statestore.transaction.follower.memory.mb=4096

# This specifies whether point in time recovery is enabled for the Sleeper table index. This is set on
# the DynamoDB tables.
sleeper.tables.index.dynamo.pointintimerecovery=false

# This specifies whether queries and scans against the table index DynamoDB tables are strongly
# consistent.
sleeper.tables.index.dynamo.consistent.reads=true

# The amount of memory in MB for the lambda that commits state store updates.
sleeper.statestore.committer.lambda.memory.mb=4096

# The timeout for the lambda that commits state store updates in seconds.
sleeper.statestore.committer.lambda.timeout.seconds=900

# The number of state store updates to be sent to the state store committer lambda in one invocation.
# This will be the batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.statestore.committer.batch.size=10

# The reserved concurrency for the state store committer lambda.
# Presently this value defaults to 10 to align with expectations around table efficiency.
# This is to ensure that state store operations can still be applied to at least 10 tables, even when
# concurrency is used up in the account.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
sleeper.statestore.committer.concurrency.reserved=10

# The maximum given concurrency allowed for the state store committer lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.statestore.committer.concurrency.max=10


## The following properties relate to standard ingest.

# The name of the ECR repository for the ingest container. The Docker image from the ingest module
# should have been uploaded to an ECR repository of this name in this account.
sleeper.ingest.repo=<insert-unique-sleeper-id>/ingest

# The maximum number of concurrent ECS tasks to run.
sleeper.ingest.max.concurrent.tasks=200

# The frequency in minutes with which an EventBridge rule runs to trigger a lambda that, if necessary,
# runs more ECS tasks to perform ingest jobs.
sleeper.ingest.task.creation.period.minutes=1

# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the ingest queue so that they are not processed by other processes.
# This should be less than the value of sleeper.ingest.queue.visibility.timeout.seconds.
sleeper.ingest.keepalive.period.seconds=300

# The visibility timeout in seconds for the standard ingest job queue. This should be greater than
# sleeper.ingest.keepalive.period.seconds.
sleeper.ingest.queue.visibility.timeout.seconds=900

# This sets the value of fs.s3a.experimental.input.fadvise on the Hadoop configuration used to read
# and write files to and from S3 in ingest jobs. Changing this value allows you to fine-tune how files
# are read. Possible values are "normal", "sequential" and "random". More information is available
# here:
# https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/performance.html#fadvise.
sleeper.ingest.fs.s3a.experimental.input.fadvise=sequential

# The amount of CPU used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.cpu=2048

# The amount of memory in MB used by Fargate tasks that perform ingest jobs.
# Note that only certain combinations of CPU and memory are valid.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.ingest.task.memory.mb=4096

# The frequency in seconds with which ingest tasks refresh their view of the partitions.
# (NB Refreshes only happen once a batch of data has been written so this is a lower bound on the
# refresh frequency.)
sleeper.ingest.partition.refresh.period=120

# A comma-separated list of buckets that contain files to be ingested via ingest jobs. The buckets
# should already exist, i.e. they will not be created as part of the cdk deployment of this instance
# of Sleeper. The ingest and bulk import stacks will be given read access to these buckets so that
# they can consume data from them.
# sleeper.ingest.source.bucket=

# Flag to enable/disable storage of tracking information for ingest jobs and tasks.
sleeper.ingest.tracker.enabled=true

# The time to live in seconds for ingest job updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.job.status.ttl=604800

# The time to live in seconds for ingest task updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.ingest.task.status.ttl=604800

# The time in seconds to wait for ingest jobs to appear on the queue before an ingest task terminates.
# Must be >= 0 and <= 20.
# See also
# https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-short-and-long-polling.html
sleeper.ingest.job.queue.wait.time=20

# The maximum number of records written to local file in an ingest job. (Records are written in sorted
# order to local disk before being uploaded to S3. Increasing this value increases the amount of time
# before data is visible in the system, but increases the number of records written to S3 in a batch,
# therefore reducing costs.)
# (arraylist-based ingest only)
sleeper.ingest.max.local.records=100000000

# The maximum number of records to read into memory in an ingest job. (Up to
# sleeper.ingest.memory.max.batch.size records are read into memory before being sorted and written to
# disk. This process is repeated until sleeper.ingest.max.local.records records have been written to
# local files. Then the sorted files and merged and the data is written to sorted files in S3.)
# (arraylist-based ingest only)
sleeper.ingest.memory.max.batch.size=1000000

# The number of bytes to allocate to the Arrow working buffer. This buffer is used for sorting and
# other sundry activities. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [256MB]
sleeper.ingest.arrow.working.buffer.bytes=268435456

# The number of bytes to allocate to the Arrow batch buffer, which is used to hold the records before
# they are written to local disk. A larger value means that the local disk holds fewer, larger files,
# which are more efficient to merge together during an upload to S3. Larger values may require a
# larger working buffer. Note that this is off-heap memory, which is in addition to the memory
# assigned to the JVM.
# (arrow-based ingest only) [1GB]
sleeper.ingest.arrow.batch.buffer.bytes=1073741824

# The maximum number of bytes to store on the local disk before uploading to the main Sleeper store. A
# larger value reduces the number of S3 PUTs that are required to upload thle data to S3 and results
# in fewer files per partition.
# (arrow-based ingest only) [2GB]
sleeper.ingest.arrow.max.local.store.bytes=2147483648

# The number of records to write at once into an Arrow file in the local store. A single Arrow file
# contains many of these micro-batches and so this parameter does not significantly affect the final
# size of the Arrow file. Larger values may require a larger working buffer.
# (arrow-based ingest only) [1K]
sleeper.ingest.arrow.max.single.write.to.file.records=1024

# The implementation of the async S3 client to use for upload during ingest.
# Valid values are 'java' or 'crt'. This determines the implementation of S3AsyncClient that gets
# used.
# With 'java' it makes a single PutObject request for each file.
# With 'crt' it uses the AWS Common Runtime (CRT) to make multipart uploads.
# Note that the CRT option is recommended. Using the Java option may cause failures if any file is
# >5GB in size, and will lead to the following warning:
# "The provided S3AsyncClient is not an instance of S3CrtAsyncClient, and thus multipart
# upload/download feature is not enabled and resumable file upload is not supported. To benefit from
# maximum throughput, consider using S3AsyncClient.crtBuilder().build() instead."
# (async partition file writer only)
sleeper.ingest.async.client.type=crt

# The part size in bytes to use for multipart uploads.
# (CRT async ingest only) [128MB]
sleeper.ingest.async.crt.part.size.bytes=134217728

# The target throughput for multipart uploads, in GB/s. Determines how many parts should be uploaded
# simultaneously.
# (CRT async ingest only)
sleeper.ingest.async.crt.target.throughput.gbps=10

# The amount of memory in MB for the lambda that receives submitted requests to ingest files.
sleeper.ingest.batcher.submitter.memory.mb=1024

# The timeout in seconds for the lambda that receives submitted requests to ingest files. Also used to
# define the visibility timeout for the batcher submit queue.
sleeper.ingest.batcher.submitter.timeout.seconds=20

# The amount of memory in MB for the lambda that creates ingest jobs from submitted file ingest
# requests.
sleeper.ingest.batcher.job.creation.memory.mb=1024

# The timeout in seconds for the lambda that creates ingest jobs from submitted file ingest requests.
sleeper.ingest.batcher.job.creation.timeout.seconds=900

# The rate at which the ingest batcher job creation lambda runs (in minutes, must be >=1).
sleeper.ingest.batcher.job.creation.period.minutes=1


## The following properties relate to bulk import, i.e. ingesting data using Spark jobs running on EMR
## or EKS.
## 
## Note that on EMR, the total resource allocation must align with the instance types used for the
## cluster. For the maximum memory usage, combine the memory and memory overhead properties, and
## compare against the maximum memory allocation for YARN in the Hadoop task configuration:
## 
## https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html
## 
## As an example, if we use m7i.xlarge for executor instances, that has a maximum allocation of 54272
## MiB, or 53 GiB. If we want 3 executors per instance, we can have 53 GiB / 3 = 18,090.666 MiB per
## executor. We can set the executor memory to 16 GiB, and the executor memory overhead to the
## remainder of that amount, which is 18,090 MiB - 16 GiB = 1,706 MiB, or 1.666 GiB. This is just above
## the default Spark memory overhead factor of 0.1, i.e. 16 GiB x 0.1 = 1.6 GiB.
## 
## Also see EMR best practices:
## 
## https://aws.github.io/aws-emr-best-practices/docs/bestpractices/Applications/Spark/best_practices/#bp-516----tune-driverexecutor-memory-cores-and-sparksqlshufflepartitions-to-fully-utilize-cluster-resources

# The class to use to perform the bulk import. The default value below uses Spark Dataframes. There is
# an alternative option that uses RDDs (sleeper.bulkimport.runner.rdd.BulkImportJobRDDDriver).
sleeper.bulk.import.class.name=sleeper.bulkimport.runner.dataframelocalsort.BulkImportDataframeLocalSortDriver

# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.spark.shuffle.mapStatus.compression.codec=lz4

# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.speculation=false

# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.spark.speculation.quantile=0.75

# The amount of memory in MB for lambda functions that start bulk import jobs.
sleeper.bulk.import.starter.memory.mb=4096

# The amount of memory allocated to a Spark executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory=16g

# The amount of memory allocated to the Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory=16g

# The number of executors. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.instances=29

# The memory overhead for an executor. Used to set spark.executor.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.memory.overhead=1706m

# The memory overhead for the driver. Used to set spark.driver.memoryOverhead.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.memory.overhead=1706m

# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.default.parallelism=290

# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.sql.shuffle.partitions=290

# (Non-persistent or persistent EMR mode only) An EC2 keypair to use for the EC2 instances. Specifying
# this will allow you to SSH to the nodes in the cluster while it's running.
sleeper.bulk.import.emr.keypair.name=my-key

# (Non-persistent or persistent EMR mode only) Specifying this security group causes the group to be
# added to the EMR master's list of security groups.
# sleeper.bulk.import.emr.master.additional.security.group=

# (Non-persistent or persistent EMR mode only) The number of cores used by an executor. Used to set
# spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.cores=5

# (Non-persistent or persistent EMR mode only) The number of cores used by the driver. Used to set
# spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.cores=5

# (Non-persistent or persistent EMR mode only) The default timeout for network interactions in Spark.
# Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.network.timeout=800s

# (Non-persistent or persistent EMR mode only) The interval between heartbeats from executors to the
# driver. Used to set spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.heartbeat.interval=60s

# (Non-persistent or persistent EMR mode only) Whether Spark should use dynamic allocation to scale
# resources up and down. Used to set spark.dynamicAllocation.enabled.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.dynamic.allocation.enabled=false

# (Non-persistent or persistent EMR mode only) The fraction of heap space used for execution and
# storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.fraction=0.80

# (Non-persistent or persistent EMR mode only) The amount of storage memory immune to eviction,
# expressed as a fraction of the heap space used for execution and storage. Used to set
# spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.memory.storage.fraction=0.30

# (Non-persistent or persistent EMR mode only) JVM options passed to the executors. Used to set
# spark.executor.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.executor.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'

# (Non-persistent or persistent EMR mode only) JVM options passed to the driver. Used to set
# spark.driver.extraJavaOptions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.driver.extra.java.options=-XX:+UseG1GC -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCDateStamps -XX:OnOutOfMemoryError='kill -9 %p'

# (Non-persistent or persistent EMR mode only) The maximum number of executor failures before YARN can
# fail the application. Used to set spark.yarn.scheduler.reporterThread.maxFailures.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.yarn.scheduler.reporter.thread.max.failures=5

# (Non-persistent or persistent EMR mode only) The storage to use for temporary caching. Used to set
# spark.storage.level.
# See
# https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/.
sleeper.bulk.import.emr.spark.storage.level=MEMORY_AND_DISK_SER

# (Non-persistent or persistent EMR mode only) Whether to compress serialized RDD partitions. Used to
# set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.rdd.compress=true

# (Non-persistent or persistent EMR mode only) Whether to compress map output files. Used to set
# spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.compress=true

# (Non-persistent or persistent EMR mode only) Whether to compress data spilled during shuffles. Used
# to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.spark.shuffle.spill.compress=true

# (Non-persistent or persistent EMR mode only) The size of the EBS volume in gibibytes (GiB).
# This can be a number from 10 to 1024.
sleeper.bulk.import.emr.ebs.volume.size.gb=256

# (Non-persistent or persistent EMR mode only) The type of the EBS volume.
# Valid values are 'gp2', 'gp3', 'io1', 'io2'.
sleeper.bulk.import.emr.ebs.volume.type=gp2

# (Non-persistent or persistent EMR mode only) The number of EBS volumes per instance.
# This can be a number from 1 to 25.
sleeper.bulk.import.emr.ebs.volumes.per.instance=4

# ARN of the KMS Key used to encrypt data at rest on the local file system in AWS EMR.
# See
# https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-encryption-enable.html#emr-encryption-create-keys.
# sleeper.bulk.import.emr.ebs.encryption.key.arn=

# The architecture for EMR Serverless to use. X86_64 or ARM64 (Coming soon)
sleeper.bulk.import.emr.serverless.architecture=X86_64

# The version of EMR Serverless to use.
sleeper.bulk.import.emr.serverless.release=emr-7.2.0

# The name of the repository for the EMR serverless container. The Docker image from the bulk-import
# module should have been uploaded to an ECR repository of this name in this account.
sleeper.bulk.import.emr.serverless.repo=<insert-unique-sleeper-id>/bulk-import-runner-emr-serverless

# Set to true to allow an EMR Serverless Application to start automatically when a job is submitted.
sleeper.bulk.import.emr.serverless.autostart.enabled=true

# Set to true to allow an EMR Serverless Application to stop automatically when there are no jobs to
# process.
# Turning this off with pre-initialised capacity turned off is not recommended.
sleeper.bulk.import.emr.serverless.autostop.enabled=true

# The number of minutes of inactivity before EMR Serverless stops the application.
sleeper.bulk.import.emr.serverless.autostop.timeout=15

# The number of cores used by a Serverless executor. Used to set spark.executor.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.cores=4

# The amount of memory allocated to a Serverless executor. Used to set spark.executor.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.memory=16G

# The amount of storage allocated to a Serverless executor.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.emr-serverless.executor.disk=200G

# The number of executors to be used with Serverless. Used to set spark.executor.instances.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.instances=36

# The number of cores used by the Serverless Spark driver. Used to set spark.driver.cores.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.cores=4

# The amount of memory allocated to the Serverless Spark driver. Used to set spark.driver.memory.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.driver.memory=16G

# The path to JAVA_HOME to be used by the custom image for bulk import.
sleeper.bulk.import.emr.serverless.spark.executorEnv.JAVA_HOME=/usr/lib/jvm/jre-11

# Whether Spark should use dynamic allocation to scale resources up and down. Used to set
# spark.dynamicAllocation.enabled. See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.dynamic.allocation.enabled=false

# Whether to compress serialized RDD partitions. Used to set spark.rdd.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.rdd.compress=true

# Whether to compress map output files. Used to set spark.shuffle.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.compress=true

# Whether to compress data spilled during shuffles. Used to set spark.shuffle.spill.compress.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.shuffle.spill.compress=true

# The default parallelism for Spark job. Used to set spark.default.parallelism.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.default.parallelism=288

# The number of partitions used in a Spark SQL/dataframe shuffle operation. Used to set
# spark.sql.shuffle.partitions.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.sql.shuffle.partitions=288

# The default timeout for network interactions in Spark. Used to set spark.network.timeout.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.network.timeout=800s

# (The interval between heartbeats from executors to the driver. Used to set
# spark.executor.heartbeatInterval.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.executor.heartbeat.interval=60s

# The fraction of heap space used for execution and storage. Used to set spark.memory.fraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.fraction=0.80

# The amount of storage memory immune to eviction, expressed as a fraction of the heap space used for
# execution and storage. Used to set spark.memory.storageFraction.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.memory.storage.fraction=0.30

# If true then speculative execution of tasks will be performed. Used to set spark.speculation.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation=false

# Fraction of tasks which must be complete before speculation is enabled for a particular stage. Used
# to set spark.speculation.quantile.
# See https://spark.apache.org/docs/latest/configuration.html.
sleeper.bulk.import.emr.serverless.spark.speculation.quantile=0.75

# The compression codec for map status results. Used to set spark.shuffle.mapStatus.compression.codec.
# Stops "Decompression error: Version not supported" errors - only a value of "lz4" has been tested.
sleeper.bulk.import.emr.serverless.spark.shuffle.mapStatus.compression.codec=lz4

# Set to enable the pre-initialise capacity option for EMR Serverless application.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.enabled=false

# The number of executors to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.count=72

# The amount of CPUs per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.cores=4vCPU

# The amount of memory per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.memory=18GB

# The amount of storage per executor for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.executor.disk=200GB

# The number of drivers to pre-initialise.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.count=5

# The amount of CPUs per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.cores=4vCPU

# The amount of memory per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.memory=18GB

# The amount of storage per driver for the pre-initialise capacity.
# See: https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/pre-init-capacity.html
sleeper.bulk.import.emr.serverless.initial.capacity.driver.disk=20GB

# (Non-persistent EMR mode only) The default EMR release label to be used when creating an EMR cluster
# for bulk importing data using Spark running on EMR.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.release.label=emr-7.2.0

# (Non-persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR
# cluster. Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk
# import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.instance.architecture=arm64

# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.master.x86.instance.types=m7i.xlarge

# (Non-persistent EMR mode only) The default EC2 x86_64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.executor.x86.instance.types=m7i.4xlarge

# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# master node of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.master.arm.instance.types=m7g.xlarge

# (Non-persistent EMR mode only) The default EC2 ARM64 instance types and weights to be used for the
# executor nodes of the EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.default.bulk.import.emr.executor.arm.instance.types=m7g.4xlarge

# (Non-persistent EMR mode only) The default purchasing option to be used for the executor nodes of
# the EMR cluster.
# Valid values are ON_DEMAND or SPOT.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.market.type=SPOT

# (Non-persistent EMR mode only) The default initial number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.initial.instances=2

# (Non-persistent EMR mode only) The default maximum number of capacity units to provision as EC2
# instances for executors in the EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This property is a default which can be overridden by a table property or by a property in the bulk
# import job specification.
sleeper.default.bulk.import.emr.executor.max.instances=10

# (Persistent EMR mode only) The EMR release used to create the persistent EMR cluster.
sleeper.bulk.import.persistent.emr.release.label=emr-7.2.0

# (Persistent EMR mode only) Which architecture to be used for EC2 instance types in the EMR cluster.
# Must be either "x86_64" "arm64" or "x86_64,arm64". For more information, see the Bulk import using
# EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.instance.architecture=arm64

# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.master.x86.instance.types=m7i.xlarge

# (Persistent EMR mode only) The EC2 x86_64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.executor.x86.instance.types=m7i.4xlarge

# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the master node of the
# persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.master.arm.instance.types=m7g.xlarge

# (Persistent EMR mode only) The EC2 ARM64 instance types and weights used for the executor nodes of
# the persistent EMR cluster.
# For more information, see the Bulk import using EMR - Instance types section in docs/usage/ingest.md
sleeper.bulk.import.persistent.emr.executor.arm.instance.types=m7g.4xlarge

# (Persistent EMR mode only) Whether the persistent EMR cluster should use managed scaling or not.
sleeper.bulk.import.persistent.emr.use.managed.scaling=true

# (Persistent EMR mode only) The minimum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# If managed scaling is not used then the cluster will be of fixed size, with a number of instances
# equal to this value.
sleeper.bulk.import.persistent.emr.min.capacity=1

# (Persistent EMR mode only) The maximum number of capacity units to provision as EC2 instances for
# executors in the persistent EMR cluster.
# This is measured in instance fleet capacity units. These are declared alongside the requested
# instance types, as each type will count for a certain number of units. By default the units are the
# number of instances.
# This value is only used if managed scaling is used.
sleeper.bulk.import.persistent.emr.max.capacity=10

# (Persistent EMR mode only) This controls the number of EMR steps that can run concurrently.
sleeper.bulk.import.persistent.emr.step.concurrency.level=2

# (EKS mode only) The name of the ECS repository where the Docker image for the bulk import container
# is stored.
sleeper.bulk.import.eks.repo=<insert-unique-sleeper-id>/bulk-import-runner

# (EKS mode only) Names of AWS IAM roles which should have access to administer the EKS cluster.
# sleeper.bulk.import.eks.cluster.admin.roles=

# (EKS mode only) Set to true if sleeper.bulk.import.eks.repo contains the image built with native
# Hadoop libraries. By default when deploying with the EKS stack enabled, an image will be built based
# on the official Spark Docker image, so this should be false.
sleeper.bulk.import.eks.is.native.libs.image=false


## The following properties relate to the splitting of partitions.

# The frequency in minutes with which the lambda runs to find partitions that need splitting and send
# jobs to the splitting lambda.
sleeper.partition.splitting.period.minutes=30

# When a partition needs splitting, a partition splitting job is created. This reads in the sketch
# files associated to the files in the partition in order to identify the median. This parameter
# controls the maximum number of files that are read in.
sleeper.partition.splitting.files.maximum=50

# The number of tables to find partitions to split for in a single invocation. This will be the batch
# size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.partition.splitting.finder.batch.size=1

# The amount of memory in MB for the lambda function used to identify partitions that need to be
# split.
sleeper.partition.splitting.finder.memory.mb=4096

# The timeout in seconds for the lambda function used to identify partitions that need to be split.
sleeper.partition.splitting.finder.timeout.seconds=900

# The reserved concurrency for the find partitions to split lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.partition.splitting.finder.concurrency.reserved=

# The maximum given concurrency allowed for the find partitions to split lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.partition.splitting.finder.concurrency.max=10

# The amount of memory in MB for the lambda function used to split partitions.
sleeper.partition.splitting.memory.mb=4096

# The timeout in seconds for the lambda function used to split partitions.
sleeper.partition.splitting.timeout.seconds=900

# The number of lambda instances to reserve from your AWS account's quota for splitting partitions.
# Note that this will not provision instances until they are needed. Each time partition splitting
# runs, a separate lambda invocation will be made for each partition that needs to be split. If the
# reserved concurrency is less than the number of partitions that need to be split across all Sleeper
# tables in the instance, these invocations may queue up.
sleeper.partition.splitting.reserved.concurrency=10

# This is the default value of the partition splitting threshold. Partitions with more than the
# following number of records in will be split. This value can be overridden on a per-table basis.
sleeper.default.partition.splitting.threshold=1000000000


## The following properties relate to garbage collection.

# The frequency in minutes with which the garbage collector lambda is run.
sleeper.gc.period.minutes=15

# The timeout in seconds for the garbage collector lambda.
sleeper.gc.lambda.timeout.seconds=840

# The amount of memory in MB for the lambda function used to perform garbage collection.
sleeper.gc.memory.mb=4096

# The reserved concurrency for the garbage collection lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.gc.concurrency.reserved=

# The maximum given concurrency allowed for the garbage collection lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.gc.concurrency.max=10

# The number of tables to perform garbage collection for in a single invocation. This will be the
# batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.gc.table.batch.size=1

# Whether to perform garbage collection for offline tables.
sleeper.gc.offline.enabled=false

# The size of the batch of files ready for garbage collection requested from the State Store.
sleeper.gc.batch.size=2000

# A file will not be deleted until this number of minutes have passed after it has been marked as
# ready for garbage collection. The reason for not deleting files immediately after they have been
# marked as ready for garbage collection is that they may still be in use by queries. This property
# can be overridden on a per-table basis.
sleeper.default.gc.delay.minutes=15


## The following properties relate to compactions.

# The name of the repository for the compaction container. The Docker image from the
# compaction-job-execution module should have been uploaded to an ECR repository of this name in this
# account.
sleeper.compaction.repo=<insert-unique-sleeper-id>/compaction-job-execution

# The number of tables to perform compaction job creation for in a single invocation. This will be the
# batch size for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.compaction.job.creation.batch.size=1

# The number of finished compaction commits to gather in the batcher before committing to the state
# store. This will be the batch size for a lambda as an SQS event source.
# This can be a maximum of 10,000. In practice the effective maximum is limited by the number of
# messages that fit in a synchronous lambda invocation payload, see the AWS documentation:
# https://docs.aws.amazon.com/lambda/latest/dg/gettingstarted-limits.html
sleeper.compaction.job.commit.batch.size=1000

# The time in seconds that the batcher will wait for compaction commits to appear if the batch size is
# not filled. This will be set in the SQS event source for the lambda. This can be a maximum of 300,
# i.e. 5 minutes.
sleeper.compaction.job.commit.batching.window.seconds=30

# The visibility timeout for the queue of compaction jobs.
sleeper.compaction.queue.visibility.timeout.seconds=900

# The visibility timeout for the queue of pending compaction job batches.
sleeper.compaction.pending.queue.visibility.timeout.seconds=900

# The frequency, in seconds, with which change message visibility requests are sent to extend the
# visibility of messages on the compaction job queue so that they are not processed by other
# processes.
# This should be less than the value of sleeper.compaction.queue.visibility.timeout.seconds.
sleeper.compaction.keepalive.period.seconds=300

# The delay in seconds until a failed compaction job becomes visible on the compaction job queue and
# can be processed again.
sleeper.compaction.job.failed.visibility.timeout.seconds=60

# The time in seconds for a compaction task to wait for a compaction job to appear on the SQS queue
# (must be <= 20).
# When a compaction task waits for compaction jobs to appear on the SQS queue, if the task receives no
# messages in the time defined by this property, it will try to wait for a message again.
sleeper.compaction.task.wait.time.seconds=20

# Set to true if compaction tasks should wait for input files to be assigned to a compaction job
# before starting it. The compaction task will poll the state store for whether the input files have
# been assigned to the job, and will only start once this has occurred.
# This prevents invalid compaction jobs from being run, particularly in the case where the compaction
# job creator runs again before the input files are assigned.
# This also causes compaction tasks to wait idle while input files are assigned, and puts extra load
# on the state store when there are many compaction tasks.
# If this is false, any created job will be executed, and will only be validated when committed to the
# state store.
sleeper.compaction.task.wait.for.input.file.assignment=false

# The time in seconds for a compaction task to wait after receiving no compaction jobs before
# attempting to receive a message again.
# When a compaction task waits for compaction jobs to appear on the SQS queue, if the task receives no
# messages in the time defined by the property "sleeper.compaction.task.wait.time.seconds", it will
# wait for a number of seconds defined by this property, then try to receive a message again.
sleeper.compaction.task.delay.before.retry.seconds=10

# The total time in seconds that a compaction task can be idle before it is terminated.
# When there are no compaction jobs available on the SQS queue, and SQS returns no jobs, the task will
# check whether this idle time has elapsed since the last time it finished a job. If so, the task will
# terminate.
sleeper.compaction.task.max.idle.time.seconds=60

# The maximum number of times that a compaction task can fail to process consecutive compaction jobs
# before it terminates.
# When the task starts or completes any job successfully, the count of consecutive failures is set to
# zero. Any time it fails to process a job, this count is incremented. If this maximum is reached, the
# task will terminate.
sleeper.compaction.task.max.consecutive.failures=3

# The rate at which the compaction job creation lambda runs (in minutes, must be >=1).
sleeper.compaction.job.creation.period.minutes=1

# The amount of memory in MB for the lambda that creates compaction jobs.
sleeper.compaction.job.creation.memory.mb=4096

# The timeout for the lambda that creates compaction jobs in seconds.
sleeper.compaction.job.creation.timeout.seconds=900

# The reserved concurrency for the lambda used to create compaction jobs.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.compaction.job.creation.concurrency.reserved=

# The maximum given concurrency allowed for the lambda used to create compaction jobs.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.compaction.job.creation.concurrency.max=10

# The amount of memory in MB for the lambda that sends batches of compaction jobs.
sleeper.compaction.job.dispatch.memory.mb=4096

# The timeout for the lambda that sends batches of compaction jobs in seconds.
sleeper.compaction.job.dispatch.timeout.seconds=900

# The reserved concurrency for the lambda that sends batches of compaction jobs.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.compaction.job.dispatch.concurrency.reserved=

# The maximum concurrency allowed for the lambda that sends batches of compaction jobs.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.compaction.job.dispatch.concurrency.max=10

# The amount of memory in MB for the lambda that batches up compaction commits.
sleeper.compaction.commit.batcher.memory.mb=4096

# The timeout for the lambda that batches up compaction commits in seconds.
sleeper.compaction.commit.batcher.timeout.seconds=900

# The reserved concurrency for the lambda that batches up compaction commits.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
sleeper.compaction.commit.batcher.concurrency.reserved=2

# The maximum concurrency allowed for the lambda that batches up compaction commits.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.compaction.commit.batcher.concurrency.max=2

# The maximum number of concurrent compaction tasks to run.
sleeper.compaction.max.concurrent.tasks=300

# The rate at which a check to see if compaction ECS tasks need to be created is made (in minutes,
# must be >= 1).
sleeper.compaction.task.creation.period.minutes=1

# The maximum number of times that a compaction job can be taken off the job definition queue before
# it is moved to the dead letter queue.
# This property is used to configure the maxReceiveCount of the compaction job definition queue.
sleeper.compaction.job.max.retries=3

# The maximum number of times that a batch of compaction jobs can be taken off the pending queue
# before it is moved to the dead letter queue.
# This property is used to configure the maxReceiveCount of the pending compaction job batch queue.
sleeper.compaction.job.dispatch.max.retries=3

# The maximum number of times that a compaction job can be taken off the batch committer queue before
# it is moved to the dead letter queue.
# This property is used to configure the maxReceiveCount of the compaction job committer queue.
sleeper.compaction.job.commit.max.retries=3

# The CPU architecture to run compaction tasks on. Valid values are X86_64 and ARM64.
# See Task CPU architecture at
# https://docs.aws.amazon.com/AmazonECS/latest/developerguide/AWS_Fargate.html
sleeper.compaction.task.cpu.architecture=X86_64

# The CPU for a compaction task using an ARM64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.arm.cpu=1024

# The amount of memory in MB for a compaction task using an ARM64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.arm.memory.mb=4096

# The CPU for a compaction task using an x86_64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.x86.cpu=1024

# The amount of memory in MB for a compaction task using an x86_64 architecture.
# See https://docs.aws.amazon.com/AmazonECS/latest/developerguide/task-cpu-memory-error.html for valid
# options.
sleeper.compaction.task.x86.memory.mb=4096

# Used when scaling EC2 instances to support an expected number of compaction tasks. This is the
# amount of memory in MB that we expect ECS to reserve on an EC2 instance before making memory
# available for compaction tasks.
# If this is unset, it will be computed as a percentage of the memory on the EC2 instead, see
# `sleeper.compaction.task.scaling.overhead.percentage`.
# sleeper.compaction.task.scaling.overhead.fixed=

# Used when scaling EC2 instances to support an expected number of compaction tasks. This is the
# percentage of memory in an EC2 instance that we expect ECS to reserve before making memory available
# for compaction tasks.
# Defaults to 10%, so we expect 90% of the memory on an EC2 instance to be used for compaction tasks.
sleeper.compaction.task.scaling.overhead.percentage=10

# What launch type should compaction containers use? Valid options: FARGATE, EC2.
sleeper.compaction.ecs.launch.type=FARGATE

# The EC2 instance type to use for compaction tasks (when using EC2-based compactions).
sleeper.compaction.ec2.type=t3.xlarge

# The minimum number of instances for the EC2 cluster (when using EC2-based compactions).
sleeper.compaction.ec2.pool.minimum=0

# The initial desired number of instances for the EC2 cluster (when using EC2-based compactions).
# Can be set by dividing initial maximum containers by number that should fit on instance type.
sleeper.compaction.ec2.pool.desired=0

# The maximum number of instances for the EC2 cluster (when using EC2-based compactions).
sleeper.compaction.ec2.pool.maximum=75

# The size in GiB of the root EBS volume attached to the EC2 instances (when using EC2-based
# compactions).
sleeper.compaction.ec2.root.size=50

# Flag to enable/disable storage of tracking information for compaction jobs and tasks.
sleeper.compaction.tracker.enabled=true

# Flag to enable/disable storing an update to the tracker during async commits of compaction jobs.
# This may be disabled if there are enough compactions that the system is unable to apply all the
# updates to the tracker. This is mainly used for testing. Reports may show compactions as unfinished
# if this update is not present in the tracker.
sleeper.compaction.tracker.async.commit.updates.enabled=true

# The time to live in seconds for compaction job updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.compaction.job.status.ttl=604800

# The time to live in seconds for compaction task updates in the job tracker. Default is 1 week.
# The expiry time is fixed when an update is saved to the store, so changing this will only affect new
# data.
sleeper.compaction.task.status.ttl=604800

# The name of the class that defines how compaction jobs should be created. This should implement
# sleeper.compaction.core.job.creation.strategy.CompactionStrategy. The value of this property is the
# default value which can be overridden on a per-table basis.
sleeper.default.compaction.strategy.class=sleeper.compaction.core.job.creation.strategy.impl.SizeRatioCompactionStrategy

# The maximum number of files to read in a compaction job. Note that the state store must support
# atomic updates for this many files.
# Also note that this many files may need to be open simultaneously. The value of
# 'sleeper.fs.s3a.max-connections' must be at least the value of this plus one. The extra one is for
# the output file.
# This is a default value and will be used if not specified in the table properties.
sleeper.default.compaction.files.batch.size=12

# The number of compaction jobs to send in a single batch.
# When compaction jobs are created, there is no limit on how many jobs can be created at once. A batch
# is a group of compaction jobs that will have their creation updates applied at the same time. For
# each batch, we send all compaction jobs to the SQS queue, then update the state store to assign job
# IDs to the input files.
# This can be overridden on a per-table basis.
sleeper.default.table.compaction.job.send.batch.size=1000

# The amount of time in seconds a batch of compaction jobs may be pending before it should not be
# retried. If the input files have not been successfully assigned to the jobs, and this much time has
# passed, then the batch will fail to send.
# Once a pending batch fails the input files will never be compacted again without other intervention,
# so it's important to ensure file assignment will be done within this time. That depends on the
# throughput of state store commits.
# It's also necessary to ensure file assignment will be done before the next invocation of compaction
# job creation, otherwise invalid jobs will be created for the same input files. The rate of these
# invocations is set in `sleeper.compaction.job.creation.period.minutes`.
sleeper.default.table.compaction.job.send.timeout.seconds=90

# The amount of time in seconds to wait between attempts to send a batch of compaction jobs. The batch
# will be sent if all input files have been successfully assigned to the jobs, otherwise the batch
# will be retried after a delay.
sleeper.default.table.compaction.job.send.retry.delay.seconds=30

# The default limit on the number of compactation jobs that can be created within a single
# invocation.Exceeding this limit, results in the selection being randomised.
sleeper.default.table.compaction.job.creation.limit=100000

# Used by the SizeRatioCompactionStrategy to decide if a group of files should be compacted.
# If the file sizes are s_1, ..., s_n then the files are compacted if s_1 + ... + s_{n-1} >= ratio *
# s_n.
# It can be overridden on a per-table basis.
sleeper.default.table.compaction.strategy.sizeratio.ratio=3

# Used by the SizeRatioCompactionStrategy to control the maximum number of jobs that can be running
# concurrently per partition. It can be overridden on a per-table basis.
sleeper.default.table.compaction.strategy.sizeratio.max.concurrent.jobs.per.partition=100000

# Select which compaction method to use if not configured against a Sleeper table. DataFusion
# compaction support is experimental.
# Valid values are: [java, datafusion]
sleeper.default.table.compaction.method=JAVA


## The following properties relate to queries.

# The maximum number of simultaneous connections to S3 from a single query runner. This is separated
# from the main one as it's common for a query runner to need to open more files at once.
sleeper.query.s3.max-connections=1024

# The amount of memory in MB for the lambda that executes queries.
sleeper.query.processor.memory.mb=4096

# The timeout for the lambda that executes queries in seconds.
sleeper.query.processor.timeout.seconds=900

# The frequency with which the query processing lambda refreshes its knowledge of the system state
# (i.e. the partitions and the mapping from partition to files), in seconds.
sleeper.query.processor.state.refresh.period.seconds=60

# The maximum number of records to include in a batch of query results send to the results queue from
# the query processing lambda.
sleeper.query.processor.results.batch.size=2000

# The size of the thread pool for retrieving records in a query processing lambda.
sleeper.query.processor.record.retrieval.threads=10

# The default value for the amount of time in minutes the query executor cache is valid for before it
# times out and needs refreshing.
sleeper.query.processor.cache.timeout=60

# This value is used to set the time-to-live on the tracking of the queries in the DynamoDB-based
# query tracker.
sleeper.query.tracker.ttl.days=1

# The length of time the results of queries remain in the query results bucket before being deleted.
sleeper.query.results.bucket.expiry.days=7

# The visibility timeout in seconds of the query results queue.
sleeper.query.results.queue.visibility.timeout.seconds=900

# The default value of the rowgroup size used when the results of queries are written to Parquet
# files. The value given below is 8MiB. This value can be overridden using the query config.
sleeper.default.query.results.rowgroup.size=8388608

# The default value of the page size used when the results of queries are written to Parquet files.
# The value given below is 128KiB. This value can be overridden using the query config.
sleeper.default.query.results.page.size=131072

# The rate at which the query lambda runs to keep it warm (in minutes, must be >=1).  This only
# applies when the KeepLambdaWarmStack is enabled
sleeper.query.warm.lambda.period.minutes=5


## The following properties relate to metrics.

# The namespaces for the metrics used in the metrics stack.
sleeper.metrics.namespace=Sleeper

# The reserved concurrency for the table metrics lambda.
# See reserved concurrency overview at:
# https://docs.aws.amazon.com/lambda/latest/dg/configuration-concurrency.html
# sleeper.metrics.concurrency.reserved=

# The maximum concurrency allowed for the table metrics lambda.
# See maximum concurrency overview at:
# https://aws.amazon.com/blogs/compute/introducing-maximum-concurrency-of-aws-lambda-functions-when-using-amazon-sqs-as-an-event-source/
sleeper.metrics.concurrency.max=10

# The number of tables to calculate metrics for in a single invocation. This will be the batch size
# for a lambda as an SQS FIFO event source. This can be a maximum of 10.
sleeper.metrics.batch.size=1

# Whether to calculate table metrics for offline tables.
sleeper.metrics.offline.enabled=false

# The period in minutes used in the dashboard.
sleeper.dashboard.time.window.minutes=5


## The following properties relate to logging.

# The logging level for logging Sleeper classes. This does not apply to the MetricsLogger which is
# always set to INFO.
sleeper.logging.level=INFO

# The logging level for Apache logs that are not Parquet.
sleeper.logging.apache.level=INFO

# The logging level for Parquet logs.
sleeper.logging.parquet.level=WARN

# The logging level for AWS logs.
sleeper.logging.aws.level=INFO

# The logging level for everything else.
sleeper.logging.root.level=INFO


## The following properties relate to the integration with Athena.

# The number of days before objects in the spill bucket are deleted.
sleeper.athena.spill.bucket.ageoff.days=1

# The fully qualified composite classes to deploy. These are the classes that interact with Athena.
# You can choose to remove one if you don't need them. Both are deployed by default.
sleeper.athena.handler.classes=sleeper.athena.composite.SimpleCompositeHandler,sleeper.athena.composite.IteratorApplyingCompositeHandler

# The amount of memory (MB) the athena composite handler has.
sleeper.athena.handler.memory.mb=4096

# The timeout in seconds for the athena composite handler.
sleeper.athena.handler.timeout.seconds=900

# ARN of the KMS Key used to encrypt data in the Athena spill bucket.
# sleeper.athena.spill.master.key.arn=


## The following properties relate to default values used by table properties.

# Default used during Parquet queries to determine whether the column indexes are used.
sleeper.default.parquet.query.column.index.enabled=false

# The size of the row group in the Parquet files (default is 8MiB).
sleeper.default.rowgroup.size=8388608

# The size of the pages in the Parquet files (default is 128KiB).
sleeper.default.page.size=131072

# The compression codec to use in the Parquet files.
# Valid values are: [uncompressed, snappy, gzip, lzo, brotli, lz4, zstd]
sleeper.default.compression.codec=zstd

# Whether dictionary encoding should be used for row key columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.rowkey.fields=false

# Whether dictionary encoding should be used for sort key columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.sortkey.fields=false

# Whether dictionary encoding should be used for value columns in the Parquet files.
sleeper.default.parquet.dictionary.encoding.value.fields=false

# Used to set parquet.columnindex.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate binary values in a column index.
sleeper.default.parquet.columnindex.truncate.length=128

# Used to set parquet.statistics.truncate.length, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# The length in bytes to truncate the min/max binary values in row groups.
sleeper.default.parquet.statistics.truncate.length=2147483647

# Used to set parquet.writer.version, see documentation here:
# https://github.com/apache/parquet-mr/blob/master/parquet-hadoop/README.md
# Can be either v1 or v2. The v2 pages store levels uncompressed while v1 pages compress levels with
# the data.
sleeper.default.parquet.writer.version=v2

# The number of attempts to make when applying a transaction to the state store. This default can be
# overridden by a table property.
sleeper.default.statestore.transactionlog.add.transaction.max.attempts=10

# The maximum amount of time to wait before the first retry when applying a transaction to the state
# store. Full jitter will be applied so that the actual wait time will be a random period between 0
# and this value. This ceiling will increase exponentially on further retries. See the below article.
# https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
# This default can be overridden by a table property.
sleeper.default.statestore.transactionlog.add.transaction.first.retry.wait.ceiling.ms=200

# The maximum amount of time to wait before any retry when applying a transaction to the state store.
# Full jitter will be applied so that the actual wait time will be a random period between 0 and this
# value. This restricts the exponential increase of the wait ceiling while retrying the transaction.
# See the below article.
# https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/
# This default can be overridden by a table property.
sleeper.default.statestore.transactionlog.add.transaction.max.retry.wait.ceiling.ms=30000

# The number of elements to include per Arrow record batch in a snapshot derived from the transaction
# log, of the state of files in a Sleeper table. Each file includes some number of references on
# different partitions. Each reference will count for one element in a record batch, but a file cannot
# currently be split between record batches. A record batch may contain more file references than this
# if a single file overflows the batch. A file with no references counts as one element.
sleeper.default.statestore.transactionlog.files.snapshot.batch.size=1000

# The number of partitions to include per Arrow record batch in a snapshot derived from the
# transaction log, of the state of partitions in a Sleeper table.
sleeper.default.statestore.transactionlog.partitions.snapshot.batch.size=1000

# The number of seconds to wait after we've loaded a snapshot before looking for a new snapshot. This
# should relate to the rate at which new snapshots are created, configured in the instance property
# `sleeper.statestore.transactionlog.snapshot.creation.lambda.period.seconds`. This default can be
# overridden by a table property.
sleeper.default.statestore.transactionlog.time.between.snapshot.checks.seconds=60

# The number of milliseconds to wait after we've updated from the transaction log before checking for
# new transactions. The state visible to an instance of the state store can be out of date by this
# amount. This can avoid excessive queries by the same process, but can result in unwanted behaviour
# when using multiple state store objects. When adding a new transaction to update the state, this
# will be ignored and the state will be brought completely up to date. This default can be overridden
# by a table property.
sleeper.default.statestore.transactionlog.time.between.transaction.checks.ms=0

# The minimum number of transactions that a snapshot must be ahead of the local state, before we load
# the snapshot instead of updating from the transaction log.
sleeper.default.statestore.transactionlog.snapshot.load.min.transactions.ahead=10

# The number of days that transaction log snapshots remain in the snapshot store before being deleted.
sleeper.default.statestore.transactionlog.snapshot.expiry.days=2

# The minimum age in minutes of a snapshot in order to allow deletion of transactions leading up to
# it. When deleting old transactions, there's a chance that processes may still read transactions
# starting from an older snapshot. We need to avoid deletion of any transactions associated with a
# snapshot that may still be used as the starting point for reading the log.
sleeper.default.statestore.transactionlog.delete.behind.snapshot.min.age.minutes=2

# The minimum number of transactions that a transaction must be behind the latest snapshot before
# being deleted. This is the number of transactions that will be kept and protected from deletion,
# whenever old transactions are deleted. This includes the transaction that the latest snapshot was
# created against. Any transactions after the snapshot will never be deleted as they are still in
# active use.
# This should be configured in relation to the property which determines whether a process will load
# the latest snapshot or instead seek through the transaction log, since we need to preserve
# transactions that may still be read:
# sleeper.default.statestore.snapshot.load.min.transactions.ahead
# The snapshot that will be considered the latest snapshot is configured by a property to set the
# minimum age for it to count for this:
# sleeper.default.statestore.transactionlog.delete.behind.snapshot.min.age
sleeper.default.statestore.transactionlog.delete.number.behind.latest.snapshot=200

# This specifies whether queries and scans against DynamoDB tables used in the state stores are
# strongly consistent. This default can be overridden by a table property.
sleeper.default.table.dynamo.strongly.consistent.reads=false

# Specifies the minimum number of leaf partitions that are needed to run a bulk import job. If this
# minimum has not been reached, bulk import jobs will refuse to start.
sleeper.default.bulk.import.min.leaf.partitions=64

# Specifies the minimum total file size required for an ingest job to be batched and sent. An ingest
# job will be created if the batcher runs while this much data is waiting, and the minimum number of
# files is also met.
sleeper.default.ingest.batcher.job.min.size=1G

# Specifies the maximum total file size for a job in the ingest batcher. If more data is waiting than
# this, it will be split into multiple jobs. If a single file exceeds this, it will still be ingested
# in its own job. It's also possible some data may be left for a future run of the batcher if some
# recent files overflow the size of a job but aren't enough to create a job on their own.
sleeper.default.ingest.batcher.job.max.size=5G

# Specifies the minimum number of files for a job in the ingest batcher. An ingest job will be created
# if the batcher runs while this many files are waiting, and the minimum size of files is also met.
sleeper.default.ingest.batcher.job.min.files=1

# Specifies the maximum number of files for a job in the ingest batcher. If more files are waiting
# than this, they will be split into multiple jobs. It's possible some data may be left for a future
# run of the batcher if some recent files overflow the size of a job but aren't enough to create a job
# on their own.
sleeper.default.ingest.batcher.job.max.files=100

# Specifies the maximum time in seconds that a file can be held in the batcher before it will be
# included in an ingest job. When any file has been waiting for longer than this, jobs will be created
# for all the currently held files, even if other criteria for a batch are not met.
sleeper.default.ingest.batcher.file.max.age.seconds=300

# Specifies the target ingest queue where batched jobs are sent.
# Valid values are: [standard_ingest, bulk_import_emr, bulk_import_persistent_emr, bulk_import_eks,
# bulk_import_emr_serverless]
sleeper.default.ingest.batcher.ingest.queue=bulk_import_emr_serverless

# The time in minutes that the tracking information is retained for a file before the records of its
# ingest are deleted (eg. which ingest job it was assigned to, the time this occurred, the size of the
# file).
# The expiry time is fixed when a file is saved to the store, so changing this will only affect new
# data.
# Defaults to 1 week.
sleeper.default.ingest.batcher.file.tracking.ttl.minutes=10080

# Specifies the strategy that ingest uses to create files and references in partitions.
# Valid values are: [one_file_per_leaf, one_reference_per_leaf]
sleeper.default.ingest.file.writing.strategy=one_reference_per_leaf

# The way in which records are held in memory before they are written to a local store.
# Valid values are 'arraylist' and 'arrow'.
# The arraylist method is simpler, but it is slower and requires careful tuning of the number of
# records in each batch.
sleeper.default.ingest.record.batch.type=arrow

# The way in which partition files are written to the main Sleeper store.
# Valid values are 'direct' (which writes using the s3a Hadoop file system) and 'async' (which writes
# locally and then copies the completed Parquet file asynchronously into S3).
# The direct method is simpler but the async method should provide better performance when the number
# of partitions is large.
sleeper.default.ingest.partition.file.writer.type=async

# This is the default for whether state store updates will be applied asynchronously via the state
# store committer.
# This is usually only used for state store implementations where there's a benefit to applying state
# store updates in a single process for each Sleeper table. This is usually to avoid contention from
# multiple processes performing updates at the same time.
# This is separate from the properties that determine which state store updates will be done as
# asynchronous commits. Those properties will only be applied when asynchronous commits are enabled
# for a given state store.
# Valid values are: [disabled, per_implementation, all_implementations]
# With `disabled`, asynchronous commits will never be used unless overridden in table properties.
# With `per_implementation`, asynchronous commits will be used for all state store implementations
# that are known to benefit from it, unless overridden in table properties.
# With `all_implementations`, asynchronous commits will be used for all state stores unless overridden
# in table properties.
sleeper.default.statestore.commit.async.behaviour=PER_IMPLEMENTATION

# This is the default for whether created compaction jobs will be assigned to their input files
# asynchronously via the state store committer, if asynchronous commit is enabled. Otherwise, the
# compaction job creator will commit input file assignments directly to the state store.
sleeper.default.compaction.job.id.assignment.commit.async=true

# This is the default for whether compaction tasks will commit finished jobs asynchronously via the
# state store committer, if asynchronous commit is enabled. Otherwise, compaction tasks will commit
# finished jobs directly to the state store.
sleeper.default.compaction.job.commit.async=true

# This property is the default for whether commits of compaction jobs are batched before being sent to
# the state store commit queue to be applied by the committer lambda. If this property is true and
# asynchronous commits are enabled then commits of compactions will be batched. If this property is
# false and asynchronous commits are enabled then commits of compactions will not be batched and will
# be sent directly to the committer lambda. This property can be overridden for individual tables.
sleeper.default.compaction.job.async.commit.batching=true

# This is the default for whether ingest tasks will add files asynchronously via the state store
# committer, if asynchronous commit is enabled. Otherwise, ingest tasks will add files directly to the
# state store.
sleeper.default.ingest.job.files.commit.async=true

# This is the default for whether bulk import will add files asynchronously via the state store
# committer, if asynchronous commit is enabled. Otherwise, bulk import will add files directly to the
# state store.
sleeper.default.bulk.import.job.files.commit.async=true

# This is the default for whether partition splits will be applied asynchronously via the state store
# committer, if asynchronous commit is enabled. Otherwise, the partition splitter will apply splits
# directly to the state store.
sleeper.default.partition.splitting.commit.async=true

# This is the default for whether the garbage collector will record deleted files asynchronously via
# the state store committer, if asynchronous commit is enabled. Otherwise, the garbage collector will
# record this directly to the state store.
sleeper.default.gc.commit.async=true

# When using the transaction log state store, this sets whether to update from the transaction log
# before adding a transaction in the asynchronous state store committer.
# If asynchronous commits are used for all or almost all state store updates, this can be false to
# avoid the extra queries.
# If the state store is commonly updated directly outside of the asynchronous committer, this can be
# true to avoid conflicts and retries.
sleeper.default.statestore.committer.update.every.commit=false

# When using the transaction log state store, this sets whether to update from the transaction log
# before adding a batch of transactions in the asynchronous state store committer.
sleeper.default.statestore.committer.update.every.batch=true