From 043c9597d860b2f96ac1772537fef61ea24a8e49 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Wed, 8 May 2024 12:06:21 +0200 Subject: [PATCH 01/16] gitignore /venv --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 682451b..e59c531 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /.idea __pycache__ /corpus +/venv From 75eb1b2cc0fd366673ca65903b03fde9cf54d433 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 16:38:50 +0200 Subject: [PATCH 02/16] Document dense encoding of invalid pushdata in EOFv0 --- spec/eofv0_verkle.md | 145 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index f18952d..bf52d42 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -105,6 +105,151 @@ The same as above except encode the values as 6-bit numbers (minimum number of bits needed for encoding `32`). Such encoding lowers the size overhead from 3.1% to 2.3%. +### Encode only invalid pushdata (dense encoding) + +Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. + +This is beneficial if our assumption is correct that most contracts only contain a limited number +of offending cases. Our initial analysis suggests this is the case, e.g. Uniswap router has 9 cases, +one of the Arbitrum validator contracts has 6 cases. + +Since Solidity contracts have a trailing metadata, which contains a Keccak-256 (32-byte) hash of the +source, there is a 12% probability ($1 - (255/256)^{32}$) that at least one of the bytes of the hash +will contain the `0x5b` value, which gives our minimum probability of having at least one invalid `JUMPDEST`. + +Let's create a map of `invalid_jumpdests[chunk_no] = position_in_chunk`. We can densely encode this +map using techniques similar to *run-length encoding* to skip distances and delta-encode offsets. + +In *scheme 1*, for each entry in `invalid_jumpdests`: +- 1-bit mode (`skip`, `value`) +- For skip-mode: + - 10-bit delta-encoding of `chunk_no` +- For value-mode: + - 4-bit delta-encoding of `chunk_no` + - 6-bit `position_in_chunk` + +Worst case encoding where each chunk contains an invalid `JUMPDEST`: +``` +total_chunk_count = 24576 / 32 = 768 +total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header +number_of_verkle_leafs = total_chunk_count / 32 = 33 +``` + +*Scheme 2* differs slightly: +- 1-bit mode (`skip`, `value`) +- For skip-mode: + - 10-bit delta-encoding of `chunk_no` +- For value-mode: + - 6-bit `position_in_chunk` + +Worst case encoding: +``` +total_chunk_count = 24576 / 32 = 768 +total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header +number_of_verkle_leafs = total_chunk_count / 32 = 21 +``` + +The decision between *scheme 1* and *scheme 2*, as well as the best encoding sizes, can be determined +through analysing existing code. + +#### Header location + +It is possible to place above as part of the "EOFv0" header, but given the upper bound of number of chunks occupied is low (33 vs 21), +it is also possible to make this part of the Verkle account header. + +This second option allows for the simplification of the `code_size` value, as it does not need to change. + +#### Runtime after Verkle + +During runtime execution two checks must be done in this order: +1) Check if the destination is on the invalid list, and abort if so. +2) Check if the value in the chunk is an actual `JUMPDEST`, and abort if not. + +It is possible to reconstruct sparse account code prior to execution with all the submitted chunks of the transaction +and perform `JUMPDEST`-validation to build up a relevant *valid `JUMPDEST` locations* map instead. + +#### Analysis + +We have analyzed two contracts, Arbitrum validator and Uniswap router. + +Arbitrum (2147-bytes long): +``` +(chunk offset, chunk number, pushdata offset) +malicious push byte: 85 2 21 +malicious push byte: 95 2 31 +malicious push byte: 116 3 20 +malicious push byte: 135 4 7 +malicious push byte: 216 6 24 +malicious push byte: 1334 41 22 +``` + +Encoding with *scheme 1*: +``` +[skip, 2] +[value, 0, 21] +[value, 0, 31] +[value, 1, 20] +[value, 1, 7] +[value, 2, 24] +[skip, 35, 22] +``` + +Encoding size: `2 skips (2 * 11 bits) + 5 values (5 * 11 bits)` = 10-bytes header (0.465%) + +Encoding with *scheme 2*: +``` +[skip, 2] +[value, 21] +[value, 31] +[skip, 1] +[value, 20] +[skip, 1] +[value, 7] +[skip, 2] +[value, 24] +[skip, 35] +[value, 22] +``` + +Encoding size: `5 skips (5 * 11 bits) + 6 values (6 * 7 bits)` = 13-bytes header (0.605%) + +Uniswap router contract (17958 bytes): + +``` +(chunk offset, chunk number, pushdata offset) +malicious push byte: 1646 51 14 +malicious push byte: 1989 62 5 +malicious push byte: 4239 132 15 +malicious push byte: 4533 141 21 +malicious push byte: 7043 220 3 +malicious push byte: 8036 251 4 +malicious push byte: 8604 268 28 +malicious push byte: 12345 385 25 +malicious push byte: 15761 492 17 +``` + +Encoding using *scheme 1*: +``` +[skip, 51] +[value, 0, 14] +[value, 11, 5] +[skip, 70] +[value, 0, 15] +[value, 9, 21] +[skip, 79] +[value, 0, 3] +[skip, 31] +[value, 0, 4] +[skip, 17] +[value, 0, 28] +[skip, 117] +[value, 0, 25] +[skip, 107] +[value, 0, 17] +``` + +Encoding size: `7 skips (7 * 11 bits) + 9 values (9 * 11 bits)` = 22-bytes header (0.122%) + ## Backwards Compatibility EOF-packaged code execution if fully compatible with the legacy code execution. From d231ec7bcf69787edf31a81aa40c1b6818e4df72 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 16:41:16 +0200 Subject: [PATCH 03/16] Add goal --- spec/eofv0_verkle.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index bf52d42..fea93ca 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -119,6 +119,8 @@ will contain the `0x5b` value, which gives our minimum probability of having at Let's create a map of `invalid_jumpdests[chunk_no] = position_in_chunk`. We can densely encode this map using techniques similar to *run-length encoding* to skip distances and delta-encode offsets. +This map is always fully loaded prior to execution, and so it is important to ensure the encoded +version is as dense as possible (without sacrificing on complexity). In *scheme 1*, for each entry in `invalid_jumpdests`: - 1-bit mode (`skip`, `value`) From 35955cf8321e51c5455e1d7a4e7d3a40822e8e05 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 16:46:51 +0200 Subject: [PATCH 04/16] Clarify wording --- spec/eofv0_verkle.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index fea93ca..ee31fb3 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -115,7 +115,8 @@ one of the Arbitrum validator contracts has 6 cases. Since Solidity contracts have a trailing metadata, which contains a Keccak-256 (32-byte) hash of the source, there is a 12% probability ($1 - (255/256)^{32}$) that at least one of the bytes of the hash -will contain the `0x5b` value, which gives our minimum probability of having at least one invalid `JUMPDEST`. +will contain the `0x5b` value, which gives our minimum probability of having at least one invalid +`JUMPDEST` in the contract. Let's create a map of `invalid_jumpdests[chunk_no] = position_in_chunk`. We can densely encode this map using techniques similar to *run-length encoding* to skip distances and delta-encode offsets. From df1270520e64893d0cd410ff481dab01d77c88ee Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 17:08:57 +0200 Subject: [PATCH 05/16] Add overheads --- spec/eofv0_verkle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index ee31fb3..45815d1 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -134,7 +134,7 @@ In *scheme 1*, for each entry in `invalid_jumpdests`: Worst case encoding where each chunk contains an invalid `JUMPDEST`: ``` total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header +total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header, i.e. 4.1% overhead number_of_verkle_leafs = total_chunk_count / 32 = 33 ``` @@ -148,7 +148,7 @@ number_of_verkle_leafs = total_chunk_count / 32 = 33 Worst case encoding: ``` total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header +total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header, i.e. 2.7% overhead number_of_verkle_leafs = total_chunk_count / 32 = 21 ``` From 920a4cc0ccf992dda749c14e4c6ea811f1b00ef5 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:02:50 +0200 Subject: [PATCH 06/16] Improve terminology about pushdata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Paweł Bylica --- spec/eofv0_verkle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 45815d1..c25c96e 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -105,9 +105,9 @@ The same as above except encode the values as 6-bit numbers (minimum number of bits needed for encoding `32`). Such encoding lowers the size overhead from 3.1% to 2.3%. -### Encode only invalid pushdata (dense encoding) +### Encode only invalid jumpdests (dense encoding) -Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. +Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. By invalid `JUMPDEST` we mean a `0x5b` byte in any pushdata. This is beneficial if our assumption is correct that most contracts only contain a limited number of offending cases. Our initial analysis suggests this is the case, e.g. Uniswap router has 9 cases, From 255276795ee4724d1c95489c26632225cf9b8ca2 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:03:53 +0200 Subject: [PATCH 07/16] Rename position_in_chunk to first_instruction_offest --- spec/eofv0_verkle.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index c25c96e..ec6b480 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -118,7 +118,7 @@ source, there is a 12% probability ($1 - (255/256)^{32}$) that at least one of t will contain the `0x5b` value, which gives our minimum probability of having at least one invalid `JUMPDEST` in the contract. -Let's create a map of `invalid_jumpdests[chunk_no] = position_in_chunk`. We can densely encode this +Let's create a map of `invalid_jumpdests[chunk_no] = first_instruction_offset`. We can densely encode this map using techniques similar to *run-length encoding* to skip distances and delta-encode offsets. This map is always fully loaded prior to execution, and so it is important to ensure the encoded version is as dense as possible (without sacrificing on complexity). @@ -129,7 +129,7 @@ In *scheme 1*, for each entry in `invalid_jumpdests`: - 10-bit delta-encoding of `chunk_no` - For value-mode: - 4-bit delta-encoding of `chunk_no` - - 6-bit `position_in_chunk` + - 6-bit `first_instruction_offest` Worst case encoding where each chunk contains an invalid `JUMPDEST`: ``` @@ -143,7 +143,7 @@ number_of_verkle_leafs = total_chunk_count / 32 = 33 - For skip-mode: - 10-bit delta-encoding of `chunk_no` - For value-mode: - - 6-bit `position_in_chunk` + - 6-bit `first_instruction_offest` Worst case encoding: ``` From 34fba68bbe087ea60c185d502e5a410196d70915 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:04:23 +0200 Subject: [PATCH 08/16] Clarify skipping text --- spec/eofv0_verkle.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index ec6b480..6607f40 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -126,9 +126,9 @@ version is as dense as possible (without sacrificing on complexity). In *scheme 1*, for each entry in `invalid_jumpdests`: - 1-bit mode (`skip`, `value`) - For skip-mode: - - 10-bit delta-encoding of `chunk_no` + - 10-bit number of chunks to skip - For value-mode: - - 4-bit delta-encoding of `chunk_no` + - 4-bit number of chunks to skip - 6-bit `first_instruction_offest` Worst case encoding where each chunk contains an invalid `JUMPDEST`: @@ -141,7 +141,7 @@ number_of_verkle_leafs = total_chunk_count / 32 = 33 *Scheme 2* differs slightly: - 1-bit mode (`skip`, `value`) - For skip-mode: - - 10-bit delta-encoding of `chunk_no` + - 10-bit number of chunks to skip - For value-mode: - 6-bit `first_instruction_offest` From 640b51ab05fdaa0f6f0b3e871e9cef4dce17f6f4 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:05:58 +0200 Subject: [PATCH 09/16] Swap scheme 1 and 2 --- spec/eofv0_verkle.md | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 6607f40..377b1ed 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -128,14 +128,13 @@ In *scheme 1*, for each entry in `invalid_jumpdests`: - For skip-mode: - 10-bit number of chunks to skip - For value-mode: - - 4-bit number of chunks to skip - 6-bit `first_instruction_offest` Worst case encoding where each chunk contains an invalid `JUMPDEST`: ``` total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header, i.e. 4.1% overhead -number_of_verkle_leafs = total_chunk_count / 32 = 33 +total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header, i.e. 2.7% overhead +number_of_verkle_leafs = total_chunk_count / 32 = 21 ``` *Scheme 2* differs slightly: @@ -143,13 +142,14 @@ number_of_verkle_leafs = total_chunk_count / 32 = 33 - For skip-mode: - 10-bit number of chunks to skip - For value-mode: + - 4-bit number of chunks to skip - 6-bit `first_instruction_offest` Worst case encoding: ``` total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header, i.e. 2.7% overhead -number_of_verkle_leafs = total_chunk_count / 32 = 21 +total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header, i.e. 4.1% overhead +number_of_verkle_leafs = total_chunk_count / 32 = 33 ``` The decision between *scheme 1* and *scheme 2*, as well as the best encoding sizes, can be determined @@ -189,19 +189,6 @@ malicious push byte: 1334 41 22 Encoding with *scheme 1*: ``` [skip, 2] -[value, 0, 21] -[value, 0, 31] -[value, 1, 20] -[value, 1, 7] -[value, 2, 24] -[skip, 35, 22] -``` - -Encoding size: `2 skips (2 * 11 bits) + 5 values (5 * 11 bits)` = 10-bytes header (0.465%) - -Encoding with *scheme 2*: -``` -[skip, 2] [value, 21] [value, 31] [skip, 1] @@ -216,6 +203,19 @@ Encoding with *scheme 2*: Encoding size: `5 skips (5 * 11 bits) + 6 values (6 * 7 bits)` = 13-bytes header (0.605%) +Encoding with *scheme 2*: +``` +[skip, 2] +[value, 0, 21] +[value, 0, 31] +[value, 1, 20] +[value, 1, 7] +[value, 2, 24] +[skip, 35, 22] +``` + +Encoding size: `2 skips (2 * 11 bits) + 5 values (5 * 11 bits)` = 10-bytes header (0.465%) + Uniswap router contract (17958 bytes): ``` @@ -231,7 +231,7 @@ malicious push byte: 12345 385 25 malicious push byte: 15761 492 17 ``` -Encoding using *scheme 1*: +Encoding using *scheme 2*: ``` [skip, 51] [value, 0, 14] From 016893756f407bec5a2e5b40d4ae1cdb600ef096 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:06:29 +0200 Subject: [PATCH 10/16] Formatting --- spec/eofv0_verkle.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 377b1ed..8edb4df 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -107,7 +107,8 @@ Such encoding lowers the size overhead from 3.1% to 2.3%. ### Encode only invalid jumpdests (dense encoding) -Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. By invalid `JUMPDEST` we mean a `0x5b` byte in any pushdata. +Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. +By invalid `JUMPDEST` we mean a `0x5b` byte in any pushdata. This is beneficial if our assumption is correct that most contracts only contain a limited number of offending cases. Our initial analysis suggests this is the case, e.g. Uniswap router has 9 cases, From de25406381cc363d8b4d07cf18c87b4017af9d43 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:09:05 +0200 Subject: [PATCH 11/16] Add conclusion --- spec/eofv0_verkle.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 8edb4df..1a54e18 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -254,6 +254,9 @@ Encoding using *scheme 2*: Encoding size: `7 skips (7 * 11 bits) + 9 values (9 * 11 bits)` = 22-bytes header (0.122%) +Our current hunch is that in average contracts this results in a sub-1% overhead, while the worst case is 4.1%. +This compares against the constant 3.2% overhead of the current Verkle code chunking. + ## Backwards Compatibility EOF-packaged code execution if fully compatible with the legacy code execution. From 462c8c362fc8425d487ddf2215e3e8faebd39c15 Mon Sep 17 00:00:00 2001 From: Alex Beregszaszi Date: Thu, 25 Apr 2024 18:11:25 +0200 Subject: [PATCH 12/16] Typo --- spec/eofv0_verkle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 1a54e18..e7a0a75 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -129,7 +129,7 @@ In *scheme 1*, for each entry in `invalid_jumpdests`: - For skip-mode: - 10-bit number of chunks to skip - For value-mode: - - 6-bit `first_instruction_offest` + - 6-bit `first_instruction_offset` Worst case encoding where each chunk contains an invalid `JUMPDEST`: ``` @@ -144,7 +144,7 @@ number_of_verkle_leafs = total_chunk_count / 32 = 21 - 10-bit number of chunks to skip - For value-mode: - 4-bit number of chunks to skip - - 6-bit `first_instruction_offest` + - 6-bit `first_instruction_offset` Worst case encoding: ``` From d6e0255fdbd6c48d59378330f8e0ff39ca0bb9dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Wed, 8 May 2024 12:44:37 +0200 Subject: [PATCH 13/16] Update encoding scheme --- spec/eofv0_verkle.md | 61 +++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index e7a0a75..f96be00 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -111,50 +111,34 @@ Alternate option is instead of encoding all valid `JUMPDEST` locations, to only By invalid `JUMPDEST` we mean a `0x5b` byte in any pushdata. This is beneficial if our assumption is correct that most contracts only contain a limited number -of offending cases. Our initial analysis suggests this is the case, e.g. Uniswap router has 9 cases, -one of the Arbitrum validator contracts has 6 cases. +of offending cases. Our initial analysis of the top 1000 used bytecodes suggests this is the case: +only 0.07% of bytecode bytes are invalid jumpdests. -Since Solidity contracts have a trailing metadata, which contains a Keccak-256 (32-byte) hash of the -source, there is a 12% probability ($1 - (255/256)^{32}$) that at least one of the bytes of the hash -will contain the `0x5b` value, which gives our minimum probability of having at least one invalid -`JUMPDEST` in the contract. - -Let's create a map of `invalid_jumpdests[chunk_no] = first_instruction_offset`. We can densely encode this -map using techniques similar to *run-length encoding* to skip distances and delta-encode offsets. +Let's create a map of `invalid_jumpdests[chunk_index] = first_instruction_offset`. We can densely encode this +map using techniques similar to *run-length encoding* to skip distances and delta-encode indexes. This map is always fully loaded prior to execution, and so it is important to ensure the encoded version is as dense as possible (without sacrificing on complexity). -In *scheme 1*, for each entry in `invalid_jumpdests`: +We propose the encoding using fixed-size 8-bit elements. +For each entry in `invalid_jumpdests`: - 1-bit mode (`skip`, `value`) - For skip-mode: - - 10-bit number of chunks to skip + - 7-bit number of chunks to skip - For value-mode: - - 6-bit `first_instruction_offset` + - 7-bit number combining number of chunks to skip `s` and `first_instruction_offset` + produced as `s * 33 + first_instruction_offset` -Worst case encoding where each chunk contains an invalid `JUMPDEST`: -``` -total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 6) / 8 = 672 # bytes for the header, i.e. 2.7% overhead -number_of_verkle_leafs = total_chunk_count / 32 = 21 -``` - -*Scheme 2* differs slightly: -- 1-bit mode (`skip`, `value`) -- For skip-mode: - - 10-bit number of chunks to skip -- For value-mode: - - 4-bit number of chunks to skip - - 6-bit `first_instruction_offset` +For the worst case where each chunk contains an invalid `JUMPDEST` the encoding length is equal +to the number of chunks in the code. I.e. the size overhead is 3.1%. -Worst case encoding: -``` -total_chunk_count = 24576 / 32 = 768 -total_chunk_count * (1 + 4 + 6) / 8 = 1056 # bytes for the header, i.e. 4.1% overhead -number_of_verkle_leafs = total_chunk_count / 32 = 33 -``` +| code size limit | code chunks | encoding chunks | +|-----------------|-------------|-----------------| +| 24576 | 768 | 24 | +| 32768 | 1024 | 32 | +| 65536 | 2048 | 64 | -The decision between *scheme 1* and *scheme 2*, as well as the best encoding sizes, can be determined -through analysing existing code. +Our current hunch is that in average contracts this results in a sub-1% overhead, while the worst case is 3.1%. +This is strictly better than the 3.2% overhead of the current Verkle code chunking. #### Header location @@ -165,9 +149,12 @@ This second option allows for the simplification of the `code_size` value, as it #### Runtime after Verkle -During runtime execution two checks must be done in this order: -1) Check if the destination is on the invalid list, and abort if so. -2) Check if the value in the chunk is an actual `JUMPDEST`, and abort if not. +During execution of a jump two checks must be done in this order: + +1. Check if the jump destination is the `JUMPDEST` opcode. +2. Check if the jump destination chunk is in the `invalid_jumpdests` map. + If yes, the jumpdest analysis of the chunk must be performed + to confirm the jump destination is not push data. It is possible to reconstruct sparse account code prior to execution with all the submitted chunks of the transaction and perform `JUMPDEST`-validation to build up a relevant *valid `JUMPDEST` locations* map instead. From 3afee13176daed1433a595437a32e306c74ed82f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Wed, 8 May 2024 15:31:07 +0200 Subject: [PATCH 14/16] reference implementation --- spec/eofv0_verkle.md | 64 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index f96be00..0291f04 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -159,7 +159,69 @@ During execution of a jump two checks must be done in this order: It is possible to reconstruct sparse account code prior to execution with all the submitted chunks of the transaction and perform `JUMPDEST`-validation to build up a relevant *valid `JUMPDEST` locations* map instead. -#### Analysis +#### Reference encoding implementation + +```python +class Scheme: + VALUE_MAX = 32 + VALUE_WIDTH = VALUE_MAX.bit_length() + VALUE_MOD = VALUE_MAX + 1 + + def __init__(self, name: str, width: int): + self.name = name + self.WIDTH = width + + payload_max = 2 ** (width - 1) - 1 + + self.SKIP_ONLY = 1 << (self.WIDTH - 1) + self.VALUE_SKIP_MAX = (payload_max - self.VALUE_MAX) // self.VALUE_MOD + self.SKIP_BIAS = self.VALUE_SKIP_MAX + 1 + + def encode(self, chunks: list[Chunk]) -> tuple[list[int], int]: + skip_only_max = self.SKIP_ONLY - 1 + + ops = [] + last_chunk_index = 0 + for i, ch in enumerate(chunks): + if not ch.contains_invalid_jumpdest: + continue # skip chunks without invalid jumpdests + + delta = i - last_chunk_index + + # Generate skips if needed. + while delta > self.VALUE_SKIP_MAX: + d = min(delta - self.SKIP_BIAS, skip_only_max) + assert 0 <= d <= skip_only_max + ops.append(self.SKIP_ONLY | d) + delta -= d + self.SKIP_BIAS + + assert 0 <= delta <= self.VALUE_SKIP_MAX + assert 0 <= ch.first_instruction_offset <= 32 + ops.append(delta * self.VALUE_MOD + ch.first_instruction_offset) + + last_chunk_index = i + + return ops, self.WIDTH * len(ops) + + def decode(self, ops: list[int]) -> dict[int, int]: + m = dict() + i = 0 + for op in ops: + if op & self.SKIP_ONLY: + delta = (op ^ self.SKIP_ONLY) + self.SKIP_BIAS + value = None + else: + delta = op // self.VALUE_MOD + value = op % self.VALUE_MOD + i += delta + print(f"{delta:+4}") + if value is not None: + m[i] = value + print(f"{i:4}: {value}") + return m +``` + +#### Example We have analyzed two contracts, Arbitrum validator and Uniswap router. From a087506bb19e7a2bb3266d8f41948332e21baeee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Wed, 8 May 2024 15:38:55 +0200 Subject: [PATCH 15/16] example --- spec/eofv0_verkle.md | 109 ++++++++++--------------------------------- 1 file changed, 25 insertions(+), 84 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 0291f04..942ebf1 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -159,6 +159,31 @@ During execution of a jump two checks must be done in this order: It is possible to reconstruct sparse account code prior to execution with all the submitted chunks of the transaction and perform `JUMPDEST`-validation to build up a relevant *valid `JUMPDEST` locations* map instead. +#### Example + +The top used bytecode: [0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2](https://etherscan.io/address/0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2) (WETH). + +``` +length: 3124 +chunks: 98 + +chunks with invalid jumpdests: +chunk_index first_instruction_offset +37 4 +49 12 +50 14 +87 13 + +encoding (7 bytes (0.22%), 1 chunk (1.02%)): +[skip, 37] +[value, 0, 4] +[skip, 12] +[value, 0, 12] +[value, 1, 14] +[skip, 37] +[value, 0, 13] +``` + #### Reference encoding implementation ```python @@ -221,90 +246,6 @@ class Scheme: return m ``` -#### Example - -We have analyzed two contracts, Arbitrum validator and Uniswap router. - -Arbitrum (2147-bytes long): -``` -(chunk offset, chunk number, pushdata offset) -malicious push byte: 85 2 21 -malicious push byte: 95 2 31 -malicious push byte: 116 3 20 -malicious push byte: 135 4 7 -malicious push byte: 216 6 24 -malicious push byte: 1334 41 22 -``` - -Encoding with *scheme 1*: -``` -[skip, 2] -[value, 21] -[value, 31] -[skip, 1] -[value, 20] -[skip, 1] -[value, 7] -[skip, 2] -[value, 24] -[skip, 35] -[value, 22] -``` - -Encoding size: `5 skips (5 * 11 bits) + 6 values (6 * 7 bits)` = 13-bytes header (0.605%) - -Encoding with *scheme 2*: -``` -[skip, 2] -[value, 0, 21] -[value, 0, 31] -[value, 1, 20] -[value, 1, 7] -[value, 2, 24] -[skip, 35, 22] -``` - -Encoding size: `2 skips (2 * 11 bits) + 5 values (5 * 11 bits)` = 10-bytes header (0.465%) - -Uniswap router contract (17958 bytes): - -``` -(chunk offset, chunk number, pushdata offset) -malicious push byte: 1646 51 14 -malicious push byte: 1989 62 5 -malicious push byte: 4239 132 15 -malicious push byte: 4533 141 21 -malicious push byte: 7043 220 3 -malicious push byte: 8036 251 4 -malicious push byte: 8604 268 28 -malicious push byte: 12345 385 25 -malicious push byte: 15761 492 17 -``` - -Encoding using *scheme 2*: -``` -[skip, 51] -[value, 0, 14] -[value, 11, 5] -[skip, 70] -[value, 0, 15] -[value, 9, 21] -[skip, 79] -[value, 0, 3] -[skip, 31] -[value, 0, 4] -[skip, 17] -[value, 0, 28] -[skip, 117] -[value, 0, 25] -[skip, 107] -[value, 0, 17] -``` - -Encoding size: `7 skips (7 * 11 bits) + 9 values (9 * 11 bits)` = 22-bytes header (0.122%) - -Our current hunch is that in average contracts this results in a sub-1% overhead, while the worst case is 4.1%. -This compares against the constant 3.2% overhead of the current Verkle code chunking. ## Backwards Compatibility From 778d10ea66506ffa2bfeed9fd0787edbdb80523c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Bylica?= Date: Fri, 10 May 2024 14:39:45 +0200 Subject: [PATCH 16/16] VLQM33 --- spec/eofv0_verkle.md | 165 +++++++++++++++++++------------------------ 1 file changed, 72 insertions(+), 93 deletions(-) diff --git a/spec/eofv0_verkle.md b/spec/eofv0_verkle.md index 942ebf1..6897c89 100644 --- a/spec/eofv0_verkle.md +++ b/spec/eofv0_verkle.md @@ -110,8 +110,8 @@ Such encoding lowers the size overhead from 3.1% to 2.3%. Alternate option is instead of encoding all valid `JUMPDEST` locations, to only encode invalid ones. By invalid `JUMPDEST` we mean a `0x5b` byte in any pushdata. -This is beneficial if our assumption is correct that most contracts only contain a limited number -of offending cases. Our initial analysis of the top 1000 used bytecodes suggests this is the case: +This is beneficial because most contracts only contain a limited number of offending cases. +Our initial analysis of the top 1000 bytecodes used in last year confirms this: only 0.07% of bytecode bytes are invalid jumpdests. Let's create a map of `invalid_jumpdests[chunk_index] = first_instruction_offset`. We can densely encode this @@ -119,14 +119,14 @@ map using techniques similar to *run-length encoding* to skip distances and delt This map is always fully loaded prior to execution, and so it is important to ensure the encoded version is as dense as possible (without sacrificing on complexity). -We propose the encoding using fixed-size 8-bit elements. -For each entry in `invalid_jumpdests`: -- 1-bit mode (`skip`, `value`) -- For skip-mode: - - 7-bit number of chunks to skip -- For value-mode: - - 7-bit number combining number of chunks to skip `s` and `first_instruction_offset` - produced as `s * 33 + first_instruction_offset` +We propose the encoding which uses [VLQ](https://en.wikipedia.org/wiki/Variable-length_quantity): + +For each entry `index, first_instruction_offset` in `invalid_jumpdests`: + +- Compute the chunk index distance to the previously encoded chunk `delta = index - last_chunk_index - 1`. +- Combine two numbers into single unsigned integer `entry = delta * 33 + first_instruction_offset`. + This is reversible because `first_instruction_offset < 33`. +- Encode `entry` into sequence of bytes using VLQ (e.g. LEB128). For the worst case where each chunk contains an invalid `JUMPDEST` the encoding length is equal to the number of chunks in the code. I.e. the size overhead is 3.1%. @@ -137,9 +137,37 @@ to the number of chunks in the code. I.e. the size overhead is 3.1%. | 32768 | 1024 | 32 | | 65536 | 2048 | 64 | -Our current hunch is that in average contracts this results in a sub-1% overhead, while the worst case is 3.1%. +Our current hunch is that in average contracts this results in ~0.1% overhead, while the worst case is 3.1%. This is strictly better than the 3.2% overhead of the current Verkle code chunking. +Stats from "top 1000 bytecodes used in last year": + +``` +total code length: 11785831 +total encoding length: 11693 (0.099%) +encoding chunks distribution: +0: 109 (10.9%) +1: 838 (83.8%) +2: 49 ( 4.9%) +3: 4 ( 0.4%) +``` + +#### Encoding example + +The top used bytecode: [0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2](https://etherscan.io/address/0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2) (WETH). + +``` +length: 3124 +chunks: 98 + +chunks with invalid jumpdests: +chunk_index delta first_instruction_offset entry leb128 +37 37 4 1225 c909 +49 11 12 375 f702 +50 0 14 14 0e +87 36 13 1201 b109 +``` + #### Header location It is possible to place above as part of the "EOFv0" header, but given the upper bound of number of chunks occupied is low (33 vs 21), @@ -159,91 +187,42 @@ During execution of a jump two checks must be done in this order: It is possible to reconstruct sparse account code prior to execution with all the submitted chunks of the transaction and perform `JUMPDEST`-validation to build up a relevant *valid `JUMPDEST` locations* map instead. -#### Example - -The top used bytecode: [0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2](https://etherscan.io/address/0xc02aaa39b223fe8d0a0e5c4f27ead9083c756cc2) (WETH). - -``` -length: 3124 -chunks: 98 - -chunks with invalid jumpdests: -chunk_index first_instruction_offset -37 4 -49 12 -50 14 -87 13 - -encoding (7 bytes (0.22%), 1 chunk (1.02%)): -[skip, 37] -[value, 0, 4] -[skip, 12] -[value, 0, 12] -[value, 1, 14] -[skip, 37] -[value, 0, 13] -``` - #### Reference encoding implementation ```python -class Scheme: - VALUE_MAX = 32 - VALUE_WIDTH = VALUE_MAX.bit_length() - VALUE_MOD = VALUE_MAX + 1 - - def __init__(self, name: str, width: int): - self.name = name - self.WIDTH = width - - payload_max = 2 ** (width - 1) - 1 - - self.SKIP_ONLY = 1 << (self.WIDTH - 1) - self.VALUE_SKIP_MAX = (payload_max - self.VALUE_MAX) // self.VALUE_MOD - self.SKIP_BIAS = self.VALUE_SKIP_MAX + 1 - - def encode(self, chunks: list[Chunk]) -> tuple[list[int], int]: - skip_only_max = self.SKIP_ONLY - 1 - - ops = [] - last_chunk_index = 0 - for i, ch in enumerate(chunks): - if not ch.contains_invalid_jumpdest: - continue # skip chunks without invalid jumpdests - - delta = i - last_chunk_index - - # Generate skips if needed. - while delta > self.VALUE_SKIP_MAX: - d = min(delta - self.SKIP_BIAS, skip_only_max) - assert 0 <= d <= skip_only_max - ops.append(self.SKIP_ONLY | d) - delta -= d + self.SKIP_BIAS - - assert 0 <= delta <= self.VALUE_SKIP_MAX - assert 0 <= ch.first_instruction_offset <= 32 - ops.append(delta * self.VALUE_MOD + ch.first_instruction_offset) - - last_chunk_index = i - - return ops, self.WIDTH * len(ops) - - def decode(self, ops: list[int]) -> dict[int, int]: - m = dict() - i = 0 - for op in ops: - if op & self.SKIP_ONLY: - delta = (op ^ self.SKIP_ONLY) + self.SKIP_BIAS - value = None - else: - delta = op // self.VALUE_MOD - value = op % self.VALUE_MOD - i += delta - print(f"{delta:+4}") - if value is not None: - m[i] = value - print(f"{i:4}: {value}") - return m +import leb128 +import io + +class VLQM33: + VALUE_MOD = 33 + + def encode(self, chunks: dict[int, int]) -> tuple[bytes, int]: + ops = b'' + last_chunk_index = 0 + for index, value in chunks.items(): + assert 0 <= value < self.VALUE_MOD + delta = index - last_chunk_index + e = delta * self.VALUE_MOD + value + ops += leb128.u.encode(e) + last_chunk_index = index + 1 + return ops, 8 * len(ops) + + def decode(self, ops: bytes) -> dict[int, int]: + stream = io.BytesIO(ops) + stream.seek(0, 2) + end = stream.tell() + stream.seek(0, 0) + + m = {} + index = 0 + while stream.tell() != end: + e, _ = leb128.u.decode_reader(stream) + delta = e // self.VALUE_MOD + value = e % self.VALUE_MOD + index += delta + m[index] = value + index += 1 + return m ```