From 07cd2adbc10a9f38b122120a51628d7ed3f46ffb Mon Sep 17 00:00:00 2001 From: Peter Rabbitson Date: Thu, 21 May 2020 20:27:53 +0200 Subject: [PATCH 1/3] Base36 byte-encoding specification Uses the alphabet 0-9a-z case insensitively. The prefix K is chosen to limit future clashes with english words based on https://en.wikipedia.org/wiki/Letter_frequency --- multibase.csv | 2 ++ rfcs/Base36.md | 40 ++++++++++++++++++++++++++++++++++++++++ tests/test1.csv | 2 ++ tests/test2.csv | 2 ++ tests/test3.csv | 2 ++ tests/test4.csv | 2 ++ tests/test5.csv | 2 ++ tests/test6.csv | 2 ++ 8 files changed, 54 insertions(+) create mode 100644 rfcs/Base36.md diff --git a/multibase.csv b/multibase.csv index d980df7..5f69d41 100644 --- a/multibase.csv +++ b/multibase.csv @@ -14,6 +14,8 @@ base32upper, B, rfc4648 no padding, base32pad, c, rfc4648 with padding, candidate base32padupper, C, rfc4648 with padding, candidate base32z, h, z-base-32 (used by Tahoe-LAFS), draft +base36upper, K, base36 [0-9a-z] case-insensitive no padding, default +base36, k, base36 [0-9a-z] case-insensitive no padding, default base58flickr, Z, base58 flicker, candidate base58btc, z, base58 bitcoin, default base64, m, rfc4648 no padding, default diff --git a/rfcs/Base36.md b/rfcs/Base36.md new file mode 100644 index 0000000..cd4ff57 --- /dev/null +++ b/rfcs/Base36.md @@ -0,0 +1,40 @@ +# Base36 + +The multibase base36 prefix is the character `k` or `K`. The digit-alphabet +consists of 0..9 and then the case insensitive range a..z for the values 10..35 + +## Encoding + +A byte array is encoded to base36 by: + +1. Counting the number of leading 0 bytes (Z). +2. Interpreting the rest of the byte array as a big-endian unsigned integer (N). +3. Concatenating a length Z string of '0' characters with the decimal + representation of N. + +A byte array is encoded to multibase base36 by prefixing its base36 encoding +with the character `k`. + +## Decoding + +A multibase base36 encoded string is decoded by first dropping the multibase +prefix (which must be `k` or `K`). + +The remaining characters are then converted to a byte array by: + +1. Counting the number of leading '0' characters (Z). +2. Interpreting the rest of the character sequence as a base36 unsigned integer + (N). +3. Concatenating a length Z array of NULL (0x00) bytes with N encoded as a + big-endian unsigned integer. + +## Examples + +Byte Array <-> Base36 Multibase: + +| Bytes | == | LC Base36 | OR | UC base36 | +|---|---|---|---|---| +| `[0x00, 0x01]` | == | `"k01"` | | `"K01"` | +| `[0x00, 0x00, 0xff]` | == | `"k0073"` | | `"K0073"` | +| `[0x01, 0x00]` | == | `"k74"` | | `"K74"` | +| `[0x00, 0x01, 0x00]` | == | `"k074"` | | `"K074"` | diff --git a/tests/test1.csv b/tests/test1.csv index 44d6003..97e807c 100644 --- a/tests/test1.csv +++ b/tests/test1.csv @@ -13,6 +13,8 @@ base32padupper, "CIRSWGZLOORZGC3DJPJSSAZLWMVZHS5DINFXGOIJB" base32hexpad, "t8him6pbeehp62r39f9ii0pbmclp7it38d5n6e891" base32hexpadupper, "T8HIM6PBEEHP62R39F9II0PBMCLP7IT38D5N6E891" base32z, "het1sg3mqqt3gn5djxj11y3msci3817depfzgqejb" +base36, "k343ixo7d49hqj1ium15pgy1wzww5fxrid21td7l" +base36upper, "K343IXO7D49HQJ1IUM15PGY1WZWW5FXRID21TD7L" base58flickr, "Ztwe7gVTeK8wswS1gf8hrgAua9fcw9reboD" base58btc, "zUXE7GvtEk8XTXs1GF8HSGbVA9FCX9SEBPe" base64, "mRGVjZW50cmFsaXplIGV2ZXJ5dGhpbmchIQ" diff --git a/tests/test2.csv b/tests/test2.csv index 97ecccb..3b95ebf 100644 --- a/tests/test2.csv +++ b/tests/test2.csv @@ -13,6 +13,8 @@ base32padupper, "CPFSXGIDNMFXGSIBB" base32hexpad, "tf5in683dc5n6i811" base32hexpadupper, "TF5IN683DC5N6I811" base32z, "hxf1zgedpcfzg1ebb" +base36, "k2lcpzo5yikidynfl" +base36upper, "K2LCPZO5YIKIDYNFL" base58flickr, "Z7Pznk19XTTzBtx" base58btc, "z7paNL19xttacUY" base64, "meWVzIG1hbmkgIQ" diff --git a/tests/test3.csv b/tests/test3.csv index 4bfbc5e..8ddea2b 100644 --- a/tests/test3.csv +++ b/tests/test3.csv @@ -13,6 +13,8 @@ base32padupper, "CNBSWY3DPEB3W64TMMQ======" base32hexpad, "td1imor3f41rmusjccg======" base32hexpadupper, "TD1IMOR3F41RMUSJCCG======" base32z, "hpb1sa5dxrb5s6hucco" +base36, "kfuvrsivvnfrbjwajo" +base36upper, "KFUVRSIVVNFRBJWAJO" base58flickr, "ZrTu1dk6cWsRYjYu" base58btc, "zStV1DL6CwTryKyV" base64, "maGVsbG8gd29ybGQ" diff --git a/tests/test4.csv b/tests/test4.csv index e02f128..7fd4fc3 100644 --- a/tests/test4.csv +++ b/tests/test4.csv @@ -13,6 +13,8 @@ base32padupper, "CAB4WK4ZANVQW42JAEE======" base32hexpad, "t01smasp0dlgmsq9044======" base32hexpadupper, "T01SMASP0DLGMSQ9044======" base32z, "hybhskh3ypiosh4jyrr" +base36, "k02lcpzo5yikidynfl" +base36upper, "K02LCPZO5YIKIDYNFL" base58flickr, "Z17Pznk19XTTzBtx" base58btc, "z17paNL19xttacUY" base64, "mAHllcyBtYW5pICE" diff --git a/tests/test5.csv b/tests/test5.csv index 9f70104..44e6b26 100644 --- a/tests/test5.csv +++ b/tests/test5.csv @@ -13,6 +13,8 @@ base32padupper, "CAAAHSZLTEBWWC3TJEAQQ====" base32hexpad, "t0007ipbj41mm2rj940gg====" base32hexpadupper, "T0007IPBJ41MM2RJ940GG====" base32z, "hyyy813murbssn5ujryoo" +base36, "k002lcpzo5yikidynfl" +base36upper, "K002LCPZO5YIKIDYNFL" base58flickr, "Z117Pznk19XTTzBtx" base58btc, "z117paNL19xttacUY" base64, "mAAB5ZXMgbWFuaSAh" diff --git a/tests/test6.csv b/tests/test6.csv index a10b180..3037d9c 100644 --- a/tests/test6.csv +++ b/tests/test6.csv @@ -9,3 +9,5 @@ base32pad, "cnbswy3dpeB3W64TMMQ======" base32padupper, "Cnbswy3dpeB3W64TMMQ======" base32hexpad, "td1imor3f41RMUSJCCG======" base32hexpadupper, "Td1imor3f41RMUSJCCG======" +base36, "kfUvrsIvVnfRbjWaJo" +base36upper, "KfUVrSIVVnFRbJWAJo" From 3b7419fd5dfb75922c642ccc870eae99cd7fdc10 Mon Sep 17 00:00:00 2001 From: Peter Rabbitson Date: Fri, 22 May 2020 03:04:20 +0200 Subject: [PATCH 2/3] More docs /o\ --- README.md | 26 +++++++++++++++----------- multibase.csv | 22 +++++++++++----------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index c8badf5..8bd1565 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ > Self identifying base encodings Multibase is a protocol for disambiguating the encoding of base-encoded (e.g., -base32, base64, base58, etc.) binary appearing in text. +base32, base36, base64, base58, etc.) binary appearing in text. When text is encoded as bytes, we can usually use a one-size-fits-all encoding (UTF-8) because we're always encoding to the same set of 256 bytes (+/- the NUL @@ -63,17 +63,19 @@ base8, 7, octal, base10, 9, decimal, draft base16, f, hexadecimal, default base16upper, F, hexadecimal, default -base32hex, v, rfc4648 no padding - highest char, candidate -base32hexupper, V, rfc4648 no padding - highest char, candidate -base32hexpad, t, rfc4648 with padding, candidate -base32hexpadupper, T, rfc4648 with padding, candidate -base32, b, rfc4648 no padding, default -base32upper, B, rfc4648 no padding, default -base32pad, c, rfc4648 with padding, candidate -base32padupper, C, rfc4648 with padding, candidate +base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexpad, t, rfc4648 case-insensitive - with padding, candidate +base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate +base32, b, rfc4648 case-insensitive - no padding, default +base32upper, B, rfc4648 case-insensitive - no padding, default +base32pad, c, rfc4648 case-insensitive - with padding, candidate +base32padupper, C, rfc4648 case-insensitive - with padding, candidate base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base58flickr, Z, base58 flicker, candidate +base36, k, base36 [0-9a-z] case-insensitive - no padding, default +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, default base58btc, z, base58 bitcoin, default +base58flickr, Z, base58 flicker, candidate base64, m, rfc4648 no padding, default base64pad, M, rfc4648 with padding - MIME encoding, candidate base64url, u, rfc4648 no padding, default @@ -107,6 +109,7 @@ Consider the following encodings of the same binary string: ``` 4D756C74696261736520697320617765736F6D6521205C6F2F # base16 (hex) JV2WY5DJMJQXGZJANFZSAYLXMVZW63LFEEQFY3ZP # base32 +3IY8QKL64VUGCX009XWUHKF6GBBTS3TVRXFRA5R # base36 YAjKoNbau5KiqmHPmSxYCvn66dA1vLmwbt # base58 TXVsdGliYXNlIGlzIGF3ZXNvbWUhIFxvLw== # base64 ``` @@ -116,11 +119,12 @@ And consider the same encodings with their multibase prefix ``` F4D756C74696261736520697320617765736F6D6521205C6F2F # base16 F BJV2WY5DJMJQXGZJANFZSAYLXMVZW63LFEEQFY3ZP # base32 B +K3IY8QKL64VUGCX009XWUHKF6GBBTS3TVRXFRA5R # base36 K zYAjKoNbau5KiqmHPmSxYCvn66dA1vLmwbt # base58 z MTXVsdGliYXNlIGlzIGF3ZXNvbWUhIFxvLw== # base64 M ``` -The base prefixes used are: `F, B, z, M`. +The base prefixes used are: `F, B, K, z, M`. ## FAQ diff --git a/multibase.csv b/multibase.csv index 5f69d41..865b404 100644 --- a/multibase.csv +++ b/multibase.csv @@ -5,19 +5,19 @@ base8, 7, octal, base10, 9, decimal, draft base16, f, hexadecimal, default base16upper, F, hexadecimal, default -base32hex, v, rfc4648 no padding - highest char, candidate -base32hexupper, V, rfc4648 no padding - highest char, candidate -base32hexpad, t, rfc4648 with padding, candidate -base32hexpadupper, T, rfc4648 with padding, candidate -base32, b, rfc4648 no padding, default -base32upper, B, rfc4648 no padding, default -base32pad, c, rfc4648 with padding, candidate -base32padupper, C, rfc4648 with padding, candidate +base32hex, v, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexupper, V, rfc4648 case-insensitive - no padding - highest char, candidate +base32hexpad, t, rfc4648 case-insensitive - with padding, candidate +base32hexpadupper, T, rfc4648 case-insensitive - with padding, candidate +base32, b, rfc4648 case-insensitive - no padding, default +base32upper, B, rfc4648 case-insensitive - no padding, default +base32pad, c, rfc4648 case-insensitive - with padding, candidate +base32padupper, C, rfc4648 case-insensitive - with padding, candidate base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base36upper, K, base36 [0-9a-z] case-insensitive no padding, default -base36, k, base36 [0-9a-z] case-insensitive no padding, default -base58flickr, Z, base58 flicker, candidate +base36, k, base36 [0-9a-z] case-insensitive - no padding, default +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, default base58btc, z, base58 bitcoin, default +base58flickr, Z, base58 flicker, candidate base64, m, rfc4648 no padding, default base64pad, M, rfc4648 with padding - MIME encoding, candidate base64url, u, rfc4648 no padding, default From f378d3427fe125057facdbac936c4215cc777920 Mon Sep 17 00:00:00 2001 From: Peter Rabbitson Date: Fri, 22 May 2020 04:51:28 +0200 Subject: [PATCH 3/3] Nits --- README.md | 4 ++-- multibase.csv | 4 ++-- rfcs/Base36.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8bd1565..220789f 100644 --- a/README.md +++ b/README.md @@ -72,8 +72,8 @@ base32upper, B, rfc4648 case-insensitive - no padding, base32pad, c, rfc4648 case-insensitive - with padding, candidate base32padupper, C, rfc4648 case-insensitive - with padding, candidate base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base36, k, base36 [0-9a-z] case-insensitive - no padding, default -base36upper, K, base36 [0-9a-z] case-insensitive - no padding, default +base36, k, base36 [0-9a-z] case-insensitive - no padding, draft +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft base58btc, z, base58 bitcoin, default base58flickr, Z, base58 flicker, candidate base64, m, rfc4648 no padding, default diff --git a/multibase.csv b/multibase.csv index 865b404..3b5abe2 100644 --- a/multibase.csv +++ b/multibase.csv @@ -14,8 +14,8 @@ base32upper, B, rfc4648 case-insensitive - no padding, base32pad, c, rfc4648 case-insensitive - with padding, candidate base32padupper, C, rfc4648 case-insensitive - with padding, candidate base32z, h, z-base-32 (used by Tahoe-LAFS), draft -base36, k, base36 [0-9a-z] case-insensitive - no padding, default -base36upper, K, base36 [0-9a-z] case-insensitive - no padding, default +base36, k, base36 [0-9a-z] case-insensitive - no padding, draft +base36upper, K, base36 [0-9a-z] case-insensitive - no padding, draft base58btc, z, base58 bitcoin, default base58flickr, Z, base58 flicker, candidate base64, m, rfc4648 no padding, default diff --git a/rfcs/Base36.md b/rfcs/Base36.md index cd4ff57..b050bfd 100644 --- a/rfcs/Base36.md +++ b/rfcs/Base36.md @@ -9,7 +9,7 @@ A byte array is encoded to base36 by: 1. Counting the number of leading 0 bytes (Z). 2. Interpreting the rest of the byte array as a big-endian unsigned integer (N). -3. Concatenating a length Z string of '0' characters with the decimal +3. Concatenating a length Z string of '0' characters with the base36 representation of N. A byte array is encoded to multibase base36 by prefixing its base36 encoding @@ -32,7 +32,7 @@ The remaining characters are then converted to a byte array by: Byte Array <-> Base36 Multibase: -| Bytes | == | LC Base36 | OR | UC base36 | +| Bytes | == | LC Base36 | OR | UC Base36 | |---|---|---|---|---| | `[0x00, 0x01]` | == | `"k01"` | | `"K01"` | | `[0x00, 0x00, 0xff]` | == | `"k0073"` | | `"K0073"` |