From ff25e9673c29fbd0a3fdac972a22eb8e04a2fcdf Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 10:53:52 +0200 Subject: [PATCH 1/7] Open research dir; want to explore cid impl perf. It's been discussed in several issues and PRs already that we might want to explore various ways of implementing CIDs for maximum performance and ease-of-use because they show up extremely often. Current CIDs are pointers, which generally speaking means you can't get one without a malloc; and also, they're not particularly well-suited for use in map keys. This branch is to attempt to consolidate all the proposals so far -- and do so in a single branch which can be checked out and contains all the proposals at once, because this will make it easy to do benchmarks and compare all of the various ways we could implement this in one place (and also easier for humans to track what the latest of each proposal is, since they're all in one place). To start with: a Cid implementation backed by a string; and matching interface. (I'm also taking this opportunity to be as minimalistic as possible in what I port over into these experimental new Cid implementations. This might not last; but as long as all this work is to be done, it's a more convenient time than usual to see what can be stripped down and still get work done.) More to come. --- _rsrch/cidiface/cid.go | 46 ++++++++++++++++++++++++ _rsrch/cidiface/cidString.go | 69 ++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 _rsrch/cidiface/cid.go create mode 100644 _rsrch/cidiface/cidString.go diff --git a/_rsrch/cidiface/cid.go b/_rsrch/cidiface/cid.go new file mode 100644 index 0000000..8a888e1 --- /dev/null +++ b/_rsrch/cidiface/cid.go @@ -0,0 +1,46 @@ +package cid + +import ( + mh "github.com/multiformats/go-multihash" +) + +// Cid represents a self-describing content adressed identifier. +// +// A CID is composed of: +// +// - a Version of the CID itself, +// - a Multicodec (indicates the encoding of the referenced content), +// - and a Multihash (which identifies the referenced content). +// +// (Note that the Multihash further contains its own version and hash type +// indicators.) +type Cid interface { + // n.b. 'yields' means "without copy", 'produces' means a malloc. + + Version() uint64 // Yields the version prefix as a uint. + Multicodec() uint64 // Yields the multicodec as a uint. + Multihash() mh.Multihash // Yields the multihash segment. + + String() string // Produces the CID formatted as b58 string. + + Prefix() Prefix // Produces a tuple of non-content metadata. + + // some change notes: + // - `KeyString() CidString` is gone because we're natively a map key now, you're welcome. + // - `StringOfBase(mbase.Encoding) (string, error)` is skipped, maybe it can come back but maybe it should be a formatter's job. + // - `Equals(o Cid) bool` is gone because it's now `==`, you're welcome. + // - `Bytes() []byte` is gone because I can't imagine where that should be used except again where a formatter should be involved. +} + +// Prefix represents all the metadata of a Cid, +// that is, the Version, the Codec, the Multihash type +// and the Multihash length. It does not contains +// any actual content information. +// NOTE: The use -1 in MhLength to mean default length is deprecated, +// use the V0Builder or V1Builder structures instead +type Prefix struct { + Version uint64 + Codec uint64 + MhType uint64 + MhLength int +} diff --git a/_rsrch/cidiface/cidString.go b/_rsrch/cidiface/cidString.go new file mode 100644 index 0000000..c0deb9b --- /dev/null +++ b/_rsrch/cidiface/cidString.go @@ -0,0 +1,69 @@ +package cid + +import ( + "encoding/binary" + + mbase "github.com/multiformats/go-multibase" + mh "github.com/multiformats/go-multihash" +) + +var _ Cid = CidStr("") + +// CidStr is a representation of a Cid as a string type containing binary. +// +// Using golang's string type is preferable over byte slices even for binary +// data because golang strings are immutable, usable as map keys, +// trivially comparable with built-in equals operators, etc. +type CidStr string + +// EmptyCid is a constant for a zero/uninitialized/sentinelvalue cid; +// it is declared mainly for readability in checks for sentinel values. +const EmptyCid = CidStr("") + +func (c CidStr) Version() uint64 { + bytes := []byte(c) + v, _ := binary.Uvarint(bytes) + return v +} + +func (c CidStr) Multicodec() uint64 { + bytes := []byte(c) + _, n := binary.Uvarint(bytes) // skip version length + codec, _ := binary.Uvarint(bytes[n:]) + return codec +} + +func (c CidStr) Multihash() mh.Multihash { + bytes := []byte(c) + _, n1 := binary.Uvarint(bytes) // skip version length + _, n2 := binary.Uvarint(bytes[n1:]) // skip codec length + return mh.Multihash(bytes[n1+n2:]) // return slice of remainder +} + +// String returns the default string representation of a Cid. +// Currently, Base58 is used as the encoding for the multibase string. +func (c CidStr) String() string { + switch c.Version() { + case 0: + return c.Multihash().B58String() + case 1: + mbstr, err := mbase.Encode(mbase.Base58BTC, []byte(c)) + if err != nil { + panic("should not error with hardcoded mbase: " + err.Error()) + } + return mbstr + default: + panic("not possible to reach this point") + } +} + +// Prefix builds and returns a Prefix out of a Cid. +func (c CidStr) Prefix() Prefix { + dec, _ := mh.Decode(c.Multihash()) // assuming we got a valid multiaddr, this will not error + return Prefix{ + MhType: dec.Code, + MhLength: dec.Length, + Version: c.Version(), + Codec: c.Multicodec(), + } +} From c724ad0d221df41d1ea04c94d00b05bd6d128695 Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 11:43:16 +0200 Subject: [PATCH 2/7] cid impl via struct and via string together. Added back in some of the parser methods. (These were previously named "Cast" and I think that's silly and wrong so I fixed it.) Functions are named overly-literally with their type (e.g. ParseCidString and ParseCidStruct rather than ParseCid or even just Parse) because for this research package I don't want to bother with many sub-packages. (Maybe I'll regret this, but at the moment it seems simpler to hold back on sub-packages.) Functions that produce Cids are literal with their return types, as well. Part of the purpose of this research package is going to be to concretely benchmark exactly how much performance overhead there is to using interfaces (which will likely cause a lot of boxing and unboxing in practice) -- since we want to explore where this boxing happens and how much it costs, it's important that none of our basic implementation functions do the boxing! The entire set of codec enums came along in this commit. Ah well; they would have eventually anyway, I guess. But it's interesting to note the only thing that dragged them along so far is the reference to 'DagProtobuf' when constructing v0 CIDs; otherwise, this enum is quite unused here. --- _rsrch/cidiface/cid.go | 4 +- _rsrch/cidiface/cidString.go | 96 ++++++++++++++++++++- _rsrch/cidiface/cidStruct.go | 162 +++++++++++++++++++++++++++++++++++ _rsrch/cidiface/enums.go | 76 ++++++++++++++++ _rsrch/cidiface/errors.go | 24 ++++++ _rsrch/cidiface/misc.go | 12 +++ 6 files changed, 371 insertions(+), 3 deletions(-) create mode 100644 _rsrch/cidiface/cidStruct.go create mode 100644 _rsrch/cidiface/enums.go create mode 100644 _rsrch/cidiface/errors.go create mode 100644 _rsrch/cidiface/misc.go diff --git a/_rsrch/cidiface/cid.go b/_rsrch/cidiface/cid.go index 8a888e1..cb4b871 100644 --- a/_rsrch/cidiface/cid.go +++ b/_rsrch/cidiface/cid.go @@ -22,6 +22,7 @@ type Cid interface { Multihash() mh.Multihash // Yields the multihash segment. String() string // Produces the CID formatted as b58 string. + Bytes() []byte // Produces the CID formatted as raw binary. Prefix() Prefix // Produces a tuple of non-content metadata. @@ -29,7 +30,8 @@ type Cid interface { // - `KeyString() CidString` is gone because we're natively a map key now, you're welcome. // - `StringOfBase(mbase.Encoding) (string, error)` is skipped, maybe it can come back but maybe it should be a formatter's job. // - `Equals(o Cid) bool` is gone because it's now `==`, you're welcome. - // - `Bytes() []byte` is gone because I can't imagine where that should be used except again where a formatter should be involved. + + // TODO: make a multi-return method for {v,mc,mh} decomposition. CidStr will be able to implement this more efficiently than if one makes a series of the individual getter calls. } // Prefix represents all the metadata of a Cid, diff --git a/_rsrch/cidiface/cidString.go b/_rsrch/cidiface/cidString.go index c0deb9b..af2014f 100644 --- a/_rsrch/cidiface/cidString.go +++ b/_rsrch/cidiface/cidString.go @@ -2,23 +2,32 @@ package cid import ( "encoding/binary" + "fmt" mbase "github.com/multiformats/go-multibase" mh "github.com/multiformats/go-multihash" ) +//================= +// def & accessors +//================= + var _ Cid = CidStr("") +var _ map[CidStr]struct{} = nil // CidStr is a representation of a Cid as a string type containing binary. // // Using golang's string type is preferable over byte slices even for binary // data because golang strings are immutable, usable as map keys, // trivially comparable with built-in equals operators, etc. +// +// Please do not cast strings or bytes into the CidStr type directly; +// use a parse method which validates the data and yields a CidStr. type CidStr string -// EmptyCid is a constant for a zero/uninitialized/sentinelvalue cid; +// EmptyCidStr is a constant for a zero/uninitialized/sentinelvalue cid; // it is declared mainly for readability in checks for sentinel values. -const EmptyCid = CidStr("") +const EmptyCidStr = CidStr("") func (c CidStr) Version() uint64 { bytes := []byte(c) @@ -57,6 +66,21 @@ func (c CidStr) String() string { } } +// Bytes produces a raw binary format of the CID. +// +// (For CidStr, this method is only distinct from casting because of +// compatibility with v0 CIDs.) +func (c CidStr) Bytes() []byte { + switch c.Version() { + case 0: + return c.Multihash() + case 1: + return []byte(c) + default: + panic("not possible to reach this point") + } +} + // Prefix builds and returns a Prefix out of a Cid. func (c CidStr) Prefix() Prefix { dec, _ := mh.Decode(c.Multihash()) // assuming we got a valid multiaddr, this will not error @@ -67,3 +91,71 @@ func (c CidStr) Prefix() Prefix { Codec: c.Multicodec(), } } + +//================================== +// parsers & validators & factories +//================================== + +func newCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr { + hashlen := len(mhash) + // two 8 bytes (max) numbers plus hash + buf := make([]byte, 2*binary.MaxVarintLen64+hashlen) + n := binary.PutUvarint(buf, version) + n += binary.PutUvarint(buf[n:], codecType) + cn := copy(buf[n:], mhash) + if cn != hashlen { + panic("copy hash length is inconsistent") + } + return CidStr(buf[:n+hashlen]) +} + +// CidStrParse takes a binary byte slice, parses it, and returns either +// a valid CidStr, or the zero CidStr and an error. +// +// For CidV1, the data buffer is in the form: +// +// +// +// CidV0 are also supported. In particular, data buffers starting +// with length 34 bytes, which starts with bytes [18,32...] are considered +// binary multihashes. +// +// The multicodec bytes are not parsed to verify they're a valid varint; +// no further reification is performed. +// +// Multibase encoding should already have been unwrapped before parsing; +// if you have a multibase-enveloped string, use CidStrDecode instead. +// +// CidStrParse is the inverse of Cid.Bytes(). +func CidStrParse(data []byte) (CidStr, error) { + if len(data) == 34 && data[0] == 18 && data[1] == 32 { + h, err := mh.Cast(data) + if err != nil { + return EmptyCidStr, err + } + return newCidStr(0, DagProtobuf, h), nil + } + + vers, n := binary.Uvarint(data) + if err := uvError(n); err != nil { + return EmptyCidStr, err + } + + if vers != 0 && vers != 1 { + return EmptyCidStr, fmt.Errorf("invalid cid version number: %d", vers) + } + + _, cn := binary.Uvarint(data[n:]) + if err := uvError(cn); err != nil { + return EmptyCidStr, err + } + + rest := data[n+cn:] + h, err := mh.Cast(rest) + if err != nil { + return EmptyCidStr, err + } + + // REVIEW: if the data is longer than the mh.len expects, we silently ignore it? should we? + return CidStr(data[0 : n+cn+len(h)]), nil +} diff --git a/_rsrch/cidiface/cidStruct.go b/_rsrch/cidiface/cidStruct.go new file mode 100644 index 0000000..dcd154e --- /dev/null +++ b/_rsrch/cidiface/cidStruct.go @@ -0,0 +1,162 @@ +package cid + +import ( + "encoding/binary" + "fmt" + + mbase "github.com/multiformats/go-multibase" + mh "github.com/multiformats/go-multihash" +) + +//================= +// def & accessors +//================= + +var _ Cid = CidStruct{} + +//var _ map[CidStruct]struct{} = nil // Will not compile! See struct def docs. + +// CidStruct represents a CID in a struct format. +// +// This format complies with the exact same Cid interface as the CidStr +// implementation, but completely pre-parses the Cid metadata. +// CidStruct is a tad quicker in case of repeatedly accessed fields, +// but requires more reshuffling to parse and to serialize. +// CidStruct is not usable as a map key, because it contains a Multihash +// reference, which is a slice, and thus not "comparable" as a primitive. +// +// Beware of zero-valued CidStruct: it is difficult to distinguish an +// incorrectly-initialized "invalid" CidStruct from one representing a v0 cid. +type CidStruct struct { + version uint64 + codec uint64 + hash mh.Multihash +} + +// EmptyCidStruct is a constant for a zero/uninitialized/sentinelvalue cid; +// it is declared mainly for readability in checks for sentinel values. +// +// Note: it's not actually a const; the compiler does not allow const structs. +var EmptyCidStruct = CidStruct{} + +func (c CidStruct) Version() uint64 { + return c.version +} + +func (c CidStruct) Multicodec() uint64 { + return c.codec +} + +func (c CidStruct) Multihash() mh.Multihash { + return c.hash +} + +// String returns the default string representation of a Cid. +// Currently, Base58 is used as the encoding for the multibase string. +func (c CidStruct) String() string { + switch c.Version() { + case 0: + return c.Multihash().B58String() + case 1: + mbstr, err := mbase.Encode(mbase.Base58BTC, c.Bytes()) + if err != nil { + panic("should not error with hardcoded mbase: " + err.Error()) + } + return mbstr + default: + panic("not possible to reach this point") + } +} + +// Bytes produces a raw binary format of the CID. +func (c CidStruct) Bytes() []byte { + switch c.version { + case 0: + return []byte(c.hash) + case 1: + // two 8 bytes (max) numbers plus hash + buf := make([]byte, 2*binary.MaxVarintLen64+len(c.hash)) + n := binary.PutUvarint(buf, c.version) + n += binary.PutUvarint(buf[n:], c.codec) + cn := copy(buf[n:], c.hash) + if cn != len(c.hash) { + panic("copy hash length is inconsistent") + } + return buf[:n+len(c.hash)] + default: + panic("not possible to reach this point") + } +} + +// Prefix builds and returns a Prefix out of a Cid. +func (c CidStruct) Prefix() Prefix { + dec, _ := mh.Decode(c.hash) // assuming we got a valid multiaddr, this will not error + return Prefix{ + MhType: dec.Code, + MhLength: dec.Length, + Version: c.version, + Codec: c.codec, + } +} + +//================================== +// parsers & validators & factories +//================================== + +// CidStructParse takes a binary byte slice, parses it, and returns either +// a valid CidStruct, or the zero CidStruct and an error. +// +// For CidV1, the data buffer is in the form: +// +// +// +// CidV0 are also supported. In particular, data buffers starting +// with length 34 bytes, which starts with bytes [18,32...] are considered +// binary multihashes. +// +// The multicodec bytes are not parsed to verify they're a valid varint; +// no further reification is performed. +// +// Multibase encoding should already have been unwrapped before parsing; +// if you have a multibase-enveloped string, use CidStructDecode instead. +// +// CidStructParse is the inverse of Cid.Bytes(). +func CidStructParse(data []byte) (CidStruct, error) { + if len(data) == 34 && data[0] == 18 && data[1] == 32 { + h, err := mh.Cast(data) + if err != nil { + return EmptyCidStruct, err + } + return CidStruct{ + codec: DagProtobuf, + version: 0, + hash: h, + }, nil + } + + vers, n := binary.Uvarint(data) + if err := uvError(n); err != nil { + return EmptyCidStruct, err + } + + if vers != 0 && vers != 1 { + return EmptyCidStruct, fmt.Errorf("invalid cid version number: %d", vers) + } + + codec, cn := binary.Uvarint(data[n:]) + if err := uvError(cn); err != nil { + return EmptyCidStruct, err + } + + rest := data[n+cn:] + h, err := mh.Cast(rest) + if err != nil { + return EmptyCidStruct, err + } + + return CidStruct{ + version: vers, + codec: codec, + hash: h, + }, nil +} diff --git a/_rsrch/cidiface/enums.go b/_rsrch/cidiface/enums.go new file mode 100644 index 0000000..53e3d47 --- /dev/null +++ b/_rsrch/cidiface/enums.go @@ -0,0 +1,76 @@ +package cid + +// These are multicodec-packed content types. The should match +// the codes described in the authoritative document: +// https://github.com/multiformats/multicodec/blob/master/table.csv +const ( + Raw = 0x55 + + DagProtobuf = 0x70 + DagCBOR = 0x71 + + GitRaw = 0x78 + + EthBlock = 0x90 + EthBlockList = 0x91 + EthTxTrie = 0x92 + EthTx = 0x93 + EthTxReceiptTrie = 0x94 + EthTxReceipt = 0x95 + EthStateTrie = 0x96 + EthAccountSnapshot = 0x97 + EthStorageTrie = 0x98 + BitcoinBlock = 0xb0 + BitcoinTx = 0xb1 + ZcashBlock = 0xc0 + ZcashTx = 0xc1 + DecredBlock = 0xe0 + DecredTx = 0xe1 +) + +// Codecs maps the name of a codec to its type +var Codecs = map[string]uint64{ + "v0": DagProtobuf, + "raw": Raw, + "protobuf": DagProtobuf, + "cbor": DagCBOR, + "git-raw": GitRaw, + "eth-block": EthBlock, + "eth-block-list": EthBlockList, + "eth-tx-trie": EthTxTrie, + "eth-tx": EthTx, + "eth-tx-receipt-trie": EthTxReceiptTrie, + "eth-tx-receipt": EthTxReceipt, + "eth-state-trie": EthStateTrie, + "eth-account-snapshot": EthAccountSnapshot, + "eth-storage-trie": EthStorageTrie, + "bitcoin-block": BitcoinBlock, + "bitcoin-tx": BitcoinTx, + "zcash-block": ZcashBlock, + "zcash-tx": ZcashTx, + "decred-block": DecredBlock, + "decred-tx": DecredTx, +} + +// CodecToStr maps the numeric codec to its name +var CodecToStr = map[uint64]string{ + Raw: "raw", + DagProtobuf: "protobuf", + DagCBOR: "cbor", + GitRaw: "git-raw", + EthBlock: "eth-block", + EthBlockList: "eth-block-list", + EthTxTrie: "eth-tx-trie", + EthTx: "eth-tx", + EthTxReceiptTrie: "eth-tx-receipt-trie", + EthTxReceipt: "eth-tx-receipt", + EthStateTrie: "eth-state-trie", + EthAccountSnapshot: "eth-account-snapshot", + EthStorageTrie: "eth-storage-trie", + BitcoinBlock: "bitcoin-block", + BitcoinTx: "bitcoin-tx", + ZcashBlock: "zcash-block", + ZcashTx: "zcash-tx", + DecredBlock: "decred-block", + DecredTx: "decred-tx", +} diff --git a/_rsrch/cidiface/errors.go b/_rsrch/cidiface/errors.go new file mode 100644 index 0000000..588c62e --- /dev/null +++ b/_rsrch/cidiface/errors.go @@ -0,0 +1,24 @@ +package cid + +import ( + "errors" +) + +var ( + // ErrVarintBuffSmall means that a buffer passed to the cid parser was not + // long enough, or did not contain an invalid cid + ErrVarintBuffSmall = errors.New("reading varint: buffer too small") + + // ErrVarintTooBig means that the varint in the given cid was above the + // limit of 2^64 + ErrVarintTooBig = errors.New("reading varint: varint bigger than 64bits" + + " and not supported") + + // ErrCidTooShort means that the cid passed to decode was not long + // enough to be a valid Cid + ErrCidTooShort = errors.New("cid too short") + + // ErrInvalidEncoding means that selected encoding is not supported + // by this Cid version + ErrInvalidEncoding = errors.New("invalid base encoding") +) diff --git a/_rsrch/cidiface/misc.go b/_rsrch/cidiface/misc.go new file mode 100644 index 0000000..9a4486a --- /dev/null +++ b/_rsrch/cidiface/misc.go @@ -0,0 +1,12 @@ +package cid + +func uvError(read int) error { + switch { + case read == 0: + return ErrVarintBuffSmall + case read < 0: + return ErrVarintTooBig + default: + return nil + } +} From b4ab25ffda195b7034eade3da127a94004460808 Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 12:16:52 +0200 Subject: [PATCH 3/7] Discovered interesting case in map key checking. Using interfaces as a map's key type can cause some things that were otherwise compile-time checks to be pushed off into runtime checks instead. This is a pretty major "caveat emptor" if you use interface-keyed maps. --- _rsrch/cidiface/cidStruct.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/_rsrch/cidiface/cidStruct.go b/_rsrch/cidiface/cidStruct.go index dcd154e..2fd5fa7 100644 --- a/_rsrch/cidiface/cidStruct.go +++ b/_rsrch/cidiface/cidStruct.go @@ -15,6 +15,8 @@ import ( var _ Cid = CidStruct{} //var _ map[CidStruct]struct{} = nil // Will not compile! See struct def docs. +//var _ map[Cid]struct{} = map[Cid]struct{}{CidStruct{}: struct{}{}} // Legal to compile... +// but you'll get panics: "runtime error: hash of unhashable type cid.CidStruct" // CidStruct represents a CID in a struct format. // From 348b9201a63d8dff7c5a8e21252e0450cdc72b23 Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 12:24:34 +0200 Subject: [PATCH 4/7] Start a readme for this research project. Right now this is mostly this is to document the behavior of interface-keyed maps. I suspect some of those caveats might be non-obvious to a lot of folks. --- _rsrch/cidiface/README.md | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 _rsrch/cidiface/README.md diff --git a/_rsrch/cidiface/README.md b/_rsrch/cidiface/README.md new file mode 100644 index 0000000..d221a5d --- /dev/null +++ b/_rsrch/cidiface/README.md @@ -0,0 +1,39 @@ +What golang Kinds work best to implement CIDs? +============================================== + +There are many possible ways to implement CIDs. This package explores them. + +- Option A: CIDs as a struct; multihash as bytes. +- Option B: CIDs as a string. +- Option C: CIDs as an interface with multiple implementors. +- Option D: CIDs as a struct; multihash also as a struct or string. + +There's a couple different criteria to consider: + +- We want the best performance when operating on the type (getters, mostly); +- We want to minimize the number of memory allocations we need; +- We want types which can be used as map keys, because this is common. + +The priority of these criteria is open to argument, but it's probably +mapkeys > minalloc > anythingelse. +(Mapkeys and minalloc are also quite entangled, since if we don't pick a +representation that can work natively as a map key, we'll end up needing +a `KeyRepr()` method which gives us something that does work as a map key, +an that will almost certainly involve a malloc itself.) + + +Discoveries +----------- + +### using interfaces as map keys forgoes a lot of safety checks + +Using interfaces as map keys pushes a bunch of type checking to runtime. +E.g., it's totally valid at compile time to push a type which is non-comparable +into a map key; it will panic at *runtime* instead of failing at compile-time. + +There's also no way to define equality checks between implementors of the +interface: golang will always use its innate concept of comparison for the +concrete types. This means its effectively *never safe* to use two different +concrete implementations of an interface in the same map; you may add elements +which are semantically "equal" in your mind, and end up very confused later +when both impls of the same "equal" object have been stored. From fb8ecaccad523f0dfa20ec2e9a4225c72f9c9fcd Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 12:37:50 +0200 Subject: [PATCH 5/7] Enumerate some more options in prose. --- _rsrch/cidiface/README.md | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/_rsrch/cidiface/README.md b/_rsrch/cidiface/README.md index d221a5d..9f5aae9 100644 --- a/_rsrch/cidiface/README.md +++ b/_rsrch/cidiface/README.md @@ -3,10 +3,7 @@ What golang Kinds work best to implement CIDs? There are many possible ways to implement CIDs. This package explores them. -- Option A: CIDs as a struct; multihash as bytes. -- Option B: CIDs as a string. -- Option C: CIDs as an interface with multiple implementors. -- Option D: CIDs as a struct; multihash also as a struct or string. +### criteria There's a couple different criteria to consider: @@ -21,6 +18,34 @@ representation that can work natively as a map key, we'll end up needing a `KeyRepr()` method which gives us something that does work as a map key, an that will almost certainly involve a malloc itself.) +### options + +There are quite a few different ways to go: + +- Option A: CIDs as a struct; multihash as bytes. +- Option B: CIDs as a string. +- Option C: CIDs as an interface with multiple implementors. +- Option D: CIDs as a struct; multihash also as a struct or string. +- Option E: CIDs as a struct; content as strings plus offsets. + +The current approach on the master branch is Option A. + +Option D is distinctive from Option A because multihash as bytes transitively +causes the CID struct to be non-comparible and thus not suitable for map keys +as per https://golang.org/ref/spec#KeyType . (It's also a bit more work to +pursue Option D because it's just a bigger splash radius of change; but also, +something we might also want to do soon, because we *do* also have these same +map-key-usability concerns with multihash alone.) + +Option E is distinctive from Option D because Option E would always maintain +the binary format of the cid internally, and so could yield it again without +malloc, while still potentially having faster access to components than +Option B since it wouldn't need to re-parse varints to access later fields. + +Option C is the avoid-choices choice, but note that interfaces are not free; +since "minimize mallocs" is one of our major goals, we cannot use interfaces +whimsically. + Discoveries ----------- From 5a6d4bdf062d560015398c9d1d41b08e115f1ea0 Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 13:18:50 +0200 Subject: [PATCH 6/7] More readme on the state of iface research. --- _rsrch/cidiface/README.md | 60 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/_rsrch/cidiface/README.md b/_rsrch/cidiface/README.md index 9f5aae9..740ba9a 100644 --- a/_rsrch/cidiface/README.md +++ b/_rsrch/cidiface/README.md @@ -46,6 +46,9 @@ Option C is the avoid-choices choice, but note that interfaces are not free; since "minimize mallocs" is one of our major goals, we cannot use interfaces whimsically. +Note there is no proposal for migrating to `type Cid []bytes`, because that +is generally considered to be strictly inferior to `type Cid string`. + Discoveries ----------- @@ -62,3 +65,60 @@ concrete types. This means its effectively *never safe* to use two different concrete implementations of an interface in the same map; you may add elements which are semantically "equal" in your mind, and end up very confused later when both impls of the same "equal" object have been stored. + +### sentinel values are possible in any impl, but some are clearer than others + +When using `*Cid`, the nil value is a clear sentinel for 'invalid'; +when using `type Cid string`, the zero value is a clear sentinel; +when using `type Cid struct` per Option A or D... the only valid check is +for a nil multihash field, since version=0 and codec=0 are both valid values. + +### usability as a map key is important + +We already covered this in the criteria section, but for clarity: + +- Option A: ❌ +- Option B: ✔ +- Option C: ~ (caveats, and depends on concrete impl) +- Option D: ✔ +- Option E: ✔ + +### living without offsets requires parsing + +Since CID (and multihash!) are defined using varints, they require parsing; +we can't just jump into the string at a known offset in order to yield e.g. +the multicodec number. + +In order to get to the 'meat' of the CID (the multihash content), we first +must parse: + +- the CID version varint; +- the multicodec varint; +- the multihash type enum varint; +- and the multihash length varint. + +Since there are many applications where we want to jump straight to the +multihash content (for example, when doing CAS sharding -- see the +[disclaimer](https://github.com/multiformats/multihash#disclaimers) about +bias in leading bytes), this overhead may be interesting. + +How much this overhead is significant is hard to say from microbenchmarking; +it depends largely on usage patterns. If these traversals are a significant +timesink, it would be an argument for Option D/E. +If these traversals are *not* a significant timesink, we might be wiser +to keep to Option B, because keeping a struct full of offsets will add several +words of memory usage per CID, and we keep a *lot* of CIDs. + +### one way or another: let's get rid of that star + +We should switch completely to handling `Cid` and remove `*Cid` completely. +Regardless of whether we do this by migrating to interface, or string +implementations, or simply structs with no pointers... once we get there, +refactoring to any of the *others* can become a no-op from the perspective +of any downstream code that uses CIDs. + +(This means all access via functions, never references to fields -- even if +we were to use a struct implementation. *Pretend* there's a interface, +in other words.) + +There are probably `gofix` incantations which can help us with this migration. From 2cf56e3813404626502a3c23a4644e57866f4176 Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Fri, 24 Aug 2018 13:53:50 +0200 Subject: [PATCH 7/7] Benchmarks of various Cid types as map keys. And writeup on same. tl;dr interfaces are not cheap if you're already at the scale where you started caring about whether or not you have pointers. --- _rsrch/cidiface/README.md | 20 ++++++++ _rsrch/cidiface/cidBoxingBench_test.go | 71 ++++++++++++++++++++++++++ _rsrch/cidiface/cidString.go | 4 +- 3 files changed, 93 insertions(+), 2 deletions(-) create mode 100644 _rsrch/cidiface/cidBoxingBench_test.go diff --git a/_rsrch/cidiface/README.md b/_rsrch/cidiface/README.md index 740ba9a..24dd9b7 100644 --- a/_rsrch/cidiface/README.md +++ b/_rsrch/cidiface/README.md @@ -109,6 +109,26 @@ If these traversals are *not* a significant timesink, we might be wiser to keep to Option B, because keeping a struct full of offsets will add several words of memory usage per CID, and we keep a *lot* of CIDs. +### interfaces cause boxing which is a significant performance cost + +See `BenchmarkCidMap_CidStr` and friends. + +Long story short: using interfaces *anywhere* will cause the compiler to +implicitly generate boxing and unboxing code (e.g. `runtime.convT2E`); +this is both another function call, and more concerningly, results in +large numbers of unbatchable memory allocations. + +Numbers without context are dangerous, but if you need one: 33%. +It's a big deal. + +This means attempts to "use interfaces, but switch to concrete impls when +performance is important" are a red herring: it doesn't work that way. + +This is not a general inditement against using interfaces -- but +if a situation is at the scale where it's become important to mind whether +or not pointers are a performance impact, then that situation also +is one where you have to think twice before using interfaces. + ### one way or another: let's get rid of that star We should switch completely to handling `Cid` and remove `*Cid` completely. diff --git a/_rsrch/cidiface/cidBoxingBench_test.go b/_rsrch/cidiface/cidBoxingBench_test.go new file mode 100644 index 0000000..920ce1f --- /dev/null +++ b/_rsrch/cidiface/cidBoxingBench_test.go @@ -0,0 +1,71 @@ +package cid + +import ( + "testing" +) + +// BenchmarkCidMap_CidStr estimates how fast it is to insert primitives into a map +// keyed by CidStr (concretely). +// +// We do 100 insertions per benchmark run to make sure the map initialization +// doesn't dominate the results. +// +// Sample results on linux amd64 go1.11beta: +// +// BenchmarkCidMap_CidStr-8 100000 16317 ns/op +// BenchmarkCidMap_CidIface-8 100000 20516 ns/op +// +// With benchmem on: +// +// BenchmarkCidMap_CidStr-8 100000 15579 ns/op 11223 B/op 207 allocs/op +// BenchmarkCidMap_CidIface-8 100000 19500 ns/op 12824 B/op 307 allocs/op +// BenchmarkCidMap_StrPlusHax-8 200000 10451 ns/op 7589 B/op 202 allocs/op +// +// We can see here that the impact of interface boxing is significant: +// it increases the time taken to do the inserts to 133%, largely because +// the implied `runtime.convT2E` calls cause another malloc each. +// +// There are also significant allocations in both cases because +// A) we cannot create a multihash without allocations since they are []byte; +// B) the map has to be grown several times; +// C) something I haven't quite put my finger on yet. +// Ideally we'd drive those down further as well. +// +// Pre-allocating the map reduces allocs by a very small percentage by *count*, +// but reduces the time taken by 66% overall (presumably because when a map +// re-arranges itself, it involves more or less an O(n) copy of the content +// in addition to the alloc itself). This isn't topical to the question of +// whether or not interfaces are a good idea; just for contextualizing. +// +func BenchmarkCidMap_CidStr(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := map[CidStr]int{} + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} + +// BenchmarkCidMap_CidIface is in the family of BenchmarkCidMap_CidStr: +// it is identical except the map key type is declared as an interface +// (which forces all insertions to be boxed, changing performance). +func BenchmarkCidMap_CidIface(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := map[Cid]int{} + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} + +// BenchmarkCidMap_CidStrAvoidMapGrowth is in the family of BenchmarkCidMap_CidStr: +// it is identical except the map is created with a size hint that removes +// some allocations (5, in practice, apparently). +func BenchmarkCidMap_CidStrAvoidMapGrowth(b *testing.B) { + for i := 0; i < b.N; i++ { + mp := make(map[CidStr]int, 100) + for x := 0; x < 100; x++ { + mp[NewCidStr(0, uint64(x), []byte{})] = x + } + } +} diff --git a/_rsrch/cidiface/cidString.go b/_rsrch/cidiface/cidString.go index af2014f..129fafd 100644 --- a/_rsrch/cidiface/cidString.go +++ b/_rsrch/cidiface/cidString.go @@ -96,7 +96,7 @@ func (c CidStr) Prefix() Prefix { // parsers & validators & factories //================================== -func newCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr { +func NewCidStr(version uint64, codecType uint64, mhash mh.Multihash) CidStr { hashlen := len(mhash) // two 8 bytes (max) numbers plus hash buf := make([]byte, 2*binary.MaxVarintLen64+hashlen) @@ -133,7 +133,7 @@ func CidStrParse(data []byte) (CidStr, error) { if err != nil { return EmptyCidStr, err } - return newCidStr(0, DagProtobuf, h), nil + return NewCidStr(0, DagProtobuf, h), nil } vers, n := binary.Uvarint(data)