Chia-Network · richardkiss · Apr 2, 2025 · May 19, 2022 · May 20, 2022 · May 23, 2022
diff --git a/clvm/SExp.py b/clvm/SExp.py
@@ -179,9 +179,9 @@ def as_int(self) -> int:
             raise TypeError("Unable to convert a pair to an int")
         return int_from_bytes(self.atom)
 
-    def as_bin(self) -> bytes:
+    def as_bin(self, *, allow_backrefs: bool = False) -> bytes:
         f = io.BytesIO()
-        sexp_to_stream(self, f)
+        sexp_to_stream(self, f, allow_backrefs=allow_backrefs)
         return f.getvalue()
 
     # TODO: should be `v: CastableType`

diff --git a/clvm/object_cache.py b/clvm/object_cache.py
@@ -0,0 +1,106 @@
+from typing import Callable, Dict, Generic, Optional, Tuple, TypeVar
+
+import hashlib
+
+from .CLVMObject import CLVMStorage
+
+T = TypeVar("T")
+
+
+class ObjectCache(Generic[T]):
+    """
+    `ObjectCache` provides a way to calculate and cache values for each node
+    in a clvm object tree. It can be used to calculate the sha256 tree hash
+    for an object and save the hash for all the child objects for building
+    usage tables, for example.
+
+    It also allows a function that's defined recursively on a clvm tree to
+    have a non-recursive implementation (as it keeps a stack of uncached
+    objects locally).
+    """
+
+    def __init__(self, f: Callable[["ObjectCache[T]", CLVMStorage], Optional[T]]):
+        """
+        `f`: Callable[ObjectCache, CLVMObject] -> Union[None, T]
+
+        The function `f` is expected to calculate its T value recursively based
+        on the T values for the left and right child for a pair. For an atom, the
+        function f must calculate the T value directly.
+
+        If a pair is passed and one of the children does not have its T value cached
+        in `ObjectCache` yet, return `None` and f will be called with each child in turn.
+        Don't recurse in f; that's part of the point of this function.
+        """
+        self.f = f
+        self.lookup: Dict[int, Tuple[T, CLVMStorage]] = dict()
+
+    def get(self, obj: CLVMStorage) -> T:
+        obj_id = id(obj)
+        if obj_id not in self.lookup:
+            obj_list = [obj]
+            while obj_list:
+                node = obj_list.pop()
+                node_id = id(node)
+                if node_id not in self.lookup:
+                    v = self.f(self, node)
+                    if v is None:
+                        if node.pair is None:
+                            raise ValueError("f returned None for atom", node)
+                        obj_list.append(node)
+                        obj_list.append(node.pair[0])
+                        obj_list.append(node.pair[1])
+                    else:
+                        self.lookup[node_id] = (v, node)
+        return self.lookup[obj_id][0]
+
+    def contains(self, obj: CLVMStorage) -> bool:
+        return id(obj) in self.lookup
+
+
+def treehash(cache: ObjectCache[bytes], obj: CLVMStorage) -> Optional[bytes]:
+    """
+    This function can be fed to `ObjectCache` to calculate the sha256 tree
+    hash for all objects in a tree.
+    """
+    if obj.pair:
+        left, right = obj.pair
+
+        # ensure both `left` and `right` have cached values
+        if cache.contains(left) and cache.contains(right):
+            left_hash = cache.get(left)
+            right_hash = cache.get(right)
+            return hashlib.sha256(b"\2" + left_hash + right_hash).digest()
+        return None
+    assert obj.atom is not None
+    return hashlib.sha256(b"\1" + obj.atom).digest()
+
+
+def serialized_length(cache: ObjectCache[int], obj: CLVMStorage) -> Optional[int]:
+    """
+    This function can be fed to `ObjectCache` to calculate the serialized
+    length for all objects in a tree.
+    """
+    if obj.pair:
+        left, right = obj.pair
+
+        # ensure both `left` and `right` have cached values
+        if cache.contains(left) and cache.contains(right):
+            left_length = cache.get(left)
+            right_length = cache.get(right)
+            return 1 + left_length + right_length
+        return None
+    assert obj.atom is not None
+    lb = len(obj.atom)
+    if lb == 0 or (lb == 1 and obj.atom[0] < 128):
+        return 1
+    if lb < 0x40:
+        return 1 + lb
+    if lb < 0x2000:
+        return 2 + lb
+    if lb < 0x100000:
+        return 3 + lb
+    if lb < 0x8000000:
+        return 4 + lb
+    if lb < 0x400000000:
+        return 5 + lb
+    raise ValueError("atom of size %d too long" % lb)
diff --git a/clvm/read_cache_lookup.py b/clvm/read_cache_lookup.py
@@ -0,0 +1,178 @@
+from collections import Counter
+from typing import Dict, Optional, List, Set, Tuple
+
+import hashlib
+
+
+LEFT = 0
+RIGHT = 1
+
+
+class ReadCacheLookup:
+    """
+    When deserializing a clvm object, a stack of deserialized child objects
+    is created, which can be used with back-references. A `ReadCacheLookup` keeps
+    track of the state of this stack and all child objects under each root
+    node in the stack so that we can quickly determine if a relevant
+    back-reference is available.
+
+    In other words, if we've already serialized an object with tree hash T,
+    and we encounter another object with that tree hash, we don't re-serialize
+    it, but rather include a back-reference to it. This data structure lets
+    us quickly determine which back-reference has the shortest path.
+
+    Note that there is a counter. This is because the stack contains some
+    child objects that are transient, and no longer appear in the stack
+    at later times in the parsing. We don't want to waste time looking for
+    these objects that no longer exist, so we reference-count them.
+
+    All hashes correspond to sha256 tree hashes.
+    """
+
+    def __init__(self) -> None:
+        """
+        Create a new `ReadCacheLookup` object with just the null terminator
+        (ie. an empty list of objects).
+        """
+        self.root_hash = hashlib.sha256(b"\1").digest()
+        self.read_stack: List[Tuple[bytes, bytes]] = []
+        self.count: Counter[bytes] = Counter()
+        self.parent_paths_for_child: Dict[bytes, List[Tuple[bytes, int]]] = {}
+
+    def push(self, obj_hash: bytes) -> None:
+        """
+        This function is used to note that an object with the given hash has just
+        been pushed to the read stack, and update the lookups as appropriate.
+        """
+        # we add two new entries: the new root of the tree, and this object (by id)
+        # new_root: (obj_hash, old_root)
+        new_root_hash = hashlib.sha256(b"\2" + obj_hash + self.root_hash).digest()
+
+        self.read_stack.append((obj_hash, self.root_hash))
+
+        self.count.update([obj_hash, new_root_hash])
+
+        new_parent_to_old_root = (new_root_hash, LEFT)
+        self.parent_paths_for_child.setdefault(obj_hash, list()).append(
+            new_parent_to_old_root
+        )
+
+        new_parent_to_id = (new_root_hash, RIGHT)
+        self.parent_paths_for_child.setdefault(self.root_hash, list()).append(
+            new_parent_to_id
+        )
+        self.root_hash = new_root_hash
+
+    def pop(self) -> Tuple[bytes, bytes]:
+        """
+        This function is used to note that the top object has just been popped
+        from the read stack. Return the 2-tuple of the child hashes.
+        """
+        item = self.read_stack.pop()
+        self.count[item[0]] -= 1
+        self.count[self.root_hash] -= 1
+        self.root_hash = item[1]
+        return item
+
+    def pop2_and_cons(self) -> None:
+        """
+        This function is used to note that a "pop-and-cons" operation has just
+        happened. We remove two objects, cons them together, and push the cons,
+        updating the internal look-ups as necessary.
+        """
+        # we remove two items: the right side of each left/right pair
+        right = self.pop()
+        left = self.pop()
+
+        self.count.update([left[0], right[0]])
+
+        new_root_hash = hashlib.sha256(b"\2" + left[0] + right[0]).digest()
+
+        self.parent_paths_for_child.setdefault(left[0], list()).append(
+            (new_root_hash, LEFT)
+        )
+        self.parent_paths_for_child.setdefault(right[0], list()).append(
+            (new_root_hash, RIGHT)
+        )
+        self.push(new_root_hash)
+
+    def find_paths(self, obj_hash: bytes, serialized_length: int) -> Set[bytes]:
+        """
+        This function looks for a path from the root to a child node with a given hash
+        by using the read cache.
+        """
+        valid_paths: Set[bytes] = set()
+        if serialized_length < 3:
+            return valid_paths
+
+        seen_ids: Set[bytes] = set()
+
+        max_bytes_for_path_encoding = serialized_length - 2
+        # 1 byte for 0xfe, 1 min byte for savings
+
+        max_path_length = max_bytes_for_path_encoding * 8 - 1
+        seen_ids.add(obj_hash)
+
+        partial_paths: List[Tuple[bytes, List[int]]] = [(obj_hash, [])]
+
+        while partial_paths:
+            new_seen_ids = set(seen_ids)
+            new_partial_paths = []
+            for node, path in partial_paths:
+                if node == self.root_hash:
+                    valid_paths.add(reversed_path_to_bytes(path))
+                    continue
+
+                parent_paths = self.parent_paths_for_child.get(node)
+
+                if parent_paths:
+                    for parent, direction in parent_paths:
+                        if self.count[parent] > 0 and parent not in seen_ids:
+                            new_path = list(path)
+                            new_path.append(direction)
+                            if len(new_path) > max_path_length:
+                                return set()
+                            new_partial_paths.append((parent, new_path))
+                        new_seen_ids.add(parent)
+            partial_paths = new_partial_paths
+            if valid_paths:
+                return valid_paths
+            seen_ids = set(new_seen_ids)
+        return valid_paths
+
+    def find_path(self, obj_hash: bytes, serialized_length: int) -> Optional[bytes]:
+        r = self.find_paths(obj_hash, serialized_length)
+        return min(r) if len(r) > 0 else None
+
+
+def reversed_path_to_bytes(path: List[int]) -> bytes:
+    """
+    Convert a list of 0/1 (for left/right) values to a path expected by clvm.
+
+    Reverse the list; convert to a binary number; prepend a 1; break into bytes.
+
+    [] => bytes([0b1])
+    [0] => bytes([0b10])
+    [1] => bytes([0b11])
+    [0, 0] => bytes([0b100])
+    [0, 1] => bytes([0b101])
+    [1, 0] => bytes([0b110])
+    [1, 1] => bytes([0b111])
+    [0, 0, 1] => bytes([0b1001])
+    [1, 1, 1, 1, 0, 0, 0, 0, 1] => bytes([0b11, 0b11100001])
+    """
+
+    byte_count = (len(path) + 1 + 7) >> 3
+    v = bytearray(byte_count)
+    index = byte_count - 1
+    mask = 1
+    for p in reversed(path):
+        if p:
+            v[index] |= mask
+        if mask == 0x80:
+            index -= 1
+            mask = 1
+        else:
+            mask <<= 1
+    v[index] |= mask
+    return bytes(v)