nod-ai
diff --git a/‎sharktank/sharktank/evaluate/perplexity_iree.py
+58-52 b/‎sharktank/sharktank/evaluate/perplexity_iree.py
+58-52
diff --git a/‎sharktank/sharktank/utils/iree.py
+42-1 b/‎sharktank/sharktank/utils/iree.py
+42-1
diff --git a/‎sharktank/tests/layers/sharded_conv2d_with_iree_test.py
+43-36 b/‎sharktank/tests/layers/sharded_conv2d_with_iree_test.py
+43-36
diff --git a/‎sharktank/tests/models/clip/clip_test.py
+30-23 b/‎sharktank/tests/models/clip/clip_test.py
+30-23
@@ -19,6 +19,7 @@
 
 import torch
 from torch.nn import CrossEntropyLoss
+import iree.runtime
 
 from sharktank.models.llama.llama import *
 from sharktank.models.mixtral.mixtral import *
@@ -34,7 +35,7 @@
 from sharktank.utils.load_llm import *
 from sharktank.utils.create_cache import *
 from sharktank.utils.export_artifacts import *
-from sharktank.utils.iree import iree_to_torch
+from sharktank.utils.iree import iree_to_torch, with_iree_device_context
 
 log_levels = {
     "info": logging.INFO,
@@ -285,76 +286,81 @@ def decode_vmfb(self, token_batch, i):
 
     @timeit
     def get_logits(self, page_cache_size):
+        def run_iree_module(iree_devices: list[iree.runtime.HalDevice]):
+            is_first_token = True
+            start = 0
+            for i in tqdm(
+                range(start, self.max_prompt_length - 1),
+                mininterval=300,
+                desc="eval: Calculating logits",
+            ):
+                logger.debug(f"Iteration: {i}")
 
-        is_first_token = True
-        start = 0
-        for i in tqdm(
-            range(start, self.max_prompt_length - 1),
-            mininterval=300,
-            desc="eval: Calculating logits",
-        ):
-            logger.debug(f"Iteration: {i}")
+                if is_first_token:
 
-            if is_first_token:
+                    token_batch = self.token_ids[:, : i + 1]
 
-                token_batch = self.token_ids[:, : i + 1]
+                    logger.debug(f"Prefill:")
 
-                logger.debug(f"Prefill:")
+                    logger.debug("Input:")
+                    logger.debug(f"{self.generator.tokenizer.decode(token_batch)}")
 
-                logger.debug("Input:")
-                logger.debug(f"{self.generator.tokenizer.decode(token_batch)}")
+                    token_batch, seq_lens_batch = self.generator.tokenizer.pad_tokens(
+                        token_ids=token_batch.tolist(),
+                        pad_to_multiple_of=self.generator.model.cache.pad_sequence_stride,
+                    )
 
-                token_batch, seq_lens_batch = self.generator.tokenizer.pad_tokens(
-                    token_ids=token_batch.tolist(),
-                    pad_to_multiple_of=self.generator.model.cache.pad_sequence_stride,
-                )
+                    logger.debug(f"{token_batch}")
 
-                logger.debug(f"{token_batch}")
+                    token_batch = torch.tensor(token_batch, device=self.torch_device)
+                    self.seq_lens_batch = torch.tensor(
+                        seq_lens_batch, device=self.torch_device
+                    )
 
-                token_batch = torch.tensor(token_batch, device=self.torch_device)
-                self.seq_lens_batch = torch.tensor(
-                    seq_lens_batch, device=self.torch_device
-                )
+                    self.batch = self.generator.begin_eval_batch(
+                        token_batch=token_batch,
+                        seq_lens_batch=self.seq_lens_batch,
+                        bs=self.bs,
+                        page_cache_size=page_cache_size,
+                    )
 
-                self.batch = self.generator.begin_eval_batch(
-                    token_batch=token_batch,
-                    seq_lens_batch=self.seq_lens_batch,
-                    bs=self.bs,
-                    page_cache_size=page_cache_size,
-                )
+                    if self.kv_cache_dtype in self.halelementtype_map.keys():
 
-                if self.kv_cache_dtype in self.halelementtype_map.keys():
+                        cache_state = self.batch.cache_state[0]
 
-                    cache_state = self.batch.cache_state[0]
+                        cache_as_int16 = cache_state.to(dtype=torch.int16)
 
-                    cache_as_int16 = cache_state.to(dtype=torch.int16)
+                        device_array_as_int16 = ireert.asdevicearray(
+                            self.haldevice,
+                            unbox_tensor(cache_as_int16).to("cpu").numpy(),
+                        )
 
-                    device_array_as_int16 = ireert.asdevicearray(
-                        self.haldevice, unbox_tensor(cache_as_int16).to("cpu").numpy()
-                    )
+                        buffer_view = ireert.HalBufferView(
+                            buffer=device_array_as_int16._buffer_view.get_buffer(),
+                            shape=device_array_as_int16._buffer_view.shape,
+                            element_type=self.halelementtype_map[self.kv_cache_dtype],
+                        )
+                        self.cache_state = ireert.DeviceArray(
+                            self.haldevice, buffer_view
+                        )
 
-                    buffer_view = ireert.HalBufferView(
-                        buffer=device_array_as_int16._buffer_view.get_buffer(),
-                        shape=device_array_as_int16._buffer_view.shape,
-                        element_type=self.halelementtype_map[self.kv_cache_dtype],
-                    )
-                    self.cache_state = ireert.DeviceArray(self.haldevice, buffer_view)
+                    else:
+                        self.cache_state = ireert.asdevicearray(
+                            self.haldevice, self.batch.cache_state[0].to("cpu").numpy()
+                        )
 
-                else:
-                    self.cache_state = ireert.asdevicearray(
-                        self.haldevice, self.batch.cache_state[0].to("cpu").numpy()
-                    )
+                    prefill_logits = self.prefill_vmfb(token_batch, i).clone()
+                    self.out_logits = prefill_logits[:, -1:, :]
 
-                prefill_logits = self.prefill_vmfb(token_batch, i)
-                self.out_logits = prefill_logits[:, -1:, :]
+                    is_first_token = False
 
-                is_first_token = False
+                else:
+                    token_batch = self.token_ids[:, i : i + 1]
 
-            else:
-                token_batch = self.token_ids[:, i : i + 1]
+                    decode_logits = self.decode_vmfb(token_batch, i)
+                    self.out_logits = torch.cat((self.out_logits, decode_logits), 1)
 
-                decode_logits = self.decode_vmfb(token_batch, i)
-                self.out_logits = torch.cat((self.out_logits, decode_logits), 1)
+        with_iree_device_context(run_iree_module, [self.runner.config.device])
 
         pad_logits_shape = self.token_ids.shape[1] - self.out_logits.shape[1]
 
 
@@ -5,13 +5,15 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import iree.runtime
-from typing import List, Tuple, Optional, Union
+from typing import Any, Callable, Generator, List, Tuple, Optional, Union
 from pathlib import Path
 import torch
 import os
 import numpy as np
 import collections.abc
 from collections import OrderedDict
+from contextlib import contextmanager
+import gc
 from ..types.tensors import (
     AnyTensor,
     InferenceTensor,
@@ -23,6 +25,45 @@
 from .tree import Tree
 
 
+def with_iree_device_context(
+    fn: Callable[[list[iree.runtime.HalDevice]], Any],
+    devices: list[iree.runtime.HalDevice],
+):
+    """Run a function with the provided devices and make sure all local resources
+    created in the function are cleaned up.
+
+    This construct is required as iree.runtime.HalBuffer, iree.runtime.HalBufferView
+    and iree.runtime.MappedMemory do not hold a reference to their respective
+    HalDevice, but they must be destroyed before the device is destroyed.
+    They are thin wrappers of the underlying native objects and they do not hold
+    references to their parent devices to avoid circular references.
+    To ensure a correct destruction order it is desirable that callable argument does
+    not return or leak arrays to the external context that are backed by IREE native
+    buffers.
+    If that is the case the user is responsible for destruction order.
+
+    An example usage that may cause a problem is
+    ```
+    def f():
+        dev: iree.runtime.HalDevice = ...
+        dev_arr: iree.runtime.DeviceArray = ...
+
+        # This creates a numpy array that is backed by iree.runtime.MappedMemory.
+        arr = dev_arr.to_host()
+
+        del dev_arr
+
+        t = torch.tensor(arr)
+    ```
+    Although the dev variable will be deleted after all other variables, in practice
+    with the various object wrappings with numpy and torch, the underlying HalBuffer
+    may get destroyed after the device.
+    """
+    res = fn(devices)
+    gc.collect()
+    return res
+
+
 def get_iree_devices(
     *, driver: str | None = None, device_count: int = 1
 ) -> List[iree.runtime.HalDevice]:
 
@@ -24,6 +24,7 @@
     unbox_tensor,
 )
 from sharktank.types.sharding import Conv2DSplitOutputChannelSharding
+from sharktank.utils.iree import with_iree_device_context
 import iree.runtime
 from typing import List, Optional
 import os
@@ -63,48 +64,54 @@ def run_iree_module(
     devices = [
         hal_driver.create_device(available_devices[0]) for _ in range(shard_count)
     ]
-    hal_module = iree.runtime.create_hal_module(instance=vm_instance, devices=devices)
-    params_path = Path(parameters_path)
-    # TODO: make IREE able to load the parameters from the top parameter file
-    # without having to specify the parameter file for each shard separately.
-    parameter_index = iree.runtime.ParameterIndex()
-    for i in range(shard_count):
-        parameter_index.load(
-            file_path=str(
-                Path(params_path).with_suffix(f".rank{i}{params_path.suffix}")
+
+    def run_iree_module(devices: list[iree.runtime.HalDevice]):
+        hal_module = iree.runtime.create_hal_module(
+            instance=vm_instance, devices=devices
+        )
+        params_path = Path(parameters_path)
+        # TODO: make IREE able to load the parameters from the top parameter file
+        # without having to specify the parameter file for each shard separately.
+        parameter_index = iree.runtime.ParameterIndex()
+        for i in range(shard_count):
+            parameter_index.load(
+                file_path=str(
+                    Path(params_path).with_suffix(f".rank{i}{params_path.suffix}")
+                )
             )
+        parameter_provider = parameter_index.create_provider(scope="model")
+        parameters_module = iree.runtime.create_io_parameters_module(
+            vm_instance, parameter_provider
         )
-    parameter_provider = parameter_index.create_provider(scope="model")
-    parameters_module = iree.runtime.create_io_parameters_module(
-        vm_instance, parameter_provider
-    )
 
-    vm_module = iree.runtime.VmModule.mmap(vm_instance, str(module_path))
+        vm_module = iree.runtime.VmModule.mmap(vm_instance, str(module_path))
 
-    # The context needs to be destroyed after the buffers, although
-    # it is not associate with them on the API level.
-    global vm_context
-    vm_context = iree.runtime.VmContext(
-        instance=vm_instance, modules=(hal_module, parameters_module, vm_module)
-    )
-    module_input_args = [
-        iree.runtime.asdevicearray(
-            devices[i], sharded_input_image.shards[i].as_torch().to("cpu").numpy()
+        # The context needs to be destroyed after the buffers, although
+        # it is not associate with them on the API level.
+        global vm_context
+        vm_context = iree.runtime.VmContext(
+            instance=vm_instance, modules=(hal_module, parameters_module, vm_module)
         )
-        for i in range(shard_count)
-    ]
+        module_input_args = [
+            iree.runtime.asdevicearray(
+                devices[i], sharded_input_image.shards[i].as_torch().to("cpu").numpy()
+            )
+            for i in range(shard_count)
+        ]
+
+        vm_function = vm_module.lookup_function("main")
+        invoker = iree.runtime.FunctionInvoker(
+            vm_context=vm_context,
+            # TODO: rework iree.runtime.FunctionInvoker interface for multiple devices.
+            # This works, but does not look right.
+            device=devices[0],
+            vm_function=vm_function,
+        )
+        results = invoker(*module_input_args)
+        shards = [torch.tensor(tensor.to_host()).clone() for tensor in results]
+        return SplitPrimitiveTensor(ts=shards, shard_dim=1)
 
-    vm_function = vm_module.lookup_function("main")
-    invoker = iree.runtime.FunctionInvoker(
-        vm_context=vm_context,
-        # TODO: rework iree.runtime.FunctionInvoker interface for multiple devices.
-        # This works, but does not look right.
-        device=devices[0],
-        vm_function=vm_function,
-    )
-    results = invoker(*module_input_args)
-    shards = [torch.tensor(tensor.to_host()) for tensor in results]
-    return SplitPrimitiveTensor(ts=shards, shard_dim=1)
+    return with_iree_device_context(run_iree_module, devices)
 
 
 def run_test_sharded_conv2d_with_iree(
 
@@ -7,6 +7,7 @@
 from collections import OrderedDict
 import functools
 import iree.compiler
+import iree.runtime
 import os
 from pathlib import Path
 from parameterized import parameterized
@@ -26,6 +27,7 @@
 )
 
 from sharktank.utils.iree import (
+    with_iree_device_context,
     get_iree_devices,
     load_iree_module,
     run_iree_module_function,
@@ -193,30 +195,35 @@ def runTestCompareIreeAgainstTorchEagerWithInputTokens(
         expected_outputs = flatten_for_iree_signature(reference_result_dict)
 
         iree_devices = get_iree_devices(driver="hip", device_count=1)
-        logger.info("Loading IREE module...")
-        iree_module, iree_vm_context, iree_vm_instance = load_iree_module(
-            module_path=iree_module_path,
-            devices=iree_devices,
-            parameters_path=parameters_path,
-        )
-        iree_args = prepare_iree_module_function_args(
-            args=flatten_for_iree_signature(input_args), devices=iree_devices
-        )
-        logger.info("Invoking IREE function...")
-        iree_result = iree_to_torch(
-            *run_iree_module_function(
-                module=iree_module,
-                vm_context=iree_vm_context,
-                args=iree_args,
-                device=iree_devices[0],
-                function_name=f"forward_bs{batch_size}",
-                trace_path_prefix=f"{target_model_path_prefix}_iree_",
+
+        def run_iree_module(iree_devices: list[iree.runtime.HalDevice]):
+            logger.info("Loading IREE module...")
+            iree_module, iree_vm_context, iree_vm_instance = load_iree_module(
+                module_path=iree_module_path,
+                devices=iree_devices,
+                parameters_path=parameters_path,
             )
-        )
-        actual_outputs = [
-            ops.to(iree_result[i], dtype=expected_outputs[i].dtype)
-            for i in range(len(expected_outputs))
-        ]
+            iree_args = prepare_iree_module_function_args(
+                args=flatten_for_iree_signature(input_args), devices=iree_devices
+            )
+            logger.info("Invoking IREE function...")
+            iree_result = iree_to_torch(
+                *run_iree_module_function(
+                    module=iree_module,
+                    vm_context=iree_vm_context,
+                    args=iree_args,
+                    device=iree_devices[0],
+                    function_name=f"forward_bs{batch_size}",
+                    trace_path_prefix=f"{target_model_path_prefix}_iree_",
+                )
+            )
+            actual_outputs = [
+                ops.to(iree_result[i], dtype=expected_outputs[i].dtype)
+                for i in range(len(expected_outputs))
+            ]
+            return [t.clone() for t in actual_outputs]
+
+        actual_outputs = with_iree_device_context(run_iree_module, iree_devices)
 
         actual_last_hidden_state = actual_outputs[0]
         expected_last_hidden_state = expected_outputs[0]