Skip to content

Memory leak with GL and Cuda rasterizer contexts #226

Description

@kirilllzaitsev

I am trying to debug the problem of ever-growing RAM after initializing a rasterizer context. Although no renderer functionality is used, the memory steadily grows in both single-GPU and nn.DataParallel environments:

  • single GPU

Image

  • multiple GPUs

Image

Following a proposed solution to a similar issue at #23, the rendering code is executed by explicitly created threads:

            self.glctx = None
            gpu_ids = list(range(torch.cuda.device_count()))
            self.rasterizer = Rasterizer(Dispatcher(gpu_ids=gpu_ids), device=self.device)
            if self.use_ddp:
                self.rasterizer = torch.nn.DataParallel(self.rasterizer, gpu_ids)

and


class Dispatcher:
    # https://github.com/NVlabs/nvdiffrast/issues/23
    def __init__(self, gpu_ids):
        self.threads = {}
        self.events = {}
        self.funcs = {}
        self.return_events = {}
        self.return_values = {}

        for gpu_id in gpu_ids:
            device = torch.device(gpu_id)
            self.events[device] = threading.Event()
            self.return_events[device] = threading.Event()
            self.threads[device] = threading.Thread(
                target=Dispatcher.worker,
                args=(
                    self,
                    device,
                ),
                daemon=True,
            )
            self.threads[device].start()

    @staticmethod
    def worker(self, device):
        torch.cuda.set_device(device)  <- has to be set to prevent device mistmatches
        ctx = dr.RasterizeCudaContext(device=device)  <- same problem for GL/Cuda context
        while True:
            self.events[device].wait()
            assert device not in self.return_values
            self.return_values[device] = self.funcs[device](ctx)
            del self.funcs[device]
            self.events[device].clear()
            self.return_events[device].set()

    def __call__(self, device, func):
        assert device not in self.funcs
        self.funcs[device] = func
        self.events[device].set()
        self.return_events[device].wait()
        ret_val = self.return_values[device]
        del self.return_values[device]
        self.return_events[device].clear()
        return ret_val


class Rasterizer(torch.nn.Module):
    def __init__(self, dispatcher, device):
        super().__init__()
        self.dispatcher = dispatcher
        self.device = device

    def forward(self, pos, tri, resolution, mesh_tensors, pts_cam):
        def func(ctx):
            has_tex = "tex" in mesh_tensors
            rast_out, _ = dr.rasterize(ctx, pos=pos, tri=tri, resolution=resolution)
            xyz_map, _ = dr.interpolate(pts_cam, rast_out, tri)
            depth = xyz_map[..., 2]
            device = pos.device
            if has_tex:
                texc, _ = dr.interpolate(mesh_tensors["uv"].to(device), rast_out, mesh_tensors["uv_idx"].to(device))
                color = dr.texture(mesh_tensors["tex"].to(device), texc, filter_mode="linear")
            else:
                color, _ = dr.interpolate(mesh_tensors["vertex_color"].to(device), rast_out, tri)
            return {
                "depth": depth,
                "color": color,
                "rast_out": rast_out,
                "xyz_map": xyz_map,
            }

        return self.dispatcher(pos.device, func)

At the same time, trying to get more logs with dr.set_log_level(0) in the single-GPU setup results in a deadlock.

The ultimate goal is to render in a distributed training setup, not necessarily with differentiation. Could you give some guidance on which things to check?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Fields

    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions