I am trying to debug the problem of ever-growing RAM after initializing a rasterizer context. Although no renderer functionality is used, the memory steadily grows in both single-GPU and nn.DataParallel environments:


Following a proposed solution to a similar issue at #23, the rendering code is executed by explicitly created threads:
self.glctx = None
gpu_ids = list(range(torch.cuda.device_count()))
self.rasterizer = Rasterizer(Dispatcher(gpu_ids=gpu_ids), device=self.device)
if self.use_ddp:
self.rasterizer = torch.nn.DataParallel(self.rasterizer, gpu_ids)
and
class Dispatcher:
# https://github.com/NVlabs/nvdiffrast/issues/23
def __init__(self, gpu_ids):
self.threads = {}
self.events = {}
self.funcs = {}
self.return_events = {}
self.return_values = {}
for gpu_id in gpu_ids:
device = torch.device(gpu_id)
self.events[device] = threading.Event()
self.return_events[device] = threading.Event()
self.threads[device] = threading.Thread(
target=Dispatcher.worker,
args=(
self,
device,
),
daemon=True,
)
self.threads[device].start()
@staticmethod
def worker(self, device):
torch.cuda.set_device(device) <- has to be set to prevent device mistmatches
ctx = dr.RasterizeCudaContext(device=device) <- same problem for GL/Cuda context
while True:
self.events[device].wait()
assert device not in self.return_values
self.return_values[device] = self.funcs[device](ctx)
del self.funcs[device]
self.events[device].clear()
self.return_events[device].set()
def __call__(self, device, func):
assert device not in self.funcs
self.funcs[device] = func
self.events[device].set()
self.return_events[device].wait()
ret_val = self.return_values[device]
del self.return_values[device]
self.return_events[device].clear()
return ret_val
class Rasterizer(torch.nn.Module):
def __init__(self, dispatcher, device):
super().__init__()
self.dispatcher = dispatcher
self.device = device
def forward(self, pos, tri, resolution, mesh_tensors, pts_cam):
def func(ctx):
has_tex = "tex" in mesh_tensors
rast_out, _ = dr.rasterize(ctx, pos=pos, tri=tri, resolution=resolution)
xyz_map, _ = dr.interpolate(pts_cam, rast_out, tri)
depth = xyz_map[..., 2]
device = pos.device
if has_tex:
texc, _ = dr.interpolate(mesh_tensors["uv"].to(device), rast_out, mesh_tensors["uv_idx"].to(device))
color = dr.texture(mesh_tensors["tex"].to(device), texc, filter_mode="linear")
else:
color, _ = dr.interpolate(mesh_tensors["vertex_color"].to(device), rast_out, tri)
return {
"depth": depth,
"color": color,
"rast_out": rast_out,
"xyz_map": xyz_map,
}
return self.dispatcher(pos.device, func)
At the same time, trying to get more logs with dr.set_log_level(0) in the single-GPU setup results in a deadlock.
The ultimate goal is to render in a distributed training setup, not necessarily with differentiation. Could you give some guidance on which things to check?
I am trying to debug the problem of ever-growing RAM after initializing a rasterizer context. Although no renderer functionality is used, the memory steadily grows in both single-GPU and nn.DataParallel environments:
Following a proposed solution to a similar issue at #23, the rendering code is executed by explicitly created threads:
and
At the same time, trying to get more logs with
dr.set_log_level(0)in the single-GPU setup results in a deadlock.The ultimate goal is to render in a distributed training setup, not necessarily with differentiation. Could you give some guidance on which things to check?