From e98e25b7a7b5f47182ca5431e43f183621953c12 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Mon, 6 Oct 2025 19:47:30 +0900 Subject: [PATCH 1/6] feat(docker): blackwell GPU compatibility Signed-off-by: Amadeusz Szymko --- Dockerfile | 14 +++++++------- projects/BEVFusion/setup.py | 1 + projects/TransFusion/setup.py | 2 ++ 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index a9fe84f5..66ebc081 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,11 @@ -ARG PYTORCH="2.2.2" -ARG CUDA="12.1" -ARG CUDNN="8" +ARG PYTORCH="2.8.0" +ARG CUDA="12.9" +ARG CUDNN="9" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel -ARG MMCV="2.1.0" -ARG MMENGINE="0.10.3" -ARG MMDET="3.2.0" +ARG MMCV="2.2.0" +ARG MMENGINE="0.10.7" +ARG MMDET="3.3.0" ARG MMDEPLOY="1.3.1" ARG MMDET3D="1.4.0" ARG MMPRETRAIN="1.2.0" @@ -13,7 +13,7 @@ ARG MMSEGMENTATION="1.2.2" ENV CUDA_HOME="/usr/local/cuda" \ FORCE_CUDA="1" \ - TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.7 8.9+PTX" \ + TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.7 8.9 9.0 12.0+PTX" \ TORCH_NVCC_FLAGS="-Xfatbin -compress-all" # Install apt dependencies for base library diff --git a/projects/BEVFusion/setup.py b/projects/BEVFusion/setup.py index 41d24eb0..837d1f53 100644 --- a/projects/BEVFusion/setup.py +++ b/projects/BEVFusion/setup.py @@ -22,6 +22,7 @@ def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_i "-gencode=arch=compute_80,code=sm_80", "-gencode=arch=compute_86,code=sm_86", "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_120,code=sm_120", ] sources += sources_cuda else: diff --git a/projects/TransFusion/setup.py b/projects/TransFusion/setup.py index 3c075c0c..d37a2a10 100644 --- a/projects/TransFusion/setup.py +++ b/projects/TransFusion/setup.py @@ -28,6 +28,8 @@ def make_cuda_ext( "-gencode=arch=compute_75,code=sm_75", "-gencode=arch=compute_80,code=sm_80", "-gencode=arch=compute_86,code=sm_86", + "-gencode=arch=compute_90,code=sm_90", + "-gencode=arch=compute_120,code=sm_120", ] sources += sources_cuda else: From 5b10dc538b86d0617b29f7f2075a76b4d6e88d2f Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Tue, 7 Oct 2025 16:40:18 +0900 Subject: [PATCH 2/6] feat(docker): downgrade mmcv Signed-off-by: Amadeusz Szymko --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 66ebc081..a775bf4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ ARG CUDA="12.9" ARG CUDNN="9" FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel -ARG MMCV="2.2.0" +ARG MMCV="2.1.0" ARG MMENGINE="0.10.7" ARG MMDET="3.3.0" ARG MMDEPLOY="1.3.1" From 6380eb460064d9b29e20424a937bb0627eb09f8c Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Oct 2025 14:06:23 +0900 Subject: [PATCH 3/6] fix(awml): checkpoint load Signed-off-by: Amadeusz Szymko --- autoware_ml/detection3d/runners/base_runner.py | 2 +- projects/BEVFusion/scripts/lean/funcs.py | 2 +- projects/BEVFusion/scripts/make_model.py | 2 +- projects/FRNet/deploy/torch_model.py | 2 +- tools/detection2d/deploy_yolox.py | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/autoware_ml/detection3d/runners/base_runner.py b/autoware_ml/detection3d/runners/base_runner.py index 58ab7fef..6aa9d3da 100644 --- a/autoware_ml/detection3d/runners/base_runner.py +++ b/autoware_ml/detection3d/runners/base_runner.py @@ -127,7 +127,7 @@ def load_verify_checkpoint(self, model: nn.Module): mean_before, variance_before = self._get_weight_statistics(model=model) # Load checkpoint - checkpoint_state_dict = torch.load(self._checkpoint_path, map_location=self._torch_device) + checkpoint_state_dict = torch.load(self._checkpoint_path, map_location=self._torch_device, weights_only=False) # Load model weights model.load_state_dict(checkpoint_state_dict["state_dict"]) diff --git a/projects/BEVFusion/scripts/lean/funcs.py b/projects/BEVFusion/scripts/lean/funcs.py index af326bd4..445aa4a0 100644 --- a/projects/BEVFusion/scripts/lean/funcs.py +++ b/projects/BEVFusion/scripts/lean/funcs.py @@ -104,7 +104,7 @@ def fuse_bn(conv, bn): def load_checkpoint(model, file, startsname=None): device = next(model.parameters()).device - ckpt = torch.load(file, map_location=device)["state_dict"] + ckpt = torch.load(file, map_location=device, weights_only=False)["state_dict"] new_ckpt = ckpt if startsname is not None: diff --git a/projects/BEVFusion/scripts/make_model.py b/projects/BEVFusion/scripts/make_model.py index d0908d90..92a85b94 100644 --- a/projects/BEVFusion/scripts/make_model.py +++ b/projects/BEVFusion/scripts/make_model.py @@ -521,7 +521,7 @@ def make_lidar_onnx(args, model, cfg, save_root, points, img): # make_ptq_model(args, model, cfg, dir_name) # Make onnx - data = torch.load(args.input_data) + data = torch.load(args.input_data, weights_only=False) img = data["img"].data[0].cuda() points = [i.cuda() for i in data["points"].data[0]] make_camera_onnx(args, model_, cfg, save_root, points, img) diff --git a/projects/FRNet/deploy/torch_model.py b/projects/FRNet/deploy/torch_model.py index 3da3aa4d..02e78a0e 100644 --- a/projects/FRNet/deploy/torch_model.py +++ b/projects/FRNet/deploy/torch_model.py @@ -23,7 +23,7 @@ def _build_model(self, model_cfg: dict, checkpoint_path: str) -> "FRNet": model_cfg["backbone"].update(deploy) model_cfg["decode_head"].update(deploy) model = MODELS.build(model_cfg) - model.load_state_dict(torch.load(checkpoint_path)["state_dict"]) + model.load_state_dict(torch.load(checkpoint_path, weights_only=False)["state_dict"]) model.eval() return model diff --git a/tools/detection2d/deploy_yolox.py b/tools/detection2d/deploy_yolox.py index 058ae994..7d702b3a 100644 --- a/tools/detection2d/deploy_yolox.py +++ b/tools/detection2d/deploy_yolox.py @@ -84,9 +84,9 @@ def get_class_num(mmdet_ckpt): if not os.path.isfile(official_ckpt_save_path): request.urlretrieve(url, official_ckpt_save_path) - official_ckpt = torch.load(official_ckpt_save_path) + official_ckpt = torch.load(official_ckpt_save_path, weights_only=False) - mmdet_ckpt = torch.load(autoware_ml_ckpt, map_location="cuda:0") + mmdet_ckpt = torch.load(autoware_ml_ckpt, map_location="cuda:0", weights_only=False) if "state_dict" in mmdet_ckpt.keys(): mmdet_ckpt = mmdet_ckpt["state_dict"] From 1307fcbc787a55afb811687b459d79885b8a2506 Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Oct 2025 15:31:03 +0900 Subject: [PATCH 4/6] fix(awml): patch torch.load for mmengine Signed-off-by: Amadeusz Szymko --- .patches/mmengine.patch | 38 ++++++++++++++++++++++++++++++++++++++ Dockerfile | 5 ++++- 2 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 .patches/mmengine.patch diff --git a/.patches/mmengine.patch b/.patches/mmengine.patch new file mode 100644 index 00000000..9f119bff --- /dev/null +++ b/.patches/mmengine.patch @@ -0,0 +1,38 @@ +--- /mmengine/runner/checkpoint.py ++++ /mmengine/runner/checkpoint.py.corrected +@@ -344,7 +344,7 @@ def load_from_local(filename, map_location): + filename = osp.expanduser(filename) + if not osp.isfile(filename): + raise FileNotFoundError(f'{filename} can not be found.') +- checkpoint = torch.load(filename, map_location=map_location) ++ checkpoint = torch.load(filename, map_location=map_location, weights_only=False) + return checkpoint + + +@@ -412,7 +412,7 @@ def load_from_pavi(filename, map_location=None): + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) +- checkpoint = torch.load(downloaded_file, map_location=map_location) ++ checkpoint = torch.load(downloaded_file, map_location=map_location, weights_only=False) + return checkpoint + + +@@ -435,7 +435,7 @@ def load_from_ceph(filename, map_location=None, backend='petrel'): + file_backend = get_file_backend( + filename, backend_args={'backend': backend}) + with io.BytesIO(file_backend.get(filename)) as buffer: +- checkpoint = torch.load(buffer, map_location=map_location) ++ checkpoint = torch.load(buffer, map_location=map_location, weights_only=False) + return checkpoint + + +@@ -504,7 +504,7 @@ def load_from_openmmlab(filename, map_location=None): + filename = osp.join(_get_mmengine_home(), model_url) + if not osp.isfile(filename): + raise FileNotFoundError(f'{filename} can not be found.') +- checkpoint = torch.load(filename, map_location=map_location) ++ checkpoint = torch.load(filename, map_location=map_location, weights_only=False) + return checkpoint + + diff --git a/Dockerfile b/Dockerfile index a775bf4a..b717b9eb 100644 --- a/Dockerfile +++ b/Dockerfile @@ -69,10 +69,13 @@ RUN python3 -m pip --no-cache-dir install \ # NOTE(knzo25): this patch is needed to use numpy versions over 1.23.5 (version used in mmdet3d 1.4.0) # It can be safely deleted when mmdet3d updates the numpy version +# NOTE(amadeuszsz): patches for torch.load can be removed after mmlab's PyTorch 2.6+ support COPY .patches/mmdet3d.patch /tmp/mmdet3d.patch +COPY .patches/mmengine.patch /tmp/mmengine.patch RUN cd $(python -c "import site; print(site.getsitepackages()[0])") \ && git apply < /tmp/mmdet3d.patch \ - && rm -f /tmp/mmdet3d.patch \ + && git apply < /tmp/mmengine.patch \ + && rm -rf /tmp/* \ && cd / ENV WGPU_BACKEND=gl From 4ed4c399b6d91a175faeb1e5431f135ea22dd15e Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Oct 2025 15:39:08 +0900 Subject: [PATCH 5/6] chore(ci): ignore patches Signed-off-by: Amadeusz Szymko --- .pre-commit-config.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 42e1549c..31fc06e9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,6 +2,8 @@ ci: autofix_commit_msg: "ci(pre-commit): autofix" autoupdate_commit_msg: "ci(pre-commit): autoupdate" +exclude: ".patches/.*" + repos: - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 @@ -20,7 +22,7 @@ repos: rev: 25.9.0 hooks: - id: black - language_version: python3.10 + language_version: python3.11 - repo: https://github.com/pycqa/isort rev: 6.0.1 From aba8062267ddae3d0722656234d9c691630da5bb Mon Sep 17 00:00:00 2001 From: Amadeusz Szymko Date: Wed, 8 Oct 2025 15:55:57 +0900 Subject: [PATCH 6/6] fix(awml): patch torch.load for mmengine (2) Signed-off-by: Amadeusz Szymko --- .patches/mmengine.patch | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.patches/mmengine.patch b/.patches/mmengine.patch index 9f119bff..096989a8 100644 --- a/.patches/mmengine.patch +++ b/.patches/mmengine.patch @@ -36,3 +36,24 @@ return checkpoint + +--- /mmengine/utils/dl_utils/hub.py ++++ /mmengine/utils/dl_utils/hub.py.corrected +@@ -48,7 +48,7 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version( + f.extractall(model_dir) + extraced_name = members[0].filename + extracted_file = os.path.join(model_dir, extraced_name) +- return torch.load(extracted_file, map_location=map_location) ++ return torch.load(extracted_file, map_location=map_location, weights_only=False) + + def load_url(url, + model_dir=None, +@@ -114,7 +114,7 @@ if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version( + return _legacy_zip_load(cached_file, model_dir, map_location) + + try: +- return torch.load(cached_file, map_location=map_location) ++ return torch.load(cached_file, map_location=map_location, weights_only=False) + except RuntimeError as error: + if digit_version(TORCH_VERSION) < digit_version('1.5.0'): + warnings.warn(