Skip to content

Commit

Permalink
Properly disable TBE SSD tests in OSS (pytorch#3548)
Browse files Browse the repository at this point in the history
Summary:
X-link: facebookresearch/FBGEMM#637

- Properly disable TBE SSD tests in OSS
- Extend ROCm test time limit to 40 minutes
- Fix `aarch64cuda` Nova runs

Pull Request resolved: pytorch#3548

Reviewed By: jianyuh

Differential Revision: D67879826

Pulled By: q10

fbshipit-source-id: 1ebf8a401b08876362a1d7f8bd29fbe3f8da2665
  • Loading branch information
q10 authored and facebook-github-bot committed Jan 7, 2025
1 parent 5c30f88 commit 61f3b15
Show file tree
Hide file tree
Showing 10 changed files with 35 additions and 44 deletions.
5 changes: 5 additions & 0 deletions .github/scripts/nova_prescript.bash
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,14 @@ if [[ $CU_VERSION = cu* ]]; then
CUDNN_INCLUDE_DIR="${CUDA_HOME}/include" \
CUDNN_LIBRARY="${CUDA_HOME}/lib64"

echo "[NOVA] -------- Finding libcuda.so -----------"
LIBCUDA_PATH=$(find /usr/local -type f -name libcuda.so)
print_exec ln "${LIBCUDA_PATH}" -s "/usr/local/lib/libcuda.so.1"

echo "[NOVA] -------- Finding NVML_LIB_PATH -----------"
if [[ ${NVML_LIB_PATH} == "" ]]; then
NVML_LIB_PATH=$(find "${CUDA_HOME}" -name libnvidia-ml.so) &&
ln "${NVML_LIB_PATH}" -s "/usr/local/lib/libnvidia-ml.so.1" &&
export NVML_LIB_PATH &&
echo "[NOVA] looking in ${CUDA_HOME}" ||
echo "[NOVA] libnvidia-ml.so not found in ${CUDA_HOME}";
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/fbgemm_gpu_ci_rocm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -205,5 +205,5 @@ jobs:
run: . $PRELUDE; install_fbgemm_gpu_wheel $BUILD_ENV *.whl

- name: Test with PyTest
timeout-minutes: 20
timeout-minutes: 40
run: . $PRELUDE; test_all_fbgemm_gpu_modules $BUILD_ENV
8 changes: 7 additions & 1 deletion fbgemm_gpu/test/tbe/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,17 @@

if open_source:
# pyre-ignore[21]
from test_utils import gpu_unavailable, running_on_github, TEST_WITH_ROCM
from test_utils import (
gpu_unavailable,
running_in_oss,
running_on_github,
TEST_WITH_ROCM,
)
else:
torch.ops.load_library("//deeplearning/fbgemm/fbgemm_gpu:cumem_utils")
from fbgemm_gpu.test.test_utils import ( # noqa F401
gpu_unavailable,
running_in_oss,
running_on_github,
TEST_WITH_ROCM,
)
Expand Down
16 changes: 5 additions & 11 deletions fbgemm_gpu/test/tbe/ssd/kv_tensor_wrapper_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,23 @@
import unittest
from unittest import TestCase

import fbgemm_gpu
import fbgemm_gpu # noqa E402
import torch
import torch.testing
from fbgemm_gpu.split_embedding_configs import SparseType
from fbgemm_gpu.utils.loader import load_torch_module
from hypothesis import given, settings, strategies as st, Verbosity

# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
open_source: bool = getattr(fbgemm_gpu, "open_source", False)

if open_source:
from test_utils import running_on_github # @manual # pyre-ignore[21]
else:
from fbgemm_gpu.test.test_utils import ( # @manual=//deeplearning/fbgemm/fbgemm_gpu:test_utils
running_on_github,
)
from .. import common # noqa E402
from ..common import open_source, running_in_oss

if not open_source:
load_torch_module(
"//deeplearning/fbgemm/fbgemm_gpu:ssd_split_table_batched_embeddings",
)


@unittest.skipIf(*running_on_github)
@unittest.skipIf(*running_in_oss)
class KvTensorWrapperTest(TestCase):
# pyre-ignore[56]
@given(
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/test/tbe/ssd/ssd_l2_cache_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from hypothesis import given, settings, Verbosity

from .. import common # noqa E402
from ..common import gpu_unavailable, running_on_github
from ..common import gpu_unavailable, running_in_oss

MAX_EXAMPLES = 20
default_st: Dict[str, Any] = {
Expand All @@ -42,7 +42,7 @@
}


@unittest.skipIf(*running_on_github)
@unittest.skipIf(*running_in_oss)
@unittest.skipIf(*gpu_unavailable)
class SSDCheckpointTest(unittest.TestCase):
def generate_fbgemm_ssd_tbe(
Expand Down
11 changes: 2 additions & 9 deletions fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_inference_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,13 @@
from hypothesis import given, settings, Verbosity

from .. import common # noqa E402
from ..common import open_source


if open_source:
# pyre-ignore[21]
from test_utils import gpu_unavailable, running_on_github
else:
from fbgemm_gpu.test.test_utils import gpu_unavailable, running_on_github
from ..common import gpu_unavailable, running_in_oss


MAX_EXAMPLES = 40


@unittest.skipIf(*running_on_github)
@unittest.skipIf(*running_in_oss)
@unittest.skipIf(*gpu_unavailable)
@unittest.skipIf(True, "Test is broken.")
class SSDIntNBitTableBatchedEmbeddingsTest(unittest.TestCase):
Expand Down
13 changes: 3 additions & 10 deletions fbgemm_gpu/test/tbe/ssd/ssd_split_tbe_training_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
# pyre-ignore-all-errors[3,6,56]

import unittest
from enum import Enum

from typing import Any, Dict, List, Optional, Tuple

Expand All @@ -24,15 +25,7 @@
from hypothesis import assume, given, settings, Verbosity

from .. import common # noqa E402
from ..common import gen_mixed_B_batch_sizes, open_source

if open_source:
# pyre-ignore[21]
from test_utils import gpu_unavailable, running_on_github
else:
from fbgemm_gpu.test.test_utils import gpu_unavailable, running_on_github

from enum import Enum
from ..common import gen_mixed_B_batch_sizes, gpu_unavailable, running_in_oss


MAX_EXAMPLES = 40
Expand Down Expand Up @@ -69,7 +62,7 @@ class FlushLocation(Enum):
ALL = 4


@unittest.skipIf(*running_on_github)
@unittest.skipIf(*running_in_oss)
@unittest.skipIf(*gpu_unavailable)
class SSDSplitTableBatchedEmbeddingsTest(unittest.TestCase):
def get_physical_table_arg_indices_(self, feature_table_map: List[int]):
Expand Down
10 changes: 2 additions & 8 deletions fbgemm_gpu/test/tbe/ssd/ssd_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,13 @@
from hypothesis import given, settings, Verbosity

from .. import common # noqa E402
from ..common import open_source

if open_source:
# pyre-ignore[21]
from test_utils import gpu_unavailable, running_on_github
else:
from fbgemm_gpu.test.test_utils import gpu_unavailable, running_on_github
from ..common import gpu_unavailable, running_in_oss


MAX_EXAMPLES = 20


@unittest.skipIf(*running_on_github)
@unittest.skipIf(*running_in_oss)
@unittest.skipIf(*gpu_unavailable)
class SSDUtilsTest(unittest.TestCase):
def execute_masked_index_test(
Expand Down
4 changes: 2 additions & 2 deletions fbgemm_gpu/test/tbe/training/backward_dense_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ def test_backward_dense( # noqa C901
or output_dtype == SparseType.FP16
or output_dtype == SparseType.BF16
)
tol = 5.0e-3 if is_low_prec else 1.0e-5
tol = 5.0e-2 if is_low_prec else 1.0e-5
torch.testing.assert_close(
fc2.float(),
f.float(),
Expand All @@ -287,7 +287,7 @@ def test_backward_dense( # noqa C901
else:
goc = torch.cat(gos, dim=0)
fc2.backward(goc)
tol = 5.0e-3 if is_low_prec else 1.0e-4
tol = 5.0e-2 if is_low_prec else 1.0e-4
torch.testing.assert_close(
cc.weights.grad,
grad_weights,
Expand Down
6 changes: 6 additions & 0 deletions fbgemm_gpu/test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@
"Test is currently known to fail or hang when run in the GitHub runners",
)

running_in_oss: Tuple[bool, str] = (
# pyre-fixme[16]: Module `fbgemm_gpu` has no attribute `open_source`.
getattr(fbgemm_gpu, "open_source", False),
"Test is currently known to fail in OSS mode",
)

running_on_rocm: Tuple[bool, str] = (
TEST_WITH_ROCM,
"Test currently doesn't work on the ROCm stack",
Expand Down

0 comments on commit 61f3b15

Please sign in to comment.