diff --git a/.github/workflows/aws-batch-integration-tests.yaml b/.github/workflows/aws-batch-integration-tests.yaml index c86cb57d1..c3a8c0d05 100644 --- a/.github/workflows/aws-batch-integration-tests.yaml +++ b/.github/workflows/aws-batch-integration-tests.yaml @@ -16,7 +16,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/.github/workflows/components-integration-tests.yaml b/.github/workflows/components-integration-tests.yaml index 3b2b1906a..debd1129f 100644 --- a/.github/workflows/components-integration-tests.yaml +++ b/.github/workflows/components-integration-tests.yaml @@ -35,7 +35,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/.github/workflows/container.yaml b/.github/workflows/container.yaml index e22d0fd21..a51540f24 100644 --- a/.github/workflows/container.yaml +++ b/.github/workflows/container.yaml @@ -15,7 +15,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/.github/workflows/gcp-batch-integration-tests.yaml b/.github/workflows/gcp-batch-integration-tests.yaml index fef83ce6d..2926e2c4b 100644 --- a/.github/workflows/gcp-batch-integration-tests.yaml +++ b/.github/workflows/gcp-batch-integration-tests.yaml @@ -17,7 +17,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.9 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/.github/workflows/kfp-integration-tests.yaml b/.github/workflows/kfp-integration-tests.yaml index 633ccf880..3898d59e2 100644 --- a/.github/workflows/kfp-integration-tests.yaml +++ b/.github/workflows/kfp-integration-tests.yaml @@ -13,7 +13,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index e76b7d9a3..2c5aac8e4 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -12,7 +12,7 @@ jobs: - name: Setup Python 3.10 uses: actions/setup-python@v2 with: - python-version: '3.10' + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 @@ -22,7 +22,7 @@ jobs: pip install -e .[dev] pip install twine - name: Run tests - run: pytest --cov=./ --cov-report=xml + run: pytest --cov=./ --cov-report=xml - name: Push nightly env: PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }} diff --git a/.github/workflows/pyre.yaml b/.github/workflows/pyre.yaml index 583a2a96d..b5acbd597 100644 --- a/.github/workflows/pyre.yaml +++ b/.github/workflows/pyre.yaml @@ -21,7 +21,7 @@ jobs: run: | set -eux pip install -e .[dev] - + - name: Init Lint Runner lintrunner init - name: Run Pyre run: scripts/pyre.sh diff --git a/.github/workflows/python-unittests.yaml b/.github/workflows/python-unittests.yaml index 5d145715a..d817bc591 100644 --- a/.github/workflows/python-unittests.yaml +++ b/.github/workflows/python-unittests.yaml @@ -10,7 +10,7 @@ jobs: unittest: strategy: matrix: - python-version: [3.8, 3.9, "3.10", 3.11] + python-version: [3.9, "3.10", 3.11, 3.12] platform: ["linux.20_04.4x"] include: - python-version: 3.9 diff --git a/.github/workflows/slurm-local-integration-tests.yaml b/.github/workflows/slurm-local-integration-tests.yaml index e8ef0d821..6038a792a 100644 --- a/.github/workflows/slurm-local-integration-tests.yaml +++ b/.github/workflows/slurm-local-integration-tests.yaml @@ -19,7 +19,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: "3.10" architecture: x64 - name: Checkout TorchX uses: actions/checkout@v2 diff --git a/dev-requirements.txt b/dev-requirements.txt index 7a9c131bc..adc00cab7 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -24,11 +24,11 @@ pytorch-lightning==2.3.1 tensorboard==2.14.0 sagemaker==2.224.4 torch-model-archiver>=0.4.2 -torch==2.2.1 -torchmetrics==0.10.3 +torch>=2.6.0 +torchmetrics==1.6.3 torchserve>=0.10.0 -torchtext==0.17.1 -torchvision==0.17.1 +torchtext==0.18.0 +torchvision==0.21.0 ts==0.5.1 ray[default] wheel diff --git a/torchx/distributed/__init__.py b/torchx/distributed/__init__.py index 64412e5b9..35324e137 100644 --- a/torchx/distributed/__init__.py +++ b/torchx/distributed/__init__.py @@ -83,9 +83,7 @@ def local_device() -> torch.device: if dist.is_initialized(): default_pg = _get_default_group() return ( - local_cuda_device() - if default_pg.options.backend == "nccl" - else torch.device("cpu") + local_cuda_device() if default_pg.name() == "nccl" else torch.device("cpu") ) else: return torch.device("cuda") if has_cuda_devices() else torch.device("cpu") diff --git a/torchx/examples/apps/lightning/model.py b/torchx/examples/apps/lightning/model.py index 4f4a6dcfb..76dde003d 100644 --- a/torchx/examples/apps/lightning/model.py +++ b/torchx/examples/apps/lightning/model.py @@ -23,7 +23,7 @@ import torch import torch.jit from torch.nn import functional as F -from torchmetrics import Accuracy +from torchmetrics.classification import MulticlassAccuracy from torchvision.models.resnet import BasicBlock, ResNet @@ -44,13 +44,12 @@ def __init__( # We use the torchvision resnet model with some small tweaks to match # TinyImageNet. - m = ResNet(BasicBlock, layer_sizes) + m = ResNet(BasicBlock, layer_sizes, num_classes=200) m.avgpool = torch.nn.AdaptiveAvgPool2d(1) - m.fc.out_features = 200 self.model: ResNet = m - self.train_acc = Accuracy() - self.val_acc = Accuracy() + self.train_acc = MulticlassAccuracy(num_classes=m.fc.out_features) + self.val_acc = MulticlassAccuracy(num_classes=m.fc.out_features) # pyre-fixme[14] def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -71,7 +70,7 @@ def validation_step( def _step( self, step_name: str, - acc_metric: Accuracy, + acc_metric: MulticlassAccuracy, batch: Tuple[torch.Tensor, torch.Tensor], batch_idx: int, ) -> torch.Tensor: diff --git a/torchx/examples/apps/lightning/train.py b/torchx/examples/apps/lightning/train.py index f1c340dca..b1509b019 100755 --- a/torchx/examples/apps/lightning/train.py +++ b/torchx/examples/apps/lightning/train.py @@ -60,6 +60,7 @@ import torch from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.loggers import TensorBoardLogger +from torch.distributed.elastic.multiprocessing import errors from torchx.examples.apps.lightning.data import ( create_random_data, download_data, @@ -85,7 +86,12 @@ def parse_args(argv: List[str]) -> argparse.Namespace: parser.add_argument( "--batch_size", type=int, default=32, help="batch size to use for training" ) - parser.add_argument("--num_samples", type=int, default=10, help="num_samples") + parser.add_argument( + "--num_samples", + type=int, + default=32, + help="number of samples in the dataset", + ) parser.add_argument( "--data_path", type=str, @@ -126,6 +132,7 @@ def get_model_checkpoint(args: argparse.Namespace) -> Optional[ModelCheckpoint]: ) +@errors.record def main(argv: List[str]) -> None: with tempfile.TemporaryDirectory() as tmpdir: args = parse_args(argv) @@ -138,7 +145,7 @@ def main(argv: List[str]) -> None: if not args.data_path: data_path = os.path.join(tmpdir, "data") os.makedirs(data_path) - create_random_data(data_path) + create_random_data(data_path, args.num_samples) else: data_path = download_data(args.data_path, tmpdir) diff --git a/torchx/runner/test/api_test.py b/torchx/runner/test/api_test.py index d488eb3f7..1f9804554 100644 --- a/torchx/runner/test/api_test.py +++ b/torchx/runner/test/api_test.py @@ -601,7 +601,7 @@ def test_get_schedulers(self, json_dumps_mock: MagicMock, _) -> None: ) app = AppDef("sleeper", roles=[role]) runner.run(app, scheduler="local") - local_sched_mock.submit.called_once_with(app, {}) + local_sched_mock.schedule.assert_called_once() def test_run_from_module(self, _: str) -> None: runner = get_runner(name="test_session") diff --git a/torchx/schedulers/test/ray_scheduler_test.py b/torchx/schedulers/test/ray_scheduler_test.py index 92482d817..230f7a97d 100644 --- a/torchx/schedulers/test/ray_scheduler_test.py +++ b/torchx/schedulers/test/ray_scheduler_test.py @@ -390,6 +390,12 @@ def test_nonmatching_address(self) -> None: ): _scheduler_with_client.submit(app=app, cfg={}) + def _assertDictContainsSubset(self, expected, actual, msg=None): + # NB: implement unittest.TestCase.assertDictContainsSubsetNew() since it was removed in python-3.11 + for key, value in expected.items(): + self.assertIn(key, actual, msg) + self.assertEqual(actual[key], value, msg) + def test_client_with_headers(self) -> None: # This tests only one option for the client. Different versions may have more options available. headers = {"Authorization": "Bearer: token"} @@ -398,7 +404,7 @@ def test_client_with_headers(self) -> None: ) _scheduler_with_client = RayScheduler("client_session", ray_client) scheduler_client = _scheduler_with_client._get_ray_client() - self.assertDictContainsSubset(scheduler_client._headers, headers) + self._assertDictContainsSubset(scheduler_client._headers, headers) class RayClusterSetup: _instance = None # pyre-ignore @@ -606,7 +612,7 @@ def test_ray_driver_elasticity(self) -> None: # 3-3 teriminal = ( driver._step() - ) # pg 2 becomes availiable, but actor 2 shouldn't be executed + ) # pg 2 becomes available, but actor 2 shouldn't be executed self.assertEqual(teriminal, False) self.assertEqual(len(driver.active_tasks), 0) # actor1 should be finished self.assertEqual(driver.command_actors_count, 0)