diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index f21325cd0848f..b92963c27a054 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -7,6 +7,7 @@ #include #else #include +#include #include #include #endif @@ -102,7 +103,7 @@ std::tuple miopen_batch_norm( mode = miopenBNSpatial; } - auto output_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format()); + auto output_t = at::empty_like(input_t, input_t.options(), input_t.suggest_memory_format()); TensorArg output{ output_t, "output", 0 }; auto handle = getMiopenHandle(); @@ -170,22 +171,15 @@ std::tuple miopen_batch_norm_backward( const std::optional& save_var_t_opt, double epsilon) { // See [Note: hacky wrapper removal for optional tensor] - const Tensor& running_mean = - running_mean_opt.value_or(Tensor()); - const Tensor& running_var = - running_var_opt.value_or(Tensor()); - const Tensor& save_mean_t = - save_mean_t_opt.value_or(Tensor()); - const Tensor& save_var_t = - save_var_t_opt.value_or(Tensor()); + const Tensor& save_mean_t = save_mean_t_opt.value_or(Tensor()); + const Tensor& save_var_t = save_var_t_opt.value_or(Tensor()); auto grad_output_contig = grad_output_t.contiguous(input_t.suggest_memory_format()); - TensorArg input{ input_t, "input", 1 }, - grad_output{ grad_output_contig, "grad_output", 2 }, - weight{ weight_t, "weight", 3 }, - save_mean{ save_mean_t, "save_mean", 4 }, - save_var{ save_var_t, "save_var", 5 }; + TensorArg input{input_t, "input", 1}, + grad_output{grad_output_contig, "grad_output", 2}, + weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4}, + save_var{save_var_t, "save_var", 5}; CheckedFrom c = "miopen_batch_norm_backward"; checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); diff --git a/test/nn/test_convolution.py b/test/nn/test_convolution.py index 2685a35ba5873..ef94df86310c8 100644 --- a/test/nn/test_convolution.py +++ b/test/nn/test_convolution.py @@ -30,7 +30,6 @@ skipCUDAIfMiopen, skipCUDAIfNoCudnn, skipCUDAIfNoMiopen, - skipCUDAIfNotMiopenSuggestNHWC, skipCUDAIfRocm, skipMeta, skipMPS, @@ -52,9 +51,7 @@ parametrize as parametrize_test, run_tests, set_default_dtype, - skipIfNotMiopenSuggestNHWC, skipIfRocmArch, - skipIfRocmVersionLessThan, subtest, TEST_SCIPY, TEST_WITH_ROCM, @@ -66,6 +63,7 @@ if TEST_WITH_ROCM: os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1" + os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1" if TEST_SCIPY: @@ -717,7 +715,6 @@ def test_ConvTranspose2d_half_cublas_gemm(self): # Almost identical to the above `test_Conv2d_naive_groups` @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False) @tf32_on_and_off(0.001) - @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7") def test_Conv2d_groups_nobias(self): dev_dtypes = [("cpu", torch.float)] if TEST_CUDA: @@ -763,7 +760,6 @@ def test_Conv2d_groups_nobias(self): # and https://github.com/pytorch/pytorch/pull/18463#issuecomment-477001024 @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False) @tf32_on_and_off(0.001) - @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7") def test_Conv2d_groups_nobias_v2(self): torch.manual_seed(123) dev_dtypes = [("cpu", torch.float)] @@ -898,7 +894,6 @@ def test_conv_tbc(self): @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") @unittest.skipIf(not TEST_CUDNN, "needs cudnn") - @skipIfNotMiopenSuggestNHWC def test_grouped_conv_cudnn_nhwc_support(self): # in order to catch the hols in grouped convolution in nhwc support for earlier cudnn version input = torch.randn((16, 16, 8, 8), dtype=torch.float16, device="cuda").to( @@ -3147,7 +3142,6 @@ def test_conv_noncontig_weights_and_bias(self, device): @onlyCUDA @largeTensorTest("12GB") - @skipIfRocmVersionLessThan((6, 0)) def test_conv_transposed_large(self, device): dtype = torch.half if self.device_type == "cuda" else torch.float conv = nn.ConvTranspose2d(1, 1, 1, 1, bias=False).to(device).to(dtype) @@ -3191,7 +3185,6 @@ def test_conv_transposed_large(self, device): self.assertEqual(maxdiff3, 0) @onlyCUDA - @skipCUDAIfRocm @largeTensorTest("12GB") def test_conv_large(self, device): dtype = torch.half if self.device_type == "cuda" else torch.float @@ -3224,7 +3217,6 @@ def test_conv_large(self, device): self.assertEqual(grad1, grad2, atol=5e-2, rtol=5e-3) @onlyCUDA - @skipCUDAIfRocm @largeTensorTest("20GB", "cpu") @largeTensorTest("60GB", "cuda") def test_conv_large_batch_1(self, device): @@ -3372,7 +3364,6 @@ def test_ConvTranspose3d_size_1_kernel(self, device): @dtypes(torch.float) @torch.backends.cudnn.flags(enabled=True, deterministic=True, benchmark=False) @tf32_on_and_off(0.001) - @unittest.skipIf(TEST_WITH_ROCM, "Skipped on ROCm, since it is failing on ROCm 5.7") def test_Conv2d_naive_groups(self, device, dtype): # Check that grouped convolutions matches two half convolutions m = nn.Conv2d(4, 4, kernel_size=3, groups=2).to(device, dtype) @@ -3641,19 +3632,21 @@ def helper( ) @onlyCUDA - @skipCUDAIfNotMiopenSuggestNHWC @dtypes(torch.half, torch.float, torch.cfloat) def test_conv_cudnn_nhwc(self, device, dtype): def helper(n, c, h, w, out_channels, kernel_size, groups): - input = torch.randint(-3, 3, (n, c, h, w), dtype=dtype, device=device).to( - memory_format=torch.channels_last - ) + # randint with dtype=torch.cfloat fails with + # RuntimeError: check_random_bounds handles only integral, floating-point and boolean types + # must create randint and randint_like using default int64, then cast to desired + input = torch.randint( + -3, 3, (n, c, h, w), dtype=torch.int64, device=device + ).to(dtype, memory_format=torch.channels_last) input.requires_grad_() conv = nn.Conv2d(c, out_channels, kernel_size, groups=groups).to( device="cuda", dtype=dtype, memory_format=torch.channels_last ) for p in conv.parameters(): - p.data = torch.randint_like(p, -3, 3) + p.data = torch.randint_like(p, -3, 3, dtype=torch.int64).to(p.dtype) # use FP64 channels-first conv as reference ref_input = input.detach().clone().contiguous().double().requires_grad_() @@ -3667,7 +3660,7 @@ def helper(n, c, h, w, out_channels, kernel_size, groups): out = conv(input) ref_out = ref_conv(ref_input) - grad = torch.randint_like(out, -3, 3) + grad = torch.randint_like(out, -3, 3, dtype=torch.int64).to(out.dtype) ref_grad = grad.detach().clone().double().contiguous() out.backward(grad) @@ -3694,7 +3687,6 @@ def helper(n, c, h, w, out_channels, kernel_size, groups): helper(1, 16, 56, 56, out_channels=16, kernel_size=3, groups=16) @onlyCUDA - @skipCUDAIfRocm @dtypes(torch.half, torch.float) def test_conv_cudnn_ndhwc(self, device, dtype): def helper(n, c, d, h, w, out_channels, kernel_size, groups): @@ -3824,7 +3816,6 @@ def _test_conv_cudnn_nhwc_nchw(self, layer, n, c, h, w, k, filter_size, device): ) @onlyCUDA - @skipCUDAIfNotMiopenSuggestNHWC @tf32_on_and_off(0.05) def test_conv_cudnn_mismatch_memory_format(self, device): configs = [ @@ -3958,7 +3949,6 @@ def test_cudnn_convolution_add_relu(self, device, dtype): self.assertEqual(F.relu(conv2d_out + alpha * z), cudnn_out) @onlyCUDA - @skipCUDAIfRocm def test_convert_conv2d_weight_memory_format(self, device): input = torch.randint(1, 10, (2, 8, 4, 4), dtype=torch.float32, device=device) model = nn.Sequential(nn.Conv2d(8, 4, 3), nn.BatchNorm2d(4)).to(device).float() @@ -3978,7 +3968,6 @@ def test_convert_conv2d_weight_memory_format(self, device): self.assertTrue(out.is_contiguous(memory_format=memory_format)) @onlyCUDA - @skipCUDAIfRocm def test_convert_conv3d_weight_memory_format(self, device): input = torch.randint( 1, 10, (2, 8, 4, 4, 4), dtype=torch.float32, device=device diff --git a/test/test_nn.py b/test/test_nn.py index e65f5d53147af..0a0d3d79567fa 100644 --- a/test/test_nn.py +++ b/test/test_nn.py @@ -61,6 +61,7 @@ if TEST_WITH_ROCM: os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC"] = "1" + os.environ["PYTORCH_MIOPEN_SUGGEST_NHWC_BATCHNORM"] = "1" # load_tests from common_utils is used to automatically filter tests for # sharding on sandcastle. This line silences flake warnings @@ -3496,7 +3497,6 @@ def test_cudnn_forward_exception(self): self.assertRaisesRegex(RuntimeError, re.escape("input.size(-1) must be equal to input_size"), rnn, x_wrong) @unittest.skipIf(not TEST_CUDNN, 'CUDNN not available') - @skipIfRocm def test_cudnn_weight_format(self): rnns = [ nn.LSTM(10, 20, batch_first=True), @@ -3504,7 +3504,8 @@ def test_cudnn_weight_format(self): nn.GRU(10, 20, batch_first=True), nn.RNN(10, 20, batch_first=True) ] - first_warn = True + # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it + first_warn = False if torch.version.hip else True for rnn in rnns: rnn.cuda() input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")