diff --git a/packages/on_demand_video_decoder/data/pix_fmt_variants/h264_avc1_yuv420p.mp4 b/packages/on_demand_video_decoder/data/pix_fmt_variants/h264_avc1_yuv420p.mp4 new file mode 100644 index 0000000..7ca9653 Binary files /dev/null and b/packages/on_demand_video_decoder/data/pix_fmt_variants/h264_avc1_yuv420p.mp4 differ diff --git a/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p.mp4 b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p.mp4 new file mode 100644 index 0000000..3106626 Binary files /dev/null and b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p.mp4 differ diff --git a/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p10le.mp4 b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p10le.mp4 new file mode 100644 index 0000000..aa1f912 Binary files /dev/null and b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hev1_yuv420p10le.mp4 differ diff --git a/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p.mp4 b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p.mp4 new file mode 100644 index 0000000..993c116 Binary files /dev/null and b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p.mp4 differ diff --git a/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p10le.mp4 b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p10le.mp4 new file mode 100644 index 0000000..c8a6f29 Binary files /dev/null and b/packages/on_demand_video_decoder/data/pix_fmt_variants/hevc_hvc1_yuv420p10le.mp4 differ diff --git a/packages/on_demand_video_decoder/ext_impl/src/VideoCodecSDKUtils/helper_classes/Utils/FFmpegDemuxer.h b/packages/on_demand_video_decoder/ext_impl/src/VideoCodecSDKUtils/helper_classes/Utils/FFmpegDemuxer.h index e6b715e..534fe20 100644 --- a/packages/on_demand_video_decoder/ext_impl/src/VideoCodecSDKUtils/helper_classes/Utils/FFmpegDemuxer.h +++ b/packages/on_demand_video_decoder/ext_impl/src/VideoCodecSDKUtils/helper_classes/Utils/FFmpegDemuxer.h @@ -23,6 +23,7 @@ extern "C" { #include #include #include +#include /* Explicitly include bsf.h when building against FFmpeg 4.3 (libavcodec 58.45.100) or later for backward compatibility */ #if LIBAVCODEC_VERSION_INT >= 3824484 #include @@ -193,6 +194,259 @@ static int64_t ffbufio_seek(void* opaque, int64_t offset, int whence) { /** * @brief libavformat wrapper class. Retrieves the elementary encoded stream from the container format. */ +// ===================================================================== +// SPS extradata parser — fallback for pixel format detection +// --------------------------------------------------------------------- +// `avformat_find_stream_info` usually fills `AVCodecParameters::format` +// through codec-side parsing. In some builds, pixel format information +// may remain unavailable in `codecpar->format`, which can cause later +// logic to fall back to a default chroma layout. +// +// The helpers in this namespace parse the SPS NAL stored in container +// extradata (HVCC for HEVC, avcC for H.264) and extract the minimum +// fields needed to infer pixel format, specifically `bit_depth_luma` +// and `chroma_format_idc`. +// +// This is a lightweight metadata parser: it reads stream parameter +// fields only and is used for format detection rather than frame +// reconstruction. +// ===================================================================== +namespace ffmpeg_demuxer_detail { + +struct BitReader { + const uint8_t* buf; + int byte; + int bit; + int size; +}; + +inline int br_read_bit(BitReader* b) { + if (b->byte >= b->size) return 0; + int v = (b->buf[b->byte] >> (7 - b->bit)) & 1; + if (++b->bit == 8) { b->bit = 0; b->byte++; } + return v; +} + +inline unsigned br_read_bits(BitReader* b, int n) { + unsigned v = 0; + while (n-- > 0) { v = (v << 1) | (unsigned)br_read_bit(b); } + return v; +} + +inline unsigned br_read_ue(BitReader* b) { + int zeros = 0; + while (zeros < 32 && b->byte < b->size && br_read_bit(b) == 0) zeros++; + return (1u << zeros) - 1 + br_read_bits(b, zeros); +} + +// Strip H.264/HEVC emulation prevention bytes (0x00 0x00 0x03 -> 0x00 0x00). +inline int rbsp_strip(const uint8_t* src, int n, uint8_t* dst, int dst_capacity) { + int o = 0; + for (int i = 0; i < n && o < dst_capacity; i++) { + if (i + 2 < n && src[i] == 0 && src[i+1] == 0 && src[i+2] == 3) { + if (o + 2 > dst_capacity) break; + dst[o++] = 0; dst[o++] = 0; i += 2; + } else { + dst[o++] = src[i]; + } + } + return o; +} + +// Locate an SPS NAL inside HEVC HVCC extradata. +inline bool find_hevc_sps(const uint8_t* ed, int n, + const uint8_t** sps_out, int* sps_len_out) { + if (n < 23 || ed[0] != 1) return false; // configurationVersion=1 expected + int p = 22; + int num_arrays = ed[p++]; + for (int a = 0; a < num_arrays && p < n; a++) { + if (p + 3 > n) return false; + int nal_type = ed[p] & 0x3F; + int num_nalus = (ed[p+1] << 8) | ed[p+2]; + p += 3; + for (int i = 0; i < num_nalus; i++) { + if (p + 2 > n) return false; + int nl = (ed[p] << 8) | ed[p+1]; + p += 2; + if (p + nl > n) return false; + if (nal_type == 33 /* HEVC NAL_SPS */) { + *sps_out = ed + p; + *sps_len_out = nl; + return true; + } + p += nl; + } + } + return false; +} + +// Locate the first SPS NAL inside H.264 avcC extradata. +inline bool find_h264_sps(const uint8_t* ed, int n, + const uint8_t** sps_out, int* sps_len_out) { + if (n < 7 || ed[0] != 1) return false; // configurationVersion=1 + int num_sps = ed[5] & 0x1F; + if (num_sps < 1) return false; + int p = 6; + if (p + 2 > n) return false; + int nl = (ed[p] << 8) | ed[p+1]; + p += 2; + if (p + nl > n) return false; + *sps_out = ed + p; + *sps_len_out = nl; + return true; +} + +// Parse a HEVC SPS RBSP for chroma_format_idc and bit_depth_luma. +inline bool parse_hevc_sps(const uint8_t* sps_nal, int len, + int* bit_depth_out, int* chroma_idc_out) { + if (len < 3) return false; + uint8_t buf[8192]; + int copy_len = len > (int)sizeof(buf) ? (int)sizeof(buf) : len; + int rl = rbsp_strip(sps_nal, copy_len, buf, (int)sizeof(buf)); + BitReader b{ buf, 0, 0, rl }; + + br_read_bits(&b, 16); // 2-byte HEVC NAL header + br_read_bits(&b, 4); // sps_video_parameter_set_id + int max_sub = (int)br_read_bits(&b, 3); // sps_max_sub_layers_minus1 + br_read_bits(&b, 1); // sps_temporal_id_nesting_flag + + // profile_tier_level (profilePresentFlag = 1) + br_read_bits(&b, 2 + 1 + 5); // general_profile_space/tier/idc + br_read_bits(&b, 32); // general_profile_compatibility_flag + br_read_bits(&b, 4); // 4 source-format flags + br_read_bits(&b, 43); // constraint flags + br_read_bits(&b, 1); // general_inbld_flag + br_read_bits(&b, 8); // general_level_idc + + int sub_p[8] = {0}, sub_l[8] = {0}; + for (int i = 0; i < max_sub; i++) { + sub_p[i] = br_read_bit(&b); + sub_l[i] = br_read_bit(&b); + } + if (max_sub > 0) { + for (int i = max_sub; i < 8; i++) br_read_bits(&b, 2); + } + for (int i = 0; i < max_sub; i++) { + if (sub_p[i]) { + br_read_bits(&b, 2 + 1 + 5); + br_read_bits(&b, 32); + br_read_bits(&b, 4); + br_read_bits(&b, 43); + br_read_bits(&b, 1); + } + if (sub_l[i]) br_read_bits(&b, 8); + } + + br_read_ue(&b); // sps_seq_parameter_set_id + *chroma_idc_out = (int)br_read_ue(&b); + if (*chroma_idc_out == 3) br_read_bit(&b); // separate_colour_plane_flag + br_read_ue(&b); // pic_width_in_luma_samples + br_read_ue(&b); // pic_height_in_luma_samples + if (br_read_bit(&b)) { // conformance_window_flag + br_read_ue(&b); br_read_ue(&b); + br_read_ue(&b); br_read_ue(&b); + } + *bit_depth_out = 8 + (int)br_read_ue(&b); + return true; +} + +// Parse an H.264 SPS RBSP. Baseline / Main profiles imply 4:2:0 8-bit. +inline bool parse_h264_sps(const uint8_t* sps_nal, int len, + int* bit_depth_out, int* chroma_idc_out) { + if (len < 4) return false; + uint8_t buf[4096]; + int copy_len = len > (int)sizeof(buf) ? (int)sizeof(buf) : len; + int rl = rbsp_strip(sps_nal, copy_len, buf, (int)sizeof(buf)); + BitReader b{ buf, 0, 0, rl }; + + br_read_bits(&b, 8); // 1-byte H.264 NAL header + int profile_idc = (int)br_read_bits(&b, 8); + br_read_bits(&b, 8); // constraint_set + reserved + br_read_bits(&b, 8); // level_idc + br_read_ue(&b); // seq_parameter_set_id + + // Only the high profiles carry chroma_format_idc / bit_depth fields. + // ITU-T H.264 (V14) 7.4.2.1.1. + static const int hi_profiles[] = { + 100, 110, 122, 244, 44, 83, 86, 118, 128, 138, 139, 134, 135 + }; + bool is_high = false; + for (size_t i = 0; i < sizeof(hi_profiles) / sizeof(hi_profiles[0]); i++) { + if (profile_idc == hi_profiles[i]) { is_high = true; break; } + } + if (!is_high) { + *chroma_idc_out = 1; // implied 4:2:0 + *bit_depth_out = 8; // implied 8-bit + return true; + } + + *chroma_idc_out = (int)br_read_ue(&b); + if (*chroma_idc_out == 3) br_read_bit(&b); // separate_colour_plane_flag + *bit_depth_out = 8 + (int)br_read_ue(&b); + return true; +} + +// Map (chroma_format_idc, bit_depth) to the AVPixelFormat enum values the +// existing eChromaFormat switch handles. Returns AV_PIX_FMT_NONE for +// combinations we do not synthesize; the existing default-branch fallback +// will then preserve current behavior. +inline AVPixelFormat pix_fmt_from_sps(int chroma_idc, int bit_depth) { + switch (chroma_idc) { + case 0: // monochrome + if (bit_depth == 8) return AV_PIX_FMT_GRAY8; + if (bit_depth == 10) return AV_PIX_FMT_GRAY10LE; + break; + case 1: // 4:2:0 + if (bit_depth == 8) return AV_PIX_FMT_YUV420P; + if (bit_depth == 10) return AV_PIX_FMT_YUV420P10LE; + if (bit_depth == 12) return AV_PIX_FMT_YUV420P12LE; + break; + case 3: // 4:4:4 + if (bit_depth == 8) return AV_PIX_FMT_YUV444P; + if (bit_depth == 10) return AV_PIX_FMT_YUV444P10LE; + if (bit_depth == 12) return AV_PIX_FMT_YUV444P12LE; + break; + default: + break; + } + return AV_PIX_FMT_NONE; +} + +// Top-level entry point. Returns AV_PIX_FMT_NONE when recovery is not +// possible (unsupported codec, missing extradata, malformed SPS, or an +// unmappable chroma/bit-depth combination). +inline AVPixelFormat recover_pix_fmt_from_extradata(const AVCodecParameters* cp) { + if (!cp || !cp->extradata || cp->extradata_size <= 0) { + return AV_PIX_FMT_NONE; + } + const uint8_t* sps = nullptr; + int sps_len = 0; + int bit_depth = 8; + int chroma_idc = 1; + + if (cp->codec_id == AV_CODEC_ID_HEVC) { + if (!find_hevc_sps(cp->extradata, cp->extradata_size, &sps, &sps_len)) { + return AV_PIX_FMT_NONE; + } + if (!parse_hevc_sps(sps, sps_len, &bit_depth, &chroma_idc)) { + return AV_PIX_FMT_NONE; + } + } else if (cp->codec_id == AV_CODEC_ID_H264) { + if (!find_h264_sps(cp->extradata, cp->extradata_size, &sps, &sps_len)) { + return AV_PIX_FMT_NONE; + } + if (!parse_h264_sps(sps, sps_len, &bit_depth, &chroma_idc)) { + return AV_PIX_FMT_NONE; + } + } else { + return AV_PIX_FMT_NONE; + } + + return pix_fmt_from_sps(chroma_idc, bit_depth); +} + +} // namespace ffmpeg_demuxer_detail + class FFmpegDemuxer { private: AVFormatContext* fmtc = NULL; @@ -329,6 +583,21 @@ class FFmpegDemuxer { eChromaFormat = (AVPixelFormat)fmtc->streams[iVideoStream]->codecpar->format; color_space = fmtc->streams[iVideoStream]->codecpar->color_space; color_range = fmtc->streams[iVideoStream]->codecpar->color_range; + + // FFmpeg builds without the HEVC / H.264 decoder cannot determine pix_fmt + // during stream-info probing. Fall back to parsing SPS extradata so the + // switch below sees the real format instead of taking the default branch. + if (eChromaFormat == AV_PIX_FMT_NONE) { + AVPixelFormat recovered = ffmpeg_demuxer_detail::recover_pix_fmt_from_extradata( + fmtc->streams[iVideoStream]->codecpar); + if (recovered != AV_PIX_FMT_NONE) { + const char* name = av_get_pix_fmt_name(recovered); + LOG(INFO) << "Recovered pix_fmt from SPS extradata: " + << (name ? name : "?"); + eChromaFormat = recovered; + } + } + switch (eChromaFormat) { case AV_PIX_FMT_YUV420P10LE: case AV_PIX_FMT_GRAY10LE: // monochrome is treated as 420 with chroma filled with 0x0 diff --git a/packages/on_demand_video_decoder/tests/test_pix_fmt_detection.py b/packages/on_demand_video_decoder/tests/test_pix_fmt_detection.py new file mode 100644 index 0000000..1b28ce8 --- /dev/null +++ b/packages/on_demand_video_decoder/tests/test_pix_fmt_detection.py @@ -0,0 +1,120 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Pix_fmt detection coverage across the codec_tag / bit-depth matrix. + +Regression target: a previously-shipped patent-free FFmpeg build cannot populate +`AVCodecParameters::format` for HEVC / H.264 streams (no decoder is linked in to +probe the SPS), and the demuxer silently fell back to 8-bit yuv420p. For a real +10-bit HEVC stream this mis-sized the GPU output buffer by a factor of 2 and +crashed `DecodeFromGOP` with `CUDA_ERROR_INVALID_VALUE`. + +These tests drive `GetGOPList` + `DecodeFromGOP` over the matrix of stream +shapes the package supports on NVDEC (HEVC at both 8-bit and 10-bit with both +hev1 and hvc1 sample-entry tags, plus H.264 8-bit), and assert the decoded +planes carry the dtype implied by the stream's real bit-depth (uint8 for +8-bit, uint16 for 10-bit). This exercises the SPS-extradata fallback added to +FFmpegDemuxer.h when `codecpar->format == AV_PIX_FMT_NONE`. +""" + +import os +import pytest + +import accvlab.on_demand_video_decoder as nvc +import utils + +VARIANTS_DIR = os.path.join(utils.get_data_dir(), "pix_fmt_variants") + + +# Each variant lists the filename, the codec_tag carried by the container, the +# bit-depth the stream actually encodes, and the dtype that should appear on the +# decoded Y plane. All variants here are NVDEC-decodable on the GPUs this +# package targets — see the support matrix for codec/bit-depth coverage: +# https://developer.nvidia.com/video-encode-decode-support-matrix +VARIANTS = [ + ("hevc_hev1_yuv420p.mp4", "hev1", 8, "|u1"), + ("hevc_hev1_yuv420p10le.mp4", "hev1", 10, "|u2"), + ("hevc_hvc1_yuv420p.mp4", "hvc1", 8, "|u1"), + ("hevc_hvc1_yuv420p10le.mp4", "hvc1", 10, "|u2"), + ("h264_avc1_yuv420p.mp4", "avc1", 8, "|u1"), +] + + +def _video_path(name): + path = os.path.join(VARIANTS_DIR, name) + if not os.path.exists(path): + pytest.skip(f"test asset missing: {path}") + return path + + +@pytest.mark.parametrize( + "filename, codec_tag, bit_depth, expected_dtype", + VARIANTS, + ids=[v[0] for v in VARIANTS], +) +def test_decode_from_gop_round_trip(filename, codec_tag, bit_depth, expected_dtype): + """End-to-end: GetGOPList -> DecodeFromGOP must produce a plane of the + correct dtype for the stream's actual bit-depth.""" + path = _video_path(filename) + + demuxer = nvc.CreateGopDecoder(maxfiles=1, iGpu=0) + decoder = nvc.CreateGopDecoder(maxfiles=1, iGpu=0) + + gop_list = demuxer.GetGOPList([path], [0], useGOPCache=True) + assert gop_list, f"GetGOPList returned empty for {filename}" + gop_data, first_ids, gop_lens = gop_list[0] + assert gop_data.size > 0, f"GOP data is empty for {filename}" + assert first_ids == [0], f"unexpected first_ids={first_ids} for {filename}" + assert gop_lens and gop_lens[0] > 0, f"unexpected gop_lens={gop_lens} for {filename}" + + frames = decoder.DecodeFromGOP(gop_data, [path], [0]) + assert len(frames) == 1, f"expected 1 frame, got {len(frames)} for {filename}" + + planes = frames[0].cuda() + assert len(planes) >= 1, f"no planes returned for {filename}" + + cai = planes[0].__cuda_array_interface__ + assert cai["typestr"] == expected_dtype, ( + f"Y plane dtype mismatch for {filename}: got {cai['typestr']!r}, " + f"expected {expected_dtype!r} for {bit_depth}-bit" + ) + + expected_bytes_per_sample = 2 if bit_depth >= 10 else 1 + actual_bytes_per_sample = int(cai["typestr"][-1]) + assert actual_bytes_per_sample == expected_bytes_per_sample, ( + f"Y plane element size mismatch for {filename}: got " + f"{actual_bytes_per_sample}B, expected {expected_bytes_per_sample}B" + ) + + +@pytest.mark.parametrize( + "filename, codec_tag, bit_depth, expected_dtype", + VARIANTS, + ids=[v[0] for v in VARIANTS], +) +def test_decode_does_not_raise_invalid_value(filename, codec_tag, bit_depth, expected_dtype): + """Focused regression: the specific failure mode we fixed was + `CUDA_ERROR_INVALID_VALUE` thrown from DecodeFromGOP because the GPU buffer + was half the size NVDEC writes. Reproduce the exact call sequence the + customer used and assert it does not raise that error.""" + path = _video_path(filename) + demuxer = nvc.CreateGopDecoder(maxfiles=1, iGpu=0) + decoder = nvc.CreateGopDecoder(maxfiles=1, iGpu=0) + + gop_data, _, _ = demuxer.GetGOPList([path], [0], useGOPCache=True)[0] + # No prior RGB call to "prime" the GPU pool — exercise the raw YUV path + # directly, which was the broken path before the SPS fallback. + frames = decoder.DecodeFromGOP(gop_data, [path], [0]) + assert frames, f"DecodeFromGOP returned no frames for {filename}" diff --git a/packages/on_demand_video_decoder/tests/utils.py b/packages/on_demand_video_decoder/tests/utils.py index 22a37cf..6fc3730 100644 --- a/packages/on_demand_video_decoder/tests/utils.py +++ b/packages/on_demand_video_decoder/tests/utils.py @@ -141,7 +141,14 @@ def get_data_dir(): def select_random_clip(path_base): - subdirs = [d for d in os.listdir(path_base) if os.path.isdir(os.path.join(path_base, d))] + # Only consider sample_clip* subdirs as eligible for the general random-clip + # tests. Other data/ subdirs (e.g. pix_fmt_variants/) hold targeted fixtures + # whose contents may not be RGB-decodable on the runtime GPU. + subdirs = [ + d + for d in os.listdir(path_base) + if os.path.isdir(os.path.join(path_base, d)) and d.startswith("sample_clip") + ] if not subdirs: return None clip_dir = os.path.join(path_base, random.choice(subdirs))