Skip to content

Commit 2bc79fb

Browse files
committed
ConvertBits: uint to float, 32 pixels end safety
1 parent 58bc9f0 commit 2bc79fb

1 file changed

Lines changed: 41 additions & 4 deletions

File tree

avs_core/convert/intel/convert_bits_avx2.cpp

Lines changed: 41 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,9 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
752752
dst_pitch = dst_pitch / sizeof(float);
753753

754754
const int src_width = src_rowsize / sizeof(pixel_t);
755-
755+
// write 32 floats (128 bytes) only if valid, scanline alignment is only 64 bytes (16 floats).
756+
// Alignment, write-limited, 16 floats (64 bytes) is guaranteed by AviSynth
757+
const int w16 = (src_width + 15) & ~15;
756758
//-----------------------
757759
bits_conv_constants d;
758760
get_bits_conv_constants(d, chroma, fulls, fulld, source_bitdepth, target_bitdepth);
@@ -763,20 +765,22 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
763765

764766
for (int y = 0; y < src_height; y++)
765767
{
766-
for (int x = 0; x < src_width; x += 32) // process by 32 integers of 8bit, rows are 64 bytes aligned
768+
// rows are 64 bytes - 16 float pixels - aligned we have a write constraint
769+
int x;
770+
for (x = 0; x + 32 <= w16; x += 32)
767771
{
768772
__m256i src_0_32, src_1_32, src_2_32, src_3_32;
769773

770774
if constexpr (sizeof(pixel_t) == 1) // uint8_t
771775
{
772776
__m256i src_32 = _mm256_load_si256(reinterpret_cast<const __m256i*>(srcp0 + x));
773777
// unpack to 4x64bits
774-
__m256i src_0 = _mm256_permute4x64_epi64(src_32, 0);
778+
src_0_32 = _mm256_cvtepu8_epi32(_mm256_castsi256_si128(src_32)); // bytes 0-7
779+
775780
__m256i src_1 = _mm256_permute4x64_epi64(src_32, 1);
776781
__m256i src_2 = _mm256_permute4x64_epi64(src_32, 2);
777782
__m256i src_3 = _mm256_permute4x64_epi64(src_32, 3);
778783

779-
src_0_32 = _mm256_cvtepu8_epi32(_mm256_castsi256_si128(src_0));
780784
src_1_32 = _mm256_cvtepu8_epi32(_mm256_castsi256_si128(src_1));
781785
src_2_32 = _mm256_cvtepu8_epi32(_mm256_castsi256_si128(src_2));
782786
src_3_32 = _mm256_cvtepu8_epi32(_mm256_castsi256_si128(src_3));
@@ -808,6 +812,7 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
808812
__m256 out_2_ps = _mm256_fmadd_ps(src_2_ps, m256_mul_factor, m256_dst_offset);
809813
__m256 out_3_ps = _mm256_fmadd_ps(src_3_ps, m256_mul_factor, m256_dst_offset);
810814

815+
// process 32 pixels, write 128 bytes
811816
_mm256_store_ps((dstp0 + x + 0), out_0_ps);
812817
_mm256_store_ps((dstp0 + x + 8), out_1_ps);
813818
_mm256_store_ps((dstp0 + x + 16), out_2_ps);
@@ -816,6 +821,38 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
816821
// const float pixel = (srcp0[x] - d.src_offset_i) * d.mul_factor + d.dst_offset;
817822
// dstp0[x] = pixel; // no clamp
818823
}
824+
825+
// second loop: process remaining 1-15 pixels, 16 at a time
826+
if (x < w16) {
827+
__m256i src_0_32, src_1_32;
828+
829+
if constexpr (sizeof(pixel_t) == 1) // uint8_t
830+
{
831+
// _m128 enough, 16 pixels
832+
__m128i src_16_bytes = _mm_load_si128(reinterpret_cast<const __m128i*>(srcp0 + x));
833+
src_0_32 = _mm256_cvtepu8_epi32(src_16_bytes); // 0-7
834+
src_1_32 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_16_bytes, 8)); // 8-15, quick permuteless
835+
}
836+
else // uint16_t
837+
{
838+
__m256i src_16 = _mm256_load_si256(reinterpret_cast<const __m256i*>(srcp0 + x));
839+
src_0_32 = _mm256_cvtepu16_epi32(_mm256_castsi256_si128(src_16)); // 0-7
840+
src_1_32 = _mm256_cvtepu16_epi32(_mm256_extracti128_si256(src_16, 1)); // 8-15
841+
}
842+
843+
src_0_32 = _mm256_sub_epi32(src_0_32, m256_src_offset_epi32);
844+
src_1_32 = _mm256_sub_epi32(src_1_32, m256_src_offset_epi32);
845+
846+
__m256 out_0_ps = _mm256_fmadd_ps(_mm256_cvtepi32_ps(src_0_32), m256_mul_factor, m256_dst_offset);
847+
__m256 out_1_ps = _mm256_fmadd_ps(_mm256_cvtepi32_ps(src_1_32), m256_mul_factor, m256_dst_offset);
848+
849+
// 16 floats - 64 bytes always safe
850+
_mm256_store_ps((dstp0 + x + 0), out_0_ps);
851+
_mm256_store_ps((dstp0 + x + 8), out_1_ps);
852+
853+
// const float pixel = (srcp0[x] - d.src_offset_i) * d.mul_factor + d.dst_offset;
854+
// dstp0[x] = pixel; // no clamp
855+
}
819856
dstp0 += dst_pitch;
820857
srcp0 += src_pitch;
821858
}

0 commit comments

Comments
 (0)