@@ -752,7 +752,9 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
752752 dst_pitch = dst_pitch / sizeof (float );
753753
754754 const int src_width = src_rowsize / sizeof (pixel_t );
755-
755+ // write 32 floats (128 bytes) only if valid, scanline alignment is only 64 bytes (16 floats).
756+ // Alignment, write-limited, 16 floats (64 bytes) is guaranteed by AviSynth
757+ const int w16 = (src_width + 15 ) & ~15 ;
756758 // -----------------------
757759 bits_conv_constants d;
758760 get_bits_conv_constants (d, chroma, fulls, fulld, source_bitdepth, target_bitdepth);
@@ -763,20 +765,22 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
763765
764766 for (int y = 0 ; y < src_height; y++)
765767 {
766- for (int x = 0 ; x < src_width; x += 32 ) // process by 32 integers of 8bit, rows are 64 bytes aligned
768+ // rows are 64 bytes - 16 float pixels - aligned we have a write constraint
769+ int x;
770+ for (x = 0 ; x + 32 <= w16; x += 32 )
767771 {
768772 __m256i src_0_32, src_1_32, src_2_32, src_3_32;
769773
770774 if constexpr (sizeof (pixel_t ) == 1 ) // uint8_t
771775 {
772776 __m256i src_32 = _mm256_load_si256 (reinterpret_cast <const __m256i*>(srcp0 + x));
773777 // unpack to 4x64bits
774- __m256i src_0 = _mm256_permute4x64_epi64 (src_32, 0 );
778+ src_0_32 = _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (src_32)); // bytes 0-7
779+
775780 __m256i src_1 = _mm256_permute4x64_epi64 (src_32, 1 );
776781 __m256i src_2 = _mm256_permute4x64_epi64 (src_32, 2 );
777782 __m256i src_3 = _mm256_permute4x64_epi64 (src_32, 3 );
778783
779- src_0_32 = _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (src_0));
780784 src_1_32 = _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (src_1));
781785 src_2_32 = _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (src_2));
782786 src_3_32 = _mm256_cvtepu8_epi32 (_mm256_castsi256_si128 (src_3));
@@ -808,6 +812,7 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
808812 __m256 out_2_ps = _mm256_fmadd_ps (src_2_ps, m256_mul_factor, m256_dst_offset);
809813 __m256 out_3_ps = _mm256_fmadd_ps (src_3_ps, m256_mul_factor, m256_dst_offset);
810814
815+ // process 32 pixels, write 128 bytes
811816 _mm256_store_ps ((dstp0 + x + 0 ), out_0_ps);
812817 _mm256_store_ps ((dstp0 + x + 8 ), out_1_ps);
813818 _mm256_store_ps ((dstp0 + x + 16 ), out_2_ps);
@@ -816,6 +821,38 @@ void convert_uintN_to_float_avx2(const BYTE* srcp, BYTE* dstp, int src_rowsize,
816821 // const float pixel = (srcp0[x] - d.src_offset_i) * d.mul_factor + d.dst_offset;
817822 // dstp0[x] = pixel; // no clamp
818823 }
824+
825+ // second loop: process remaining 1-15 pixels, 16 at a time
826+ if (x < w16) {
827+ __m256i src_0_32, src_1_32;
828+
829+ if constexpr (sizeof (pixel_t ) == 1 ) // uint8_t
830+ {
831+ // _m128 enough, 16 pixels
832+ __m128i src_16_bytes = _mm_load_si128 (reinterpret_cast <const __m128i*>(srcp0 + x));
833+ src_0_32 = _mm256_cvtepu8_epi32 (src_16_bytes); // 0-7
834+ src_1_32 = _mm256_cvtepu8_epi32 (_mm_srli_si128 (src_16_bytes, 8 )); // 8-15, quick permuteless
835+ }
836+ else // uint16_t
837+ {
838+ __m256i src_16 = _mm256_load_si256 (reinterpret_cast <const __m256i*>(srcp0 + x));
839+ src_0_32 = _mm256_cvtepu16_epi32 (_mm256_castsi256_si128 (src_16)); // 0-7
840+ src_1_32 = _mm256_cvtepu16_epi32 (_mm256_extracti128_si256 (src_16, 1 )); // 8-15
841+ }
842+
843+ src_0_32 = _mm256_sub_epi32 (src_0_32, m256_src_offset_epi32);
844+ src_1_32 = _mm256_sub_epi32 (src_1_32, m256_src_offset_epi32);
845+
846+ __m256 out_0_ps = _mm256_fmadd_ps (_mm256_cvtepi32_ps (src_0_32), m256_mul_factor, m256_dst_offset);
847+ __m256 out_1_ps = _mm256_fmadd_ps (_mm256_cvtepi32_ps (src_1_32), m256_mul_factor, m256_dst_offset);
848+
849+ // 16 floats - 64 bytes always safe
850+ _mm256_store_ps ((dstp0 + x + 0 ), out_0_ps);
851+ _mm256_store_ps ((dstp0 + x + 8 ), out_1_ps);
852+
853+ // const float pixel = (srcp0[x] - d.src_offset_i) * d.mul_factor + d.dst_offset;
854+ // dstp0[x] = pixel; // no clamp
855+ }
819856 dstp0 += dst_pitch;
820857 srcp0 += src_pitch;
821858 }
0 commit comments