From 4eb6e7476bbb78a40583c59565d57ab6e87d2140 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Thu, 11 Apr 2024 10:43:41 -0700 Subject: [PATCH 1/8] simd rfx yuv (cherry picked from commit b2f99d967c3b4eeaaacebb6681aa6a664a3aad7c) --- module/amd64/Makefile.am | 1 + .../a8r8g8b8_to_yuvalp_box_amd64_sse2.asm | 178 ++++++++++++++++++ module/amd64/funcs_amd64.h | 4 + module/rdp.h | 1 + module/rdpCapture.c | 95 ++++++---- module/rdpCapture.h | 4 + module/rdpSimd.c | 87 +++++++++ module/x86/Makefile.am | 1 + .../x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm | 171 +++++++++++++++++ module/x86/funcs_x86.h | 4 + 10 files changed, 505 insertions(+), 41 deletions(-) create mode 100644 module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm create mode 100644 module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am index cd2a0204..be4d928b 100644 --- a/module/amd64/Makefile.am +++ b/module/amd64/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \ a8r8g8b8_to_nv12_box_amd64_sse2.asm \ + a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \ cpuid_amd64.asm \ i420_to_rgb32_amd64_sse2.asm \ uyvy_to_rgb32_amd64_sse2.asm \ diff --git a/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm new file mode 100644 index 00000000..cfe9d6af --- /dev/null +++ b/module/amd64/a8r8g8b8_to_yuvalp_box_amd64_sse2.asm @@ -0,0 +1,178 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8 [rsp + 16] ; d8 +%define LDST_STRIDE [rsp + 24] ; dst_stride +%define LWIDTH [rsp + 32] ; width +%define LHEIGHT [rsp + 40] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_amd64_sse2 + push rbx + push rbp + sub rsp, 48 ; local vars, 48 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8, rdx ; d8 + mov LDST_STRIDE, rcx ; dst_stride + mov LWIDTH, r8 ; width + mov LHEIGHT, r9 ; height + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [rdi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [rdi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8 + mov rax, LD8 ; d8 + add rax, LDST_STRIDE ; d8 += dst_stride + mov LD8, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 48 ; local vars, 48 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h index ae38c53b..3b54e2b2 100644 --- a/module/amd64/funcs_amd64.h +++ b/module/amd64/funcs_amd64.h @@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdp.h b/module/rdp.h index 844db424..f9e4ac61 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -297,6 +297,7 @@ struct _rdpRec copy_box_proc a8r8g8b8_to_a8b8g8r8_box; copy_box_dst2_proc a8r8g8b8_to_nv12_box; + copy_box_proc a8r8g8b8_to_yuvalp_box; /* multimon */ struct monitor_info minfo[16]; /* client monitor data */ diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 334a4880..893917ce 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -124,24 +124,18 @@ rdpFillBox_yuvalp(int ax, int ay, /* 19595 38470 7471 -11071 -21736 32807 32756 -27429 -5327 */ -static int -rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, - const uint8_t *src, int src_stride, - uint8_t *dst, int dst_stride, - BoxPtr rects, int num_rects) +int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) { - const uint8_t *s8; - uint8_t *d8; uint8_t *yptr; uint8_t *uptr; uint8_t *vptr; uint8_t *aptr; const uint32_t *s32; - int index; int jndex; int kndex; - int width; - int height; uint32_t pixel; uint8_t a; int r; @@ -150,6 +144,51 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, int y; int u; int v; + + for (jndex = 0; jndex < height; jndex++) + { + s32 = (const uint32_t *) s8; + yptr = d8; + uptr = yptr + 64 * 64; + vptr = uptr + 64 * 64; + aptr = vptr + 64 * 64; + kndex = 0; + while (kndex < width) + { + pixel = *(s32++); + RGB_SPLIT(a, r, g, b, pixel); + y = (r * 19595 + g * 38470 + b * 7471) >> 16; + u = (r * -11071 + g * -21736 + b * 32807) >> 16; + v = (r * 32756 + g * -27429 + b * -5327) >> 16; + u = u + 128; + v = v + 128; + y = RDPCLAMP(y, 0, UCHAR_MAX); + u = RDPCLAMP(u, 0, UCHAR_MAX); + v = RDPCLAMP(v, 0, UCHAR_MAX); + *(yptr++) = y; + *(uptr++) = u; + *(vptr++) = v; + *(aptr++) = a; + kndex++; + } + d8 += dst_stride; + s8 += src_stride; + } + return 0; +} + +/******************************************************************************/ +static int +rdpCopyBox_a8r8g8b8_to_yuvalp(rdpClientCon *clientCon, int ax, int ay, + const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8; + int index; + int width; + int height; BoxPtr box; dst = dst + (ay << 8) * (dst_stride >> 8) + (ax << 8); @@ -162,35 +201,9 @@ rdpCopyBox_a8r8g8b8_to_yuvalp(int ax, int ay, d8 += box->x1 - ax; width = box->x2 - box->x1; height = box->y2 - box->y1; - for (jndex = 0; jndex < height; jndex++) - { - s32 = (const uint32_t *) s8; - yptr = d8; - uptr = yptr + 64 * 64; - vptr = uptr + 64 * 64; - aptr = vptr + 64 * 64; - kndex = 0; - while (kndex < width) - { - pixel = *(s32++); - RGB_SPLIT(a, r, g, b, pixel); - y = (r * 19595 + g * 38470 + b * 7471) >> 16; - u = (r * -11071 + g * -21736 + b * 32807) >> 16; - v = (r * 32756 + g * -27429 + b * -5327) >> 16; - u = u + 128; - v = v + 128; - y = RDPCLAMP(y, 0, UCHAR_MAX); - u = RDPCLAMP(u, 0, UCHAR_MAX); - v = RDPCLAMP(v, 0, UCHAR_MAX); - *(yptr++) = y; - *(uptr++) = u; - *(vptr++) = v; - *(aptr++) = a; - kndex++; - } - d8 += 64; - s8 += src_stride; - } + clientCon->dev->a8r8g8b8_to_yuvalp_box(s8, src_stride, + d8, 64, + width, height); } return 0; } @@ -946,7 +959,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, rects = REGION_RECTS(&tile_reg); num_rects = REGION_NUM_RECTS(&tile_reg); crc = wyhash((const void*)rects, num_rects * sizeof(BoxRec), crc, _wyp); - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, rects, num_rects); @@ -975,7 +988,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /* lazily only do this if hash wasn't identical */ if (rcode != rgnPART) { - rdpCopyBox_a8r8g8b8_to_yuvalp(x, y, + rdpCopyBox_a8r8g8b8_to_yuvalp(clientCon, x, y, src, src_stride, dst, dst_stride, &rect, 1); diff --git a/module/rdpCapture.h b/module/rdpCapture.h index 72a9336e..7b027eb1 100644 --- a/module/rdpCapture.h +++ b/module/rdpCapture.h @@ -48,5 +48,9 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +extern _X_EXPORT int +a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 49a3653e..05dd6a03 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -62,6 +62,90 @@ int g_simd_use_accel = 1; #define LLOGLN(_level, _args) \ do { if (_level < LOG_LEVEL) { ErrorF _args ; ErrorF("\n"); } } while (0) +#if SIMD_USE_ACCEL + +#if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64) +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_amd64_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#if defined(__x86__) || defined(_M_IX86) || defined(__i386__) +/*****************************************************************************/ +int +a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_yuvalp_box_x86_sse2(s8, src_stride, + d8, dst_stride, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_yuvalp_box(s8 + aligned_width * 4, src_stride, + d8 + aligned_width, dst_stride, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} +#endif + +#endif + /*****************************************************************************/ Bool rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) @@ -77,6 +161,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = UYVY_to_RGB32; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) { @@ -93,6 +178,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } #elif defined(__x86__) || defined(_M_IX86) || defined(__i386__) @@ -108,6 +194,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2; + dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } #endif diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am index ed106863..9539f8c0 100644 --- a/module/x86/Makefile.am +++ b/module/x86/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_I386 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \ a8r8g8b8_to_nv12_box_x86_sse2.asm \ + a8r8g8b8_to_yuvalp_box_x86_sse2.asm \ cpuid_x86.asm \ i420_to_rgb32_x86_sse2.asm \ uyvy_to_rgb32_x86_sse2.asm \ diff --git a/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm new file mode 100644 index 00000000..cec02043 --- /dev/null +++ b/module/x86/a8r8g8b8_to_yuvalp_box_x86_sse2.asm @@ -0,0 +1,171 @@ +; +;Copyright 2024 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to YUVALP +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width must be multiple of 8 and > 0 +; height must be > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + cw128 times 8 dw 128 + cw77 times 8 dw 77 + cw150 times 8 dw 150 + cw29 times 8 dw 29 + cw43 times 8 dw 43 + cw85 times 8 dw 85 + cw107 times 8 dw 107 + cw21 times 8 dw 21 + +%define LS8 [esp + 20] ; s8 +%define LSRC_STRIDE [esp + 24] ; src_stride +%define LD8 [esp + 28] ; d8 +%define LDST_STRIDE [esp + 32] ; dst_stride +%define LWIDTH [esp + 36] ; width +%define LHEIGHT [esp + 40] ; height + +;int +;a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, +; uint8_t *d8, int dst_stride, +; int width, int height); +PROC a8r8g8b8_to_yuvalp_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8 ; d8 + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + movdqa xmm4, xmm0 ; alpha + psrld xmm4, 24 ; alpha + pand xmm4, [lsym(cd255)] ; alpha + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm5, xmm0 ; alpha + psrld xmm5, 24 ; alpha + pand xmm5, [lsym(cd255)] ; alpha + packssdw xmm4, xmm5 ; xmm4 = 8 alphas + packuswb xmm4, xmm7 + movq [edi + 3 * 64 * 64], xmm4 ; out 8 bytes aaaaaaaa + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (77 * _R + 150 * _G + 29 * _B) >> 8; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw29)] + pmullw xmm5, [lsym(cw150)] + pmullw xmm6, [lsym(cw77)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-43 * _R - 85 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw85)] + pmullw xmm6, [lsym(cw43)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 1 * 64 * 64], xmm4 ; out 8 bytes uuuuuuuu + + ; _V = ((128 * _R - 107 * _G - 21 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw107)] + pmullw xmm6, [lsym(cw21)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq [edi + 2 * 64 * 64], xmm4 ; out 8 bytes vvvvvvvv + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8 + mov eax, LD8 ; d8 + add eax, LDST_STRIDE ; d8 += dst_stride + mov LD8, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h index c70cc8cf..d1f3357d 100644 --- a/module/x86/funcs_x86.h +++ b/module/x86/funcs_x86.h @@ -43,6 +43,10 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_y, int dst_stride_y, uint8_t *d8_uv, int dst_stride_uv, int width, int height); +int +a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8, int dst_stride, + int width, int height); #endif From bd2fa880804de3e64a8c853a3bf96414b6d56458 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Tue, 14 May 2024 00:52:03 -0700 Subject: [PATCH 2/8] x264 working (cherry picked from commit b80d38562ca7885a52c2f653049e7ae1afbd7295) --- module/amd64/Makefile.am | 1 + .../a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm | 304 ++++++++++++++++++ module/amd64/funcs_amd64.h | 5 + module/rdp.h | 1 + module/rdpCapture.c | 157 ++++++++- module/rdpCapture.h | 5 + module/rdpClientCon.c | 2 + module/rdpSimd.c | 178 +++++++++- module/x86/Makefile.am | 1 + .../a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm | 300 +++++++++++++++++ module/x86/funcs_x86.h | 5 + 11 files changed, 952 insertions(+), 7 deletions(-) create mode 100644 module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm create mode 100644 module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm diff --git a/module/amd64/Makefile.am b/module/amd64/Makefile.am index be4d928b..ed2d7c63 100644 --- a/module/amd64/Makefile.am +++ b/module/amd64/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_AMD64 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_amd64_sse2.asm \ a8r8g8b8_to_nv12_box_amd64_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm \ a8r8g8b8_to_yuvalp_box_amd64_sse2.asm \ cpuid_amd64.asm \ i420_to_rgb32_amd64_sse2.asm \ diff --git a/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm new file mode 100644 index 00000000..c18e9d6a --- /dev/null +++ b/module/amd64/a8r8g8b8_to_nv12_709fr_box_amd64_sse2.asm @@ -0,0 +1,304 @@ +; +;Copyright 2015 Jay Sorg +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;amd64 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LS8 [rsp + 0] ; s8 +%define LSRC_STRIDE [rsp + 8] ; src_stride +%define LD8_Y [rsp + 16] ; d8_y +%define LDST_Y_STRIDE [rsp + 24] ; dst_stride_y +%define LD8_UV [rsp + 32] ; d8_uv +%define LDST_UV_STRIDE [rsp + 40] ; dst_stride_uv +%define LU1 [rsp + 48] ; first line U, 8 bytes +%define LV1 [rsp + 56] ; first line V, 8 bytes +%define LU2 [rsp + 64] ; second line U, 8 bytes +%define LV2 [rsp + 72] ; second line V, 8 bytes + +%define LWIDTH [rsp + 104] ; width +%define LHEIGHT [rsp + 112] ; height + +;The first six integer or pointer arguments are passed in registers +; RDI, RSI, RDX, RCX, R8, and R9 + +;int +;a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_amd64_sse2 + push rbx + push rbp + sub rsp, 80 ; local vars, 80 bytes + + mov LS8, rdi ; s8 + mov LSRC_STRIDE, rsi ; src_stride + mov LD8_Y, rdx ; d8_y + mov LDST_Y_STRIDE, rcx ; dst_stride_y + mov LD8_UV, r8 ; d8_uv + mov LDST_UV_STRIDE, r9 ; dst_stride_uv + + pxor xmm7, xmm7 + + mov ebx, LHEIGHT ; ebx = height + shr ebx, 1 ; doing 2 lines at a time + +row_loop1: + mov rsi, LS8 ; s8 + mov rdi, LD8_Y ; d8_y + mov rdx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add rsi, LSRC_STRIDE + add rdi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [rsi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [rsi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 54 * _R + 183 * _G + 18 * _B) >> 8); + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [rdi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-29 * _R - 99 * _G + 128 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((128 * _R - 116 * _G - 12 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [rdx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub rsi, LSRC_STRIDE + sub rdi, LDST_Y_STRIDE + + ; move right + lea rsi, [rsi + 32] + lea rdi, [rdi + 8] + lea rdx, [rdx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov rax, LS8 ; s8 + add rax, LSRC_STRIDE ; s8 += src_stride + add rax, LSRC_STRIDE ; s8 += src_stride + mov LS8, rax + + ; update d8_y + mov rax, LD8_Y ; d8_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add rax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, rax + + ; update d8_uv + mov rax, LD8_UV ; d8_uv + add rax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, rax + + dec ebx + jnz row_loop1 + + mov rax, 0 ; return value + add rsp, 80 ; local vars, 80 bytes + pop rbp + pop rbx + ret +END_OF_FILE diff --git a/module/amd64/funcs_amd64.h b/module/amd64/funcs_amd64.h index 3b54e2b2..9d746fdc 100644 --- a/module/amd64/funcs_amd64.h +++ b/module/amd64/funcs_amd64.h @@ -44,6 +44,11 @@ a8r8g8b8_to_nv12_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int a8r8g8b8_to_yuvalp_box_amd64_sse2(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height); diff --git a/module/rdp.h b/module/rdp.h index f9e4ac61..9122bc92 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -297,6 +297,7 @@ struct _rdpRec copy_box_proc a8r8g8b8_to_a8b8g8r8_box; copy_box_dst2_proc a8r8g8b8_to_nv12_box; + copy_box_dst2_proc a8r8g8b8_to_nv12_709fr_box; copy_box_proc a8r8g8b8_to_yuvalp_box; /* multimon */ diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 893917ce..f8b52db5 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -553,6 +553,103 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, return 0; } +/******************************************************************************/ +int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int index; + int jndex; + int R; + int G; + int B; + int Y; + int U; + int V; + int U_sum; + int V_sum; + int pixel; + const uint32_t *s32a; + const uint32_t *s32b; + uint8_t *d8ya; + uint8_t *d8yb; + uint8_t *d8uv; + + for (jndex = 0; jndex < height; jndex += 2) + { + s32a = (const uint32_t *) (s8 + src_stride * jndex); + s32b = (const uint32_t *) (s8 + src_stride * (jndex + 1)); + d8ya = d8_y + dst_stride_y * jndex; + d8yb = d8_y + dst_stride_y * (jndex + 1); + d8uv = d8_uv + dst_stride_uv * (jndex / 2); + for (index = 0; index < width; index += 2) + { + U_sum = 0; + V_sum = 0; + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32a[0]; + s32a++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8ya[0] = RDPCLAMP(Y, 0, 255); + d8ya++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + pixel = s32b[0]; + s32b++; + R = (pixel >> 16) & 0xff; + G = (pixel >> 8) & 0xff; + B = (pixel >> 0) & 0xff; + Y = ( 54 * R + 183 * G + 18 * B) >> 8; + U = ((-29 * R - 99 * G + 128 * B) >> 8) + 128; + V = ((128 * R - 116 * G - 12 * B) >> 8) + 128; + d8yb[0] = RDPCLAMP(Y, 0, 255); + d8yb++; + U_sum += RDPCLAMP(U, 0, 255); + V_sum += RDPCLAMP(V, 0, 255); + + d8uv[0] = (U_sum + 2) / 4; + d8uv++; + d8uv[0] = (V_sum + 2) / 4; + d8uv++; + } + } + return 0; +} + /******************************************************************************/ /* copy rects with no error checking */ static int @@ -590,6 +687,44 @@ rdpCopyBox_a8r8g8b8_to_nv12(rdpClientCon *clientCon, return 0; } +/******************************************************************************/ +/* copy rects with no error checking */ +static int +rdpCopyBox_a8r8g8b8_to_nv12_709fr(rdpClientCon *clientCon, + const uint8_t *src, int src_stride, + int srcx, int srcy, + uint8_t *dst_y, int dst_stride_y, + uint8_t *dst_uv, int dst_stride_uv, + int dstx, int dsty, + BoxPtr rects, int num_rects) +{ + const uint8_t *s8; + uint8_t *d8_y; + uint8_t *d8_uv; + int index; + int width; + int height; + BoxPtr box; + + for (index = 0; index < num_rects; index++) + { + box = rects + index; + s8 = src + (box->y1 - srcy) * src_stride; + s8 += (box->x1 - srcx) * 4; + d8_y = dst_y + (box->y1 - dsty) * dst_stride_y; + d8_y += (box->x1 - dstx) * 1; + d8_uv = dst_uv + ((box->y1 - dsty) / 2) * dst_stride_uv; + d8_uv += (box->x1 - dstx) * 1; + width = box->x2 - box->x1; + height = box->y2 - box->y1; + clientCon->dev->a8r8g8b8_to_nv12_709fr_box(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + width, height); + } + return 0; +} + /******************************************************************************/ static Bool isShmStatusActive(enum shared_memory_status status) { @@ -1081,6 +1216,17 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, dst, dst_stride, 0, 0, *out_rects, num_rects); } + else if (dst_format == XRDP_nv12_709fr) + { + dst_uv = dst; + dst_uv += clientCon->cap_width * clientCon->cap_height; + rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon, + src, src_stride, 0, 0, + dst, dst_stride, + dst_uv, dst_stride, + 0, 0, + *out_rects, num_rects); + } else if (dst_format == XRDP_nv12) { dst_uv = dst; @@ -1187,12 +1333,13 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, return rdpCapture0(clientCon, in_reg, out_rects, num_out_rects, id); case 1: return rdpCapture1(clientCon, in_reg, out_rects, num_out_rects, id); - case 2: - case 4: - /* used for remotefx capture */ + case 2: /* surface command RFX */ + /* FALLTHROUGH */ + case 4: /* GFX progressive */ return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id); - case 3: - case 5: + case 3: /* surface command h264 */ + /* FALLTHROUGH */ + case 5: /* GFX h264 */ /* used for even align capture */ return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id); default: diff --git a/module/rdpCapture.h b/module/rdpCapture.h index 7b027eb1..7e38508e 100644 --- a/module/rdpCapture.h +++ b/module/rdpCapture.h @@ -49,6 +49,11 @@ a8r8g8b8_to_nv12_box(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); extern _X_EXPORT int +a8r8g8b8_to_nv12_709fr_box(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +extern _X_EXPORT int a8r8g8b8_to_yuvalp_box(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height); diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c index a3366fa5..29c7c4c0 100644 --- a/module/rdpClientCon.c +++ b/module/rdpClientCon.c @@ -2625,6 +2625,8 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, id->flags, id->left, id->top, id->width, id->height)); capture_code = clientCon->client_info.capture_code; + LLOGLN(10, ("rdpClientConSendPaintRectShmFd: capture_code %d", + capture_code)); num_rects_d = REGION_NUM_RECTS(dirtyReg); num_rects_c = numCopyRects; diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 05dd6a03..9af4ecf9 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -65,6 +65,92 @@ int g_simd_use_accel = 1; #if SIMD_USE_ACCEL #if defined(__x86_64__) || defined(__AMD64__) || defined (_M_AMD64) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_amd64_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, + height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + /*****************************************************************************/ int a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, @@ -105,6 +191,91 @@ a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap(const uint8_t *s8, int src_stride, #endif #if defined(__x86__) || defined(_M_IX86) || defined(__i386__) +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_box(s8 + aligned_width * 4, src_stride, + d8_y + aligned_width, dst_stride_y, + d8_uv + aligned_width, dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + +/******************************************************************************/ +static int +a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height) +{ + int aligned_width; + int left_over_width; + int error; + + aligned_width = width & ~7; + left_over_width = width - aligned_width; + if (height > 0) + { + if (aligned_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box_x86_sse2(s8, src_stride, + d8_y, dst_stride_y, + d8_uv, dst_stride_uv, + aligned_width, height); + if (error != 0) + { + return error; + } + } + if (left_over_width > 0) + { + error = a8r8g8b8_to_nv12_709fr_box(s8 + awidaligned_widthth * 4, + src_stride, + d8_y + aligned_width, + dst_stride_y, + d8_uv + aligned_width, + dst_stride_uv, + left_over_width, height); + if (error != 0) + { + return error; + } + } + } + return 0; +} + /*****************************************************************************/ int a8r8g8b8_to_yuvalp_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, @@ -161,6 +332,7 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->uyvy_to_rgb32 = UYVY_to_RGB32; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box; dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box; #if SIMD_USE_ACCEL if (g_simd_use_accel) @@ -177,7 +349,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_amd64_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_amd64_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_amd64_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_amd64_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_amd64_sse2_wrap; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_amd64_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 amd64 yuv functions assigned")); } @@ -193,7 +366,8 @@ rdpSimdInit(ScreenPtr pScreen, ScrnInfoPtr pScrn) dev->yuy2_to_rgb32 = yuy2_to_rgb32_x86_sse2; dev->uyvy_to_rgb32 = uyvy_to_rgb32_x86_sse2; dev->a8r8g8b8_to_a8b8g8r8_box = a8r8g8b8_to_a8b8g8r8_box_x86_sse2; - dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2; + dev->a8r8g8b8_to_nv12_box = a8r8g8b8_to_nv12_box_x86_sse2_wrap; + dev->a8r8g8b8_to_nv12_709fr_box = a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap; dev->a8r8g8b8_to_yuvalp_box = a8r8g8b8_to_yuvalp_box_x86_sse2_wrap; LLOGLN(0, ("rdpSimdInit: sse2 x86 yuv functions assigned")); } diff --git a/module/x86/Makefile.am b/module/x86/Makefile.am index 9539f8c0..92acda61 100644 --- a/module/x86/Makefile.am +++ b/module/x86/Makefile.am @@ -3,6 +3,7 @@ NAFLAGS += -DASM_ARCH_I386 ASMSOURCES = \ a8r8g8b8_to_a8b8g8r8_box_x86_sse2.asm \ a8r8g8b8_to_nv12_box_x86_sse2.asm \ + a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm \ a8r8g8b8_to_yuvalp_box_x86_sse2.asm \ cpuid_x86.asm \ i420_to_rgb32_x86_sse2.asm \ diff --git a/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm new file mode 100644 index 00000000..262f1af3 --- /dev/null +++ b/module/x86/a8r8g8b8_to_nv12_709fr_box_x86_sse2.asm @@ -0,0 +1,300 @@ +; +;Copyright 2015 Jay Sorg +;Copyright 2017 mirabilos +; +;Permission to use, copy, modify, distribute, and sell this software and its +;documentation for any purpose is hereby granted without fee, provided that +;the above copyright notice appear in all copies and that both that +;copyright notice and this permission notice appear in supporting +;documentation. +; +;The above copyright notice and this permission notice shall be included in +;all copies or substantial portions of the Software. +; +;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN +;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +; +;ARGB to NV12 709 full range +;x86 SSE2 +; +; notes +; address s8 should be aligned on 16 bytes, will be slower if not +; width should be multiple of 8 and > 0 +; height should be even and > 0 + +%include "common.asm" + +PREPARE_RODATA + cd255 times 4 dd 255 + + cw255 times 8 dw 255 + cw128 times 8 dw 128 + cw54 times 8 dw 54 + cw183 times 8 dw 183 + cw18 times 8 dw 18 + cw29 times 8 dw 29 + cw99 times 8 dw 99 + cw116 times 8 dw 116 + cw12 times 8 dw 12 + cw2 times 8 dw 2 + +%define LU1 [esp + 0] ; first line U, 8 bytes +%define LV1 [esp + 8] ; first line V, 8 bytes +%define LU2 [esp + 16] ; second line U, 8 bytes +%define LV2 [esp + 24] ; second line V, 8 bytes + +%define LS8 [esp + 52] ; s8 +%define LSRC_STRIDE [esp + 56] ; src_stride +%define LD8_Y [esp + 60] ; d8_y +%define LDST_Y_STRIDE [esp + 64] ; dst_stride_y +%define LD8_UV [esp + 68] ; d8_uv +%define LDST_UV_STRIDE [esp + 72] ; dst_stride_uv +%define LWIDTH [esp + 76] ; width +%define LHEIGHT [esp + 80] ; height + +;int +;a8r8g8b8_to_nv12_709fr_box_x86_sse2(const char *s8, int src_stride, +; char *d8_y, int dst_stride_y, +; char *d8_uv, int dst_stride_uv, +; int width, int height); +PROC a8r8g8b8_to_nv12_709fr_box_x86_sse2 + push ebx + RETRIEVE_RODATA + push esi + push edi + push ebp + sub esp, 32 ; local vars, 32 bytes + + pxor xmm7, xmm7 + + mov ebp, LHEIGHT ; ebp = height + shr ebp, 1 ; doing 2 lines at a time + +row_loop1: + mov esi, LS8 ; s8 + mov edi, LD8_Y ; d8_y + mov edx, LD8_UV ; d8_uv + + mov ecx, LWIDTH ; ecx = width + shr ecx, 3 ; doing 8 pixels at a time + +loop1: + ; first line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU1, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV1, xmm4 ; save for later + + ; go down to second line + add esi, LSRC_STRIDE + add edi, LDST_Y_STRIDE + + ; second line + movdqu xmm0, [esi] ; 4 pixels, 16 bytes + movdqa xmm1, xmm0 ; blue + pand xmm1, [lsym(cd255)] ; blue + movdqa xmm2, xmm0 ; green + psrld xmm2, 8 ; green + pand xmm2, [lsym(cd255)] ; green + movdqa xmm3, xmm0 ; red + psrld xmm3, 16 ; red + pand xmm3, [lsym(cd255)] ; red + + movdqu xmm0, [esi + 16] ; 4 pixels, 16 bytes + movdqa xmm4, xmm0 ; blue + pand xmm4, [lsym(cd255)] ; blue + movdqa xmm5, xmm0 ; green + psrld xmm5, 8 ; green + pand xmm5, [lsym(cd255)] ; green + movdqa xmm6, xmm0 ; red + psrld xmm6, 16 ; red + pand xmm6, [lsym(cd255)] ; red + + packssdw xmm1, xmm4 ; xmm1 = 8 blues + packssdw xmm2, xmm5 ; xmm2 = 8 greens + packssdw xmm3, xmm6 ; xmm3 = 8 reds + + ; _Y = (( 66 * _R + 129 * _G + 25 * _B) >> 8) + 16; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw18)] + pmullw xmm5, [lsym(cw183)] + pmullw xmm6, [lsym(cw54)] + paddw xmm4, xmm5 + paddw xmm4, xmm6 + psrlw xmm4, 8 + packuswb xmm4, xmm7 + movq [edi], xmm4 ; out 8 bytes yyyyyyyy + + ; _U = ((-38 * _R - 74 * _G + 112 * _B) >> 8) + 128; + movdqa xmm4, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm6, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw99)] + pmullw xmm6, [lsym(cw29)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LU2, xmm4 ; save for later + + ; _V = ((112 * _R - 94 * _G - 18 * _B) >> 8) + 128; + movdqa xmm6, xmm1 ; blue + movdqa xmm5, xmm2 ; green + movdqa xmm4, xmm3 ; red + pmullw xmm4, [lsym(cw128)] + pmullw xmm5, [lsym(cw116)] + pmullw xmm6, [lsym(cw12)] + psubw xmm4, xmm5 + psubw xmm4, xmm6 + psraw xmm4, 8 + paddw xmm4, [lsym(cw128)] + packuswb xmm4, xmm7 + movq LV2, xmm4 ; save for later + + ; uv add and divide(average) + movq mm1, LU1 ; u from first line + movq mm3, mm1 + pand mm1, [lsym(cw255)] + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + movq mm2, LU2 ; u from second line + movq mm3, mm2 + pand mm2, [lsym(cw255)] + paddw mm1, mm2 ; add + psrlw mm3, 8 + pand mm3, [lsym(cw255)] + paddw mm1, mm3 ; add + paddw mm1, [lsym(cw2)] ; add 2 + psrlw mm1, 2 ; div 4 + + movq mm2, LV1 ; v from first line + movq mm4, mm2 + pand mm2, [lsym(cw255)] + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + movq mm3, LV2 ; v from second line + movq mm4, mm3 + pand mm3, [lsym(cw255)] + paddw mm2, mm3 ; add + psrlw mm4, 8 + pand mm4, [lsym(cw255)] + paddw mm2, mm4 ; add + paddw mm2, [lsym(cw2)] ; add 2 + psrlw mm2, 2 ; div 4 + + packuswb mm1, mm1 + packuswb mm2, mm2 + + punpcklbw mm1, mm2 ; uv + movq [edx], mm1 ; out 8 bytes uvuvuvuv + + ; go up to first line + sub esi, LSRC_STRIDE + sub edi, LDST_Y_STRIDE + + ; move right + lea esi, [esi + 32] + lea edi, [edi + 8] + lea edx, [edx + 8] + + dec ecx + jnz loop1 + + ; update s8 + mov eax, LS8 ; s8 + add eax, LSRC_STRIDE ; s8 += src_stride + add eax, LSRC_STRIDE ; s8 += src_stride + mov LS8, eax + + ; update d8_y + mov eax, LD8_Y ; d8_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + add eax, LDST_Y_STRIDE ; d8_y += dst_stride_y + mov LD8_Y, eax + + ; update d8_uv + mov eax, LD8_UV ; d8_uv + add eax, LDST_UV_STRIDE ; d8_uv += dst_stride_uv + mov LD8_UV, eax + + dec ebp + jnz row_loop1 + + mov eax, 0 ; return value + add esp, 32 ; local vars, 32 bytes + pop ebp + pop edi + pop esi + pop ebx + ret +END_OF_FILE diff --git a/module/x86/funcs_x86.h b/module/x86/funcs_x86.h index d1f3357d..a08834f8 100644 --- a/module/x86/funcs_x86.h +++ b/module/x86/funcs_x86.h @@ -44,6 +44,11 @@ a8r8g8b8_to_nv12_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8_uv, int dst_stride_uv, int width, int height); int +a8r8g8b8_to_nv12_709fr_box_x86_sse2(const uint8_t *s8, int src_stride, + uint8_t *d8_y, int dst_stride_y, + uint8_t *d8_uv, int dst_stride_uv, + int width, int height); +int a8r8g8b8_to_yuvalp_box_x86_sse2(const uint8_t *s8, int src_stride, uint8_t *d8, int dst_stride, int width, int height); From 3932598ce77852702d91655f85a21e3b6299a644 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Tue, 14 May 2024 01:36:32 -0700 Subject: [PATCH 3/8] fix typeo (cherry picked from commit 019c29e7bf587306da90cec1d5d166f9b03d9e03) --- module/rdpSimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/rdpSimd.c b/module/rdpSimd.c index 9af4ecf9..59feb9da 100644 --- a/module/rdpSimd.c +++ b/module/rdpSimd.c @@ -260,7 +260,7 @@ a8r8g8b8_to_nv12_709fr_box_x86_sse2_wrap(const uint8_t *s8, int src_stride, } if (left_over_width > 0) { - error = a8r8g8b8_to_nv12_709fr_box(s8 + awidaligned_widthth * 4, + error = a8r8g8b8_to_nv12_709fr_box(s8 + aligned_width * 4, src_stride, d8_y + aligned_width, dst_stride_y, From f35c9a4be021785a16fac885e339640f9c0d3f86 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Tue, 14 May 2024 14:18:57 -0700 Subject: [PATCH 4/8] multimon fixes (cherry picked from commit ad29d913e3df61eaf69185da264d60dc9f422dd2) --- module/rdpCapture.c | 101 ++++++++++++++++++++++++++++++++++++++------ 1 file changed, 88 insertions(+), 13 deletions(-) diff --git a/module/rdpCapture.c b/module/rdpCapture.c index f8b52db5..5f2a45ec 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -1216,17 +1216,6 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, dst, dst_stride, 0, 0, *out_rects, num_rects); } - else if (dst_format == XRDP_nv12_709fr) - { - dst_uv = dst; - dst_uv += clientCon->cap_width * clientCon->cap_height; - rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon, - src, src_stride, 0, 0, - dst, dst_stride, - dst_uv, dst_stride, - 0, 0, - *out_rects, num_rects); - } else if (dst_format == XRDP_nv12) { dst_uv = dst; @@ -1246,6 +1235,91 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, return rv; } +/******************************************************************************/ +/* make out_rects always multiple of 2 width and height */ +static Bool +rdpCapture5(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) +{ + BoxPtr psrc_rects; + BoxRec rect; + int num_rects; + int index; + uint8_t *dst_uv; + Bool rv; + const uint8_t *src; + uint8_t *dst; + int src_stride; + int dst_stride; + int dst_format; + + LLOGLN(10, ("rdpCapture5:")); + + if (!isShmStatusActive(clientCon->shmemstatus)) + { + LLOGLN(0, ("rdpCapture5: WARNING -- Shared memory is not configured." + " Aborting capture!")); + return FALSE; + } + + rv = TRUE; + + rdpRegionTranslate(in_reg, -id->left, -id->top); + + num_rects = REGION_NUM_RECTS(in_reg); + psrc_rects = REGION_RECTS(in_reg); + + if (num_rects < 1) + { + return FALSE; + } + + *num_out_rects = num_rects; + + *out_rects = g_new(BoxRec, num_rects * 4); + index = 0; + while (index < num_rects) + { + rect = psrc_rects[index]; + LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.x2, + rect.x2, rect.y2)); + rect.x1 -= rect.x1 & 1; + rect.y1 -= rect.y1 & 1; + rect.x2 += rect.x2 & 1; + rect.y2 += rect.y2 & 1; + LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.x2, + rect.x2, rect.y2)); + (*out_rects)[index] = rect; + index++; + } + + src = id->pixels; + dst = id->shmem_pixels; + dst_format = clientCon->rdp_format; + src_stride = id->lineBytes; + dst_stride = id->width; + + src = src + src_stride * id->top + id->left * 4; + + if (dst_format == XRDP_nv12_709fr) + { + dst_uv = dst; + dst_uv += id->width * id->height; + rdpCopyBox_a8r8g8b8_to_nv12_709fr(clientCon, + src, src_stride, 0, 0, + dst, dst_stride, + dst_uv, dst_stride, + 0, 0, + *out_rects, num_rects); + } + else + { + LLOGLN(0, ("rdpCapture5: unimplemented color conversion")); + } + + return rv; +} + #if defined(XORGXRDP_GLAMOR) /******************************************************************************/ static int @@ -1338,10 +1412,11 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, case 4: /* GFX progressive */ return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id); case 3: /* surface command h264 */ - /* FALLTHROUGH */ - case 5: /* GFX h264 */ /* used for even align capture */ return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id); + case 5: /* GFX h264 */ + /* used for even align capture */ + return rdpCapture5(clientCon, in_reg, out_rects, num_out_rects, id); default: LLOGLN(0, ("rdpCapture: mode %d not implemented", mode)); break; From 20d0c5455d7073b44a37b42b01ff877f12e256c8 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Tue, 14 May 2024 23:31:23 -0700 Subject: [PATCH 5/8] change capture_code to enum, rename the capture functions to match (cherry picked from commit a8cfeacfa124412d884ee698505513c21598dd7d) --- module/rdpCapture.c | 86 +++++++++++++++++++++---------------------- module/rdpClientCon.c | 26 ++++++------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 5f2a45ec..5518a5e4 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -758,8 +758,8 @@ wyhash_rfx_tile(const uint8_t *src, int src_stride, int x, int y, uint64_t seed) /******************************************************************************/ static Bool -rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSimple(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -772,10 +772,10 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture0:")); + LLOGLN(10, ("rdpCaptureSimple:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture0: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSimple: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -843,7 +843,7 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture0: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSimple: unimplemented color conversion")); } return rv; } @@ -851,8 +851,8 @@ rdpCapture0(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 16 width and height */ static Bool -rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA16(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -883,10 +883,10 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture1:")); + LLOGLN(10, ("rdpCaptureSufA16:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture1: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA16: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -992,15 +992,15 @@ rdpCapture1(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture1: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA16: unimplemented color conversion")); } return rv; } /******************************************************************************/ static Bool -rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureGfxPro(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { int x; int y; @@ -1022,11 +1022,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int num_crcs; int mon_index; - LLOGLN(10, ("rdpCapture2:")); + LLOGLN(10, ("rdpCaptureGfxPro:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture2: WARNING -- Shared memory is not configured" + LLOGLN(0, ("rdpCaptureGfxPro: WARNING -- Shared memory is not configured" " for RFX. Aborting capture!")); return FALSE; } @@ -1052,7 +1052,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, num_crcs = crc_stride * ((id->height + 63) / 64); if (num_crcs != clientCon->num_rfx_crcs_alloc[mon_index]) { - LLOGLN(0, ("rdpCapture2: resize the crc list was %d now %d", + LLOGLN(0, ("rdpCaptureGfxPro: resize the crc list was %d now %d", clientCon->num_rfx_crcs_alloc[mon_index], num_crcs)); /* resize the crc list */ clientCon->num_rfx_crcs_alloc[mon_index] = num_crcs; @@ -1072,11 +1072,11 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, rect.x2 = rect.x1 + XRDP_RFX_ALIGN; rect.y2 = rect.y1 + XRDP_RFX_ALIGN; rcode = rdpRegionContainsRect(in_reg, &rect); - LLOGLN(10, ("rdpCapture2: rcode %d", rcode)); + LLOGLN(10, ("rdpCaptureGfxPro: rcode %d", rcode)); if (rcode == rgnOUT) { - LLOGLN(10, ("rdpCapture2: rgnOUT")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnOUT")); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -1087,7 +1087,7 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, crc = WYHASH_SEED; if (rcode == rgnPART) { - LLOGLN(10, ("rdpCapture2: rgnPART")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnPART")); rdpFillBox_yuvalp(x, y, dst, dst_stride); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionIntersect(&tile_reg, in_reg, &tile_reg); @@ -1104,16 +1104,16 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else /* rgnIN */ { - LLOGLN(10, ("rdpCapture2: rgnIN")); + LLOGLN(10, ("rdpCaptureGfxPro: rgnIN")); crc = wyhash_rfx_tile(src, src_stride, x, y, crc); } crc_offset = (y / XRDP_RFX_ALIGN) * crc_stride + (x / XRDP_RFX_ALIGN); - LLOGLN(10, ("rdpCapture2: crc 0x%" PRIx64 " 0x%" PRIx64, + LLOGLN(10, ("rdpCaptureGfxPro: crc 0x%" PRIx64 " 0x%" PRIx64, crc, clientCon->rfx_crcs[mon_index][crc_offset])); if (crc == clientCon->rfx_crcs[mon_index][crc_offset]) { - LLOGLN(10, ("rdpCapture2: crc skip at x %d y %d", x, y)); + LLOGLN(10, ("rdpCaptureGfxPro: crc skip at x %d y %d", x, y)); rdpRegionInit(&tile_reg, &rect, 0); rdpRegionSubtract(in_reg, in_reg, &tile_reg); rdpRegionUninit(&tile_reg); @@ -1150,8 +1150,8 @@ rdpCapture2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 2 width and height */ static Bool -rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureSufA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -1165,11 +1165,11 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture3:")); + LLOGLN(10, ("rdpCaptureSufA2:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture3: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureSufA2: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -1229,7 +1229,7 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture3: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureSufA2: unimplemented color conversion")); } return rv; @@ -1238,8 +1238,8 @@ rdpCapture3(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, /******************************************************************************/ /* make out_rects always multiple of 2 width and height */ static Bool -rdpCapture5(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, - int *num_out_rects, struct image_data *id) +rdpCaptureGfxA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, + int *num_out_rects, struct image_data *id) { BoxPtr psrc_rects; BoxRec rect; @@ -1253,11 +1253,11 @@ rdpCapture5(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int dst_stride; int dst_format; - LLOGLN(10, ("rdpCapture5:")); + LLOGLN(10, ("rdpCaptureGfxA2:")); if (!isShmStatusActive(clientCon->shmemstatus)) { - LLOGLN(0, ("rdpCapture5: WARNING -- Shared memory is not configured." + LLOGLN(0, ("rdpCaptureGfxA2: WARNING -- Shared memory is not configured." " Aborting capture!")); return FALSE; } @@ -1314,7 +1314,7 @@ rdpCapture5(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } else { - LLOGLN(0, ("rdpCapture5: unimplemented color conversion")); + LLOGLN(0, ("rdpCaptureGfxA2: unimplemented color conversion")); } return rv; @@ -1386,7 +1386,7 @@ Bool rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, int *num_out_rects, struct image_data *id) { - int mode; + enum xrdp_capture_code mode; LLOGLN(10, ("rdpCapture:")); mode = clientCon->client_info.capture_code; @@ -1403,20 +1403,20 @@ rdpCapture(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, } switch (mode) { - case 0: - return rdpCapture0(clientCon, in_reg, out_rects, num_out_rects, id); - case 1: - return rdpCapture1(clientCon, in_reg, out_rects, num_out_rects, id); - case 2: /* surface command RFX */ + case CC_SIMPLE: + return rdpCaptureSimple(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A16: + return rdpCaptureSufA16(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_RFX: /* surface command RFX */ /* FALLTHROUGH */ - case 4: /* GFX progressive */ - return rdpCapture2(clientCon, in_reg, out_rects, num_out_rects, id); - case 3: /* surface command h264 */ + case CC_GFX_PRO: /* GFX progressive */ + return rdpCaptureGfxPro(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_SUF_A2: /* surface command h264 */ /* used for even align capture */ - return rdpCapture3(clientCon, in_reg, out_rects, num_out_rects, id); - case 5: /* GFX h264 */ + return rdpCaptureSufA2(clientCon, in_reg, out_rects, num_out_rects, id); + case CC_GFX_A2: /* GFX h264 */ /* used for even align capture */ - return rdpCapture5(clientCon, in_reg, out_rects, num_out_rects, id); + return rdpCaptureGfxA2(clientCon, in_reg, out_rects, num_out_rects, id); default: LLOGLN(0, ("rdpCapture: mode %d not implemented", mode)); break; diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c index 29c7c4c0..8b3ac2fb 100644 --- a/module/rdpClientCon.c +++ b/module/rdpClientCon.c @@ -776,8 +776,8 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->rdp_height = height; /* Set the capture parameters */ - if ((clientCon->client_info.capture_code == 2) || /* RFX */ - (clientCon->client_info.capture_code == 4)) + if ((clientCon->client_info.capture_code == CC_SUF_RFX) || /* RFX */ + (clientCon->client_info.capture_code == CC_GFX_PRO)) { LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); /* RFX capture needs fixed-size rectangles */ @@ -793,8 +793,8 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->cap_stride_bytes = clientCon->cap_width * 4; shmemstatus = SHM_RFX_ACTIVE_PENDING; } - else if ((clientCon->client_info.capture_code == 3) || /* H264 */ - (clientCon->client_info.capture_code == 5)) + else if ((clientCon->client_info.capture_code == CC_SUF_A2) || /* H264 */ + (clientCon->client_info.capture_code == CC_GFX_A2)) { LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); clientCon->cap_width = width; @@ -1011,12 +1011,12 @@ rdpSendMemoryAllocationComplete(rdpPtr dev, rdpClientCon *clientCon) switch (clientCon->client_info.capture_code) { - case 2: - case 4: + case CC_SUF_RFX: + case CC_GFX_PRO: alignment = XRDP_RFX_ALIGN; break; - case 3: - case 5: + case CC_SUF_A2: + case CC_GFX_A2: alignment = XRDP_H264_ALIGN; break; default: @@ -2608,7 +2608,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, int num_rects_d; int num_rects_c; struct stream *s; - int capture_code; + enum xrdp_capture_code capture_code; int start_frame_bytes; int wiretosurface1_bytes; int wiretosurface2_bytes; @@ -2638,7 +2638,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConBeginUpdate(dev, clientCon); - if (capture_code < 4) + if (capture_code < CC_GFX_PRO) { /* non gfx */ size = 2 + 2 + 2 + num_rects_d * 8 + 2 + num_rects_c * 8; @@ -2658,7 +2658,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, clientCon->rect_id); out_uint32_le(s, id->shmem_bytes); out_uint32_le(s, id->shmem_offset); - if (capture_code == 2) /* rfx */ + if (capture_code == CC_SUF_RFX) /* rfx */ { out_uint16_le(s, id->left); out_uint16_le(s, id->top); @@ -2675,7 +2675,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, rdpClientConSendPending(clientCon->dev, clientCon); g_sck_send_fd_set(clientCon->sck, "int", 4, &(id->shmem_fd), 1); } - else if (capture_code == 4) /* gfx pro rfx */ + else if (capture_code == CC_GFX_PRO) /* gfx pro rfx */ { start_frame_bytes = 8 + 8; wiretosurface2_bytes = 8 + 13 + @@ -2747,7 +2747,7 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, 0); /* shmem_bytes */ } } - else if (capture_code == 5) /* gfx h264 */ + else if (capture_code == CC_GFX_A2) /* gfx h264 */ { start_frame_bytes = 8 + 8; wiretosurface1_bytes = 8 + 9 + From a12f232865532533cfa6350fdaca1979aaceca77 Mon Sep 17 00:00:00 2001 From: Jay Sorg Date: Sun, 19 May 2024 22:19:57 -0700 Subject: [PATCH 6/8] fix crash when odd monitor height (cherry picked from commit 7857c1e9acb67afbed81fac841d3c6acfac4f3f2) --- module/rdpCapture.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/module/rdpCapture.c b/module/rdpCapture.c index 5518a5e4..7591af47 100644 --- a/module/rdpCapture.c +++ b/module/rdpCapture.c @@ -1281,13 +1281,21 @@ rdpCaptureGfxA2(rdpClientCon *clientCon, RegionPtr in_reg, BoxPtr *out_rects, while (index < num_rects) { rect = psrc_rects[index]; - LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.x2, + LLOGLN(10, ("old x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, rect.x2, rect.y2)); rect.x1 -= rect.x1 & 1; rect.y1 -= rect.y1 & 1; rect.x2 += rect.x2 & 1; rect.y2 += rect.y2 & 1; - LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.x2, + if (rect.x2 > id->width) + { + rect.x2 = id->width & ~1; + } + if (rect.y2 > id->height) + { + rect.y2 = id->height & ~1; + } + LLOGLN(10, ("new x1 %d y1 %d x2 %d y2 %d", rect.x1, rect.y1, rect.x2, rect.y2)); (*out_rects)[index] = rect; index++; From 5af24ba7626700f7f70c239dce8aced9d8cef272 Mon Sep 17 00:00:00 2001 From: Koichiro Iwao Date: Sun, 18 Aug 2024 21:27:44 +0900 Subject: [PATCH 7/8] Perform CI test against xrdp v0.10-h264 branch --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a495a362..ffb33509 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -68,7 +68,7 @@ jobs: steps: - uses: actions/checkout@v3 - run: sudo scripts/install_xorgxrdp_build_dependencies_with_apt.sh ${{ matrix.arch }} --allow-downgrades --allow-remove-essential --allow-change-held-packages - - run: git clone --depth 1 --branch=v0.10 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp + - run: git clone --depth 1 --branch=v0.10-h264 https://github.com/neutrinolabs/xrdp.git ${{ github.workspace}}/xrdp - run: ./bootstrap - run: ./configure ${{ matrix.CONF_FLAGS }} - run: make CFLAGS="$CFLAGS -O2 -Wall -Wwrite-strings -Werror" From 9525b61213fe794396fc20cd5b801e597d8af15a Mon Sep 17 00:00:00 2001 From: Koichiro Iwao Date: Sun, 17 Nov 2024 23:56:58 +0900 Subject: [PATCH 8/8] Use per-codec frame capture interval passed from xrdp --- module/rdp.h | 2 + module/rdpClientCon.c | 91 ++++++++++++++++++++++++------------------- module/rdpClientCon.h | 2 + 3 files changed, 54 insertions(+), 41 deletions(-) diff --git a/module/rdp.h b/module/rdp.h index 9122bc92..534934e3 100644 --- a/module/rdp.h +++ b/module/rdp.h @@ -282,6 +282,8 @@ struct _rdpRec CARD32 last_event_time_ms; CARD32 last_wheel_time_ms; + CARD32 msFrameInterval; + int conNumber; struct _rdpCounts counts; diff --git a/module/rdpClientCon.c b/module/rdpClientCon.c index 8b3ac2fb..b98db64b 100644 --- a/module/rdpClientCon.c +++ b/module/rdpClientCon.c @@ -776,47 +776,57 @@ rdpClientConResizeAllMemoryAreas(rdpPtr dev, rdpClientCon *clientCon) clientCon->rdp_height = height; /* Set the capture parameters */ - if ((clientCon->client_info.capture_code == CC_SUF_RFX) || /* RFX */ - (clientCon->client_info.capture_code == CC_GFX_PRO)) + switch(clientCon->client_info.capture_code) { - LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); - /* RFX capture needs fixed-size rectangles */ - clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN); - clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN); - LLOGLN(0, (" cap_width %d cap_height %d", - clientCon->cap_width, clientCon->cap_height)); + case CC_SUF_RFX: /* RFX */ + case CC_GFX_PRO: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got RFX capture")); + /* RFX capture needs fixed-size rectangles */ + clientCon->cap_width = RDPALIGN(width, XRDP_RFX_ALIGN); + clientCon->cap_height = RDPALIGN(height, XRDP_RFX_ALIGN); + LLOGLN(0, (" cap_width %d cap_height %d", + clientCon->cap_width, clientCon->cap_height)); - bytes = clientCon->cap_width * clientCon->cap_height * - clientCon->rdp_Bpp; + bytes = clientCon->cap_width * clientCon->cap_height * + clientCon->rdp_Bpp; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * 4; - shmemstatus = SHM_RFX_ACTIVE_PENDING; - } - else if ((clientCon->client_info.capture_code == CC_SUF_A2) || /* H264 */ - (clientCon->client_info.capture_code == CC_GFX_A2)) - { - LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); - clientCon->cap_width = width; - clientCon->cap_height = height; + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * 4; + shmemstatus = SHM_RFX_ACTIVE_PENDING; - bytes = clientCon->cap_width * clientCon->cap_height * 2; + dev->msFrameInterval = clientCon->client_info.rfx_frame_interval; + break; + case CC_SUF_A2: /* H264 */ + case CC_GFX_A2: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got H264 capture")); + clientCon->cap_width = width; + clientCon->cap_height = height; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * 4; - shmemstatus = SHM_H264_ACTIVE_PENDING; - } - else - { - clientCon->cap_width = width; - clientCon->cap_height = height; + bytes = clientCon->cap_width * clientCon->cap_height * 2; - bytes = width * height * clientCon->rdp_Bpp; + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * 4; + shmemstatus = SHM_H264_ACTIVE_PENDING; - clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; - clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp; - shmemstatus = SHM_ACTIVE_PENDING; + dev->msFrameInterval = clientCon->client_info.h264_frame_interval; + break; + default: + LLOGLN(0, ("rdpClientConProcessMsgClientInfo: got normal capture")); + clientCon->cap_width = width; + clientCon->cap_width = width; + clientCon->cap_height = height; + + bytes = width * height * clientCon->rdp_Bpp; + + clientCon->shmem_lineBytes = clientCon->rdp_Bpp * clientCon->cap_width; + clientCon->cap_stride_bytes = clientCon->cap_width * clientCon->rdp_Bpp; + shmemstatus = SHM_ACTIVE_PENDING; + + dev->msFrameInterval = clientCon->client_info.normal_frame_interval; + break; } + + LLOGLN(0, (" msFrameInterval %ld", (long)dev->msFrameInterval)); rdpClientConAllocateSharedMemory(clientCon, bytes); if (clientCon->client_info.capture_format != 0) @@ -2539,7 +2549,7 @@ rdpClientConScheduleDeferredUpdate(rdpPtr dev) { dev->sendUpdateScheduled = TRUE; dev->sendUpdateTimer = - TimerSet(dev->sendUpdateTimer, 0, 40, + TimerSet(dev->sendUpdateTimer, 0, dev->msFrameInterval, rdpClientConDeferredUpdateCallback, dev); } } @@ -2658,15 +2668,15 @@ rdpClientConSendPaintRectShmFd(rdpPtr dev, rdpClientCon *clientCon, out_uint32_le(s, clientCon->rect_id); out_uint32_le(s, id->shmem_bytes); out_uint32_le(s, id->shmem_offset); - if (capture_code == CC_SUF_RFX) /* rfx */ - { + if (capture_code == CC_SUF_RFX) /* rfx */ + { out_uint16_le(s, id->left); out_uint16_le(s, id->top); out_uint16_le(s, id->width); out_uint16_le(s, id->height); - } - else - { + } + else + { out_uint16_le(s, 0); out_uint16_le(s, 0); out_uint16_le(s, clientCon->cap_width); @@ -2981,7 +2991,6 @@ rdpDeferredUpdateCallback(OsTimerPtr timer, CARD32 now, pointer arg) /******************************************************************************/ -#define MIN_MS_BETWEEN_FRAMES 40 #define MIN_MS_TO_WAIT_FOR_MORE_UPDATES 4 #define UPDATE_RETRY_TIMEOUT 200 // After this number of retries, give up and perform the capture anyway. This prevents an infinite loop. static void @@ -3000,7 +3009,7 @@ rdpScheduleDeferredUpdate(rdpClientCon *clientCon) for more changes before sending an update. Always waiting the longer delay would introduce unnecessarily much latency. */ msToWait = MIN_MS_TO_WAIT_FOR_MORE_UPDATES; - minNextUpdateTime = clientCon->lastUpdateTime + MIN_MS_BETWEEN_FRAMES; + minNextUpdateTime = clientCon->lastUpdateTime + clientCon->dev->msFrameInterval; /* the first check is to gracefully handle the infrequent case of the time wrapping around */ if(clientCon->lastUpdateTime < curTime && diff --git a/module/rdpClientCon.h b/module/rdpClientCon.h index 5ff1de21..b4c443cf 100644 --- a/module/rdpClientCon.h +++ b/module/rdpClientCon.h @@ -120,6 +120,8 @@ struct _rdpClientCon int updateScheduled; /* boolean */ int updateRetries; + CARD32 msFrameInterval; + RegionPtr dirtyRegion; int num_rfx_crcs_alloc[16];