From 0f175891a608108a458890be6194358e0234f09f Mon Sep 17 00:00:00 2001 From: Gabriel Wang Date: Thu, 14 May 2026 14:37:46 +0800 Subject: [PATCH] Add SVE2 SIMD Alpha-Blending Blitter (#15504) SVE/SVE2 is a new SIMD extension for AArch64. Compared to NEON, SVE/SVE2 brings the following benefits that are good for SDL projects: - Lane prediction: we don't have to treat the tail part of a stride separately when the width is n times the hardware vector size - Although the performance is almost no difference from NEON when the hardware vector size is 128bits, when the hardware provides a longer vector size, e.g. 256, 512, ... 2048, we can enjoy the large performance gain without modifying the source code or recompiling a library. The functional correctness is validated in a dedicated [qemu project](https://github.com/GorgonMeducer/aarch64_qemu_mac_template/tree/SDL-SVE2-Acceleration-Validation). The performance is tested on [Radxa Orion 6 N](https://radxa.com/products/orion/o6n/), which provides 4x A720 and 4x A520 processors. Since the vector size is 128 bits, which is the same as NEON, the performance is almost the same (or no worse than) the NEON acceleration. --- Android.mk | 1 + CMakeLists.txt | 36 + include/SDL3/SDL_cpuinfo.h | 12 + include/SDL3/SDL_intrin.h | 28 + include/build_config/SDL_build_config.h.cmake | 1 + include/build_config/SDL_build_config_ios.h | 3 + src/cpuinfo/SDL_cpuinfo.c | 33 + src/dynapi/SDL_dynapi.exports | 1 + src/dynapi/SDL_dynapi.sym | 1 + src/dynapi/SDL_dynapi_overrides.h | 1 + src/dynapi/SDL_dynapi_procs.h | 1 + src/video/SDL_blit_A.c | 28 + src/video/SDL_blit_N.c | 21 + src/video/arm/SDL_sve2_blit_A.c | 89 + src/video/arm/SDL_sve2_blit_A.h | 37 + src/video/arm/SDL_sve2_blit_N.c | 64 + src/video/arm/SDL_sve2_blit_N.h | 35 + src/video/arm/SDL_sve2_extension.h | 1142 ++++++++ src/video/arm/SDL_sve2_swizzle.h | 2375 +++++++++++++++++ src/video/arm/SDL_sve2_util.h | 206 ++ test/testplatform.c | 1 + 21 files changed, 4116 insertions(+) create mode 100644 src/video/arm/SDL_sve2_blit_A.c create mode 100644 src/video/arm/SDL_sve2_blit_A.h create mode 100644 src/video/arm/SDL_sve2_blit_N.c create mode 100644 src/video/arm/SDL_sve2_blit_N.h create mode 100644 src/video/arm/SDL_sve2_extension.h create mode 100644 src/video/arm/SDL_sve2_swizzle.h create mode 100644 src/video/arm/SDL_sve2_util.h diff --git a/Android.mk b/Android.mk index 2e3b11483c..d53bf403b1 100644 --- a/Android.mk +++ b/Android.mk @@ -84,6 +84,7 @@ LOCAL_SRC_FILES := \ $(wildcard $(LOCAL_PATH)/src/tray/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/android/*.c) \ + $(wildcard $(LOCAL_PATH)/src/video/arm/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/yuv2rgb/*.c)) LOCAL_CFLAGS += -DGL_GLEXT_PROTOTYPES diff --git a/CMakeLists.txt b/CMakeLists.txt index e591a011c1..851e11add9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,6 +314,7 @@ dep_option(SDL_SSE4_2 "Use SSE4.2 assembly routines" ON "SDL_ASSEMB dep_option(SDL_MMX "Use MMX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_X86 OR SDL_CPU_X64" OFF) dep_option(SDL_ALTIVEC "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF) dep_option(SDL_ARMNEON "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF) +dep_option(SDL_ARMSVE2 "Use SVE2 assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM64" OFF) dep_option(SDL_LSX "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) dep_option(SDL_LASX "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) @@ -939,6 +940,37 @@ if(SDL_ASSEMBLY) endif() endif() + if(SDL_ARMSVE2) + cmake_push_check_state() + string(APPEND CMAKE_REQUIRED_FLAGS " -march=armv8-a+sve2") + check_arm_source_compiles([==[ + #include + svuint32_t sve2_test(svuint32_t a, svuint32_t b) { + return svadd_u32_x(svptrue_b32(), a, b); + } + int main(int argc, char *argv[]) { + sve2_test(svdup_u32(0), svdup_u32(0)); + return 0; + }]==] COMPILER_SUPPORTS_ARMSVE2) + if(COMPILER_SUPPORTS_ARMSVE2) + set(HAVE_ARMSVE2 TRUE) + endif() + cmake_pop_check_state() + + if(HAVE_ARMSVE2) + sdl_sources( + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c" + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c" + ) + set_source_files_properties( + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c" + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c" + PROPERTIES + SKIP_PRECOMPILE_HEADERS ON + ) + endif() + endif() + if(USE_GCC OR USE_CLANG) # TODO: Those all seem to be quite GCC specific - needs to be # reworked for better compiler support @@ -1055,6 +1087,10 @@ if(NOT HAVE_ARMNEON) set(SDL_DISABLE_NEON 1) endif() +if(NOT HAVE_ARMSVE2) + set(SDL_DISABLE_SVE2 1) +endif() + set(SDL_DISABLE_ALLOCA 0) check_include_file("alloca.h" "HAVE_ALLOCA_H") if(MSVC) diff --git a/include/SDL3/SDL_cpuinfo.h b/include/SDL3/SDL_cpuinfo.h index 5669c2373d..765cadf287 100644 --- a/include/SDL3/SDL_cpuinfo.h +++ b/include/SDL3/SDL_cpuinfo.h @@ -281,6 +281,18 @@ extern SDL_DECLSPEC bool SDLCALL SDL_HasARMSIMD(void); */ extern SDL_DECLSPEC bool SDLCALL SDL_HasNEON(void); +/** + * Determine whether the CPU has SVE2 (Scalable Vector Extension 2). + * + * This is only relevant on ARM64 Linux. On other platforms it always returns + * false. + * + * \returns true if the CPU has SVE2, false otherwise. + * + * \since This function is available since SDL 3.6.0. + */ +extern SDL_DECLSPEC bool SDLCALL SDL_HasSVE2(void); + /** * Determine whether the CPU has LSX (LOONGARCH SIMD) features. * diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h index a2e968080c..ecd8192941 100644 --- a/include/SDL3/SDL_intrin.h +++ b/include/SDL3/SDL_intrin.h @@ -85,6 +85,16 @@ */ #define SDL_NEON_INTRINSICS 1 +/** + * Defined if (and only if) the compiler supports ARM SVE2 intrinsics. + * + * If this macro is defined, SDL will have already included `` + * as appropriate. + * + * \since This macro is available since SDL 3.6.0. + */ +#define SDL_SVE2_INTRINSICS 1 + /** * Defined if (and only if) the compiler supports PowerPC Altivec intrinsics. * @@ -237,6 +247,10 @@ _m_prefetch(void *__P) # define SDL_NEON_INTRINSICS 1 # include #endif +#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2) +# define SDL_SVE2_INTRINSICS 1 +# include +#endif #else /* altivec.h redefining bool causes a number of problems, see bugs 3993 and 4392, so you need to explicitly define SDL_ENABLE_ALTIVEC to have it included. */ @@ -265,6 +279,20 @@ _m_prefetch(void *__P) # endif # endif #endif +#ifndef SDL_DISABLE_SVE2 +# if defined(SDL_PLATFORM_WINDOWS) +/* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */ +# if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */ +# define SDL_SVE2_INTRINSICS 1 +# include +# define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */ +# define __ARM_ARCH 8 +# endif +# elif !defined(SDL_PLATFORM_MACOS) /* Apple has no AArch64 device supporting SVE2 */ +# define SDL_SVE2_INTRINSICS 1 +# include +# endif +#endif #endif /* compiler version */ #ifdef SDL_WIKI_DOCUMENTATION_SECTION diff --git a/include/build_config/SDL_build_config.h.cmake b/include/build_config/SDL_build_config.h.cmake index e7d0b34f42..2e0cdc21b4 100644 --- a/include/build_config/SDL_build_config.h.cmake +++ b/include/build_config/SDL_build_config.h.cmake @@ -625,6 +625,7 @@ typedef unsigned int uintptr_t; #cmakedefine SDL_DISABLE_LSX 1 #cmakedefine SDL_DISABLE_LASX 1 #cmakedefine SDL_DISABLE_NEON 1 +#cmakedefine SDL_DISABLE_SVE2 1 #ifdef SDL_PLATFORM_PRIVATE #include "SDL_end_config_private.h" diff --git a/include/build_config/SDL_build_config_ios.h b/include/build_config/SDL_build_config_ios.h index 308270b5a0..56f17f8b8f 100644 --- a/include/build_config/SDL_build_config_ios.h +++ b/include/build_config/SDL_build_config_ios.h @@ -226,4 +226,7 @@ /* Enable tray subsystem */ #define SDL_TRAY_DUMMY 1 +/* Disable ARM SVE2 intrinsics until we confirm they're available on all Apple mobile and TV hardware */ +#define SDL_DISABLE_SVE2 1 + #endif /* SDL_build_config_ios_h_ */ diff --git a/src/cpuinfo/SDL_cpuinfo.c b/src/cpuinfo/SDL_cpuinfo.c index 966a5ae79a..19daae4421 100644 --- a/src/cpuinfo/SDL_cpuinfo.c +++ b/src/cpuinfo/SDL_cpuinfo.c @@ -109,6 +109,7 @@ #define CPU_HAS_ARM_SIMD (1 << 11) #define CPU_HAS_LSX (1 << 12) #define CPU_HAS_LASX (1 << 13) +#define CPU_HAS_SVE2 (1 << 14) #define CPU_CFG2 0x2 #define CPU_CFG2_LSX (1 << 6) @@ -514,6 +515,27 @@ static int CPU_haveNEON(void) #endif } +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) +#endif + +static int CPU_haveSVE2(void) +{ +#if defined(__aarch64__) && \ + ((defined(SDL_PLATFORM_LINUX) && defined(HAVE_GETAUXVAL)) || defined(SDL_PLATFORM_ANDROID)) + return ((getauxval(AT_HWCAP2) & HWCAP2_SVE2) == HWCAP2_SVE2) + && ((getauxval(AT_HWCAP) & HWCAP_SVE) == HWCAP_SVE); +#else + return 0; +#endif +} + static int CPU_readCPUCFG(void) { uint32_t cfg2 = 0; @@ -960,6 +982,8 @@ static Uint32 SDLCALL SDL_CPUFeatureMaskFromHint(void) spot_mask = CPU_HAS_LSX; } else if (ref_string_equals("lasx", spot, end)) { spot_mask = CPU_HAS_LASX; + } else if (ref_string_equals("sve2", spot, end)) { + spot_mask = CPU_HAS_SVE2; } else { // Ignore unknown/incorrect cpu feature(s) continue; @@ -1036,6 +1060,10 @@ static Uint32 SDL_GetCPUFeatures(void) SDL_CPUFeatures |= CPU_HAS_LASX; SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 32); } + if (CPU_haveSVE2()) { + SDL_CPUFeatures |= CPU_HAS_SVE2; + SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 16); + } SDL_CPUFeatures &= SDL_CPUFeatureMaskFromHint(); } return SDL_CPUFeatures; @@ -1117,6 +1145,11 @@ bool SDL_HasLASX(void) return CPU_FEATURE_AVAILABLE(CPU_HAS_LASX); } +bool SDL_HasSVE2(void) +{ + return CPU_FEATURE_AVAILABLE(CPU_HAS_SVE2); +} + static int SDL_SystemRAM = 0; int SDL_GetSystemRAM(void) diff --git a/src/dynapi/SDL_dynapi.exports b/src/dynapi/SDL_dynapi.exports index 67600f2b7b..32e9fbff86 100644 --- a/src/dynapi/SDL_dynapi.exports +++ b/src/dynapi/SDL_dynapi.exports @@ -1287,3 +1287,4 @@ _SDL_GDKResumeRenderer _SDL_IsPhone _SDL_LoadJPG_IO _SDL_LoadJPG +_SDL_HasSVE2 diff --git a/src/dynapi/SDL_dynapi.sym b/src/dynapi/SDL_dynapi.sym index 3fdc470a33..ca1a1c97d9 100644 --- a/src/dynapi/SDL_dynapi.sym +++ b/src/dynapi/SDL_dynapi.sym @@ -1288,6 +1288,7 @@ SDL3_0.0.0 { SDL_IsPhone; SDL_LoadJPG_IO; SDL_LoadJPG; + SDL_HasSVE2; # extra symbols go here (don't modify this line) local: *; }; diff --git a/src/dynapi/SDL_dynapi_overrides.h b/src/dynapi/SDL_dynapi_overrides.h index 7b88affdc6..677768ff2f 100644 --- a/src/dynapi/SDL_dynapi_overrides.h +++ b/src/dynapi/SDL_dynapi_overrides.h @@ -1314,3 +1314,4 @@ #define SDL_IsPhone SDL_IsPhone_REAL #define SDL_LoadJPG_IO SDL_LoadJPG_IO_REAL #define SDL_LoadJPG SDL_LoadJPG_REAL +#define SDL_HasSVE2 SDL_HasSVE2_REAL diff --git a/src/dynapi/SDL_dynapi_procs.h b/src/dynapi/SDL_dynapi_procs.h index 24a5afad98..99899b346e 100644 --- a/src/dynapi/SDL_dynapi_procs.h +++ b/src/dynapi/SDL_dynapi_procs.h @@ -1322,3 +1322,4 @@ SDL_DYNAPI_PROC(void,SDL_GDKResumeRenderer,(SDL_Renderer *a),(a),) SDL_DYNAPI_PROC(bool,SDL_IsPhone,(void),(),return) SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG_IO,(SDL_IOStream *a,bool b),(a,b),return) SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG,(const char *a),(a),return) +SDL_DYNAPI_PROC(bool,SDL_HasSVE2,(void),(),return) diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index f7a997f3b0..0dcd25d885 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -25,6 +25,10 @@ #include "SDL_pixels_c.h" #include "SDL_surface_c.h" +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) +#include "./arm/SDL_sve2_blit_A.h" +#endif + // Functions to perform alpha blended blitting // N->1 blending with per-surface alpha @@ -1477,6 +1481,17 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) } case 2: +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + if (sf->bytes_per_pixel == 4 && + df->bytes_per_pixel == 2 && + df->Rmask == 0x0000F800 && + df->Gmask == 0x000007E0 && + df->Bmask == 0x0000001F) { + return Blit8888to565PixelAlphaSwizzleSVE2; + } + } +#endif if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { if (df->Gmask == 0x7e0) { return BlitARGBto565PixelAlpha; @@ -1504,6 +1519,19 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return Blit8888to8888PixelAlphaSwizzleLSX; } #endif +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2() + /* NEON is faster than SVE2 when vector size is 128bit */ + #if defined(SDL_NEON_INTRINSICS) + && SDL_GetSVEVectorSize() > 128 + #endif + ) { + // To prevent "unused function" compiler warnings/errors + (void)Blit8888to8888PixelAlpha; + (void)Blit8888to8888PixelAlphaSwizzle; + return Blit8888to8888PixelAlphaSwizzleSVE2; + } +#endif #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) // To prevent "unused function" compiler warnings/errors (void)Blit8888to8888PixelAlpha; diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 204c1addbd..b014d4233a 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -26,6 +26,10 @@ #include "SDL_surface_c.h" #include "SDL_blit_copy.h" +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) +#include "./arm/SDL_sve2_blit_N.h" +#endif + // General optimized routines that write char by char #define HAVE_FAST_WRITE_INT8 1 @@ -3117,10 +3121,27 @@ SDL_BlitFunc SDL_CalculateBlitN(SDL_Surface *surface) return Blit8888to8888PixelSwizzleSSE41; } #endif +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + return Blit8888to8888PixelSwizzleSVE2; + } +#endif #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) return Blit8888to8888PixelSwizzleNEON; #endif } +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + /* RGBA8888/ARGB8888/XRGB8888 -> RGB565 */ + if (srcfmt->bytes_per_pixel == 4 && + dstfmt->bytes_per_pixel == 2 && + dstfmt->Rmask == 0x0000F800 && + dstfmt->Gmask == 0x000007E0 && + dstfmt->Bmask == 0x0000001F) { + return Blit8888to565PixelSwizzleSVE2; + } + } +#endif blitfun = NULL; if (dstfmt->bits_per_pixel > 8) { diff --git a/src/video/arm/SDL_sve2_blit_A.c b/src/video/arm/SDL_sve2_blit_A.c new file mode 100644 index 0000000000..be029bcc70 --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_A.c @@ -0,0 +1,89 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#include "SDL_sve2_blit_A.h" +#include + +#ifdef SDL_SVE2_INTRINSICS + +#undef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + /* fill alpha */ \ + sve_target_u16 = svdup_u16(0xFF); \ + } else { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } + +#undef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(svdup_u16(0xFF), \ + sve_target_u16, \ + vMask); \ + } else { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } + +#undef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ + do { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } while (0) + +#include "SDL_sve2_swizzle.h" + +/*-----------------------------------------------------------------------------* + * Swizzle Blend with Alpha * + *-----------------------------------------------------------------------------*/ +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info) +{ + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + assert(0 != srcfmt->Amask); + (void)srcfmt; + + sdl_sve_8888_to_8888_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +size_t SDL_GetSVEVectorSize(void) +{ + return svlen(svundef_u8()) * 8; +} + +#endif /* SDL_SVE2_INTRINSICS */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_A.h b/src/video/arm/SDL_sve2_blit_A.h new file mode 100644 index 0000000000..2a7e2b8149 --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_A.h @@ -0,0 +1,37 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_sve2_blit_A_h_ +#define SDL_sve2_blit_A_h_ + +#include "../../SDL_internal.h" +#include "../SDL_blit.h" + +#ifdef SDL_SVE2_INTRINSICS + +void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); +void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); + +size_t SDL_GetSVEVectorSize(void); + +#endif /* SDL_SVE2_INTRINSICS */ + +#endif /* SDL_sve2_blitters_h_ */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_N.c b/src/video/arm/SDL_sve2_blit_N.c new file mode 100644 index 0000000000..c6ae97e53b --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_N.c @@ -0,0 +1,64 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#include "SDL_sve2_blit_N.h" +#include + +#ifdef SDL_SVE2_INTRINSICS + +#undef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \ + do { \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + /* fill alpha */ \ + sve_target_u16 = svdup_u16(0xFF); \ + } else { \ + sve_target_u16 = sve_source_u16; \ + } \ + } while (0) + +#undef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \ + do { \ + sve_target_u16 = sve_source_u16; \ + } while (0) + +#undef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ + do { \ + sve_target_u16 = sve_source_u16; \ + } while (0) + +#include "SDL_sve2_swizzle.h" + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_8888_to_8888_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info); +} + +#endif /* SDL_SVE2_INTRINSICS */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_N.h b/src/video/arm/SDL_sve2_blit_N.h new file mode 100644 index 0000000000..3868de0dbb --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_N.h @@ -0,0 +1,35 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_sve2_blit_N_h_ +#define SDL_sve2_blit_N_h_ + +#include "../../SDL_internal.h" +#include "../SDL_blit.h" + +#ifdef SDL_SVE2_INTRINSICS + +void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info); +void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info); + +#endif /* SDL_SVE2_INTRINSICS */ + +#endif /* SDL_sve2_blitters_h_ */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h new file mode 100644 index 0000000000..2f5a74a12b --- /dev/null +++ b/src/video/arm/SDL_sve2_extension.h @@ -0,0 +1,1142 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) +#define SDL_SVE2_EXTENSION_H + +#include "SDL_sve2_util.h" +#include +#include + +/*! + * \brief a wrapper for __attribute__((nonnull)) + */ +#ifndef ARM_NONNULL +#define ARM_NONNULL(...) __attribute__((nonnull(__VA_ARGS__))) +#endif + +#define svlenu8() svcntb_pat(SV_ALL) +#define svlenu16() (svcntb_pat(SV_ALL) / sizeof(uint16_t)) +#define svlenu32() (svcntb_pat(SV_ALL) / sizeof(uint32_t)) +#define svlenu64() (svcntb_pat(SV_ALL) / sizeof(uint64_t)) + +#define svlens8() svlenu8() +#define svlens16() svlenu16() +#define svlens32() svlenu32() +#define svlens64() svlenu64() + +#define sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name) \ + for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \ + pTemp != NULL; \ + pTemp = NULL) \ + for (size_t SVE_SAFE_NAME(n) = 0, \ + sve_iteration_advance = svlenu32() * 4; \ + ({ \ + ma_pred_name = svwhilelt_b8((int32_t)SVE_SAFE_NAME(n), \ + (int32_t)(ma_stride_size)); \ + SVE_SAFE_NAME(n) < (ma_stride_size); \ + }); \ + SVE_SAFE_NAME(n) += sve_iteration_advance) + +#define sdl_sve_stride_loop_rgb32(ma_stride_size, ma_pred_name) \ + sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name) + +#define sdl_sve_stride_loop_rgb16(ma_stride_size, ma_pred_name) \ + for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \ + pTemp != NULL; \ + pTemp = NULL) \ + for (size_t SVE_SAFE_NAME(n) = 0, \ + sve_iteration_advance = svlenu16(); \ + ({ \ + ma_pred_name = svwhilelt_b16((int32_t)SVE_SAFE_NAME(n), \ + (int32_t)(ma_stride_size)); \ + SVE_SAFE_NAME(n) < (ma_stride_size); \ + }); \ + SVE_SAFE_NAME(n) += sve_iteration_advance) + +#define sdl_sve_pixel_ccc_foreach_chn(ma_source_u16x3, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x3_t sve_source_u16x3 = ma_source_u16x3; \ + (void)sve_source_u16x3; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_accc_foreach_chn012(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_accc_foreach_chn(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_accc_ccca(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_ccca_accc(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_a123_a321(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_123a_321a(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16x3_t sdl_sve_rgb565_unpack(svuint16_t vPixels) +{ + svuint16_t vBlue = svand_n_u16_m(svptrue_b16(), vPixels, 0x1F); + svuint16_t vGreen = svand_n_u16_m(svptrue_b16(), vPixels, (0x3F << 5)); + svuint16_t vRed = svand_n_u16_m(svptrue_b16(), vPixels, (0x1F << 11)); + + return svcreate3_u16(svlsl_n_u16_m(svptrue_b16(), vBlue, 3), + svlsr_n_u16_m(svptrue_b16(), vGreen, 3), + svlsr_n_u16_m(svptrue_b16(), vRed, 8)); +} + +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_rgb565_pack(svuint16x3_t vRGB16x3) +{ + svuint16_t vRed = svlsr_n_u16_m(svptrue_b16(), svget3_u16(vRGB16x3, 0), 3); + svuint16_t vGreen = svlsl_n_u16_m(svptrue_b16(), + svand_n_u16_m(svptrue_b16(), + svget3_u16(vRGB16x3, 1), + (0x3F << 2)), + 3); + svuint16_t vBlue = svlsl_n_u16_m(svptrue_b16(), + svand_n_u16_m(svptrue_b16(), + svget3_u16(vRGB16x3, 2), + (0x1F << 3)), + 8); + + svuint16_t vPixel = svorr_u16_m(svptrue_b16(), vRed, vGreen); + return svorr_u16_m(svptrue_b16(), vPixel, vBlue); + + // return (svget3_u16(vRGB16x3, 0) >> 3) + // | ((svget3_u16(vRGB16x3, 1) & (0x3F << 2)) << 3) + // | ((svget3_u16(vRGB16x3, 2) & (0x1F << 3)) << 8); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2, 3, 4) +static inline void svld3rgb565_u16(svbool_t vPredu8, + uint16_t *phwSource, + svuint16x3_t *pvLow, + svuint16x3_t *pvHigh) +{ + svuint8x2_t vInput8x2 = svld2_u8(vPredu8, (uint8_t *)phwSource); + + svuint16_t vLowByteLowHalf = svunpklo_u16(svget2_u8(vInput8x2, 0)); + svuint16_t vLowByteHighHalf = svunpkhi_u16(svget2_u8(vInput8x2, 0)); + + svuint16_t vHighByteLowHalf = svunpklo_u16(svget2_u8(vInput8x2, 1)); + svuint16_t vHighByteHighHalf = svunpkhi_u16(svget2_u8(vInput8x2, 1)); + + //*pvLow = sdl_sve_rgb565_unpack ( vLowByteLowHalf + // | (vHighByteLowHalf << 8)); + *pvLow = sdl_sve_rgb565_unpack( + svorr_u16_m(svptrue_b16(), + vLowByteLowHalf, + //(vHighByteLowHalf << 8) + svlsl_n_u16_m(svptrue_b16(), vHighByteLowHalf, 8))); + + //*pvHigh = sdl_sve_rgb565_unpack ( vLowByteHighHalf + // | (vHighByteHighHalf << 8)); + *pvHigh = sdl_sve_rgb565_unpack( + svorr_u16_m(svptrue_b16(), + vLowByteHighHalf, + //(vHighByteHighHalf << 8) + svlsl_n_u16_m(svptrue_b16(), vHighByteHighHalf, 8))); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2) +static inline void svst3rgb565_u16(svbool_t vPredu8, + uint16_t *phwTarget, + svuint16x3_t vLow, + svuint16x3_t vHigh) +{ + svuint16_t vLowByteLowHalf = svundef_u16(); + svuint16_t vHighByteLowHalf = svundef_u16(); + + /* pack low half pixels */ + do { + svuint16_t vPixel = sdl_sve_rgb565_pack(vLow); + + // vLowByteLowHalf = vPixel & 0xFF; + vLowByteLowHalf = svand_n_u16_m(svptrue_b16(), vPixel, 0xFF); + + // vHighByteLowHalf = vPixel >> 8; + vHighByteLowHalf = svlsr_n_u16_m(svptrue_b16(), vPixel, 8); + } while (0); + + svuint16_t vLowByteHighHalf = svundef_u16(); + svuint16_t vHighByteHighHalf = svundef_u16(); + + /* pack high half pixels */ + do { + svuint16_t vPixel = sdl_sve_rgb565_pack(vHigh); + + // vLowByteHighHalf = vPixel & 0xFF; + vLowByteHighHalf = svand_n_u16_m(svptrue_b16(), vPixel, 0xFF); + + // vHighByteHighHalf = vPixel >> 8; + vHighByteHighHalf = svlsr_n_u16_m(svptrue_b16(), vPixel, 8); + } while (0); + + /* save rgb565 pixels */ + svuint8_t vLowByte = svuzp1_u8(svreinterpret_u8(vLowByteLowHalf), + svreinterpret_u8(vLowByteHighHalf)); + + svuint8_t vHighByte = svuzp1_u8(svreinterpret_u8(vHighByteLowHalf), + svreinterpret_u8(vHighByteHighHalf)); + + svst2_u8(vPredu8, (uint8_t *)phwTarget, svcreate2_u8(vLowByte, vHighByte)); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define svld4ub_u16(ma_pred, \ + ma_src_ptr, \ + ma_svuint16x4_low_ptr, \ + ma_svuint16x4_high_ptr) \ + do { \ + svuint8x4_t vInput8x4 = svld4_u8((ma_pred), (ma_src_ptr)); \ + \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 0, svunpklo_u16(svget4_u8(vInput8x4, 0))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 1, svunpklo_u16(svget4_u8(vInput8x4, 1))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 2, svunpklo_u16(svget4_u8(vInput8x4, 2))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 3, svunpklo_u16(svget4_u8(vInput8x4, 3))); \ + \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 0, svunpkhi_u16(svget4_u8(vInput8x4, 0))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 1, svunpkhi_u16(svget4_u8(vInput8x4, 1))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 2, svunpkhi_u16(svget4_u8(vInput8x4, 2))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 3, svunpkhi_u16(svget4_u8(vInput8x4, 3))); \ + } while (0) + +#define svst4ub_u16(ma_pred, \ + ma_dst_ptr, \ + ma_svuint16x4_low, \ + ma_svuint16x4_high) \ + do { \ + svuint8_t vCH0u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 0)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 0))); \ + \ + svuint8_t vCH1u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 1)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 1))); \ + \ + svuint8_t vCH2u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 2)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 2))); \ + \ + svuint8_t vCH3u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 3)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 3))); \ + \ + svst4_u8((ma_pred), (ma_dst_ptr), svcreate4_u8(vCH0u8, vCH1u8, vCH2u8, vCH3u8)); \ + } while (0) +#else +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2, 3, 4) +static inline void svld4ub_u16(svbool_t vPredu8, + uint8_t *pchSource, + svuint16x4_t *pvLow, + svuint16x4_t *pvHigh) +{ + svuint8x4_t vInput8x4 = svld4_u8(vPredu8, pchSource); + + *pvLow = svset4_u16(*pvLow, 0, svunpklo_u16(svget4_u8(vInput8x4, 0))); + *pvLow = svset4_u16(*pvLow, 1, svunpklo_u16(svget4_u8(vInput8x4, 1))); + *pvLow = svset4_u16(*pvLow, 2, svunpklo_u16(svget4_u8(vInput8x4, 2))); + *pvLow = svset4_u16(*pvLow, 3, svunpklo_u16(svget4_u8(vInput8x4, 3))); + + *pvHigh = svset4_u16(*pvHigh, 0, svunpkhi_u16(svget4_u8(vInput8x4, 0))); + *pvHigh = svset4_u16(*pvHigh, 1, svunpkhi_u16(svget4_u8(vInput8x4, 1))); + *pvHigh = svset4_u16(*pvHigh, 2, svunpkhi_u16(svget4_u8(vInput8x4, 2))); + *pvHigh = svset4_u16(*pvHigh, 3, svunpkhi_u16(svget4_u8(vInput8x4, 3))); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2) +static inline void svst4ub_u16(svbool_t vPredu8, + uint8_t *pchTarget, + svuint16x4_t vLow, + svuint16x4_t vHigh) +{ + + svuint8_t vCH0u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 0)), + svreinterpret_u8(svget4_u16(vHigh, 0))); + + svuint8_t vCH1u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 1)), + svreinterpret_u8(svget4_u16(vHigh, 1))); + + svuint8_t vCH2u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 2)), + svreinterpret_u8(svget4_u16(vHigh, 2))); + + svuint8_t vCH3u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 3)), + svreinterpret_u8(svget4_u16(vHigh, 3))); + + svst4_u8(vPredu8, pchTarget, svcreate4_u8(vCH0u8, vCH1u8, vCH2u8, vCH3u8)); +} +#endif + +/*! \note the Element range of vMask is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint16_t vTarget, svuint16_t vMask) +{ + // vTarget = vSource * vMask + vTarget * (255 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + vTemp0 = svmla_u16_m(svptrue_b16(), + vTemp0, + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(255), + vMask)); + + vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1); + + svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); + /* x += x >> 8 */ + vTemp0 = svadd_u16_m(svptrue_b16(), + vTemp0, + vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; +} + +/*! \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource, + svuint16_t vTarget, + uint16_t hwOpacity) +{ + // svuint16_t vOpacity = svdup_u16(hwOpacity); + // vTarget = vSource * vOpacity + vTarget * (256 - vOpacity); + + svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity); + svuint16_t vTemp1 = svmul_n_u16_m(svptrue_b16(), + vTarget, + 256 - hwOpacity); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_mask_and_opacity(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask, + uint16_t hwOpacity) +{ + vMask = svsel(svcmpeq_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_masks(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1) +{ + vMask1 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask1, 255), + vMask1, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask0, 255), + vMask1, + //(vMask0 * vMask1) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_masks_and_opacity( + svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + uint16_t hwOpacity) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), /* >= 255 */ + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_3masks(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + svuint16_t vMask2) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask2, 255), + vMask, + //(vMask * vMask2) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask, vMask2), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_3masks_and_opacity( + svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + svuint16_t vMask2, + uint16_t hwOpacity) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask2, 255), + vMask, + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask, vMask2), + 8)); + //(vMask * vMask2) >> 8); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +#endif /* SDL_SVE2_EXTENSION_H */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_swizzle.h b/src/video/arm/SDL_sve2_swizzle.h new file mode 100644 index 0000000000..a2d6f978d2 --- /dev/null +++ b/src/video/arm/SDL_sve2_swizzle.h @@ -0,0 +1,2375 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#if !defined(SD_SVE2_SWIZZLE_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) +#define SD_SVE2_SWIZZLE_H + +#include "SDL_sve2_extension.h" + +#define sdl_sve_rgb32_stride_impl(ma_sve_chn_iterator, ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x4_t vTargetLow16x4 = svundef4_u16(); \ + svuint16x4_t vTargetHigh16x4 = svundef4_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + &vTargetLow16x4, \ + &vTargetHigh16x4); \ + \ + /* process low half */ \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x4, \ + __VA_ARGS__); \ + \ + /* process high half */ \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x4, \ + __VA_ARGS__); \ + \ + svst4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + vTargetLow16x4, \ + vTargetHigh16x4); \ + \ + pwSource += sve_iteration_advance; \ + pwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_no_alpha_stride_impl( \ + ma_alpha_idx, \ + ma_sve_chn_iterator, \ + ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x4_t vTargetLow16x4 = svundef4_u16(); \ + svuint16x4_t vTargetHigh16x4 = svundef4_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + &vTargetLow16x4, \ + &vTargetHigh16x4); \ + \ + vSourceLow16x4 = svset4(vSourceLow16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + vSourceHigh16x4 = svset4(vSourceHigh16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + \ + /* process low half */ \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x4, \ + __VA_ARGS__); \ + \ + /* process high half */ \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x4, \ + __VA_ARGS__); \ + \ + svst4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + vTargetLow16x4, \ + vTargetHigh16x4); \ + \ + pwSource += sve_iteration_advance; \ + pwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_to_rgb565_stride_impl(ma_sve_chn_iterator, ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x3_t vTargetLow16x3 = svundef3_u16(); \ + svuint16x3_t vTargetHigh16x3 = svundef3_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld3rgb565_u16(vTailPred, \ + phwTarget, \ + &vTargetLow16x3, \ + &vTargetHigh16x3); \ + \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x3, \ + __VA_ARGS__); \ + \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x3, \ + __VA_ARGS__); \ + \ + svst3rgb565_u16(vTailPred, \ + phwTarget, \ + vTargetLow16x3, \ + vTargetHigh16x3); \ + \ + pwSource += sve_iteration_advance; \ + phwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( \ + ma_alpha_idx, \ + ma_sve_chn_iterator, \ + ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x3_t vTargetLow16x3 = svundef3_u16(); \ + svuint16x3_t vTargetHigh16x3 = svundef3_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + vSourceLow16x4 = svset4(vSourceLow16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + vSourceHigh16x4 = svset4(vSourceHigh16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + \ + svld3rgb565_u16(vTailPred, \ + phwTarget, \ + &vTargetLow16x3, \ + &vTargetHigh16x3); \ + \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x3, \ + __VA_ARGS__); \ + \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x3, \ + __VA_ARGS__); \ + \ + svst3rgb565_u16(vTailPred, \ + phwTarget, \ + vTargetLow16x3, \ + vTargetHigh16x3); \ + \ + pwSource += sve_iteration_advance; \ + phwTarget += sve_iteration_advance; \ + } + +#ifndef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) +#endif + +#ifndef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) +#endif + +/* + * Source: ACCC and CCCA + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + + sdl_sve_rgb32_blend_op_copy_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_accc_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_accc_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_accc_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_accc_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_ccca_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_ccca_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_ccca_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_ccca_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_copy_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_ccca_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_ccca_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_ccca_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_ccca_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_accc_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_accc_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_accc_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_accc_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +/* + * Source: XCCC and CCCX + */ + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_accc_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_accc_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_accc_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_accc_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_ccca_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_ccca_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_ccca_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_ccca_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_ccca_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_ccca_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_ccca_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_ccca_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_accc_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_accc_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_accc_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_accc_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1) +static inline void sdl_sve_8888_to_8888_swizzle_dispatcher(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + uint8_t *src = info->src; + int srcskip = info->src_skip; + uint8_t *dst = info->dst; + int dstskip = info->dst_skip; + + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + + // Set up some basic variables + int srcbpp = srcfmt->bytes_per_pixel; + int dstbpp = dstfmt->bytes_per_pixel; + + assert((srcbpp == 4) && (dstbpp == 4)); + + bool fill_alpha = (!dstfmt->Amask); + + int srcstride = srcskip + srcbpp * width; + int dststride = dstskip + dstbpp * width; + + switch (srcfmt->format) { + case SDL_PIXELFORMAT_XRGB8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_ARGB8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_RGBX8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_XBGR8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_BGRX8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + default: + assert(false); + break; + } +} + +#ifndef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) +#endif + +/* + * ACCC or CCCA + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_argb8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_argb8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_argb8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_rgba8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_rgba8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_rgba8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_bgra8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_bgra8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_bgra8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_abgr8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_abgr8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_abgr8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +/* + * XCCC or CCCX + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xrgb8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 3, + sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xrgb8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xrgb8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_rgbx8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 0, + sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_rgbx8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_rgbx8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_bgrx8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 0, + sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_bgrx8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_bgrx8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xbgr8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 3, + sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xbgr8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xbgr8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1) +static inline void sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + uint8_t *src = info->src; + int srcskip = info->src_skip; + uint8_t *dst = info->dst; + int dstskip = info->dst_skip; + + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + + // Set up some basic variables + int srcbpp = srcfmt->bytes_per_pixel; + int dstbpp = dstfmt->bytes_per_pixel; + + assert(srcbpp == 4); + assert(dstbpp == 2); + + int srcstride = srcskip + srcbpp * width; + int dststride = dstskip + dstbpp * width; + + switch (srcfmt->format) { + case SDL_PIXELFORMAT_XRGB8888: + sdl_sve_xrgb8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_ARGB8888: + sdl_sve_argb8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_RGBX8888: + sdl_sve_rgbx8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_RGBA8888: + sdl_sve_rgba8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_XBGR8888: + sdl_sve_xbgr8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_ABGR8888: + sdl_sve_abgr8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_BGRX8888: + sdl_sve_bgrx8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_BGRA8888: + sdl_sve_bgra8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + default: + assert(false); + break; + } +} + +#endif /* SD_SVE2_SWIZZLE_H */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_util.h b/src/video/arm/SDL_sve2_util.h new file mode 100644 index 0000000000..2a1602b432 --- /dev/null +++ b/src/video/arm/SDL_sve2_util.h @@ -0,0 +1,206 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_SVE2_UTIL_H +#define SDL_SVE2_UTIL_H + +#undef SVE_0_CONNECT2 +#undef SVE_0_CONNECT3 +#undef SVE_0_CONNECT4 +#undef SVE_0_CONNECT5 +#undef SVE_0_CONNECT6 +#undef SVE_0_CONNECT7 +#undef SVE_0_CONNECT8 +#undef SVE_0_CONNECT9 + +#undef SVE_CONNECT2 +#undef SVE_CONNECT3 +#undef SVE_CONNECT4 +#undef SVE_CONNECT5 +#undef SVE_CONNECT6 +#undef SVE_CONNECT7 +#undef SVE_CONNECT8 +#undef SVE_CONNECT9 +#undef ALT_SVE_CONNECT2 + +#undef SVE_SAFE_NAME + +#undef SVE_CONNECT + +#define SVE_0_CONNECT2(ma_A, ma_B) ma_A##ma_B +#define SVE_0_CONNECT3(ma_A, ma_B, ma_C) ma_A##ma_B##ma_C +#define SVE_0_CONNECT4(ma_A, ma_B, ma_C, ma_D) ma_A##ma_B##ma_C##ma_D +#define SVE_0_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) \ + ma_A##ma_B##ma_C##ma_D##ma_E +#define SVE_0_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F +#define SVE_0_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G +#define SVE_0_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G##ma_H +#define SVE_0_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G##ma_H##ma_I + +#define ALT_SVE_CONNECT2(ma_A, ma_B) SVE_0_CONNECT2(ma_A, ma_B) +#define SVE_CONNECT2(ma_A, ma_B) SVE_0_CONNECT2(ma_A, ma_B) +#define SVE_CONNECT3(ma_A, ma_B, ma_C) SVE_0_CONNECT3(ma_A, ma_B, ma_C) +#define SVE_CONNECT4(ma_A, ma_B, ma_C, ma_D) \ + SVE_0_CONNECT4(ma_A, ma_B, ma_C, ma_D) +#define SVE_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) \ + SVE_0_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) +#define SVE_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) \ + SVE_0_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) +#define SVE_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) \ + SVE_0_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) +#define SVE_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) \ + SVE_0_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) +#define SVE_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) \ + SVE_0_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) + +#define SVE_CONNECT(...) \ + ALT_SVE_CONNECT2(SVE_CONNECT, \ + SVE_VA_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#ifndef SVE_VA_NUM_ARGS_IMPL +#define SVE_VA_NUM_ARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, \ + _12, _13, _14, _15, _16, ma_N, ...) ma_N +#endif + +#ifndef SVE_VA_NUM_ARGS +#define SVE_VA_NUM_ARGS(...) \ + SVE_VA_NUM_ARGS_IMPL(0, ##__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, \ + 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define SVE_SAFE_NAME(ma_NAME) SVE_CONNECT3(ma_, ma_NAME, ma_LINEma_) + +/* ---------------------------------------------------------------------------* + * SVE Test Helper * + * ---------------------------------------------------------------------------*/ + +#define SVT_PRINT_VECTOR(ma_VECOTOR, ma_ELEMENT_T, ma_FORMAT_STRING) \ + do { \ + int_fast8_t nElementCount = svcntb_pat(SV_ALL) / sizeof(ma_ELEMENT_T); \ + uint8_t SVE_SAFE_NAME(chVectorBuffer) \ + [nElementCount * sizeof(ma_ELEMENT_T)]; \ + \ + svst1_u8(svptrue_b8(), \ + SVE_SAFE_NAME(chVectorBuffer), \ + svreinterpret_u8(ma_VECOTOR)); \ + \ + ma_ELEMENT_T *pElement = (ma_ELEMENT_T *)SVE_SAFE_NAME(chVectorBuffer); \ + printf("%s\t[", #ma_VECOTOR); \ + do { \ + printf(ma_FORMAT_STRING "\t", (int)*pElement++); \ + } while (--nElementCount); \ + printf("]\r\n"); \ + } while (0) + +#define SVT_INIT_VECOTR(ma_VECTOR, ma_ELEMENT_T, ...) \ + do { \ + uint8_t SVE_SAFE_NAME(chVectorBuffer)[svcntb_pat(SV_ALL)]; \ + \ + memset(SVE_SAFE_NAME(chVectorBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(chVectorBuffer))); \ + memcpy(SVE_SAFE_NAME(chVectorBuffer), /* This should NOT be SDL_memcpy() */ \ + (ma_ELEMENT_T[]){ __VA_ARGS__ }, \ + MIN(sizeof(SVE_SAFE_NAME(chVectorBuffer)), \ + sizeof((ma_ELEMENT_T[]){ __VA_ARGS__ }))); \ + \ + ma_VECTOR = svld1(svptrue_b8(), \ + (ma_ELEMENT_T *)SVE_SAFE_NAME(chVectorBuffer)); \ + } while (0) + +#define SVT_INIT_PRED(ma_PREDICT, ...) \ + do { \ + uint8_t SVE_SAFE_NAME(chBuffer)[svlen(svundef_u64())]; \ + memset(SVE_SAFE_NAME(chBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(chBuffer))); \ + \ + memcpy(SVE_SAFE_NAME(chBuffer), /* This should NOT be SDL_memcpy() */ \ + (uint8_t[]){ __VA_ARGS__ }, \ + MIN(sizeof(SVE_SAFE_NAME(chBuffer)), \ + sizeof((uint8_t[]){ __VA_ARGS__ }))); \ + \ + ma_PREDICT = (*(svbool_t *)SVE_SAFE_NAME(chBuffer)); \ + } while (0) + +#define SVT_PRINT_PRED(ma_PREDICT, ma_TYPE_T) \ + do { \ + printf("%8s\t[", #ma_PREDICT); \ + uint16_t SVE_SAFE_NAME(hwBuffer)[svlen(svundef_u64()) / 2]; \ + memset(SVE_SAFE_NAME(hwBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(hwBuffer))); \ + *(volatile svbool_t *)SVE_SAFE_NAME(hwBuffer) = (ma_PREDICT); \ + \ + uint_fast16_t SVE_SAFE_NAME(nTotalBits) = svlen(svundef_u8()); \ + uint_fast8_t SVE_SAFE_NAME(nElementBits) = sizeof(ma_TYPE_T); \ + \ + uint16_t *phwPred = SVE_SAFE_NAME(hwBuffer); \ + do { \ + uint16_t hwPred = *phwPred++; \ + \ + for (uint_fast8_t n = 0; \ + n < 16; \ + n += SVE_SAFE_NAME(nElementBits)) { \ + \ + if (hwPred & 0x01) { \ + printf("True "); \ + } else { \ + printf("False"); \ + } \ + printf("%*s\t", (int)sizeof(ma_TYPE_T) - 1, ""); \ + hwPred >>= SVE_SAFE_NAME(nElementBits); \ + } \ + \ + SVE_SAFE_NAME(nTotalBits) -= 16; \ + } while (SVE_SAFE_NAME(nTotalBits)); \ + \ + printf("]\r\n"); \ + } while (0) + +#define SVT_PRINT_BUFFER(ma_BUFF_PTR, ma_SIZE, ma_TYPE_T, ma_FMT_STR, ma_STRIDE) \ + do { \ + ma_TYPE_T *pBuffer = (ma_TYPE_T *)ma_BUFF_PTR; \ + size_t nElementCount = (ma_SIZE) / sizeof(ma_TYPE_T); \ + \ + size_t nStrideSize = (ma_STRIDE); \ + size_t nLineCount = 0; \ + \ + printf("%s\n\t", #ma_BUFF_PTR); \ + do { \ + \ + printf(ma_FMT_STR " ", *pBuffer++); \ + nLineCount++; \ + if (nLineCount >= nStrideSize) { \ + nLineCount = 0; \ + printf("\n\t"); \ + } \ + \ + } while (--nElementCount); \ + printf("\n"); \ + \ + } while (0) + +#endif /* SDL_SVE2_UTIL_H */ \ No newline at end of file diff --git a/test/testplatform.c b/test/testplatform.c index 4e79f6326c..d42c72d10f 100644 --- a/test/testplatform.c +++ b/test/testplatform.c @@ -414,6 +414,7 @@ static int TestCPUInfo(bool verbose) SDL_Log("NEON %s", SDL_HasNEON() ? "detected" : "not detected"); SDL_Log("LSX %s", SDL_HasLSX() ? "detected" : "not detected"); SDL_Log("LASX %s", SDL_HasLASX() ? "detected" : "not detected"); + SDL_Log("SVE2 %s", SDL_HasSVE2() ? "detected" : "not detected"); SDL_Log("System RAM %d MB", SDL_GetSystemRAM()); SDL_Log("System memory page size %d bytes", SDL_GetSystemPageSize()); }