diff --git a/Android.mk b/Android.mk index 2e3b11483c..d53bf403b1 100644 --- a/Android.mk +++ b/Android.mk @@ -84,6 +84,7 @@ LOCAL_SRC_FILES := \ $(wildcard $(LOCAL_PATH)/src/tray/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/android/*.c) \ + $(wildcard $(LOCAL_PATH)/src/video/arm/*.c) \ $(wildcard $(LOCAL_PATH)/src/video/yuv2rgb/*.c)) LOCAL_CFLAGS += -DGL_GLEXT_PROTOTYPES diff --git a/CMakeLists.txt b/CMakeLists.txt index e591a011c1..851e11add9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -314,6 +314,7 @@ dep_option(SDL_SSE4_2 "Use SSE4.2 assembly routines" ON "SDL_ASSEMB dep_option(SDL_MMX "Use MMX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_X86 OR SDL_CPU_X64" OFF) dep_option(SDL_ALTIVEC "Use Altivec assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_POWERPC32 OR SDL_CPU_POWERPC64" OFF) dep_option(SDL_ARMNEON "Use NEON assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM32 OR SDL_CPU_ARM64" OFF) +dep_option(SDL_ARMSVE2 "Use SVE2 assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_ARM64" OFF) dep_option(SDL_LSX "Use LSX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) dep_option(SDL_LASX "Use LASX assembly routines" ON "SDL_ASSEMBLY;SDL_CPU_LOONGARCH64" OFF) @@ -939,6 +940,37 @@ if(SDL_ASSEMBLY) endif() endif() + if(SDL_ARMSVE2) + cmake_push_check_state() + string(APPEND CMAKE_REQUIRED_FLAGS " -march=armv8-a+sve2") + check_arm_source_compiles([==[ + #include + svuint32_t sve2_test(svuint32_t a, svuint32_t b) { + return svadd_u32_x(svptrue_b32(), a, b); + } + int main(int argc, char *argv[]) { + sve2_test(svdup_u32(0), svdup_u32(0)); + return 0; + }]==] COMPILER_SUPPORTS_ARMSVE2) + if(COMPILER_SUPPORTS_ARMSVE2) + set(HAVE_ARMSVE2 TRUE) + endif() + cmake_pop_check_state() + + if(HAVE_ARMSVE2) + sdl_sources( + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c" + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c" + ) + set_source_files_properties( + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_A.c" + "${SDL3_SOURCE_DIR}/src/video/arm/SDL_sve2_blit_N.c" + PROPERTIES + SKIP_PRECOMPILE_HEADERS ON + ) + endif() + endif() + if(USE_GCC OR USE_CLANG) # TODO: Those all seem to be quite GCC specific - needs to be # reworked for better compiler support @@ -1055,6 +1087,10 @@ if(NOT HAVE_ARMNEON) set(SDL_DISABLE_NEON 1) endif() +if(NOT HAVE_ARMSVE2) + set(SDL_DISABLE_SVE2 1) +endif() + set(SDL_DISABLE_ALLOCA 0) check_include_file("alloca.h" "HAVE_ALLOCA_H") if(MSVC) diff --git a/include/SDL3/SDL_cpuinfo.h b/include/SDL3/SDL_cpuinfo.h index 5669c2373d..765cadf287 100644 --- a/include/SDL3/SDL_cpuinfo.h +++ b/include/SDL3/SDL_cpuinfo.h @@ -281,6 +281,18 @@ extern SDL_DECLSPEC bool SDLCALL SDL_HasARMSIMD(void); */ extern SDL_DECLSPEC bool SDLCALL SDL_HasNEON(void); +/** + * Determine whether the CPU has SVE2 (Scalable Vector Extension 2). + * + * This is only relevant on ARM64 Linux. On other platforms it always returns + * false. + * + * \returns true if the CPU has SVE2, false otherwise. + * + * \since This function is available since SDL 3.6.0. + */ +extern SDL_DECLSPEC bool SDLCALL SDL_HasSVE2(void); + /** * Determine whether the CPU has LSX (LOONGARCH SIMD) features. * diff --git a/include/SDL3/SDL_intrin.h b/include/SDL3/SDL_intrin.h index a2e968080c..ecd8192941 100644 --- a/include/SDL3/SDL_intrin.h +++ b/include/SDL3/SDL_intrin.h @@ -85,6 +85,16 @@ */ #define SDL_NEON_INTRINSICS 1 +/** + * Defined if (and only if) the compiler supports ARM SVE2 intrinsics. + * + * If this macro is defined, SDL will have already included `` + * as appropriate. + * + * \since This macro is available since SDL 3.6.0. + */ +#define SDL_SVE2_INTRINSICS 1 + /** * Defined if (and only if) the compiler supports PowerPC Altivec intrinsics. * @@ -237,6 +247,10 @@ _m_prefetch(void *__P) # define SDL_NEON_INTRINSICS 1 # include #endif +#if defined(__ARM_FEATURE_SVE2) && !defined(SDL_DISABLE_SVE2) +# define SDL_SVE2_INTRINSICS 1 +# include +#endif #else /* altivec.h redefining bool causes a number of problems, see bugs 3993 and 4392, so you need to explicitly define SDL_ENABLE_ALTIVEC to have it included. */ @@ -265,6 +279,20 @@ _m_prefetch(void *__P) # endif # endif #endif +#ifndef SDL_DISABLE_SVE2 +# if defined(SDL_PLATFORM_WINDOWS) +/* Visual Studio doesn't define __ARM_ARCH, but _M_ARM (if set, always 7), and _M_ARM64 (if set, always 1). */ +# if defined (_M_ARM64) && 0 /* Please only remove this 0 when MSVC releasing support for SVE2 officially. */ +# define SDL_SVE2_INTRINSICS 1 +# include +# define __ARM_FEATURE_SVE2 1 /* Set __ARM_FEATURE_SVE2 so that it can be used elsewhere, at compile time */ +# define __ARM_ARCH 8 +# endif +# elif !defined(SDL_PLATFORM_MACOS) /* Apple has no AArch64 device supporting SVE2 */ +# define SDL_SVE2_INTRINSICS 1 +# include +# endif +#endif #endif /* compiler version */ #ifdef SDL_WIKI_DOCUMENTATION_SECTION diff --git a/include/build_config/SDL_build_config.h.cmake b/include/build_config/SDL_build_config.h.cmake index e7d0b34f42..2e0cdc21b4 100644 --- a/include/build_config/SDL_build_config.h.cmake +++ b/include/build_config/SDL_build_config.h.cmake @@ -625,6 +625,7 @@ typedef unsigned int uintptr_t; #cmakedefine SDL_DISABLE_LSX 1 #cmakedefine SDL_DISABLE_LASX 1 #cmakedefine SDL_DISABLE_NEON 1 +#cmakedefine SDL_DISABLE_SVE2 1 #ifdef SDL_PLATFORM_PRIVATE #include "SDL_end_config_private.h" diff --git a/include/build_config/SDL_build_config_ios.h b/include/build_config/SDL_build_config_ios.h index 308270b5a0..56f17f8b8f 100644 --- a/include/build_config/SDL_build_config_ios.h +++ b/include/build_config/SDL_build_config_ios.h @@ -226,4 +226,7 @@ /* Enable tray subsystem */ #define SDL_TRAY_DUMMY 1 +/* Disable ARM SVE2 intrinsics until we confirm they're available on all Apple mobile and TV hardware */ +#define SDL_DISABLE_SVE2 1 + #endif /* SDL_build_config_ios_h_ */ diff --git a/src/cpuinfo/SDL_cpuinfo.c b/src/cpuinfo/SDL_cpuinfo.c index 966a5ae79a..19daae4421 100644 --- a/src/cpuinfo/SDL_cpuinfo.c +++ b/src/cpuinfo/SDL_cpuinfo.c @@ -109,6 +109,7 @@ #define CPU_HAS_ARM_SIMD (1 << 11) #define CPU_HAS_LSX (1 << 12) #define CPU_HAS_LASX (1 << 13) +#define CPU_HAS_SVE2 (1 << 14) #define CPU_CFG2 0x2 #define CPU_CFG2_LSX (1 << 6) @@ -514,6 +515,27 @@ static int CPU_haveNEON(void) #endif } +#ifndef AT_HWCAP2 +#define AT_HWCAP2 26 +#endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif +#ifndef HWCAP2_SVE2 +#define HWCAP2_SVE2 (1 << 1) +#endif + +static int CPU_haveSVE2(void) +{ +#if defined(__aarch64__) && \ + ((defined(SDL_PLATFORM_LINUX) && defined(HAVE_GETAUXVAL)) || defined(SDL_PLATFORM_ANDROID)) + return ((getauxval(AT_HWCAP2) & HWCAP2_SVE2) == HWCAP2_SVE2) + && ((getauxval(AT_HWCAP) & HWCAP_SVE) == HWCAP_SVE); +#else + return 0; +#endif +} + static int CPU_readCPUCFG(void) { uint32_t cfg2 = 0; @@ -960,6 +982,8 @@ static Uint32 SDLCALL SDL_CPUFeatureMaskFromHint(void) spot_mask = CPU_HAS_LSX; } else if (ref_string_equals("lasx", spot, end)) { spot_mask = CPU_HAS_LASX; + } else if (ref_string_equals("sve2", spot, end)) { + spot_mask = CPU_HAS_SVE2; } else { // Ignore unknown/incorrect cpu feature(s) continue; @@ -1036,6 +1060,10 @@ static Uint32 SDL_GetCPUFeatures(void) SDL_CPUFeatures |= CPU_HAS_LASX; SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 32); } + if (CPU_haveSVE2()) { + SDL_CPUFeatures |= CPU_HAS_SVE2; + SDL_SIMDAlignment = SDL_max(SDL_SIMDAlignment, 16); + } SDL_CPUFeatures &= SDL_CPUFeatureMaskFromHint(); } return SDL_CPUFeatures; @@ -1117,6 +1145,11 @@ bool SDL_HasLASX(void) return CPU_FEATURE_AVAILABLE(CPU_HAS_LASX); } +bool SDL_HasSVE2(void) +{ + return CPU_FEATURE_AVAILABLE(CPU_HAS_SVE2); +} + static int SDL_SystemRAM = 0; int SDL_GetSystemRAM(void) diff --git a/src/dynapi/SDL_dynapi.exports b/src/dynapi/SDL_dynapi.exports index 67600f2b7b..32e9fbff86 100644 --- a/src/dynapi/SDL_dynapi.exports +++ b/src/dynapi/SDL_dynapi.exports @@ -1287,3 +1287,4 @@ _SDL_GDKResumeRenderer _SDL_IsPhone _SDL_LoadJPG_IO _SDL_LoadJPG +_SDL_HasSVE2 diff --git a/src/dynapi/SDL_dynapi.sym b/src/dynapi/SDL_dynapi.sym index 3fdc470a33..ca1a1c97d9 100644 --- a/src/dynapi/SDL_dynapi.sym +++ b/src/dynapi/SDL_dynapi.sym @@ -1288,6 +1288,7 @@ SDL3_0.0.0 { SDL_IsPhone; SDL_LoadJPG_IO; SDL_LoadJPG; + SDL_HasSVE2; # extra symbols go here (don't modify this line) local: *; }; diff --git a/src/dynapi/SDL_dynapi_overrides.h b/src/dynapi/SDL_dynapi_overrides.h index 7b88affdc6..677768ff2f 100644 --- a/src/dynapi/SDL_dynapi_overrides.h +++ b/src/dynapi/SDL_dynapi_overrides.h @@ -1314,3 +1314,4 @@ #define SDL_IsPhone SDL_IsPhone_REAL #define SDL_LoadJPG_IO SDL_LoadJPG_IO_REAL #define SDL_LoadJPG SDL_LoadJPG_REAL +#define SDL_HasSVE2 SDL_HasSVE2_REAL diff --git a/src/dynapi/SDL_dynapi_procs.h b/src/dynapi/SDL_dynapi_procs.h index 24a5afad98..99899b346e 100644 --- a/src/dynapi/SDL_dynapi_procs.h +++ b/src/dynapi/SDL_dynapi_procs.h @@ -1322,3 +1322,4 @@ SDL_DYNAPI_PROC(void,SDL_GDKResumeRenderer,(SDL_Renderer *a),(a),) SDL_DYNAPI_PROC(bool,SDL_IsPhone,(void),(),return) SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG_IO,(SDL_IOStream *a,bool b),(a,b),return) SDL_DYNAPI_PROC(SDL_Surface*,SDL_LoadJPG,(const char *a),(a),return) +SDL_DYNAPI_PROC(bool,SDL_HasSVE2,(void),(),return) diff --git a/src/video/SDL_blit_A.c b/src/video/SDL_blit_A.c index f7a997f3b0..0dcd25d885 100644 --- a/src/video/SDL_blit_A.c +++ b/src/video/SDL_blit_A.c @@ -25,6 +25,10 @@ #include "SDL_pixels_c.h" #include "SDL_surface_c.h" +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) +#include "./arm/SDL_sve2_blit_A.h" +#endif + // Functions to perform alpha blended blitting // N->1 blending with per-surface alpha @@ -1477,6 +1481,17 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) } case 2: +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + if (sf->bytes_per_pixel == 4 && + df->bytes_per_pixel == 2 && + df->Rmask == 0x0000F800 && + df->Gmask == 0x000007E0 && + df->Bmask == 0x0000001F) { + return Blit8888to565PixelAlphaSwizzleSVE2; + } + } +#endif if (sf->bytes_per_pixel == 4 && sf->Amask == 0xff000000 && sf->Gmask == 0xff00 && ((sf->Rmask == 0xff && df->Rmask == 0x1f) || (sf->Bmask == 0xff && df->Bmask == 0x1f))) { if (df->Gmask == 0x7e0) { return BlitARGBto565PixelAlpha; @@ -1504,6 +1519,19 @@ SDL_BlitFunc SDL_CalculateBlitA(SDL_Surface *surface) return Blit8888to8888PixelAlphaSwizzleLSX; } #endif +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2() + /* NEON is faster than SVE2 when vector size is 128bit */ + #if defined(SDL_NEON_INTRINSICS) + && SDL_GetSVEVectorSize() > 128 + #endif + ) { + // To prevent "unused function" compiler warnings/errors + (void)Blit8888to8888PixelAlpha; + (void)Blit8888to8888PixelAlphaSwizzle; + return Blit8888to8888PixelAlphaSwizzleSVE2; + } +#endif #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) // To prevent "unused function" compiler warnings/errors (void)Blit8888to8888PixelAlpha; diff --git a/src/video/SDL_blit_N.c b/src/video/SDL_blit_N.c index 204c1addbd..b014d4233a 100644 --- a/src/video/SDL_blit_N.c +++ b/src/video/SDL_blit_N.c @@ -26,6 +26,10 @@ #include "SDL_surface_c.h" #include "SDL_blit_copy.h" +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) +#include "./arm/SDL_sve2_blit_N.h" +#endif + // General optimized routines that write char by char #define HAVE_FAST_WRITE_INT8 1 @@ -3117,10 +3121,27 @@ SDL_BlitFunc SDL_CalculateBlitN(SDL_Surface *surface) return Blit8888to8888PixelSwizzleSSE41; } #endif +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + return Blit8888to8888PixelSwizzleSVE2; + } +#endif #if defined(SDL_NEON_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) return Blit8888to8888PixelSwizzleNEON; #endif } +#if defined(SDL_SVE2_INTRINSICS) && (__ARM_ARCH >= 8) && (defined(__aarch64__) || defined(_M_ARM64)) + if (SDL_HasSVE2()) { + /* RGBA8888/ARGB8888/XRGB8888 -> RGB565 */ + if (srcfmt->bytes_per_pixel == 4 && + dstfmt->bytes_per_pixel == 2 && + dstfmt->Rmask == 0x0000F800 && + dstfmt->Gmask == 0x000007E0 && + dstfmt->Bmask == 0x0000001F) { + return Blit8888to565PixelSwizzleSVE2; + } + } +#endif blitfun = NULL; if (dstfmt->bits_per_pixel > 8) { diff --git a/src/video/arm/SDL_sve2_blit_A.c b/src/video/arm/SDL_sve2_blit_A.c new file mode 100644 index 0000000000..be029bcc70 --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_A.c @@ -0,0 +1,89 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#include "SDL_sve2_blit_A.h" +#include + +#ifdef SDL_SVE2_INTRINSICS + +#undef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + /* fill alpha */ \ + sve_target_u16 = svdup_u16(0xFF); \ + } else { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } + +#undef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(svdup_u16(0xFF), \ + sve_target_u16, \ + vMask); \ + } else { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } + +#undef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ + do { \ + svuint16_t vMask = svget4(sve_source_u16x4, (ma_alpha_chn_idx)); \ + sve_target_u16 = sdl_sve_chn_blend_with_mask(sve_source_u16, \ + sve_target_u16, \ + vMask); \ + } while (0) + +#include "SDL_sve2_swizzle.h" + +/*-----------------------------------------------------------------------------* + * Swizzle Blend with Alpha * + *-----------------------------------------------------------------------------*/ +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info) +{ + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + assert(0 != srcfmt->Amask); + (void)srcfmt; + + sdl_sve_8888_to_8888_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +size_t SDL_GetSVEVectorSize(void) +{ + return svlen(svundef_u8()) * 8; +} + +#endif /* SDL_SVE2_INTRINSICS */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_A.h b/src/video/arm/SDL_sve2_blit_A.h new file mode 100644 index 0000000000..2a7e2b8149 --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_A.h @@ -0,0 +1,37 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_sve2_blit_A_h_ +#define SDL_sve2_blit_A_h_ + +#include "../../SDL_internal.h" +#include "../SDL_blit.h" + +#ifdef SDL_SVE2_INTRINSICS + +void Blit8888to8888PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); +void Blit8888to565PixelAlphaSwizzleSVE2(SDL_BlitInfo *info); + +size_t SDL_GetSVEVectorSize(void); + +#endif /* SDL_SVE2_INTRINSICS */ + +#endif /* SDL_sve2_blitters_h_ */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_N.c b/src/video/arm/SDL_sve2_blit_N.c new file mode 100644 index 0000000000..c6ae97e53b --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_N.c @@ -0,0 +1,64 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#include "SDL_sve2_blit_N.h" +#include + +#ifdef SDL_SVE2_INTRINSICS + +#undef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) \ + do { \ + if (sve_src_chn_idx == (ma_alpha_chn_idx)) { \ + /* fill alpha */ \ + sve_target_u16 = svdup_u16(0xFF); \ + } else { \ + sve_target_u16 = sve_source_u16; \ + } \ + } while (0) + +#undef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) \ + do { \ + sve_target_u16 = sve_source_u16; \ + } while (0) + +#undef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) \ + do { \ + sve_target_u16 = sve_source_u16; \ + } while (0) + +#include "SDL_sve2_swizzle.h" + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_8888_to_8888_swizzle_dispatcher(info); +} + +SDL_TARGETING("arch=armv8-a+sve2") +void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info) +{ + sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(info); +} + +#endif /* SDL_SVE2_INTRINSICS */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_blit_N.h b/src/video/arm/SDL_sve2_blit_N.h new file mode 100644 index 0000000000..3868de0dbb --- /dev/null +++ b/src/video/arm/SDL_sve2_blit_N.h @@ -0,0 +1,35 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_sve2_blit_N_h_ +#define SDL_sve2_blit_N_h_ + +#include "../../SDL_internal.h" +#include "../SDL_blit.h" + +#ifdef SDL_SVE2_INTRINSICS + +void Blit8888to8888PixelSwizzleSVE2(SDL_BlitInfo *info); +void Blit8888to565PixelSwizzleSVE2(SDL_BlitInfo *info); + +#endif /* SDL_SVE2_INTRINSICS */ + +#endif /* SDL_sve2_blitters_h_ */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h new file mode 100644 index 0000000000..2f5a74a12b --- /dev/null +++ b/src/video/arm/SDL_sve2_extension.h @@ -0,0 +1,1142 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#if !defined(SDL_SVE2_EXTENSION_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) +#define SDL_SVE2_EXTENSION_H + +#include "SDL_sve2_util.h" +#include +#include + +/*! + * \brief a wrapper for __attribute__((nonnull)) + */ +#ifndef ARM_NONNULL +#define ARM_NONNULL(...) __attribute__((nonnull(__VA_ARGS__))) +#endif + +#define svlenu8() svcntb_pat(SV_ALL) +#define svlenu16() (svcntb_pat(SV_ALL) / sizeof(uint16_t)) +#define svlenu32() (svcntb_pat(SV_ALL) / sizeof(uint32_t)) +#define svlenu64() (svcntb_pat(SV_ALL) / sizeof(uint64_t)) + +#define svlens8() svlenu8() +#define svlens16() svlenu16() +#define svlens32() svlenu32() +#define svlens64() svlenu64() + +#define sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name) \ + for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \ + pTemp != NULL; \ + pTemp = NULL) \ + for (size_t SVE_SAFE_NAME(n) = 0, \ + sve_iteration_advance = svlenu32() * 4; \ + ({ \ + ma_pred_name = svwhilelt_b8((int32_t)SVE_SAFE_NAME(n), \ + (int32_t)(ma_stride_size)); \ + SVE_SAFE_NAME(n) < (ma_stride_size); \ + }); \ + SVE_SAFE_NAME(n) += sve_iteration_advance) + +#define sdl_sve_stride_loop_rgb32(ma_stride_size, ma_pred_name) \ + sdl_sve_stride_loop_accc8888(ma_stride_size, ma_pred_name) + +#define sdl_sve_stride_loop_rgb16(ma_stride_size, ma_pred_name) \ + for (svbool_t ma_pred_name, *pTemp = &ma_pred_name; \ + pTemp != NULL; \ + pTemp = NULL) \ + for (size_t SVE_SAFE_NAME(n) = 0, \ + sve_iteration_advance = svlenu16(); \ + ({ \ + ma_pred_name = svwhilelt_b16((int32_t)SVE_SAFE_NAME(n), \ + (int32_t)(ma_stride_size)); \ + SVE_SAFE_NAME(n) < (ma_stride_size); \ + }); \ + SVE_SAFE_NAME(n) += sve_iteration_advance) + +#define sdl_sve_pixel_ccc_foreach_chn(ma_source_u16x3, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x3_t sve_source_u16x3 = ma_source_u16x3; \ + (void)sve_source_u16x3; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget3((ma_source_u16x3), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_accc_foreach_chn012(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_accc_foreach_chn(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_accc_ccca(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_ccca_accc(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_a123_a321(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_123a_321a(ma_source_u16x4, \ + ma_target_u16x4, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 3; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 3); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 3, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 2, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget4((ma_target_u16x4), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x4 = svset4(ma_target_u16x4, 1, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 3; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 3); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +#define sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565(ma_source_u16x4, \ + ma_target_u16x3, \ + ...) \ + do { \ + svuint16x4_t sve_source_u16x4 = ma_source_u16x4; \ + (void)sve_source_u16x4; \ + do { \ + const uint8_t sve_src_chn_idx = 2; \ + const uint8_t sve_dst_chn_idx = 0; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 2); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 0); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 0, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 1; \ + const uint8_t sve_dst_chn_idx = 1; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 1); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 1); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 1, sve_target_u16); \ + } while (0); \ + do { \ + const uint8_t sve_src_chn_idx = 0; \ + const uint8_t sve_dst_chn_idx = 2; \ + (void)sve_src_chn_idx; \ + (void)sve_dst_chn_idx; \ + svuint16_t sve_source_u16 = svget4((ma_source_u16x4), 0); \ + svuint16_t sve_target_u16 = svget3((ma_target_u16x3), 2); \ + (void)sve_source_u16; \ + (void)sve_target_u16; \ + __VA_ARGS__ \ + ma_target_u16x3 = svset3(ma_target_u16x3, 2, sve_target_u16); \ + } while (0); \ + } while (0) + +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16x3_t sdl_sve_rgb565_unpack(svuint16_t vPixels) +{ + svuint16_t vBlue = svand_n_u16_m(svptrue_b16(), vPixels, 0x1F); + svuint16_t vGreen = svand_n_u16_m(svptrue_b16(), vPixels, (0x3F << 5)); + svuint16_t vRed = svand_n_u16_m(svptrue_b16(), vPixels, (0x1F << 11)); + + return svcreate3_u16(svlsl_n_u16_m(svptrue_b16(), vBlue, 3), + svlsr_n_u16_m(svptrue_b16(), vGreen, 3), + svlsr_n_u16_m(svptrue_b16(), vRed, 8)); +} + +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_rgb565_pack(svuint16x3_t vRGB16x3) +{ + svuint16_t vRed = svlsr_n_u16_m(svptrue_b16(), svget3_u16(vRGB16x3, 0), 3); + svuint16_t vGreen = svlsl_n_u16_m(svptrue_b16(), + svand_n_u16_m(svptrue_b16(), + svget3_u16(vRGB16x3, 1), + (0x3F << 2)), + 3); + svuint16_t vBlue = svlsl_n_u16_m(svptrue_b16(), + svand_n_u16_m(svptrue_b16(), + svget3_u16(vRGB16x3, 2), + (0x1F << 3)), + 8); + + svuint16_t vPixel = svorr_u16_m(svptrue_b16(), vRed, vGreen); + return svorr_u16_m(svptrue_b16(), vPixel, vBlue); + + // return (svget3_u16(vRGB16x3, 0) >> 3) + // | ((svget3_u16(vRGB16x3, 1) & (0x3F << 2)) << 3) + // | ((svget3_u16(vRGB16x3, 2) & (0x1F << 3)) << 8); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2, 3, 4) +static inline void svld3rgb565_u16(svbool_t vPredu8, + uint16_t *phwSource, + svuint16x3_t *pvLow, + svuint16x3_t *pvHigh) +{ + svuint8x2_t vInput8x2 = svld2_u8(vPredu8, (uint8_t *)phwSource); + + svuint16_t vLowByteLowHalf = svunpklo_u16(svget2_u8(vInput8x2, 0)); + svuint16_t vLowByteHighHalf = svunpkhi_u16(svget2_u8(vInput8x2, 0)); + + svuint16_t vHighByteLowHalf = svunpklo_u16(svget2_u8(vInput8x2, 1)); + svuint16_t vHighByteHighHalf = svunpkhi_u16(svget2_u8(vInput8x2, 1)); + + //*pvLow = sdl_sve_rgb565_unpack ( vLowByteLowHalf + // | (vHighByteLowHalf << 8)); + *pvLow = sdl_sve_rgb565_unpack( + svorr_u16_m(svptrue_b16(), + vLowByteLowHalf, + //(vHighByteLowHalf << 8) + svlsl_n_u16_m(svptrue_b16(), vHighByteLowHalf, 8))); + + //*pvHigh = sdl_sve_rgb565_unpack ( vLowByteHighHalf + // | (vHighByteHighHalf << 8)); + *pvHigh = sdl_sve_rgb565_unpack( + svorr_u16_m(svptrue_b16(), + vLowByteHighHalf, + //(vHighByteHighHalf << 8) + svlsl_n_u16_m(svptrue_b16(), vHighByteHighHalf, 8))); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2) +static inline void svst3rgb565_u16(svbool_t vPredu8, + uint16_t *phwTarget, + svuint16x3_t vLow, + svuint16x3_t vHigh) +{ + svuint16_t vLowByteLowHalf = svundef_u16(); + svuint16_t vHighByteLowHalf = svundef_u16(); + + /* pack low half pixels */ + do { + svuint16_t vPixel = sdl_sve_rgb565_pack(vLow); + + // vLowByteLowHalf = vPixel & 0xFF; + vLowByteLowHalf = svand_n_u16_m(svptrue_b16(), vPixel, 0xFF); + + // vHighByteLowHalf = vPixel >> 8; + vHighByteLowHalf = svlsr_n_u16_m(svptrue_b16(), vPixel, 8); + } while (0); + + svuint16_t vLowByteHighHalf = svundef_u16(); + svuint16_t vHighByteHighHalf = svundef_u16(); + + /* pack high half pixels */ + do { + svuint16_t vPixel = sdl_sve_rgb565_pack(vHigh); + + // vLowByteHighHalf = vPixel & 0xFF; + vLowByteHighHalf = svand_n_u16_m(svptrue_b16(), vPixel, 0xFF); + + // vHighByteHighHalf = vPixel >> 8; + vHighByteHighHalf = svlsr_n_u16_m(svptrue_b16(), vPixel, 8); + } while (0); + + /* save rgb565 pixels */ + svuint8_t vLowByte = svuzp1_u8(svreinterpret_u8(vLowByteLowHalf), + svreinterpret_u8(vLowByteHighHalf)); + + svuint8_t vHighByte = svuzp1_u8(svreinterpret_u8(vHighByteLowHalf), + svreinterpret_u8(vHighByteHighHalf)); + + svst2_u8(vPredu8, (uint8_t *)phwTarget, svcreate2_u8(vLowByte, vHighByte)); +} + +#if defined(__GNUC__) && !defined(__clang__) +#define svld4ub_u16(ma_pred, \ + ma_src_ptr, \ + ma_svuint16x4_low_ptr, \ + ma_svuint16x4_high_ptr) \ + do { \ + svuint8x4_t vInput8x4 = svld4_u8((ma_pred), (ma_src_ptr)); \ + \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 0, svunpklo_u16(svget4_u8(vInput8x4, 0))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 1, svunpklo_u16(svget4_u8(vInput8x4, 1))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 2, svunpklo_u16(svget4_u8(vInput8x4, 2))); \ + *(ma_svuint16x4_low_ptr) = svset4_u16(*(ma_svuint16x4_low_ptr), 3, svunpklo_u16(svget4_u8(vInput8x4, 3))); \ + \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 0, svunpkhi_u16(svget4_u8(vInput8x4, 0))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 1, svunpkhi_u16(svget4_u8(vInput8x4, 1))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 2, svunpkhi_u16(svget4_u8(vInput8x4, 2))); \ + *(ma_svuint16x4_high_ptr) = svset4_u16(*(ma_svuint16x4_high_ptr), 3, svunpkhi_u16(svget4_u8(vInput8x4, 3))); \ + } while (0) + +#define svst4ub_u16(ma_pred, \ + ma_dst_ptr, \ + ma_svuint16x4_low, \ + ma_svuint16x4_high) \ + do { \ + svuint8_t vCH0u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 0)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 0))); \ + \ + svuint8_t vCH1u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 1)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 1))); \ + \ + svuint8_t vCH2u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 2)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 2))); \ + \ + svuint8_t vCH3u8 = svuzp1_u8(svreinterpret_u8(svget4_u16((ma_svuint16x4_low), 3)), \ + svreinterpret_u8(svget4_u16((ma_svuint16x4_high), 3))); \ + \ + svst4_u8((ma_pred), (ma_dst_ptr), svcreate4_u8(vCH0u8, vCH1u8, vCH2u8, vCH3u8)); \ + } while (0) +#else +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2, 3, 4) +static inline void svld4ub_u16(svbool_t vPredu8, + uint8_t *pchSource, + svuint16x4_t *pvLow, + svuint16x4_t *pvHigh) +{ + svuint8x4_t vInput8x4 = svld4_u8(vPredu8, pchSource); + + *pvLow = svset4_u16(*pvLow, 0, svunpklo_u16(svget4_u8(vInput8x4, 0))); + *pvLow = svset4_u16(*pvLow, 1, svunpklo_u16(svget4_u8(vInput8x4, 1))); + *pvLow = svset4_u16(*pvLow, 2, svunpklo_u16(svget4_u8(vInput8x4, 2))); + *pvLow = svset4_u16(*pvLow, 3, svunpklo_u16(svget4_u8(vInput8x4, 3))); + + *pvHigh = svset4_u16(*pvHigh, 0, svunpkhi_u16(svget4_u8(vInput8x4, 0))); + *pvHigh = svset4_u16(*pvHigh, 1, svunpkhi_u16(svget4_u8(vInput8x4, 1))); + *pvHigh = svset4_u16(*pvHigh, 2, svunpkhi_u16(svget4_u8(vInput8x4, 2))); + *pvHigh = svset4_u16(*pvHigh, 3, svunpkhi_u16(svget4_u8(vInput8x4, 3))); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(2) +static inline void svst4ub_u16(svbool_t vPredu8, + uint8_t *pchTarget, + svuint16x4_t vLow, + svuint16x4_t vHigh) +{ + + svuint8_t vCH0u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 0)), + svreinterpret_u8(svget4_u16(vHigh, 0))); + + svuint8_t vCH1u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 1)), + svreinterpret_u8(svget4_u16(vHigh, 1))); + + svuint8_t vCH2u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 2)), + svreinterpret_u8(svget4_u16(vHigh, 2))); + + svuint8_t vCH3u8 = svuzp1_u8(svreinterpret_u8(svget4_u16(vLow, 3)), + svreinterpret_u8(svget4_u16(vHigh, 3))); + + svst4_u8(vPredu8, pchTarget, svcreate4_u8(vCH0u8, vCH1u8, vCH2u8, vCH3u8)); +} +#endif + +/*! \note the Element range of vMask is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint16_t vTarget, svuint16_t vMask) +{ + // vTarget = vSource * vMask + vTarget * (255 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + vTemp0 = svmla_u16_m(svptrue_b16(), + vTemp0, + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(255), + vMask)); + + vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1); + + svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); + /* x += x >> 8 */ + vTemp0 = svadd_u16_m(svptrue_b16(), + vTemp0, + vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; +} + +/*! \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource, + svuint16_t vTarget, + uint16_t hwOpacity) +{ + // svuint16_t vOpacity = svdup_u16(hwOpacity); + // vTarget = vSource * vOpacity + vTarget * (256 - vOpacity); + + svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity); + svuint16_t vTemp1 = svmul_n_u16_m(svptrue_b16(), + vTarget, + 256 - hwOpacity); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_mask_and_opacity(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask, + uint16_t hwOpacity) +{ + vMask = svsel(svcmpeq_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_masks(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1) +{ + vMask1 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask1, 255), + vMask1, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask0, 255), + vMask1, + //(vMask0 * vMask1) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_masks_and_opacity( + svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + uint16_t hwOpacity) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), /* >= 255 */ + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8, + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_3masks(svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + svuint16_t vMask2) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask2, 255), + vMask, + //(vMask * vMask2) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask, vMask2), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +/*! \note the Element range of vMask0/1 is [0, 0xFF] + * \note the hwOpacity range [0, 0x100] + */ +SDL_TARGETING("arch=armv8-a+sve2") +static inline svuint16_t sdl_sve_chn_blend_with_3masks_and_opacity( + svuint16_t vSource, + svuint16_t vTarget, + svuint16_t vMask0, + svuint16_t vMask1, + svuint16_t vMask2, + uint16_t hwOpacity) +{ + vMask0 = svadd_u16_m(svcmpeq_n_u16(svptrue_b16(), vMask0, 255), + vMask0, + svdup_u16(1)); + + svuint16_t vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask1, 255), + vMask0, + //(vMask0 * vMask1) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask0, vMask1), + 8)); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask2, 255), + vMask, + svlsr_n_u16_m(svptrue_b16(), + svmul_u16_m(svptrue_b16(), vMask, vMask2), + 8)); + //(vMask * vMask2) >> 8); + + vMask = + svsel(svcmpge_n_u16(svptrue_b16(), vMask, 255), + svdup_u16(hwOpacity), + //(vMask * hwOpacity) >> 8 + svlsr_n_u16_m(svptrue_b16(), + svmul_n_u16_m(svptrue_b16(), vMask, hwOpacity), + 8)); + + // vTarget = vSource * vMask + vTarget * (256 - vMask); + svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp1 = svmul_u16_m(svptrue_b16(), + vTarget, + svsub_u16_m(svptrue_b16(), + svdup_u16(256), + vMask)); + vTarget = svadd_u16_m(svptrue_b16(), vTemp0, vTemp1); + + return svlsr_n_u16_m(svptrue_b16(), vTarget, 8); // vTarget >> 8; +} + +#endif /* SDL_SVE2_EXTENSION_H */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_swizzle.h b/src/video/arm/SDL_sve2_swizzle.h new file mode 100644 index 0000000000..a2d6f978d2 --- /dev/null +++ b/src/video/arm/SDL_sve2_swizzle.h @@ -0,0 +1,2375 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#if !defined(SD_SVE2_SWIZZLE_H) //&& (defined(__ARM_FEATURE_SVE2) && __ARM_FEATURE_SVE2) +#define SD_SVE2_SWIZZLE_H + +#include "SDL_sve2_extension.h" + +#define sdl_sve_rgb32_stride_impl(ma_sve_chn_iterator, ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x4_t vTargetLow16x4 = svundef4_u16(); \ + svuint16x4_t vTargetHigh16x4 = svundef4_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + &vTargetLow16x4, \ + &vTargetHigh16x4); \ + \ + /* process low half */ \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x4, \ + __VA_ARGS__); \ + \ + /* process high half */ \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x4, \ + __VA_ARGS__); \ + \ + svst4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + vTargetLow16x4, \ + vTargetHigh16x4); \ + \ + pwSource += sve_iteration_advance; \ + pwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_no_alpha_stride_impl( \ + ma_alpha_idx, \ + ma_sve_chn_iterator, \ + ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x4_t vTargetLow16x4 = svundef4_u16(); \ + svuint16x4_t vTargetHigh16x4 = svundef4_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + &vTargetLow16x4, \ + &vTargetHigh16x4); \ + \ + vSourceLow16x4 = svset4(vSourceLow16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + vSourceHigh16x4 = svset4(vSourceHigh16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + \ + /* process low half */ \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x4, \ + __VA_ARGS__); \ + \ + /* process high half */ \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x4, \ + __VA_ARGS__); \ + \ + svst4ub_u16(vTailPred, \ + (uint8_t *)pwTarget, \ + vTargetLow16x4, \ + vTargetHigh16x4); \ + \ + pwSource += sve_iteration_advance; \ + pwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_to_rgb565_stride_impl(ma_sve_chn_iterator, ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x3_t vTargetLow16x3 = svundef3_u16(); \ + svuint16x3_t vTargetHigh16x3 = svundef3_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + svld3rgb565_u16(vTailPred, \ + phwTarget, \ + &vTargetLow16x3, \ + &vTargetHigh16x3); \ + \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x3, \ + __VA_ARGS__); \ + \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x3, \ + __VA_ARGS__); \ + \ + svst3rgb565_u16(vTailPred, \ + phwTarget, \ + vTargetLow16x3, \ + vTargetHigh16x3); \ + \ + pwSource += sve_iteration_advance; \ + phwTarget += sve_iteration_advance; \ + } + +#define sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( \ + ma_alpha_idx, \ + ma_sve_chn_iterator, \ + ...) \ + sdl_sve_stride_loop_rgb32(uStride, vTailPred) \ + { \ + \ + svuint16x4_t vSourceLow16x4 = svundef4_u16(); \ + svuint16x4_t vSourceHigh16x4 = svundef4_u16(); \ + \ + svuint16x3_t vTargetLow16x3 = svundef3_u16(); \ + svuint16x3_t vTargetHigh16x3 = svundef3_u16(); \ + \ + svld4ub_u16(vTailPred, \ + (uint8_t *)pwSource, \ + &vSourceLow16x4, \ + &vSourceHigh16x4); \ + \ + vSourceLow16x4 = svset4(vSourceLow16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + vSourceHigh16x4 = svset4(vSourceHigh16x4, \ + (ma_alpha_idx), \ + svdup_u16(0xFF)); \ + \ + svld3rgb565_u16(vTailPred, \ + phwTarget, \ + &vTargetLow16x3, \ + &vTargetHigh16x3); \ + \ + ma_sve_chn_iterator(vSourceLow16x4, vTargetLow16x3, \ + __VA_ARGS__); \ + \ + ma_sve_chn_iterator(vSourceHigh16x4, vTargetHigh16x3, \ + __VA_ARGS__); \ + \ + svst3rgb565_u16(vTailPred, \ + phwTarget, \ + vTargetLow16x3, \ + vTargetHigh16x3); \ + \ + pwSource += sve_iteration_advance; \ + phwTarget += sve_iteration_advance; \ + } + +#ifndef sdl_sve_rgb32_blend_op_fill_alpha +#define sdl_sve_rgb32_blend_op_fill_alpha(ma_alpha_chn_idx) +#endif + +#ifndef sdl_sve_rgb32_blend_op_copy_alpha +#define sdl_sve_rgb32_blend_op_copy_alpha(ma_alpha_chn_idx) +#endif + +/* + * Source: ACCC and CCCA + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn, + + sdl_sve_rgb32_blend_op_copy_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_accc_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_accc_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_accc_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_accc_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_ccca_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_ccca_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_ccca_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_ccca_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_fill_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_accc_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_copy_alpha(3); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_ccca_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_ccca_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_accc_blend_to_ccca_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_accc_stride_blend_to_ccca_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_ccca_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_accc_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_accc_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_ccca_blend_to_accc_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_ccca_stride_blend_to_accc_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_a123_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_a123_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_a123_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123a_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + sdl_sve_rgb32_stride_impl(sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123a_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123a_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +/* + * Source: XCCC and CCCX + */ + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_fill_alpha(0); + + ); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_accc_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_accc_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_accc_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_accc_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_ccca_fill_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_ccca_fill_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_ccca_copy_alpha( + uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_ccca_copy_alpha( + (uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_src_dst_rev, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_ccca_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xccc_stride_blend_to_ccca_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_accc_ccca, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_ccca_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_ccca_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xccc_blend_to_ccca_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xccc_stride_blend_to_ccca_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_accc_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_cccx_stride_blend_to_accc_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_ccca_accc, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_accc_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_accc_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_cccx_blend_to_accc_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_cccx_stride_blend_to_accc_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_a321_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_fill_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_x123_stride_blend_to_a321_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(3, + sdl_sve_pixel_u16x4_foreach_chn_a123_a321, + sdl_sve_rgb32_blend_op_copy_alpha(3);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_a321_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_a321_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_x123_blend_to_a321_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_x123_stride_blend_to_a321_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_321a_fill_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_fill_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_123x_stride_blend_to_321a_copy_alpha( + uint32_t *SDL_RESTRICT pwSource, + uint32_t *SDL_RESTRICT pwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_stride_impl(0, + sdl_sve_pixel_u16x4_foreach_chn_123a_321a, + sdl_sve_rgb32_blend_op_copy_alpha(0);); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_321a_fill_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_321a_fill_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_123x_blend_to_321a_copy_alpha(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_123x_stride_blend_to_321a_copy_alpha((uint32_t *)pchSource, + (uint32_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1) +static inline void sdl_sve_8888_to_8888_swizzle_dispatcher(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + uint8_t *src = info->src; + int srcskip = info->src_skip; + uint8_t *dst = info->dst; + int dstskip = info->dst_skip; + + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + + // Set up some basic variables + int srcbpp = srcfmt->bytes_per_pixel; + int dstbpp = dstfmt->bytes_per_pixel; + + assert((srcbpp == 4) && (dstbpp == 4)); + + bool fill_alpha = (!dstfmt->Amask); + + int srcstride = srcskip + srcbpp * width; + int dststride = dstskip + dstbpp * width; + + switch (srcfmt->format) { + case SDL_PIXELFORMAT_XRGB8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_ARGB8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_RGBX8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_XBGR8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_x123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_x123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_xccc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_xccc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_a123_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_a123_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_accc_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_accc_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_accc_blend_to_ccca_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_accc_blend_to_ccca_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_BGRX8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_123x_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123x_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_cccx_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_cccx_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + switch (dstfmt->format) { + case SDL_PIXELFORMAT_ARGB8888: + case SDL_PIXELFORMAT_XRGB8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_a321_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_a321_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_RGBA8888: + case SDL_PIXELFORMAT_RGBX8888: + if (fill_alpha) { + sdl_sve_123a_blend_to_321a_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_123a_blend_to_321a_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_ABGR8888: + case SDL_PIXELFORMAT_XBGR8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_accc_fill_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_accc_copy_alpha(src, + srcstride, + dst, + dststride, + width, + height); + } + break; + + case SDL_PIXELFORMAT_BGRA8888: + case SDL_PIXELFORMAT_BGRX8888: + if (fill_alpha) { + sdl_sve_ccca_blend_to_ccca_fill_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } else { + sdl_sve_ccca_blend_to_ccca_copy_alpha( + src, + srcstride, + dst, + dststride, + width, + height); + } + break; + default: + assert(false); + break; + } + break; + + default: + assert(false); + break; + } +} + +#ifndef sdl_sve_rgb32_blend_to_rgb565_op +#define sdl_sve_rgb32_blend_to_rgb565_op(ma_alpha_chn_idx) +#endif + +/* + * ACCC or CCCA + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_argb8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_argb8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_argb8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_rgba8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_rgba8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_rgba8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_bgra8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_bgra8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_bgra8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_abgr8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_to_rgb565_stride_impl( + sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_abgr8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_abgr8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +/* + * XCCC or CCCX + */ +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xrgb8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 3, + sdl_sve_pixel_u16x4_foreach_chn_argb_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xrgb8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xrgb8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_rgbx8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 0, + sdl_sve_pixel_u16x4_foreach_chn_rgba_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_rgbx8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_rgbx8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_bgrx8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 0, + sdl_sve_pixel_u16x4_foreach_chn_bgra_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(0); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_bgrx8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_bgrx8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 2) +static inline void sdl_sve_xbgr8888_stride_blend_to_rgb565(uint32_t *SDL_RESTRICT pwSource, + uint16_t *SDL_RESTRICT phwTarget, + size_t uStride) +{ + sdl_sve_rgb32_no_alpha_to_rgb565_stride_impl( + 3, + sdl_sve_pixel_u16x4_foreach_chn_abgr_rgb565, + { + sdl_sve_rgb32_blend_to_rgb565_op(3); + }); +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1, 3) +static inline void sdl_sve_xbgr8888_blend_to_rgb565(uint8_t *SDL_RESTRICT pchSource, + size_t uSourceStride, + uint8_t *SDL_RESTRICT pchTarget, + size_t uTargetStride, + int nWidth, + int nHeight) +{ + while (nHeight--) { + + sdl_sve_xbgr8888_stride_blend_to_rgb565((uint32_t *)pchSource, + (uint16_t *)pchTarget, + nWidth); + + pchSource += uSourceStride; + pchTarget += uTargetStride; + } +} + +SDL_TARGETING("arch=armv8-a+sve2") +ARM_NONNULL(1) +static inline void sdl_sve_rgb32_to_rgb565_swizzle_dispatcher(SDL_BlitInfo *info) +{ + int width = info->dst_w; + int height = info->dst_h; + uint8_t *src = info->src; + int srcskip = info->src_skip; + uint8_t *dst = info->dst; + int dstskip = info->dst_skip; + + const SDL_PixelFormatDetails *srcfmt = info->src_fmt; + const SDL_PixelFormatDetails *dstfmt = info->dst_fmt; + + // Set up some basic variables + int srcbpp = srcfmt->bytes_per_pixel; + int dstbpp = dstfmt->bytes_per_pixel; + + assert(srcbpp == 4); + assert(dstbpp == 2); + + int srcstride = srcskip + srcbpp * width; + int dststride = dstskip + dstbpp * width; + + switch (srcfmt->format) { + case SDL_PIXELFORMAT_XRGB8888: + sdl_sve_xrgb8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_ARGB8888: + sdl_sve_argb8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_RGBX8888: + sdl_sve_rgbx8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_RGBA8888: + sdl_sve_rgba8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_XBGR8888: + sdl_sve_xbgr8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_ABGR8888: + sdl_sve_abgr8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_BGRX8888: + sdl_sve_bgrx8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + case SDL_PIXELFORMAT_BGRA8888: + sdl_sve_bgra8888_blend_to_rgb565(src, + srcstride, + dst, + dststride, + width, + height); + break; + + default: + assert(false); + break; + } +} + +#endif /* SD_SVE2_SWIZZLE_H */ \ No newline at end of file diff --git a/src/video/arm/SDL_sve2_util.h b/src/video/arm/SDL_sve2_util.h new file mode 100644 index 0000000000..2a1602b432 --- /dev/null +++ b/src/video/arm/SDL_sve2_util.h @@ -0,0 +1,206 @@ +/* + Simple DirectMedia Layer + Copyright (C) 1997-2026 Sam Lantinga + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. +*/ + +#ifndef SDL_SVE2_UTIL_H +#define SDL_SVE2_UTIL_H + +#undef SVE_0_CONNECT2 +#undef SVE_0_CONNECT3 +#undef SVE_0_CONNECT4 +#undef SVE_0_CONNECT5 +#undef SVE_0_CONNECT6 +#undef SVE_0_CONNECT7 +#undef SVE_0_CONNECT8 +#undef SVE_0_CONNECT9 + +#undef SVE_CONNECT2 +#undef SVE_CONNECT3 +#undef SVE_CONNECT4 +#undef SVE_CONNECT5 +#undef SVE_CONNECT6 +#undef SVE_CONNECT7 +#undef SVE_CONNECT8 +#undef SVE_CONNECT9 +#undef ALT_SVE_CONNECT2 + +#undef SVE_SAFE_NAME + +#undef SVE_CONNECT + +#define SVE_0_CONNECT2(ma_A, ma_B) ma_A##ma_B +#define SVE_0_CONNECT3(ma_A, ma_B, ma_C) ma_A##ma_B##ma_C +#define SVE_0_CONNECT4(ma_A, ma_B, ma_C, ma_D) ma_A##ma_B##ma_C##ma_D +#define SVE_0_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) \ + ma_A##ma_B##ma_C##ma_D##ma_E +#define SVE_0_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F +#define SVE_0_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G +#define SVE_0_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G##ma_H +#define SVE_0_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) \ + ma_A##ma_B##ma_C##ma_D##ma_E##ma_F##ma_G##ma_H##ma_I + +#define ALT_SVE_CONNECT2(ma_A, ma_B) SVE_0_CONNECT2(ma_A, ma_B) +#define SVE_CONNECT2(ma_A, ma_B) SVE_0_CONNECT2(ma_A, ma_B) +#define SVE_CONNECT3(ma_A, ma_B, ma_C) SVE_0_CONNECT3(ma_A, ma_B, ma_C) +#define SVE_CONNECT4(ma_A, ma_B, ma_C, ma_D) \ + SVE_0_CONNECT4(ma_A, ma_B, ma_C, ma_D) +#define SVE_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) \ + SVE_0_CONNECT5(ma_A, ma_B, ma_C, ma_D, ma_E) +#define SVE_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) \ + SVE_0_CONNECT6(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F) +#define SVE_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) \ + SVE_0_CONNECT7(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G) +#define SVE_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) \ + SVE_0_CONNECT8(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H) +#define SVE_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) \ + SVE_0_CONNECT9(ma_A, ma_B, ma_C, ma_D, ma_E, ma_F, ma_G, ma_H, ma_I) + +#define SVE_CONNECT(...) \ + ALT_SVE_CONNECT2(SVE_CONNECT, \ + SVE_VA_NUM_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#ifndef SVE_VA_NUM_ARGS_IMPL +#define SVE_VA_NUM_ARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, \ + _12, _13, _14, _15, _16, ma_N, ...) ma_N +#endif + +#ifndef SVE_VA_NUM_ARGS +#define SVE_VA_NUM_ARGS(...) \ + SVE_VA_NUM_ARGS_IMPL(0, ##__VA_ARGS__, 16, 15, 14, 13, 12, 11, 10, 9, \ + 8, 7, 6, 5, 4, 3, 2, 1, 0) +#endif + +#define SVE_SAFE_NAME(ma_NAME) SVE_CONNECT3(ma_, ma_NAME, ma_LINEma_) + +/* ---------------------------------------------------------------------------* + * SVE Test Helper * + * ---------------------------------------------------------------------------*/ + +#define SVT_PRINT_VECTOR(ma_VECOTOR, ma_ELEMENT_T, ma_FORMAT_STRING) \ + do { \ + int_fast8_t nElementCount = svcntb_pat(SV_ALL) / sizeof(ma_ELEMENT_T); \ + uint8_t SVE_SAFE_NAME(chVectorBuffer) \ + [nElementCount * sizeof(ma_ELEMENT_T)]; \ + \ + svst1_u8(svptrue_b8(), \ + SVE_SAFE_NAME(chVectorBuffer), \ + svreinterpret_u8(ma_VECOTOR)); \ + \ + ma_ELEMENT_T *pElement = (ma_ELEMENT_T *)SVE_SAFE_NAME(chVectorBuffer); \ + printf("%s\t[", #ma_VECOTOR); \ + do { \ + printf(ma_FORMAT_STRING "\t", (int)*pElement++); \ + } while (--nElementCount); \ + printf("]\r\n"); \ + } while (0) + +#define SVT_INIT_VECOTR(ma_VECTOR, ma_ELEMENT_T, ...) \ + do { \ + uint8_t SVE_SAFE_NAME(chVectorBuffer)[svcntb_pat(SV_ALL)]; \ + \ + memset(SVE_SAFE_NAME(chVectorBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(chVectorBuffer))); \ + memcpy(SVE_SAFE_NAME(chVectorBuffer), /* This should NOT be SDL_memcpy() */ \ + (ma_ELEMENT_T[]){ __VA_ARGS__ }, \ + MIN(sizeof(SVE_SAFE_NAME(chVectorBuffer)), \ + sizeof((ma_ELEMENT_T[]){ __VA_ARGS__ }))); \ + \ + ma_VECTOR = svld1(svptrue_b8(), \ + (ma_ELEMENT_T *)SVE_SAFE_NAME(chVectorBuffer)); \ + } while (0) + +#define SVT_INIT_PRED(ma_PREDICT, ...) \ + do { \ + uint8_t SVE_SAFE_NAME(chBuffer)[svlen(svundef_u64())]; \ + memset(SVE_SAFE_NAME(chBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(chBuffer))); \ + \ + memcpy(SVE_SAFE_NAME(chBuffer), /* This should NOT be SDL_memcpy() */ \ + (uint8_t[]){ __VA_ARGS__ }, \ + MIN(sizeof(SVE_SAFE_NAME(chBuffer)), \ + sizeof((uint8_t[]){ __VA_ARGS__ }))); \ + \ + ma_PREDICT = (*(svbool_t *)SVE_SAFE_NAME(chBuffer)); \ + } while (0) + +#define SVT_PRINT_PRED(ma_PREDICT, ma_TYPE_T) \ + do { \ + printf("%8s\t[", #ma_PREDICT); \ + uint16_t SVE_SAFE_NAME(hwBuffer)[svlen(svundef_u64()) / 2]; \ + memset(SVE_SAFE_NAME(hwBuffer), /* This should NOT be SDL_memset() */ \ + 0, \ + sizeof(SVE_SAFE_NAME(hwBuffer))); \ + *(volatile svbool_t *)SVE_SAFE_NAME(hwBuffer) = (ma_PREDICT); \ + \ + uint_fast16_t SVE_SAFE_NAME(nTotalBits) = svlen(svundef_u8()); \ + uint_fast8_t SVE_SAFE_NAME(nElementBits) = sizeof(ma_TYPE_T); \ + \ + uint16_t *phwPred = SVE_SAFE_NAME(hwBuffer); \ + do { \ + uint16_t hwPred = *phwPred++; \ + \ + for (uint_fast8_t n = 0; \ + n < 16; \ + n += SVE_SAFE_NAME(nElementBits)) { \ + \ + if (hwPred & 0x01) { \ + printf("True "); \ + } else { \ + printf("False"); \ + } \ + printf("%*s\t", (int)sizeof(ma_TYPE_T) - 1, ""); \ + hwPred >>= SVE_SAFE_NAME(nElementBits); \ + } \ + \ + SVE_SAFE_NAME(nTotalBits) -= 16; \ + } while (SVE_SAFE_NAME(nTotalBits)); \ + \ + printf("]\r\n"); \ + } while (0) + +#define SVT_PRINT_BUFFER(ma_BUFF_PTR, ma_SIZE, ma_TYPE_T, ma_FMT_STR, ma_STRIDE) \ + do { \ + ma_TYPE_T *pBuffer = (ma_TYPE_T *)ma_BUFF_PTR; \ + size_t nElementCount = (ma_SIZE) / sizeof(ma_TYPE_T); \ + \ + size_t nStrideSize = (ma_STRIDE); \ + size_t nLineCount = 0; \ + \ + printf("%s\n\t", #ma_BUFF_PTR); \ + do { \ + \ + printf(ma_FMT_STR " ", *pBuffer++); \ + nLineCount++; \ + if (nLineCount >= nStrideSize) { \ + nLineCount = 0; \ + printf("\n\t"); \ + } \ + \ + } while (--nElementCount); \ + printf("\n"); \ + \ + } while (0) + +#endif /* SDL_SVE2_UTIL_H */ \ No newline at end of file diff --git a/test/testplatform.c b/test/testplatform.c index 4e79f6326c..d42c72d10f 100644 --- a/test/testplatform.c +++ b/test/testplatform.c @@ -414,6 +414,7 @@ static int TestCPUInfo(bool verbose) SDL_Log("NEON %s", SDL_HasNEON() ? "detected" : "not detected"); SDL_Log("LSX %s", SDL_HasLSX() ? "detected" : "not detected"); SDL_Log("LASX %s", SDL_HasLASX() ? "detected" : "not detected"); + SDL_Log("SVE2 %s", SDL_HasSVE2() ? "detected" : "not detected"); SDL_Log("System RAM %d MB", SDL_GetSystemRAM()); SDL_Log("System memory page size %d bytes", SDL_GetSystemPageSize()); }