diff --git a/src/video/arm/SDL_sve2_extension.h b/src/video/arm/SDL_sve2_extension.h index b9db084bba..24c46105d7 100644 --- a/src/video/arm/SDL_sve2_extension.h +++ b/src/video/arm/SDL_sve2_extension.h @@ -907,7 +907,8 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svuint16_t vMask) { // vTarget = vSource * vMask + vTarget * (255 - vMask); - svuint16_t vTemp0 = svmul_u16_m(svptrue_b16(), vSource, vMask); + svuint16_t vTemp0 = svdup_u16(1); + vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vSource, vMask); vTemp0 = svmla_u16_m(svptrue_b16(), vTemp0, vTarget, @@ -915,17 +916,13 @@ static inline svuint16_t sdl_sve_chn_blend_with_mask(svuint16_t vSource, svdup_u16(255), vMask)); - vTemp0 = svadd_n_u16_m(svptrue_b16(), vTemp0, 1); - - svuint16_t vTemp1 = svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); /* x += x >> 8 */ - vTemp0 = svadd_u16_m(svptrue_b16(), - vTemp0, - vTemp1); - - return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; + return svreinterpret_u16_u8( + svaddhnb_u16(vTemp0, + svlsr_n_u16_m(svptrue_b16(), + vTemp0, + 8))); } - /*! \note the Element range of vMask is [0, 0xFF] */ SDL_TARGETING("arch=armv8-a+sve2") @@ -968,15 +965,15 @@ static inline svuint16_t sdl_sve_chn_blend_with_opacity(svuint16_t vSource, */ SDL_TARGETING("arch=armv8-a+sve2") static inline svuint16_t sdl_sve_chn_blend_with_opacity_fast(svuint16_t vSource, - svuint16_t vTarget, - uint16_t hwOpacity) + svuint16_t vTarget, + uint16_t hwOpacity) { // vTarget = vSource * vMask + vTarget * (255 - vMask); svuint16_t vTemp0 = svmul_n_u16_m(svptrue_b16(), vSource, hwOpacity); vTemp0 = svmla_n_u16_m(svptrue_b16(), - vTemp0, - vTarget, - 256 - hwOpacity); + vTemp0, + vTarget, + 256 - hwOpacity); return svlsr_n_u16_m(svptrue_b16(), vTemp0, 8); // vTarget >> 8; }