[x265] [PATCH 4/6] asm-primitives.cpp: Delete dequant_scaling SVE2 implementation

Mon Jan 6 17:17:21 UTC 2025

The implementation makes use of Neon instructions which only operate on
the low 128-bits of an SVE register, meaning that for longer vectors the
function gives incorrect output. We could maintain the kernel and only
use it if the vector length is 128-bits, however it appears to give no
benefit in this case so simply remove it instead.
---
 source/common/aarch64/asm-primitives.cpp |  3 +-
 source/common/aarch64/pixel-util-sve2.S  | 79 ------------------------
 2 files changed, 1 insertion(+), 81 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 478b6943a..c1f41cb3c 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -969,8 +969,7 @@ void setupSve2Primitives(EncoderPrimitives &p)
     p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
     p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
 
-    // dequant_scaling
-    p.dequant_scaling = PFX(dequant_scaling_sve2);
+    // dequant_normal
     p.dequant_normal  = PFX(dequant_normal_sve2);
 
     // ssim_4x4x2_core
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index b2b4d24c1..5b872f437 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -867,85 +867,6 @@ function PFX(scale1D_128to64_sve2)
     ret
 endfunc
 
-/***** dequant_scaling*****/
-// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
-function PFX(dequant_scaling_sve2)
-    ptrue           p0.h, vl8
-    add             x5, x5, #4              // shift + 4
-    lsr             x3, x3, #3              // num / 8
-    cmp             x5, x4
-    blt             .dequant_skip_sve2
-
-    mov             x12, #1
-    sub             x6, x5, x4          // shift - per
-    sub             x6, x6, #1          // shift - per - 1
-    lsl             x6, x12, x6         // 1 << shift - per - 1 (add)
-    mov             z0.s, w6
-    sub             x7, x4, x5          // per - shift
-    mov             z3.s, w7
-
-.dequant_loop1_sve2:
-    ld1h            {z19.h}, p0/z, [x0]
-    ld1w            {z2.s}, p0/z, [x1]
-    add             x1, x1, #16
-    ld1w            {z20.s}, p0/z, [x1]
-    add             x0, x0, #16
-    add             x1, x1, #16
-
-    sub             x3, x3, #1
-    sunpklo         z1.s, z19.h
-    sunpkhi         z19.s, z19.h
-
-    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
-    mul             z19.s, z19.s, z20.s
-    add             z1.s, z1.s, z0.s // quantCoef * deQuantCoef + add
-    add             z19.s, z19.s, z0.s
-
-    // No equivalent instructions in SVE2 for sshl
-    // as sqshl has double latency
-    sshl            v1.4s, v1.4s, v3.4s
-    sshl            v19.4s, v19.4s, v3.4s
-
-    sqxtnb          z16.h, z1.s
-    sqxtnb          z17.h, z19.s
-    st1h            {z16.s}, p0, [x2]
-    st1h            {z17.s}, p0, [x2, #1, mul vl]
-    add             x2, x2, #16
-    cbnz            x3, .dequant_loop1_sve2
-    ret
-
-.dequant_skip_sve2:
-    sub             x6, x4, x5          // per - shift
-    mov             z0.h, w6
-
-.dequant_loop2_sve2:
-    ld1h            {z19.h}, p0/z, [x0]
-    ld1w            {z2.s}, p0/z, [x1]
-    add             x1, x1, #16
-    ld1w            {z20.s}, p0/z, [x1]
-    add             x0, x0, #16
-    add             x1, x1, #16
-
-
-    sub             x3, x3, #1
-    sunpklo         z1.s, z19.h
-    sunpkhi         z19.s, z19.h
-
-    mul             z1.s, z1.s, z2.s // quantCoef * deQuantCoef
-    mul             z19.s, z19.s, z20.s
-
-    // Keeping NEON instructions here in order to have
-    // one sqshl later
-    sqxtn           v16.4h, v1.4s       // x265_clip3
-    sqxtn2          v16.8h, v19.4s
-
-    sqshl           z16.h, p0/m, z16.h, z0.h // coefQ << per - shift
-    st1h            {z16.h}, p0, [x2]
-    add             x2, x2, #16
-    cbnz            x3, .dequant_loop2_sve2
-    ret
-endfunc
-
 // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 function PFX(dequant_normal_sve2)
     lsr             w2, w2, #4              // num / 16
-- 
2.34.1

-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-asm-primitives.cpp-Delete-dequant_scaling-SVE2-imple.patch
Type: text/x-diff
Size: 4720 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20250106/d42c9ad7/attachment-0001.patch>