[x265] [PATCH 06/11] AArch64: Enable existing SSE_SS SVE impl for SBD
Gerda Zsejke More
gerdazsejke.more at arm.com
Tue Dec 10 16:02:32 UTC 2024
The existing HBD SSE_SS SVE implementation is suitable for SBD as
well, so enable it for the SBD build.
Delete the existing SBD SSE_SS SVE2 implementation in order to have
the SVE implementation as default for SVE2 supported platforms. The
SVE implementation is up to 55% faster than the SVE2 implementation.
Change-Id: Ib7f5e731b2007ebd6e967cfb1a5ffbeb845dac22
---
source/common/aarch64/asm-primitives.cpp | 21 +-
source/common/aarch64/ssd-a-sve.S | 4 +-
source/common/aarch64/ssd-a-sve2.S | 449 -----------------------
3 files changed, 9 insertions(+), 465 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 72aa1cf3d..d920adeb4 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -836,6 +836,13 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
+ // sse_ss
+ p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
+ p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
+ p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve);
+ p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve);
+ p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve);
+
#if !HIGH_BIT_DEPTH
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
@@ -872,13 +879,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve);
-
- // sse_ss
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
- p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
- p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve);
- p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve);
- p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve);
#endif // !HIGH_BIT_DEPTH
}
#endif // defined(HAVE_SVE2) || defined(HAVE_SVE)
@@ -899,13 +899,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // sse_ss
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve2);
- p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve2);
- p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve2);
- p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve2);
- p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve2);
-
// ssd_s
p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve2);
p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve2);
diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S
index ac0ee710d..c1f745947 100644
--- a/source/common/aarch64/ssd-a-sve.S
+++ b/source/common/aarch64/ssd-a-sve.S
@@ -213,6 +213,8 @@ function PFX(pixel_sse_pp_64x64_sve)
ret
endfunc
+#endif // HIGH_BIT_DEPTH
+
.macro SSE_SS_4x2
ldr d16, [x0]
ldr d17, [x2]
@@ -367,5 +369,3 @@ function PFX(pixel_sse_ss_64x64_sve)
fmov x0, d0
ret
endfunc
-
-#endif // HIGH_BIT_DEPTH
diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S
index b3e84b69b..fe3c0d893 100644
--- a/source/common/aarch64/ssd-a-sve2.S
+++ b/source/common/aarch64/ssd-a-sve2.S
@@ -36,455 +36,6 @@
.text
-function PFX(pixel_sse_ss_4x4_sve2)
- ptrue p0.b, vl8
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 3
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_8x8_sve2)
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 7
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_16x16_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_16x16
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- sub z2.h, z17.h, z19.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
- smlalb z3.s, z2.h, z2.h
- smlalt z4.s, z2.h, z2.h
-.rept 15
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- sub z2.h, z17.h, z19.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
- smlalb z3.s, z2.h, z2.h
- smlalt z4.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_16x16:
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 15
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_32x32_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_32x32
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x0, #2, mul vl]
- ld1b {z19.b}, p0/z, [x0, #3, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- ld1b {z22.b}, p0/z, [x2, #2, mul vl]
- ld1b {z23.b}, p0/z, [x2, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- sub z3.h, z18.h, z22.h
- sub z4.h, z19.h, z23.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- smlalb z5.s, z4.h, z4.h
- smlalt z6.s, z4.h, z4.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x0, #2, mul vl]
- ld1b {z19.b}, p0/z, [x0, #3, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- ld1b {z22.b}, p0/z, [x2, #2, mul vl]
- ld1b {z23.b}, p0/z, [x2, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- sub z3.h, z18.h, z22.h
- sub z4.h, z19.h, z23.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- smlalb z5.s, z4.h, z4.h
- smlalt z6.s, z4.h, z4.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_32x32:
- cmp x9, #48
- bgt .vl_gt_48_pixel_sse_ss_32x32
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_48_pixel_sse_ss_32x32:
- ptrue p0.b, vl64
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_64x64
- ptrue p0.b, vl16
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z26.b}, p0/z, [x0, #2, mul vl]
- ld1b {z27.b}, p0/z, [x0, #3, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- ld1b {z30.b}, p0/z, [x2, #2, mul vl]
- ld1b {z31.b}, p0/z, [x2, #3, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- ld1b {z24.b}, p0/z, [x0, #4, mul vl]
- ld1b {z25.b}, p0/z, [x0, #5, mul vl]
- ld1b {z26.b}, p0/z, [x0, #6, mul vl]
- ld1b {z27.b}, p0/z, [x0, #7, mul vl]
- ld1b {z28.b}, p0/z, [x2, #4, mul vl]
- ld1b {z29.b}, p0/z, [x2, #5, mul vl]
- ld1b {z30.b}, p0/z, [x2, #6, mul vl]
- ld1b {z31.b}, p0/z, [x2, #7, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z26.b}, p0/z, [x0, #2, mul vl]
- ld1b {z27.b}, p0/z, [x0, #3, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- ld1b {z30.b}, p0/z, [x2, #2, mul vl]
- ld1b {z31.b}, p0/z, [x2, #3, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- ld1b {z24.b}, p0/z, [x0, #4, mul vl]
- ld1b {z25.b}, p0/z, [x0, #5, mul vl]
- ld1b {z26.b}, p0/z, [x0, #6, mul vl]
- ld1b {z27.b}, p0/z, [x0, #7, mul vl]
- ld1b {z28.b}, p0/z, [x2, #4, mul vl]
- ld1b {z29.b}, p0/z, [x2, #5, mul vl]
- ld1b {z30.b}, p0/z, [x2, #6, mul vl]
- ld1b {z31.b}, p0/z, [x2, #7, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_sse_ss_64x64
- ptrue p0.b, vl32
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z25.b}, p0/z, [x0, #2, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- ld1b {z29.b}, p0/z, [x2, #2, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z25.b}, p0/z, [x0, #2, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- ld1b {z29.b}, p0/z, [x2, #2, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_48_pixel_sse_ss_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_sse_ss_64x64
- ptrue p0.b, vl64
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_112_pixel_sse_ss_64x64:
- ptrue p0.b, vl128
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
function PFX(pixel_ssd_s_4x4_sve2)
ptrue p0.b, vl8
ld1b {z16.b}, p0/z, [x0]
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 3ad539d7da13329060314fffadc60c4577da9bee Mon Sep 17 00:00:00 2001
Message-Id: <3ad539d7da13329060314fffadc60c4577da9bee.1733846134.git.gerdazsejke.more at arm.com>
In-Reply-To: <cover.1733846134.git.gerdazsejke.more at arm.com>
References: <cover.1733846134.git.gerdazsejke.more at arm.com>
From: Gerda Zsejke More <gerdazsejke.more at arm.com>
Date: Sat, 7 Dec 2024 16:02:30 +0100
Subject: [PATCH 06/11] AArch64: Enable existing SSE_SS SVE impl for SBD
The existing HBD SSE_SS SVE implementation is suitable for SBD as
well, so enable it for the SBD build.
Delete the existing SBD SSE_SS SVE2 implementation in order to have
the SVE implementation as default for SVE2 supported platforms. The
SVE implementation is up to 55% faster than the SVE2 implementation.
Change-Id: Ib7f5e731b2007ebd6e967cfb1a5ffbeb845dac22
---
source/common/aarch64/asm-primitives.cpp | 21 +-
source/common/aarch64/ssd-a-sve.S | 4 +-
source/common/aarch64/ssd-a-sve2.S | 449 -----------------------
3 files changed, 9 insertions(+), 465 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 72aa1cf3d..d920adeb4 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -836,6 +836,13 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
+ // sse_ss
+ p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
+ p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
+ p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve);
+ p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve);
+ p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve);
+
#if !HIGH_BIT_DEPTH
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
@@ -872,13 +879,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve);
-
- // sse_ss
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
- p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
- p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve);
- p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve);
- p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve);
#endif // !HIGH_BIT_DEPTH
}
#endif // defined(HAVE_SVE2) || defined(HAVE_SVE)
@@ -899,13 +899,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // sse_ss
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve2);
- p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve2);
- p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_sve2);
- p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_sve2);
- p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_sve2);
-
// ssd_s
p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve2);
p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve2);
diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S
index ac0ee710d..c1f745947 100644
--- a/source/common/aarch64/ssd-a-sve.S
+++ b/source/common/aarch64/ssd-a-sve.S
@@ -213,6 +213,8 @@ function PFX(pixel_sse_pp_64x64_sve)
ret
endfunc
+#endif // HIGH_BIT_DEPTH
+
.macro SSE_SS_4x2
ldr d16, [x0]
ldr d17, [x2]
@@ -367,5 +369,3 @@ function PFX(pixel_sse_ss_64x64_sve)
fmov x0, d0
ret
endfunc
-
-#endif // HIGH_BIT_DEPTH
diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S
index b3e84b69b..fe3c0d893 100644
--- a/source/common/aarch64/ssd-a-sve2.S
+++ b/source/common/aarch64/ssd-a-sve2.S
@@ -36,455 +36,6 @@
.text
-function PFX(pixel_sse_ss_4x4_sve2)
- ptrue p0.b, vl8
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 3
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_8x8_sve2)
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 7
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z17.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_16x16_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_16x16
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- sub z2.h, z17.h, z19.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
- smlalb z3.s, z2.h, z2.h
- smlalt z4.s, z2.h, z2.h
-.rept 15
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x2]
- ld1b {z19.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- sub z2.h, z17.h, z19.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
- smlalb z3.s, z2.h, z2.h
- smlalt z4.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_16x16:
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- smullb z3.s, z1.h, z1.h
- smullt z4.s, z1.h, z1.h
-.rept 15
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z18.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z18.h
- smlalb z3.s, z1.h, z1.h
- smlalt z4.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z3.s
- fmov w0, s3
- uaddv d4, p0, z4.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_32x32_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_32x32
- ptrue p0.b, vl16
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x0, #2, mul vl]
- ld1b {z19.b}, p0/z, [x0, #3, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- ld1b {z22.b}, p0/z, [x2, #2, mul vl]
- ld1b {z23.b}, p0/z, [x2, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- sub z3.h, z18.h, z22.h
- sub z4.h, z19.h, z23.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- smlalb z5.s, z4.h, z4.h
- smlalt z6.s, z4.h, z4.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z18.b}, p0/z, [x0, #2, mul vl]
- ld1b {z19.b}, p0/z, [x0, #3, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- ld1b {z22.b}, p0/z, [x2, #2, mul vl]
- ld1b {z23.b}, p0/z, [x2, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- sub z3.h, z18.h, z22.h
- sub z4.h, z19.h, z23.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- smlalb z5.s, z4.h, z4.h
- smlalt z6.s, z4.h, z4.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_32x32:
- cmp x9, #48
- bgt .vl_gt_48_pixel_sse_ss_32x32
- ptrue p0.b, vl32
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z17.b}, p0/z, [x0, #1, mul vl]
- ld1b {z20.b}, p0/z, [x2]
- ld1b {z21.b}, p0/z, [x2, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- sub z2.h, z17.h, z21.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_48_pixel_sse_ss_32x32:
- ptrue p0.b, vl64
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- smullb z5.s, z1.h, z1.h
- smullt z6.s, z1.h, z1.h
-.rept 31
- ld1b {z16.b}, p0/z, [x0]
- ld1b {z20.b}, p0/z, [x2]
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
- sub z1.h, z16.h, z20.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
-function PFX(pixel_sse_ss_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_sse_ss_64x64
- ptrue p0.b, vl16
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z26.b}, p0/z, [x0, #2, mul vl]
- ld1b {z27.b}, p0/z, [x0, #3, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- ld1b {z30.b}, p0/z, [x2, #2, mul vl]
- ld1b {z31.b}, p0/z, [x2, #3, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- ld1b {z24.b}, p0/z, [x0, #4, mul vl]
- ld1b {z25.b}, p0/z, [x0, #5, mul vl]
- ld1b {z26.b}, p0/z, [x0, #6, mul vl]
- ld1b {z27.b}, p0/z, [x0, #7, mul vl]
- ld1b {z28.b}, p0/z, [x2, #4, mul vl]
- ld1b {z29.b}, p0/z, [x2, #5, mul vl]
- ld1b {z30.b}, p0/z, [x2, #6, mul vl]
- ld1b {z31.b}, p0/z, [x2, #7, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z26.b}, p0/z, [x0, #2, mul vl]
- ld1b {z27.b}, p0/z, [x0, #3, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- ld1b {z30.b}, p0/z, [x2, #2, mul vl]
- ld1b {z31.b}, p0/z, [x2, #3, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- ld1b {z24.b}, p0/z, [x0, #4, mul vl]
- ld1b {z25.b}, p0/z, [x0, #5, mul vl]
- ld1b {z26.b}, p0/z, [x0, #6, mul vl]
- ld1b {z27.b}, p0/z, [x0, #7, mul vl]
- ld1b {z28.b}, p0/z, [x2, #4, mul vl]
- ld1b {z29.b}, p0/z, [x2, #5, mul vl]
- ld1b {z30.b}, p0/z, [x2, #6, mul vl]
- ld1b {z31.b}, p0/z, [x2, #7, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- sub z2.h, z26.h, z30.h
- sub z3.h, z27.h, z31.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- smlalb z5.s, z2.h, z2.h
- smlalt z6.s, z2.h, z2.h
- smlalb z5.s, z3.h, z3.h
- smlalt z6.s, z3.h, z3.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_16_pixel_sse_ss_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_sse_ss_64x64
- ptrue p0.b, vl32
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z25.b}, p0/z, [x0, #2, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- ld1b {z29.b}, p0/z, [x2, #2, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z25.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2]
- ld1b {z29.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z25.b}, p0/z, [x0, #2, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- ld1b {z29.b}, p0/z, [x2, #2, mul vl]
- sub z0.h, z24.h, z28.h
- sub z1.h, z25.h, z29.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- smlalb z5.s, z1.h, z1.h
- smlalt z6.s, z1.h, z1.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_48_pixel_sse_ss_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_sse_ss_64x64
- ptrue p0.b, vl64
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- ld1b {z24.b}, p0/z, [x0, #1, mul vl]
- ld1b {z28.b}, p0/z, [x2, #1, mul vl]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-.vl_gt_112_pixel_sse_ss_64x64:
- ptrue p0.b, vl128
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smullb z5.s, z0.h, z0.h
- smullt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.rept 63
- ld1b {z24.b}, p0/z, [x0]
- ld1b {z28.b}, p0/z, [x2]
- sub z0.h, z24.h, z28.h
- smlalb z5.s, z0.h, z0.h
- smlalt z6.s, z0.h, z0.h
- add x0, x0, x1, lsl #1
- add x2, x2, x3, lsl #1
-.endr
- uaddv d3, p0, z5.s
- fmov w0, s3
- uaddv d4, p0, z6.s
- fmov w1, s4
- add w0, w0, w1
- ret
-endfunc
-
function PFX(pixel_ssd_s_4x4_sve2)
ptrue p0.b, vl8
ld1b {z16.b}, p0/z, [x0]
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list