[x265] [PATCH 1/4] AArch64: Remove SVE assembly implementation of quant
Hari Limaye
hari.limaye at arm.com
Mon Aug 12 21:15:09 UTC 2024
Remove the SVE assembly implementation of the quant primitive, as this
gives no performance improvement over the Neon implementation.
---
source/common/aarch64/asm-primitives.cpp | 3 --
source/common/aarch64/fun-decls.h | 2 -
source/common/aarch64/pixel-util-sve.S | 57 ------------------------
3 files changed, 62 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 32d75ee35..33c2e5864 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -1030,9 +1030,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
#endif
-
- // quant
- p.quant = PFX(quant_sve);
}
#endif
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 363725fef..09ec1755a 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -266,8 +266,6 @@ int PFX(pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixe
int PFX(pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
int PFX(pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
-uint32_t PFX(quant_sve)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
-
void PFX(dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
void PFX(dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index c1d6b4129..3d073d42e 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -314,60 +314,3 @@ function PFX(pixel_satd_64x48_sve)
mov x0, x7
ret x10
endfunc
-
-/********* ssim ***********/
-// uint32_t quant_c(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
-// No need to fully use sve instructions for this function
-function PFX(quant_sve)
- mov w9, #1
- lsl w9, w9, w4
- mov z0.s, w9
- neg w9, w4
- mov z1.s, w9
- add w9, w9, #8
- mov z2.s, w9
- mov z3.s, w5
-
- lsr w6, w6, #2
- eor z4.d, z4.d, z4.d
- eor w10, w10, w10
- eor z17.d, z17.d, z17.d
-
-.Loop_quant_sve:
- ld1 {v18.4h}, [x0], #8
- ld1 {v7.4s}, [x1], #16
- sxtl v6.4s, v18.4h
-
- cmlt v5.4s, v6.4s, #0
-
- abs v6.4s, v6.4s
-
-
- mul v6.4s, v6.4s, v7.4s
-
- add v7.4s, v6.4s, v3.4s
- sshl v7.4s, v7.4s, v1.4s
-
- mls v6.4s, v7.4s, v0.s[0]
- sshl v16.4s, v6.4s, v2.4s
- st1 {v16.4s}, [x2], #16
-
- // numsig
- cmeq v16.4s, v7.4s, v17.4s
- add v4.4s, v4.4s, v16.4s
- add w10, w10, #4
-
- // level *= sign
- eor z16.d, z7.d, z5.d
- sub v16.4s, v16.4s, v5.4s
- sqxtn v5.4h, v16.4s
- st1 {v5.4h}, [x3], #8
-
- subs w6, w6, #1
- b.ne .Loop_quant_sve
-
- addv s4, v4.4s
- mov w9, v4.s[0]
- add w0, w10, w9
- ret
-endfunc
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-AArch64-Remove-SVE-assembly-implementation-of-quant.patch
Type: text/x-patch
Size: 4130 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240812/b7dedc24/attachment.bin>
More information about the x265-devel
mailing list