[x265] [PATCH] AArch64: Optimize and clean up addAvg Neon and SVE2 functions
Li Zhang
li.zhang2 at arm.com
Thu Jun 19 18:48:38 UTC 2025
Extend the neon intrinsics implementation to support all block sizes and
optimize it to use rounding-shift-and-accumulate instead of separate
widening, add, and shift steps. Also unroll the loops for larger block
sizes to enable the compiler to emit LDP and STP instructions.
Delete the Neon and SVE2 assembly implementations as they are 1-2x
slower than Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 16 -
source/common/aarch64/mc-a-sve2.S | 606 -----------------------
source/common/aarch64/mc-a.S | 341 -------------
source/common/aarch64/mem-neon.h | 20 +
source/common/aarch64/pixel-prim.cpp | 207 +++++---
5 files changed, 169 insertions(+), 1021 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..e1fc8e82a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -462,14 +462,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
ALL_LUMA_PU(pixelavg_pp[ALIGNED], pixel_avg_pp, neon);
- // addAvg
- ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_LUMA_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
@@ -635,14 +627,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2);
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2);
- // addAvg
- LUMA_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- LUMA_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[NONALIGNED], addAvg, sve2);
- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[ALIGNED], addAvg, sve2);
- CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2);
diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index 00fb0048f..fc0a6f3e8 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -298,609 +298,3 @@ pixel_avg_pp_64xN_sve2 16
pixel_avg_pp_64xN_sve2 32
pixel_avg_pp_64xN_sve2 48
pixel_avg_pp_64xN_sve2 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-
-.macro addAvg_2xN_sve2 h
-function PFX(addAvg_2x\h\()_sve2)
- ptrue p0.s, vl2
- ptrue p1.h, vl4
- ptrue p2.h, vl2
-.rept \h / 2
- ld1rw {z0.s}, p0/z, [x0]
- ld1rw {z1.s}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1rw {z2.s}, p0/z, [x0]
- ld1rw {z3.s}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p1/m, z0.h, z1.h
- add z2.h, p1/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p2, [x2]
- add x2, x2, x5
- st1b {z2.h}, p2, [x2]
- add x2, x2, x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_2xN_sve2 4
-addAvg_2xN_sve2 8
-addAvg_2xN_sve2 16
-
-.macro addAvg_6xN_sve2 h
-function PFX(addAvg_6x\h\()_sve2)
- mov w12, #\h / 2
- ptrue p0.b, vl16
- ptrue p2.h, vl6
-.Loop_sve2_addavg_6x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- sqrshrnb z2.b, z2.h, #7
- add z0.b, z0.b, #0x80
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p2, [x2]
- add x2, x2, x5
- st1b {z2.h}, p2, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_6x\h
- ret
-endfunc
-.endm
-
-addAvg_6xN_sve2 8
-addAvg_6xN_sve2 16
-
-.macro addAvg_8xN_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
- ptrue p0.b, vl16
-.rept \h / 2
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- st1b {z2.h}, p0, [x2]
- add x2, x2, x5
-.endr
- ret
-endfunc
-.endm
-
-.macro addAvg_8xN1_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
- mov w12, #\h / 2
- ptrue p0.b, vl16
-.Loop_sve2_addavg_8x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- st1b {z2.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_8x\h
- ret
-endfunc
-.endm
-
-addAvg_8xN_sve2 2
-addAvg_8xN_sve2 4
-addAvg_8xN_sve2 6
-addAvg_8xN_sve2 8
-addAvg_8xN_sve2 12
-addAvg_8xN_sve2 16
-addAvg_8xN1_sve2 32
-addAvg_8xN1_sve2 64
-
-.macro addAvg_12xN_sve2 h
-function PFX(addAvg_12x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_12x\h
- ptrue p0.b, vl16
- ptrue p1.b, vl8
-.Loop_sve2_addavg_12x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- ld1b {z2.b}, p1/z, [x0, #1, mul vl]
- ld1b {z3.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p1/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z2.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_12x\h
- ret
-.vl_gt_16_addAvg_12x\h\():
- mov x10, #24
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_sve2_gt_16_addavg_12x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_gt_16_addavg_12x\h
- ret
-endfunc
-.endm
-
-addAvg_12xN_sve2 16
-addAvg_12xN_sve2 32
-
-.macro addAvg_16xN_sve2 h
-function PFX(addAvg_16x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_16x\h
- ptrue p0.b, vl16
-.Loop_eq_16_sve2_addavg_16x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- ld1b {z2.b}, p0/z, [x0, #1, mul vl]
- ld1b {z3.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z2.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_eq_16_sve2_addavg_16x\h
- ret
-.vl_gt_16_addAvg_16x\h\():
- ptrue p0.b, vl32
-.Loop_gt_16_sve2_addavg_16x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_16_sve2_addavg_16x\h
- ret
-endfunc
-.endm
-
-addAvg_16xN_sve2 4
-addAvg_16xN_sve2 8
-addAvg_16xN_sve2 12
-addAvg_16xN_sve2 16
-addAvg_16xN_sve2 24
-addAvg_16xN_sve2 32
-addAvg_16xN_sve2 64
-
-.macro addAvg_24xN_sve2 h
-function PFX(addAvg_24x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_24x\h
- addAvg_start
-.Loop_eq_16_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1 {v0.16b-v2.16b}, [x0], x3
- ld1 {v3.16b-v5.16b}, [x1], x4
- addavg_1 v0, v3
- addavg_1 v1, v4
- addavg_1 v2, v5
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- st1 {v0.8b-v2.8b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_24x\h
- ret
-.vl_gt_16_addAvg_24x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_24x\h
- ptrue p0.b, vl32
- ptrue p1.b, vl16
-.Loop_gt_16_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p1/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x1]
- ld1b {z3.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- add z1.h, p1/m, z1.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_16_sve2_addavg_24x\h
- ret
-.vl_gt_48_addAvg_24x\h\():
- mov x10, #48
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_gt_48_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z2.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_48_sve2_addavg_24x\h
- ret
-endfunc
-.endm
-
-addAvg_24xN_sve2 32
-addAvg_24xN_sve2 64
-
-.macro addAvg_32xN_sve2 h
-function PFX(addAvg_32x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_32x\h
- ptrue p0.b, vl16
-.Loop_eq_16_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z3.b}, p0/z, [x0, #3, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- ld1b {z7.b}, p0/z, [x1, #3, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- add z3.h, p0/m, z3.h, z7.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- sqrshrnb z3.b, z3.h, #7
- add z3.b, z3.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- st1b {z3.h}, p0, [x2, #3, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_eq_16_sve2_addavg_32x\h
- ret
-.vl_gt_16_addAvg_32x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_32x\h
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x1]
- ld1b {z3.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- add z1.h, p0/m, z1.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_32x\h
- ret
-.vl_gt_48_addAvg_32x\h\():
- ptrue p0.b, vl64
-.Loop_eq_64_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_eq_64_sve2_addavg_32x\h
- ret
-endfunc
-.endm
-
-addAvg_32xN_sve2 8
-addAvg_32xN_sve2 16
-addAvg_32xN_sve2 24
-addAvg_32xN_sve2 32
-addAvg_32xN_sve2 48
-addAvg_32xN_sve2 64
-
-function PFX(addAvg_48x64_sve2)
- mov w12, #64
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_48x64
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_eq_16_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v21.8h}, [x0], x3
- ld1 {v22.8h-v23.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v22
- addavg_1 v21, v23
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- st1 {v0.16b-v2.16b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_48x64
- ret
-.vl_gt_16_addAvg_48x64:
- cmp x9, #48
- bgt .vl_gt_48_addAvg_48x64
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_48x64
- ret
-.vl_gt_48_addAvg_48x64:
- cmp x9, #112
- bgt .vl_gt_112_addAvg_48x64
- ptrue p0.b, vl64
- ptrue p1.b, vl32
-.Loop_gt_48_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p1/z, [x0, #1, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p1/m, z1.h, z5.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_48_sve2_addavg_48x64
- ret
-.vl_gt_112_addAvg_48x64:
- mov x10, #96
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_gt_112_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_112_sve2_addavg_48x64
- ret
-endfunc
-
-.macro addAvg_64xN_sve2 h
-function PFX(addAvg_64x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_64x\h
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_eq_16_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v23.8h}, [x0], x3
- ld1 {v24.8h-v27.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v24
- addavg_1 v21, v25
- addavg_1 v22, v26
- addavg_1 v23, v27
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- sqxtun v3.8b, v22.8h
- sqxtun2 v3.16b, v23.8h
- st1 {v0.16b-v3.16b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_64x\h
- ret
-.vl_gt_16_addAvg_64x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_64x\h
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z3.b}, p0/z, [x0, #3, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- ld1b {z7.b}, p0/z, [x1, #3, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- add z3.h, p0/m, z3.h, z7.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- sqrshrnb z3.b, z3.h, #7
- add z3.b, z3.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- st1b {z3.h}, p0, [x2, #3, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_64x\h
- ret
-.vl_gt_48_addAvg_64x\h\():
- cmp x9, #112
- bgt .vl_gt_112_addAvg_64x\h
- ptrue p0.b, vl64
-.Loop_gt_eq_48_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_48_sve2_addavg_64x\h
- ret
-.vl_gt_112_addAvg_64x\h\():
- ptrue p0.b, vl128
-.Loop_gt_eq_128_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_128_sve2_addavg_64x\h
- ret
-endfunc
-.endm
-
-addAvg_64xN_sve2 16
-addAvg_64xN_sve2 32
-addAvg_64xN_sve2 48
-addAvg_64xN_sve2 64
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..876228473 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -214,344 +214,3 @@ pixel_avg_pp_64xN_neon 16
pixel_avg_pp_64xN_neon 32
pixel_avg_pp_64xN_neon 48
pixel_avg_pp_64xN_neon 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-.macro addAvg_2xN h
-function PFX(addAvg_2x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ldr w10, [x0]
- ldr w11, [x1]
- add x0, x0, x3
- add x1, x1, x4
- ldr w12, [x0]
- ldr w13, [x1]
- add x0, x0, x3
- add x1, x1, x4
- dup v0.2s, w10
- dup v1.2s, w11
- dup v2.2s, w12
- dup v3.2s, w13
- add v0.4h, v0.4h, v1.4h
- add v2.4h, v2.4h, v3.4h
- saddl v0.4s, v0.4h, v30.4h
- saddl v2.4s, v2.4h, v30.4h
- shrn v0.4h, v0.4s, #7
- shrn2 v0.8h, v2.4s, #7
- sqxtun v0.8b, v0.8h
- st1 {v0.h}[0], [x2], x5
- st1 {v0.h}[2], [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_2xN 4
-addAvg_2xN 8
-addAvg_2xN 16
-
-.macro addAvg_4xN h
-function PFX(addAvg_4x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ld1 {v0.8b}, [x0], x3
- ld1 {v1.8b}, [x1], x4
- ld1 {v2.8b}, [x0], x3
- ld1 {v3.8b}, [x1], x4
- add v0.4h, v0.4h, v1.4h
- add v2.4h, v2.4h, v3.4h
- saddl v0.4s, v0.4h, v30.4h
- saddl v2.4s, v2.4h, v30.4h
- shrn v0.4h, v0.4s, #7
- shrn2 v0.8h, v2.4s, #7
- sqxtun v0.8b, v0.8h
- st1 {v0.s}[0], [x2], x5
- st1 {v0.s}[1], [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_4xN 2
-addAvg_4xN 4
-addAvg_4xN 8
-addAvg_4xN 16
-addAvg_4xN 32
-
-.macro addAvg_6xN h
-function PFX(addAvg_6x\h\()_neon)
- addAvg_start
- mov w12, #\h / 2
- sub x5, x5, #4
-.Loop_addavg_6x\h:
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- str s0, [x2], #4
- st1 {v0.h}[2], [x2], x5
- str s1, [x2], #4
- st1 {v1.h}[2], [x2], x5
- cbnz w12, .Loop_addavg_6x\h
- ret
-endfunc
-.endm
-
-addAvg_6xN 8
-addAvg_6xN 16
-
-.macro addAvg_8xN h
-function PFX(addAvg_8x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], x5
- st1 {v1.8b}, [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-.macro addAvg_8xN1 h
-function PFX(addAvg_8x\h\()_neon)
- addAvg_start
- mov w12, #\h / 2
-.Loop_addavg_8x\h:
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], x5
- st1 {v1.8b}, [x2], x5
- cbnz w12, .Loop_addavg_8x\h
- ret
-endfunc
-.endm
-
-addAvg_8xN 2
-addAvg_8xN 4
-addAvg_8xN 6
-addAvg_8xN 8
-addAvg_8xN 12
-addAvg_8xN 16
-addAvg_8xN1 32
-addAvg_8xN1 64
-
-.macro addAvg_12xN h
-function PFX(addAvg_12x\h\()_neon)
- addAvg_start
- sub x3, x3, #16
- sub x4, x4, #16
- sub x5, x5, #8
- mov w12, #\h
-.Loop_addAvg_12X\h\():
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], #16
- ld1 {v1.16b}, [x1], #16
- ld1 {v2.8b}, [x0], x3
- ld1 {v3.8b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.4h, v2.4h, v3.4h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], #8
- st1 {v1.s}[0], [x2], x5
- cbnz w12, .Loop_addAvg_12X\h
- ret
-endfunc
-.endm
-
-addAvg_12xN 16
-addAvg_12xN 32
-
-.macro addAvg_16xN h
-function PFX(addAvg_16x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_16x\h:
- sub w12, w12, #1
- ld1 {v0.8h-v1.8h}, [x0], x3
- ld1 {v2.8h-v3.8h}, [x1], x4
- addavg_1 v0, v2
- addavg_1 v1, v3
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- st1 {v0.16b}, [x2], x5
- cbnz w12, .Loop_addavg_16x\h
- ret
-endfunc
-.endm
-
-addAvg_16xN 4
-addAvg_16xN 8
-addAvg_16xN 12
-addAvg_16xN 16
-addAvg_16xN 24
-addAvg_16xN 32
-addAvg_16xN 64
-
-.macro addAvg_24xN h
-function PFX(addAvg_24x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_24x\h\():
- sub w12, w12, #1
- ld1 {v0.16b-v2.16b}, [x0], x3
- ld1 {v3.16b-v5.16b}, [x1], x4
- addavg_1 v0, v3
- addavg_1 v1, v4
- addavg_1 v2, v5
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- st1 {v0.8b-v2.8b}, [x2], x5
- cbnz w12, .Loop_addavg_24x\h
- ret
-endfunc
-.endm
-
-addAvg_24xN 32
-addAvg_24xN 64
-
-.macro addAvg_32xN h
-function PFX(addAvg_32x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_32x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], x3
- ld1 {v4.8h-v7.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- sqxtun v3.8b, v3.8h
- st1 {v0.8b-v3.8b}, [x2], x5
- cbnz w12, .Loop_addavg_32x\h
- ret
-endfunc
-.endm
-
-addAvg_32xN 8
-addAvg_32xN 16
-addAvg_32xN 24
-addAvg_32xN 32
-addAvg_32xN 48
-addAvg_32xN 64
-
-function PFX(addAvg_48x64_neon)
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
- mov w12, #64
-.Loop_addavg_48x64:
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v21.8h}, [x0], x3
- ld1 {v22.8h-v23.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v22
- addavg_1 v21, v23
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- st1 {v0.16b-v2.16b}, [x2], x5
- cbnz w12, .Loop_addavg_48x64
- ret
-endfunc
-
-.macro addAvg_64xN h
-function PFX(addAvg_64x\h\()_neon)
- addAvg_start
- mov w12, #\h
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_addavg_64x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v23.8h}, [x0], x3
- ld1 {v24.8h-v27.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v24
- addavg_1 v21, v25
- addavg_1 v22, v26
- addavg_1 v23, v27
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- sqxtun v3.8b, v22.8h
- sqxtun2 v3.16b, v23.8h
- st1 {v0.16b-v3.16b}, [x2], x5
- cbnz w12, .Loop_addavg_64x\h
- ret
-endfunc
-.endm
-
-addAvg_64xN 16
-addAvg_64xN 32
-addAvg_64xN 48
-addAvg_64xN 64
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 8bd5fbee9..27d72b70c 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -74,6 +74,26 @@ static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
}
+// Store 2 bytes from the low half of a uint8x8_t.
+static void inline store_u8x2x1(uint8_t *d, const uint8x8_t s)
+{
+ vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(s), 0);
+}
+
+// Load 2 int16_t into a int16x8_t.
+static inline int16x8_t load_s16x2x1(const int16_t *p)
+{
+ int32x4_t ret = vld1q_lane_s32((const int32_t *)p, vdupq_n_s32(0), 0);
+
+ return vreinterpretq_s16_s32(ret);
+}
+
+// Store 2 uint16_t from the low half of a uint16x8_t.
+static inline void store_u16x2x1(const uint16_t *d, const uint16x8_t s)
+{
+ vst1q_lane_u32((uint32_t *)d, vreinterpretq_u32_u16(s), 0);
+}
+
// Store N blocks of 32-bits from (N / 2) D-Registers.
template<int N>
static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..ef7861284 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1145,49 +1145,138 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_
}
}
-template<int bx, int by>
-void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t src0Stride, intptr_t src1Stride,
- intptr_t dstStride)
+template<int width, int height>
+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst,
+ intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
-
const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
- const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+ const int offset = 2 * IF_INTERNAL_OFFS;
- const int32x4_t addon = vdupq_n_s32(offset);
- for (int y = 0; y < by; y++)
+#if HIGH_BIT_DEPTH
+ const int16x8_t addon = vdupq_n_s16(offset >> shiftNum);
+
+ for (int h = 0; h < height; h++)
{
- int x = 0;
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t s0[2], s1[2];
+ load_s16x8xn<2>(src0 + w, 8, s0);
+ load_s16x8xn<2>(src1 + w, 8, s1);
- for (; (x + 8) <= bx; x += 8)
+ int16x8_t d0_lo = vrsraq_n_s16(addon, vaddq_s16(s0[0], s1[0]), shiftNum);
+ int16x8_t d0_hi = vrsraq_n_s16(addon, vaddq_s16(s0[1], s1[1]), shiftNum);
+
+ d0_lo = vminq_s16(d0_lo, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0_lo = vmaxq_s16(d0_lo, vdupq_n_s16(0));
+ d0_hi = vminq_s16(d0_hi, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0_hi = vmaxq_s16(d0_hi, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0_lo));
+ vst1q_u16(dst + w + 8, vreinterpretq_u16_s16(d0_hi));
+ }
+ if (width & 8)
{
- int16x8_t in0 = vld1q_s16(src0 + x);
- int16x8_t in1 = vld1q_s16(src1 + x);
- int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
- int32x4_t t2 = vaddl_high_s16(in0, in1);
- t1 = vaddq_s32(t1, addon);
- t2 = vaddq_s32(t2, addon);
- t1 = vshrq_n_s32(t1, shiftNum);
- t2 = vshrq_n_s32(t2, shiftNum);
- int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1),
- vreinterpretq_s16_s32(t2));
-#if HIGH_BIT_DEPTH
- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
- t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(dst + x, vreinterpretq_u16_s16(t));
-#else
- vst1_u8(dst + x, vqmovun_s16(t));
-#endif
+ int16x8_t s0 = vld1q_s16(src0 + w);
+ int16x8_t s1 = vld1q_s16(src1 + w);
+
+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum);
+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmaxq_s16(d0, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0));
+
+ w += 8;
+ }
+ if (width & 4)
+ {
+ int16x4_t s0 = vld1_s16(src0 + w);
+ int16x4_t s1 = vld1_s16(src1 + w);
+
+ int16x4_t d0 = vrsra_n_s16(vget_low_s16(addon), vadd_s16(s0, s1), shiftNum);
+ d0 = vmin_s16(d0, vdup_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmax_s16(d0, vdup_n_s16(0));
+
+ vst1_u16(dst + w, vreinterpret_u16_s16(d0));
+
+ w += 4;
+ }
+ if (width & 2)
+ {
+ int16x8_t s0 = load_s16x2x1(src0 + w);
+ int16x8_t s1 = load_s16x2x1(src1 + w);
+
+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum);
+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmaxq_s16(d0, vdupq_n_s16(0));
+
+ store_u16x2x1(dst + w, vreinterpretq_u16_s16(d0));
+ }
+
+ src0 += src0Stride;
+ src1 += src1Stride;
+ dst += dstStride;
+ }
+#else // !HIGH_BIT_DEPTH
+ const uint8x8_t addon = vdup_n_u8(offset >> shiftNum);
+
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t s0[2], s1[2];
+ load_s16x8xn<2>(src0 + w, 8, s0);
+ load_s16x8xn<2>(src1 + w, 8, s1);
+
+ int8x8_t sum01_s8_lo = vqrshrn_n_s16(vaddq_s16(s0[0], s1[0]), shiftNum);
+ int8x8_t sum01_s8_hi = vqrshrn_n_s16(vaddq_s16(s0[1], s1[1]), shiftNum);
+ uint8x8_t d0_lo = vadd_u8(vreinterpret_u8_s8(sum01_s8_lo), addon);
+ uint8x8_t d0_hi = vadd_u8(vreinterpret_u8_s8(sum01_s8_hi), addon);
+
+ vst1_u8(dst + w, d0_lo);
+ vst1_u8(dst + w + 8, d0_hi);
+ }
+ if (width & 8)
+ {
+ int16x8_t s0 = vld1q_s16(src0 + w);
+ int16x8_t s1 = vld1q_s16(src1 + w);
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ vst1_u8(dst + w, d0);
+
+ w += 8;
}
- for (; x < bx; x += 2)
+ if (width & 4)
{
- dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
- dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
+ int16x8_t s0 = vcombine_s16(vld1_s16(src0 + w), vdup_n_s16(0));
+ int16x8_t s1 = vcombine_s16(vld1_s16(src1 + w), vdup_n_s16(0));
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ store_u8x4x1(dst + w, d0);
+
+ w += 4;
+ }
+ if (width & 2)
+ {
+ int16x8_t s0 = load_s16x2x1(src0 + w);
+ int16x8_t s1 = load_s16x2x1(src1 + w);
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ store_u8x2x1(dst + w, d0);
}
src0 += src0Stride;
src1 += src1Stride;
dst += dstStride;
}
+#endif
}
void planecopy_cp_neon(const uint8_t *src, intptr_t srcStride, pixel *dst,
@@ -2057,29 +2146,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
- CHROMA_PU_420(4, 4);
- CHROMA_PU_420(8, 8);
- CHROMA_PU_420(16, 16);
- CHROMA_PU_420(32, 32);
+ CHROMA_PU_420(2, 4);
+ CHROMA_PU_420(2, 8);
CHROMA_PU_420(4, 2);
- CHROMA_PU_420(8, 4);
+ CHROMA_PU_420(4, 4);
CHROMA_PU_420(4, 8);
- CHROMA_PU_420(8, 6);
CHROMA_PU_420(6, 8);
+ CHROMA_PU_420(4, 16);
CHROMA_PU_420(8, 2);
- CHROMA_PU_420(2, 8);
- CHROMA_PU_420(16, 8);
- CHROMA_PU_420(8, 16);
- CHROMA_PU_420(16, 12);
+ CHROMA_PU_420(8, 4);
+ CHROMA_PU_420(8, 6);
+ CHROMA_PU_420(8, 8);
+ CHROMA_PU_420(8, 16);
+ CHROMA_PU_420(8, 32);
CHROMA_PU_420(12, 16);
CHROMA_PU_420(16, 4);
- CHROMA_PU_420(4, 16);
- CHROMA_PU_420(32, 16);
+ CHROMA_PU_420(16, 8);
+ CHROMA_PU_420(16, 12);
+ CHROMA_PU_420(16, 16);
CHROMA_PU_420(16, 32);
- CHROMA_PU_420(32, 24);
CHROMA_PU_420(24, 32);
CHROMA_PU_420(32, 8);
- CHROMA_PU_420(8, 32);
+ CHROMA_PU_420(32, 16);
+ CHROMA_PU_420(32, 24);
+ CHROMA_PU_420(32, 32);
@@ -2161,30 +2251,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
- CHROMA_PU_422(4, 8);
- CHROMA_PU_422(8, 16);
- CHROMA_PU_422(16, 32);
- CHROMA_PU_422(32, 64);
- CHROMA_PU_422(4, 4);
+ CHROMA_PU_422(2, 4);
CHROMA_PU_422(2, 8);
- CHROMA_PU_422(8, 8);
+ CHROMA_PU_422(2, 16);
+ CHROMA_PU_422(4, 4);
+ CHROMA_PU_422(4, 8);
CHROMA_PU_422(4, 16);
- CHROMA_PU_422(8, 12);
- CHROMA_PU_422(6, 16);
+ CHROMA_PU_422(4, 32);
CHROMA_PU_422(8, 4);
- CHROMA_PU_422(2, 16);
- CHROMA_PU_422(16, 16);
+ CHROMA_PU_422(8, 8);
+ CHROMA_PU_422(8, 12);
+ CHROMA_PU_422(8, 16);
CHROMA_PU_422(8, 32);
- CHROMA_PU_422(16, 24);
+ CHROMA_PU_422(8, 64);
+ CHROMA_PU_422(6, 16);
CHROMA_PU_422(12, 32);
CHROMA_PU_422(16, 8);
- CHROMA_PU_422(4, 32);
- CHROMA_PU_422(32, 32);
+ CHROMA_PU_422(16, 16);
+ CHROMA_PU_422(16, 24);
+ CHROMA_PU_422(16, 32);
CHROMA_PU_422(16, 64);
- CHROMA_PU_422(32, 48);
CHROMA_PU_422(24, 64);
CHROMA_PU_422(32, 16);
- CHROMA_PU_422(8, 64);
+ CHROMA_PU_422(32, 32);
+ CHROMA_PU_422(32, 48);
+ CHROMA_PU_422(32, 64);
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 5af250c195d7b118b3d61c3922c67ec2bfe85afb Mon Sep 17 00:00:00 2001
Message-Id: <5af250c195d7b118b3d61c3922c67ec2bfe85afb.1750358721.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Thu, 29 May 2025 18:10:32 +0200
Subject: [PATCH] AArch64: Optimize and clean up addAvg Neon and SVE2 functions
Extend the neon intrinsics implementation to support all block sizes and
optimize it to use rounding-shift-and-accumulate instead of separate
widening, add, and shift steps. Also unroll the loops for larger block
sizes to enable the compiler to emit LDP and STP instructions.
Delete the Neon and SVE2 assembly implementations as they are 1-2x
slower than Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 16 -
source/common/aarch64/mc-a-sve2.S | 606 -----------------------
source/common/aarch64/mc-a.S | 341 -------------
source/common/aarch64/mem-neon.h | 20 +
source/common/aarch64/pixel-prim.cpp | 207 +++++---
5 files changed, 169 insertions(+), 1021 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..e1fc8e82a 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -462,14 +462,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
ALL_LUMA_PU(pixelavg_pp[ALIGNED], pixel_avg_pp, neon);
- // addAvg
- ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_LUMA_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
@@ -635,14 +627,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2);
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2);
- // addAvg
- LUMA_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- LUMA_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[NONALIGNED], addAvg, sve2);
- CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[ALIGNED], addAvg, sve2);
- CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
-
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2);
diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index 00fb0048f..fc0a6f3e8 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -298,609 +298,3 @@ pixel_avg_pp_64xN_sve2 16
pixel_avg_pp_64xN_sve2 32
pixel_avg_pp_64xN_sve2 48
pixel_avg_pp_64xN_sve2 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-
-.macro addAvg_2xN_sve2 h
-function PFX(addAvg_2x\h\()_sve2)
- ptrue p0.s, vl2
- ptrue p1.h, vl4
- ptrue p2.h, vl2
-.rept \h / 2
- ld1rw {z0.s}, p0/z, [x0]
- ld1rw {z1.s}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1rw {z2.s}, p0/z, [x0]
- ld1rw {z3.s}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p1/m, z0.h, z1.h
- add z2.h, p1/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p2, [x2]
- add x2, x2, x5
- st1b {z2.h}, p2, [x2]
- add x2, x2, x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_2xN_sve2 4
-addAvg_2xN_sve2 8
-addAvg_2xN_sve2 16
-
-.macro addAvg_6xN_sve2 h
-function PFX(addAvg_6x\h\()_sve2)
- mov w12, #\h / 2
- ptrue p0.b, vl16
- ptrue p2.h, vl6
-.Loop_sve2_addavg_6x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- sqrshrnb z2.b, z2.h, #7
- add z0.b, z0.b, #0x80
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p2, [x2]
- add x2, x2, x5
- st1b {z2.h}, p2, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_6x\h
- ret
-endfunc
-.endm
-
-addAvg_6xN_sve2 8
-addAvg_6xN_sve2 16
-
-.macro addAvg_8xN_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
- ptrue p0.b, vl16
-.rept \h / 2
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- st1b {z2.h}, p0, [x2]
- add x2, x2, x5
-.endr
- ret
-endfunc
-.endm
-
-.macro addAvg_8xN1_sve2 h
-function PFX(addAvg_8x\h\()_sve2)
- mov w12, #\h / 2
- ptrue p0.b, vl16
-.Loop_sve2_addavg_8x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- ld1b {z2.b}, p0/z, [x0]
- ld1b {z3.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- st1b {z2.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_8x\h
- ret
-endfunc
-.endm
-
-addAvg_8xN_sve2 2
-addAvg_8xN_sve2 4
-addAvg_8xN_sve2 6
-addAvg_8xN_sve2 8
-addAvg_8xN_sve2 12
-addAvg_8xN_sve2 16
-addAvg_8xN1_sve2 32
-addAvg_8xN1_sve2 64
-
-.macro addAvg_12xN_sve2 h
-function PFX(addAvg_12x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_12x\h
- ptrue p0.b, vl16
- ptrue p1.b, vl8
-.Loop_sve2_addavg_12x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- ld1b {z2.b}, p1/z, [x0, #1, mul vl]
- ld1b {z3.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p1/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z2.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_addavg_12x\h
- ret
-.vl_gt_16_addAvg_12x\h\():
- mov x10, #24
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_sve2_gt_16_addavg_12x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_sve2_gt_16_addavg_12x\h
- ret
-endfunc
-.endm
-
-addAvg_12xN_sve2 16
-addAvg_12xN_sve2 32
-
-.macro addAvg_16xN_sve2 h
-function PFX(addAvg_16x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_16x\h
- ptrue p0.b, vl16
-.Loop_eq_16_sve2_addavg_16x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- ld1b {z2.b}, p0/z, [x0, #1, mul vl]
- ld1b {z3.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- add z2.h, p0/m, z2.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z2.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_eq_16_sve2_addavg_16x\h
- ret
-.vl_gt_16_addAvg_16x\h\():
- ptrue p0.b, vl32
-.Loop_gt_16_sve2_addavg_16x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_16_sve2_addavg_16x\h
- ret
-endfunc
-.endm
-
-addAvg_16xN_sve2 4
-addAvg_16xN_sve2 8
-addAvg_16xN_sve2 12
-addAvg_16xN_sve2 16
-addAvg_16xN_sve2 24
-addAvg_16xN_sve2 32
-addAvg_16xN_sve2 64
-
-.macro addAvg_24xN_sve2 h
-function PFX(addAvg_24x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_24x\h
- addAvg_start
-.Loop_eq_16_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1 {v0.16b-v2.16b}, [x0], x3
- ld1 {v3.16b-v5.16b}, [x1], x4
- addavg_1 v0, v3
- addavg_1 v1, v4
- addavg_1 v2, v5
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- st1 {v0.8b-v2.8b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_24x\h
- ret
-.vl_gt_16_addAvg_24x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_24x\h
- ptrue p0.b, vl32
- ptrue p1.b, vl16
-.Loop_gt_16_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p1/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x1]
- ld1b {z3.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- add z1.h, p1/m, z1.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_16_sve2_addavg_24x\h
- ret
-.vl_gt_48_addAvg_24x\h\():
- mov x10, #48
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_gt_48_sve2_addavg_24x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z2.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_48_sve2_addavg_24x\h
- ret
-endfunc
-.endm
-
-addAvg_24xN_sve2 32
-addAvg_24xN_sve2 64
-
-.macro addAvg_32xN_sve2 h
-function PFX(addAvg_32x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_32x\h
- ptrue p0.b, vl16
-.Loop_eq_16_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z3.b}, p0/z, [x0, #3, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- ld1b {z7.b}, p0/z, [x1, #3, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- add z3.h, p0/m, z3.h, z7.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- sqrshrnb z3.b, z3.h, #7
- add z3.b, z3.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- st1b {z3.h}, p0, [x2, #3, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_eq_16_sve2_addavg_32x\h
- ret
-.vl_gt_16_addAvg_32x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_32x\h
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x1]
- ld1b {z3.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z2.h
- add z1.h, p0/m, z1.h, z3.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_32x\h
- ret
-.vl_gt_48_addAvg_32x\h\():
- ptrue p0.b, vl64
-.Loop_eq_64_sve2_addavg_32x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z1.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_eq_64_sve2_addavg_32x\h
- ret
-endfunc
-.endm
-
-addAvg_32xN_sve2 8
-addAvg_32xN_sve2 16
-addAvg_32xN_sve2 24
-addAvg_32xN_sve2 32
-addAvg_32xN_sve2 48
-addAvg_32xN_sve2 64
-
-function PFX(addAvg_48x64_sve2)
- mov w12, #64
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_48x64
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_eq_16_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v21.8h}, [x0], x3
- ld1 {v22.8h-v23.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v22
- addavg_1 v21, v23
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- st1 {v0.16b-v2.16b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_48x64
- ret
-.vl_gt_16_addAvg_48x64:
- cmp x9, #48
- bgt .vl_gt_48_addAvg_48x64
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_48x64
- ret
-.vl_gt_48_addAvg_48x64:
- cmp x9, #112
- bgt .vl_gt_112_addAvg_48x64
- ptrue p0.b, vl64
- ptrue p1.b, vl32
-.Loop_gt_48_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p1/z, [x0, #1, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p1/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p1/m, z1.h, z5.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p1, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_48_sve2_addavg_48x64
- ret
-.vl_gt_112_addAvg_48x64:
- mov x10, #96
- mov x11, #0
- whilelt p0.b, x11, x10
-.Loop_gt_112_sve2_addavg_48x64:
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_112_sve2_addavg_48x64
- ret
-endfunc
-
-.macro addAvg_64xN_sve2 h
-function PFX(addAvg_64x\h\()_sve2)
- mov w12, #\h
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_addAvg_64x\h
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_eq_16_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v23.8h}, [x0], x3
- ld1 {v24.8h-v27.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v24
- addavg_1 v21, v25
- addavg_1 v22, v26
- addavg_1 v23, v27
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- sqxtun v3.8b, v22.8h
- sqxtun2 v3.16b, v23.8h
- st1 {v0.16b-v3.16b}, [x2], x5
- cbnz w12, .Loop_eq_16_sve2_addavg_64x\h
- ret
-.vl_gt_16_addAvg_64x\h\():
- cmp x9, #48
- bgt .vl_gt_48_addAvg_64x\h
- ptrue p0.b, vl32
-.Loop_gt_eq_32_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z2.b}, p0/z, [x0, #2, mul vl]
- ld1b {z3.b}, p0/z, [x0, #3, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- ld1b {z6.b}, p0/z, [x1, #2, mul vl]
- ld1b {z7.b}, p0/z, [x1, #3, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- add z2.h, p0/m, z2.h, z6.h
- add z3.h, p0/m, z3.h, z7.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- sqrshrnb z2.b, z2.h, #7
- add z2.b, z2.b, #0x80
- sqrshrnb z3.b, z3.h, #7
- add z3.b, z3.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- st1b {z2.h}, p0, [x2, #2, mul vl]
- st1b {z3.h}, p0, [x2, #3, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_32_sve2_addavg_64x\h
- ret
-.vl_gt_48_addAvg_64x\h\():
- cmp x9, #112
- bgt .vl_gt_112_addAvg_64x\h
- ptrue p0.b, vl64
-.Loop_gt_eq_48_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z1.b}, p0/z, [x0, #1, mul vl]
- ld1b {z4.b}, p0/z, [x1]
- ld1b {z5.b}, p0/z, [x1, #1, mul vl]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- add z1.h, p0/m, z1.h, z5.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- sqrshrnb z1.b, z1.h, #7
- add z1.b, z1.b, #0x80
- st1b {z0.h}, p0, [x2]
- st1b {z1.h}, p0, [x2, #1, mul vl]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_48_sve2_addavg_64x\h
- ret
-.vl_gt_112_addAvg_64x\h\():
- ptrue p0.b, vl128
-.Loop_gt_eq_128_sve2_addavg_64x\h\():
- sub w12, w12, #1
- ld1b {z0.b}, p0/z, [x0]
- ld1b {z4.b}, p0/z, [x1]
- add x0, x0, x3, lsl #1
- add x1, x1, x4, lsl #1
- add z0.h, p0/m, z0.h, z4.h
- sqrshrnb z0.b, z0.h, #7
- add z0.b, z0.b, #0x80
- st1b {z0.h}, p0, [x2]
- add x2, x2, x5
- cbnz w12, .Loop_gt_eq_128_sve2_addavg_64x\h
- ret
-endfunc
-.endm
-
-addAvg_64xN_sve2 16
-addAvg_64xN_sve2 32
-addAvg_64xN_sve2 48
-addAvg_64xN_sve2 64
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index 130bf1a4a..876228473 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -214,344 +214,3 @@ pixel_avg_pp_64xN_neon 16
pixel_avg_pp_64xN_neon 32
pixel_avg_pp_64xN_neon 48
pixel_avg_pp_64xN_neon 64
-
-// void addAvg(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
-.macro addAvg_2xN h
-function PFX(addAvg_2x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ldr w10, [x0]
- ldr w11, [x1]
- add x0, x0, x3
- add x1, x1, x4
- ldr w12, [x0]
- ldr w13, [x1]
- add x0, x0, x3
- add x1, x1, x4
- dup v0.2s, w10
- dup v1.2s, w11
- dup v2.2s, w12
- dup v3.2s, w13
- add v0.4h, v0.4h, v1.4h
- add v2.4h, v2.4h, v3.4h
- saddl v0.4s, v0.4h, v30.4h
- saddl v2.4s, v2.4h, v30.4h
- shrn v0.4h, v0.4s, #7
- shrn2 v0.8h, v2.4s, #7
- sqxtun v0.8b, v0.8h
- st1 {v0.h}[0], [x2], x5
- st1 {v0.h}[2], [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_2xN 4
-addAvg_2xN 8
-addAvg_2xN 16
-
-.macro addAvg_4xN h
-function PFX(addAvg_4x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ld1 {v0.8b}, [x0], x3
- ld1 {v1.8b}, [x1], x4
- ld1 {v2.8b}, [x0], x3
- ld1 {v3.8b}, [x1], x4
- add v0.4h, v0.4h, v1.4h
- add v2.4h, v2.4h, v3.4h
- saddl v0.4s, v0.4h, v30.4h
- saddl v2.4s, v2.4h, v30.4h
- shrn v0.4h, v0.4s, #7
- shrn2 v0.8h, v2.4s, #7
- sqxtun v0.8b, v0.8h
- st1 {v0.s}[0], [x2], x5
- st1 {v0.s}[1], [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-addAvg_4xN 2
-addAvg_4xN 4
-addAvg_4xN 8
-addAvg_4xN 16
-addAvg_4xN 32
-
-.macro addAvg_6xN h
-function PFX(addAvg_6x\h\()_neon)
- addAvg_start
- mov w12, #\h / 2
- sub x5, x5, #4
-.Loop_addavg_6x\h:
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- str s0, [x2], #4
- st1 {v0.h}[2], [x2], x5
- str s1, [x2], #4
- st1 {v1.h}[2], [x2], x5
- cbnz w12, .Loop_addavg_6x\h
- ret
-endfunc
-.endm
-
-addAvg_6xN 8
-addAvg_6xN 16
-
-.macro addAvg_8xN h
-function PFX(addAvg_8x\h\()_neon)
- addAvg_start
-.rept \h / 2
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], x5
- st1 {v1.8b}, [x2], x5
-.endr
- ret
-endfunc
-.endm
-
-.macro addAvg_8xN1 h
-function PFX(addAvg_8x\h\()_neon)
- addAvg_start
- mov w12, #\h / 2
-.Loop_addavg_8x\h:
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], x3
- ld1 {v1.16b}, [x1], x4
- ld1 {v2.16b}, [x0], x3
- ld1 {v3.16b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.8h, v2.8h, v3.8h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- saddl2 v19.4s, v2.8h, v30.8h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- shrn2 v1.8h, v19.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], x5
- st1 {v1.8b}, [x2], x5
- cbnz w12, .Loop_addavg_8x\h
- ret
-endfunc
-.endm
-
-addAvg_8xN 2
-addAvg_8xN 4
-addAvg_8xN 6
-addAvg_8xN 8
-addAvg_8xN 12
-addAvg_8xN 16
-addAvg_8xN1 32
-addAvg_8xN1 64
-
-.macro addAvg_12xN h
-function PFX(addAvg_12x\h\()_neon)
- addAvg_start
- sub x3, x3, #16
- sub x4, x4, #16
- sub x5, x5, #8
- mov w12, #\h
-.Loop_addAvg_12X\h\():
- sub w12, w12, #1
- ld1 {v0.16b}, [x0], #16
- ld1 {v1.16b}, [x1], #16
- ld1 {v2.8b}, [x0], x3
- ld1 {v3.8b}, [x1], x4
- add v0.8h, v0.8h, v1.8h
- add v2.4h, v2.4h, v3.4h
- saddl v16.4s, v0.4h, v30.4h
- saddl2 v17.4s, v0.8h, v30.8h
- saddl v18.4s, v2.4h, v30.4h
- shrn v0.4h, v16.4s, #7
- shrn2 v0.8h, v17.4s, #7
- shrn v1.4h, v18.4s, #7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- st1 {v0.8b}, [x2], #8
- st1 {v1.s}[0], [x2], x5
- cbnz w12, .Loop_addAvg_12X\h
- ret
-endfunc
-.endm
-
-addAvg_12xN 16
-addAvg_12xN 32
-
-.macro addAvg_16xN h
-function PFX(addAvg_16x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_16x\h:
- sub w12, w12, #1
- ld1 {v0.8h-v1.8h}, [x0], x3
- ld1 {v2.8h-v3.8h}, [x1], x4
- addavg_1 v0, v2
- addavg_1 v1, v3
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- st1 {v0.16b}, [x2], x5
- cbnz w12, .Loop_addavg_16x\h
- ret
-endfunc
-.endm
-
-addAvg_16xN 4
-addAvg_16xN 8
-addAvg_16xN 12
-addAvg_16xN 16
-addAvg_16xN 24
-addAvg_16xN 32
-addAvg_16xN 64
-
-.macro addAvg_24xN h
-function PFX(addAvg_24x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_24x\h\():
- sub w12, w12, #1
- ld1 {v0.16b-v2.16b}, [x0], x3
- ld1 {v3.16b-v5.16b}, [x1], x4
- addavg_1 v0, v3
- addavg_1 v1, v4
- addavg_1 v2, v5
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- st1 {v0.8b-v2.8b}, [x2], x5
- cbnz w12, .Loop_addavg_24x\h
- ret
-endfunc
-.endm
-
-addAvg_24xN 32
-addAvg_24xN 64
-
-.macro addAvg_32xN h
-function PFX(addAvg_32x\h\()_neon)
- addAvg_start
- mov w12, #\h
-.Loop_addavg_32x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], x3
- ld1 {v4.8h-v7.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- sqxtun v0.8b, v0.8h
- sqxtun v1.8b, v1.8h
- sqxtun v2.8b, v2.8h
- sqxtun v3.8b, v3.8h
- st1 {v0.8b-v3.8b}, [x2], x5
- cbnz w12, .Loop_addavg_32x\h
- ret
-endfunc
-.endm
-
-addAvg_32xN 8
-addAvg_32xN 16
-addAvg_32xN 24
-addAvg_32xN 32
-addAvg_32xN 48
-addAvg_32xN 64
-
-function PFX(addAvg_48x64_neon)
- addAvg_start
- sub x3, x3, #64
- sub x4, x4, #64
- mov w12, #64
-.Loop_addavg_48x64:
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v21.8h}, [x0], x3
- ld1 {v22.8h-v23.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v22
- addavg_1 v21, v23
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- st1 {v0.16b-v2.16b}, [x2], x5
- cbnz w12, .Loop_addavg_48x64
- ret
-endfunc
-
-.macro addAvg_64xN h
-function PFX(addAvg_64x\h\()_neon)
- addAvg_start
- mov w12, #\h
- sub x3, x3, #64
- sub x4, x4, #64
-.Loop_addavg_64x\h\():
- sub w12, w12, #1
- ld1 {v0.8h-v3.8h}, [x0], #64
- ld1 {v4.8h-v7.8h}, [x1], #64
- ld1 {v20.8h-v23.8h}, [x0], x3
- ld1 {v24.8h-v27.8h}, [x1], x4
- addavg_1 v0, v4
- addavg_1 v1, v5
- addavg_1 v2, v6
- addavg_1 v3, v7
- addavg_1 v20, v24
- addavg_1 v21, v25
- addavg_1 v22, v26
- addavg_1 v23, v27
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v20.8h
- sqxtun2 v2.16b, v21.8h
- sqxtun v3.8b, v22.8h
- sqxtun2 v3.16b, v23.8h
- st1 {v0.16b-v3.16b}, [x2], x5
- cbnz w12, .Loop_addavg_64x\h
- ret
-endfunc
-.endm
-
-addAvg_64xN 16
-addAvg_64xN 32
-addAvg_64xN 48
-addAvg_64xN 64
diff --git a/source/common/aarch64/mem-neon.h b/source/common/aarch64/mem-neon.h
index 8bd5fbee9..27d72b70c 100644
--- a/source/common/aarch64/mem-neon.h
+++ b/source/common/aarch64/mem-neon.h
@@ -74,6 +74,26 @@ static void inline store_u8x4x1(uint8_t *d, const uint8x8_t s)
vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(s), 0);
}
+// Store 2 bytes from the low half of a uint8x8_t.
+static void inline store_u8x2x1(uint8_t *d, const uint8x8_t s)
+{
+ vst1_lane_u16((uint16_t *)d, vreinterpret_u16_u8(s), 0);
+}
+
+// Load 2 int16_t into a int16x8_t.
+static inline int16x8_t load_s16x2x1(const int16_t *p)
+{
+ int32x4_t ret = vld1q_lane_s32((const int32_t *)p, vdupq_n_s32(0), 0);
+
+ return vreinterpretq_s16_s32(ret);
+}
+
+// Store 2 uint16_t from the low half of a uint16x8_t.
+static inline void store_u16x2x1(const uint16_t *d, const uint16x8_t s)
+{
+ vst1q_lane_u32((uint32_t *)d, vreinterpretq_u32_u16(s), 0);
+}
+
// Store N blocks of 32-bits from (N / 2) D-Registers.
template<int N>
static void inline store_u8x4_strided_xN(uint8_t *d, intptr_t stride,
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..ef7861284 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1145,49 +1145,138 @@ void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_
}
}
-template<int bx, int by>
-void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst, intptr_t src0Stride, intptr_t src1Stride,
- intptr_t dstStride)
+template<int width, int height>
+void addAvg_neon(const int16_t *src0, const int16_t *src1, pixel *dst,
+ intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride)
{
-
const int shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
- const int offset = (1 << (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+ const int offset = 2 * IF_INTERNAL_OFFS;
- const int32x4_t addon = vdupq_n_s32(offset);
- for (int y = 0; y < by; y++)
+#if HIGH_BIT_DEPTH
+ const int16x8_t addon = vdupq_n_s16(offset >> shiftNum);
+
+ for (int h = 0; h < height; h++)
{
- int x = 0;
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t s0[2], s1[2];
+ load_s16x8xn<2>(src0 + w, 8, s0);
+ load_s16x8xn<2>(src1 + w, 8, s1);
- for (; (x + 8) <= bx; x += 8)
+ int16x8_t d0_lo = vrsraq_n_s16(addon, vaddq_s16(s0[0], s1[0]), shiftNum);
+ int16x8_t d0_hi = vrsraq_n_s16(addon, vaddq_s16(s0[1], s1[1]), shiftNum);
+
+ d0_lo = vminq_s16(d0_lo, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0_lo = vmaxq_s16(d0_lo, vdupq_n_s16(0));
+ d0_hi = vminq_s16(d0_hi, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0_hi = vmaxq_s16(d0_hi, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0_lo));
+ vst1q_u16(dst + w + 8, vreinterpretq_u16_s16(d0_hi));
+ }
+ if (width & 8)
{
- int16x8_t in0 = vld1q_s16(src0 + x);
- int16x8_t in1 = vld1q_s16(src1 + x);
- int32x4_t t1 = vaddl_s16(vget_low_s16(in0), vget_low_s16(in1));
- int32x4_t t2 = vaddl_high_s16(in0, in1);
- t1 = vaddq_s32(t1, addon);
- t2 = vaddq_s32(t2, addon);
- t1 = vshrq_n_s32(t1, shiftNum);
- t2 = vshrq_n_s32(t2, shiftNum);
- int16x8_t t = vuzp1q_s16(vreinterpretq_s16_s32(t1),
- vreinterpretq_s16_s32(t2));
-#if HIGH_BIT_DEPTH
- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
- t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(dst + x, vreinterpretq_u16_s16(t));
-#else
- vst1_u8(dst + x, vqmovun_s16(t));
-#endif
+ int16x8_t s0 = vld1q_s16(src0 + w);
+ int16x8_t s1 = vld1q_s16(src1 + w);
+
+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum);
+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmaxq_s16(d0, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(d0));
+
+ w += 8;
+ }
+ if (width & 4)
+ {
+ int16x4_t s0 = vld1_s16(src0 + w);
+ int16x4_t s1 = vld1_s16(src1 + w);
+
+ int16x4_t d0 = vrsra_n_s16(vget_low_s16(addon), vadd_s16(s0, s1), shiftNum);
+ d0 = vmin_s16(d0, vdup_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmax_s16(d0, vdup_n_s16(0));
+
+ vst1_u16(dst + w, vreinterpret_u16_s16(d0));
+
+ w += 4;
+ }
+ if (width & 2)
+ {
+ int16x8_t s0 = load_s16x2x1(src0 + w);
+ int16x8_t s1 = load_s16x2x1(src1 + w);
+
+ int16x8_t d0 = vrsraq_n_s16(addon, vaddq_s16(s0, s1), shiftNum);
+ d0 = vminq_s16(d0, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ d0 = vmaxq_s16(d0, vdupq_n_s16(0));
+
+ store_u16x2x1(dst + w, vreinterpretq_u16_s16(d0));
+ }
+
+ src0 += src0Stride;
+ src1 += src1Stride;
+ dst += dstStride;
+ }
+#else // !HIGH_BIT_DEPTH
+ const uint8x8_t addon = vdup_n_u8(offset >> shiftNum);
+
+ for (int h = 0; h < height; h++)
+ {
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
+ {
+ int16x8_t s0[2], s1[2];
+ load_s16x8xn<2>(src0 + w, 8, s0);
+ load_s16x8xn<2>(src1 + w, 8, s1);
+
+ int8x8_t sum01_s8_lo = vqrshrn_n_s16(vaddq_s16(s0[0], s1[0]), shiftNum);
+ int8x8_t sum01_s8_hi = vqrshrn_n_s16(vaddq_s16(s0[1], s1[1]), shiftNum);
+ uint8x8_t d0_lo = vadd_u8(vreinterpret_u8_s8(sum01_s8_lo), addon);
+ uint8x8_t d0_hi = vadd_u8(vreinterpret_u8_s8(sum01_s8_hi), addon);
+
+ vst1_u8(dst + w, d0_lo);
+ vst1_u8(dst + w + 8, d0_hi);
+ }
+ if (width & 8)
+ {
+ int16x8_t s0 = vld1q_s16(src0 + w);
+ int16x8_t s1 = vld1q_s16(src1 + w);
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ vst1_u8(dst + w, d0);
+
+ w += 8;
}
- for (; x < bx; x += 2)
+ if (width & 4)
{
- dst[x + 0] = x265_clip((src0[x + 0] + src1[x + 0] + offset) >> shiftNum);
- dst[x + 1] = x265_clip((src0[x + 1] + src1[x + 1] + offset) >> shiftNum);
+ int16x8_t s0 = vcombine_s16(vld1_s16(src0 + w), vdup_n_s16(0));
+ int16x8_t s1 = vcombine_s16(vld1_s16(src1 + w), vdup_n_s16(0));
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ store_u8x4x1(dst + w, d0);
+
+ w += 4;
+ }
+ if (width & 2)
+ {
+ int16x8_t s0 = load_s16x2x1(src0 + w);
+ int16x8_t s1 = load_s16x2x1(src1 + w);
+
+ int8x8_t sum01_s8 = vqrshrn_n_s16(vaddq_s16(s0, s1), shiftNum);
+ uint8x8_t d0 = vadd_u8(vreinterpret_u8_s8(sum01_s8), addon);
+
+ store_u8x2x1(dst + w, d0);
}
src0 += src0Stride;
src1 += src1Stride;
dst += dstStride;
}
+#endif
}
void planecopy_cp_neon(const uint8_t *src, intptr_t srcStride, pixel *dst,
@@ -2057,29 +2146,30 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
- CHROMA_PU_420(4, 4);
- CHROMA_PU_420(8, 8);
- CHROMA_PU_420(16, 16);
- CHROMA_PU_420(32, 32);
+ CHROMA_PU_420(2, 4);
+ CHROMA_PU_420(2, 8);
CHROMA_PU_420(4, 2);
- CHROMA_PU_420(8, 4);
+ CHROMA_PU_420(4, 4);
CHROMA_PU_420(4, 8);
- CHROMA_PU_420(8, 6);
CHROMA_PU_420(6, 8);
+ CHROMA_PU_420(4, 16);
CHROMA_PU_420(8, 2);
- CHROMA_PU_420(2, 8);
- CHROMA_PU_420(16, 8);
- CHROMA_PU_420(8, 16);
- CHROMA_PU_420(16, 12);
+ CHROMA_PU_420(8, 4);
+ CHROMA_PU_420(8, 6);
+ CHROMA_PU_420(8, 8);
+ CHROMA_PU_420(8, 16);
+ CHROMA_PU_420(8, 32);
CHROMA_PU_420(12, 16);
CHROMA_PU_420(16, 4);
- CHROMA_PU_420(4, 16);
- CHROMA_PU_420(32, 16);
+ CHROMA_PU_420(16, 8);
+ CHROMA_PU_420(16, 12);
+ CHROMA_PU_420(16, 16);
CHROMA_PU_420(16, 32);
- CHROMA_PU_420(32, 24);
CHROMA_PU_420(24, 32);
CHROMA_PU_420(32, 8);
- CHROMA_PU_420(8, 32);
+ CHROMA_PU_420(32, 16);
+ CHROMA_PU_420(32, 24);
+ CHROMA_PU_420(32, 32);
@@ -2161,30 +2251,31 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
- CHROMA_PU_422(4, 8);
- CHROMA_PU_422(8, 16);
- CHROMA_PU_422(16, 32);
- CHROMA_PU_422(32, 64);
- CHROMA_PU_422(4, 4);
+ CHROMA_PU_422(2, 4);
CHROMA_PU_422(2, 8);
- CHROMA_PU_422(8, 8);
+ CHROMA_PU_422(2, 16);
+ CHROMA_PU_422(4, 4);
+ CHROMA_PU_422(4, 8);
CHROMA_PU_422(4, 16);
- CHROMA_PU_422(8, 12);
- CHROMA_PU_422(6, 16);
+ CHROMA_PU_422(4, 32);
CHROMA_PU_422(8, 4);
- CHROMA_PU_422(2, 16);
- CHROMA_PU_422(16, 16);
+ CHROMA_PU_422(8, 8);
+ CHROMA_PU_422(8, 12);
+ CHROMA_PU_422(8, 16);
CHROMA_PU_422(8, 32);
- CHROMA_PU_422(16, 24);
+ CHROMA_PU_422(8, 64);
+ CHROMA_PU_422(6, 16);
CHROMA_PU_422(12, 32);
CHROMA_PU_422(16, 8);
- CHROMA_PU_422(4, 32);
- CHROMA_PU_422(32, 32);
+ CHROMA_PU_422(16, 16);
+ CHROMA_PU_422(16, 24);
+ CHROMA_PU_422(16, 32);
CHROMA_PU_422(16, 64);
- CHROMA_PU_422(32, 48);
CHROMA_PU_422(24, 64);
CHROMA_PU_422(32, 16);
- CHROMA_PU_422(8, 64);
+ CHROMA_PU_422(32, 32);
+ CHROMA_PU_422(32, 48);
+ CHROMA_PU_422(32, 64);
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].satd = NULL;
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list