[x265] [PATCH] AArch64: Optimize and clean up add_ps Neon and SVE2 functions
Li Zhang
li.zhang2 at arm.com
Thu Jun 12 10:21:00 UTC 2025
Optimize the standard bit-depth Neon intrinsics implementation to use
ADDW instead of UXTL and ADD. Also unroll the Neon intrinsics
implementations to enable the usage of LDP and STP. Implement Neon
intrinsics for blocksizes of width 4.
Delete the Neon and SVE2 assembly implementation as they are slower than
Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 64 -----
source/common/aarch64/pixel-prim.cpp | 104 +++++++--
source/common/aarch64/pixel-util-sve2.S | 286 -----------------------
source/common/aarch64/pixel-util.S | 183 ---------------
4 files changed, 79 insertions(+), 558 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..f6203c857 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -504,38 +504,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
- // pixel_add_ps
- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- // chroma add_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
//scale2D_64to32
p.scale2D_64to32 = PFX(scale2D_64to32_neon);
@@ -664,38 +632,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_sve2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_sve2);
- // pixel_add_ps
- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_sve2);
-
- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_sve2);
-
- // chroma add_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x8_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x16_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_sve2);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = PFX(pixel_add_ps_4x8_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = PFX(pixel_add_ps_8x16_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_sve2);
-
// scale1D_128to64
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..851105dd8 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1110,38 +1110,92 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixe
}
}
-template<int bx, int by>
-void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_t *b1, intptr_t sstride0,
- intptr_t sstride1)
+template<int width, int height>
+void pixel_add_ps_neon(pixel *dst, intptr_t dstride, const pixel *src0,
+ const int16_t *src1, intptr_t sstride0, intptr_t sstride1)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
- for (; (x + 8) <= bx; x += 8)
- {
- int16x8_t t;
- int16x8_t b1e = vld1q_s16(b1 + x);
- int16x8_t b0e;
#if HIGH_BIT_DEPTH
- b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x));
- t = vaddq_s16(b0e, b1e);
- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
- t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(a + x, vreinterpretq_u16_s16(t));
-#else
- b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x)));
- t = vaddq_s16(b0e, b1e);
- vst1_u8(a + x, vqmovun_s16(t));
-#endif
+ for (int w = 0; w + 16 <= width; w += 16)
+ {
+ uint16x8_t s0_lo = vld1q_u16(src0 + w);
+ uint16x8_t s0_hi = vld1q_u16(src0 + w + 8);
+ int16x8_t s1_lo = vld1q_s16(src1 + w);
+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8);
+
+ int16x8_t sum_lo = vaddq_s16(vreinterpretq_s16_u16(s0_lo), s1_lo);
+ int16x8_t sum_hi = vaddq_s16(vreinterpretq_s16_u16(s0_hi), s1_hi);
+ sum_lo = vminq_s16(sum_lo, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum_lo = vmaxq_s16(sum_lo, vdupq_n_s16(0));
+ sum_hi = vminq_s16(sum_hi, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum_hi = vmaxq_s16(sum_hi, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(sum_lo));
+ vst1q_u16(dst + w + 8, vreinterpretq_u16_s16(sum_hi));
+ }
+ if (width == 8)
+ {
+ uint16x8_t s0 = vld1q_u16(src0);
+ int16x8_t s1 = vld1q_s16(src1);
+
+ int16x8_t sum = vaddq_s16(vreinterpretq_s16_u16(s0), s1);
+ sum = vminq_s16(sum, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum = vmaxq_s16(sum, vdupq_n_s16(0));
+
+ vst1q_u16(dst, vreinterpretq_u16_s16(sum));
}
- for (; x < bx; x++)
+ if (width == 4)
{
- a[x] = (int16_t)x265_clip(b0[x] + b1[x]);
+ int16x4_t s1 = vld1_s16(src1);
+ uint16x4_t s0 = vld1_u16(src0);
+
+ int16x4_t sum = vadd_s16(vreinterpret_s16_u16(s0), s1);
+ sum = vmin_s16(sum, vdup_n_s16((1 << X265_DEPTH) - 1));
+ sum = vmax_s16(sum, vdup_n_s16(0));
+
+ vst1_u16(dst, vreinterpret_u16_s16(sum));
}
+#else // !HIGH_BIT_DEPTH
+ for (int w = 0; w + 16 <= width; w += 16)
+ {
+ uint8x16_t s0 = vld1q_u8(src0 + w);
+ int16x8_t s1_lo = vld1q_s16(src1 + w);
+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8);
- b0 += sstride0;
- b1 += sstride1;
- a += dstride;
+ uint16x8_t sum_lo = vaddw_u8(vreinterpretq_u16_s16(s1_lo), vget_low_u8(s0));
+ uint16x8_t sum_hi = vaddw_u8(vreinterpretq_u16_s16(s1_hi), vget_high_u8(s0));
+ uint8x8_t d0_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
+ uint8x8_t d0_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
+
+ vst1_u8(dst + w, d0_lo);
+ vst1_u8(dst + w + 8, d0_hi);
+ }
+ if (width == 8)
+ {
+ uint8x8_t s0 = vld1_u8(src0);
+ int16x8_t s1 = vld1q_s16(src1);
+
+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0);
+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum));
+
+ vst1_u8(dst, d0);
+ }
+ if (width == 4)
+ {
+ uint8x8_t s0 = load_u8x4x1(src0);
+ int16x8_t s1 = vcombine_s16(vld1_s16(src1), vdup_n_s16(0));
+
+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0);
+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum));
+
+ store_u8x4x1(dst, d0);
+ }
+#endif
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
}
}
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 56a2253ea..257bcd7aa 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -531,292 +531,6 @@ function PFX(pixel_sub_ps_32x64_sve2)
ret
endfunc
-function PFX(pixel_add_ps_4x4_sve2)
- ptrue p0.h, vl8
- ptrue p1.h, vl4
-.rept 4
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p1/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p1, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x8_sve2)
- ptrue p0.h, vl8
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-.macro pixel_add_ps_16xN_sve2 h
-function PFX(pixel_add_ps_16x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_16x\h
- ptrue p0.b, vl16
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z2.h}, p0/z, [x3]
- ld1h {z3.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z2.h
- add z25.h, z1.h, z3.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_pixel_add_ps_16x\h\():
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z2.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-.endm
-
-pixel_add_ps_16xN_sve2 16
-pixel_add_ps_16xN_sve2 32
-
-.macro pixel_add_ps_32xN_sve2 h
- function PFX(pixel_add_ps_32x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_32x\h
- lsl x5, x5, #1
- mov w12, #\h / 4
-.Loop_add_ps__sve2_32x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v1.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b-v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps__sve2_32x\h
- ret
-.vl_gt_16_pixel_add_ps_32x\h\():
- cmp x9, #48
- bgt .vl_gt_48_pixel_add_ps_32x\h
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z4.h}, p0/z, [x3]
- ld1h {z5.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z4.h
- add z25.h, z1.h, z5.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_pixel_add_ps_32x\h\():
- ptrue p0.b, vl64
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z4.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z4.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-.endm
-
-pixel_add_ps_32xN_sve2 32
-pixel_add_ps_32xN_sve2 64
-
-function PFX(pixel_add_ps_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_64x64
- ptrue p0.b, vl16
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1b {z2.h}, p0/z, [x2, #2, mul vl]
- ld1b {z3.h}, p0/z, [x2, #3, mul vl]
- ld1b {z4.h}, p0/z, [x2, #4 ,mul vl]
- ld1b {z5.h}, p0/z, [x2, #5, mul vl]
- ld1b {z6.h}, p0/z, [x2, #6, mul vl]
- ld1b {z7.h}, p0/z, [x2, #7, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- ld1h {z10.h}, p0/z, [x3, #2, mul vl]
- ld1h {z11.h}, p0/z, [x3, #3, mul vl]
- ld1h {z12.h}, p0/z, [x3, #4, mul vl]
- ld1h {z13.h}, p0/z, [x3, #5, mul vl]
- ld1h {z14.h}, p0/z, [x3, #6, mul vl]
- ld1h {z15.h}, p0/z, [x3, #7, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- add z26.h, z2.h, z10.h
- add z27.h, z3.h, z11.h
- add z28.h, z4.h, z12.h
- add z29.h, z5.h, z13.h
- add z30.h, z6.h, z14.h
- add z31.h, z7.h, z15.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- sqxtunb z8.b, z26.h
- sqxtunb z9.b, z27.h
- sqxtunb z10.b, z28.h
- sqxtunb z11.b, z29.h
- sqxtunb z12.b, z30.h
- sqxtunb z13.b, z31.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- st1b {z8.h}, p0, [x0, #2, mul vl]
- st1b {z9.h}, p0, [x0, #3, mul vl]
- st1b {z10.h}, p0, [x0, #4, mul vl]
- st1b {z11.h}, p0, [x0, #5, mul vl]
- st1b {z12.h}, p0, [x0, #6, mul vl]
- st1b {z13.h}, p0, [x0, #7, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_pixel_add_ps_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_add_ps_64x64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1b {z2.h}, p0/z, [x2, #2, mul vl]
- ld1b {z3.h}, p0/z, [x2, #3, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- ld1h {z10.h}, p0/z, [x3, #2, mul vl]
- ld1h {z11.h}, p0/z, [x3, #3, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- add z26.h, z2.h, z10.h
- add z27.h, z3.h, z11.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- sqxtunb z8.b, z26.h
- sqxtunb z9.b, z27.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- st1b {z8.h}, p0, [x0, #2, mul vl]
- st1b {z9.h}, p0, [x0, #3, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_pixel_add_ps_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_add_ps_64x64
- ptrue p0.b, vl64
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_112_pixel_add_ps_64x64:
- ptrue p0.b, vl128
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z8.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-// Chroma add_ps
-function PFX(pixel_add_ps_4x8_sve2)
- ptrue p0.h,vl4
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x16_sve2)
- ptrue p0.h,vl8
-.rept 16
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
// void scale1D_128to64(pixel *dst, const pixel *src)
function PFX(scale1D_128to64_sve2)
rdvl x9, #1
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 480278e5e..0751e0e7c 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -340,189 +340,6 @@ function PFX(pixel_sub_ps_32x64_neon)
ret
endfunc
-// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
-function PFX(pixel_add_ps_4x4_neon)
- lsl x5, x5, #1
-.rept 2
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.4h}, [x3], x5
- ld1 {v3.4h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.s}[0], [x0], x1
- st1 {v5.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x8_neon)
- lsl x5, x5, #1
-.rept 4
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.8h}, [x3], x5
- ld1 {v3.8h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-.macro pixel_add_ps_16xN_neon h
-function PFX(pixel_add_ps_16x\h\()_neon)
- lsl x5, x5, #1
- mov w12, #\h / 8
-.Loop_add_ps_16x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b}, [x2], x4
- ld1 {v1.16b}, [x2], x4
- ld1 {v16.8h-v17.8h}, [x3], x5
- ld1 {v18.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b}, [x0], x1
- st1 {v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_16x\h
- ret
-endfunc
-.endm
-
-pixel_add_ps_16xN_neon 16
-pixel_add_ps_16xN_neon 32
-
-.macro pixel_add_ps_32xN_neon h
- function PFX(pixel_add_ps_32x\h\()_neon)
- lsl x5, x5, #1
- mov w12, #\h / 4
-.Loop_add_ps_32x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v1.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b-v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_32x\h
- ret
-endfunc
-.endm
-
-pixel_add_ps_32xN_neon 32
-pixel_add_ps_32xN_neon 64
-
-function PFX(pixel_add_ps_64x64_neon)
- lsl x5, x5, #1
- sub x5, x5, #64
- mov w12, #32
-.Loop_add_ps_64x64:
- sub w12, w12, #1
-.rept 2
- ld1 {v0.16b-v3.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], #64
- ld1 {v20.8h-v23.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- uxtl v24.8h, v2.8b
- uxtl2 v25.8h, v2.16b
- uxtl v26.8h, v3.8b
- uxtl2 v27.8h, v3.16b
- add v0.8h, v4.8h, v16.8h
- add v1.8h, v5.8h, v17.8h
- add v2.8h, v6.8h, v18.8h
- add v3.8h, v7.8h, v19.8h
- add v4.8h, v24.8h, v20.8h
- add v5.8h, v25.8h, v21.8h
- add v6.8h, v26.8h, v22.8h
- add v7.8h, v27.8h, v23.8h
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v4.8h
- sqxtun2 v2.16b, v5.8h
- sqxtun v3.8b, v6.8h
- sqxtun2 v3.16b, v7.8h
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_64x64
- ret
-endfunc
-
-// Chroma add_ps
-function PFX(pixel_add_ps_4x8_neon)
- lsl x5, x5, #1
-.rept 4
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.4h}, [x3], x5
- ld1 {v3.4h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.s}[0], [x0], x1
- st1 {v5.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x16_neon)
- lsl x5, x5, #1
-.rept 8
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.8h}, [x3], x5
- ld1 {v3.8h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
-.endr
- ret
-endfunc
-
// void scale1D_128to64(pixel *dst, const pixel *src)
function PFX(scale1D_128to64_neon)
.rept 2
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 5f431f9339bad7381fc4eeee48128272ef07db07 Mon Sep 17 00:00:00 2001
Message-Id: <5f431f9339bad7381fc4eeee48128272ef07db07.1749722286.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 2 Jun 2025 16:35:42 +0200
Subject: [PATCH] AArch64: Optimize and clean up add_ps Neon and SVE2 functions
Optimize the standard bit-depth Neon intrinsics implementation to use
ADDW instead of UXTL and ADD. Also unroll the Neon intrinsics
implementations to enable the usage of LDP and STP. Implement Neon
intrinsics for blocksizes of width 4.
Delete the Neon and SVE2 assembly implementation as they are slower than
Neon intrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 64 -----
source/common/aarch64/pixel-prim.cpp | 104 +++++++--
source/common/aarch64/pixel-util-sve2.S | 286 -----------------------
source/common/aarch64/pixel-util.S | 183 ---------------
4 files changed, 79 insertions(+), 558 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..f6203c857 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -504,38 +504,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
- // pixel_add_ps
- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- // chroma add_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
//scale2D_64to32
p.scale2D_64to32 = PFX(scale2D_64to32_neon);
@@ -664,38 +632,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_sve2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_sve2);
- // pixel_add_ps
- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_sve2);
-
- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_sve2);
-
- // chroma add_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x8_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x16_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_sve2);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = PFX(pixel_add_ps_4x8_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = PFX(pixel_add_ps_8x16_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_sve2);
-
// scale1D_128to64
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..851105dd8 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1110,38 +1110,92 @@ void pixel_sub_ps_neon(int16_t *a, intptr_t dstride, const pixel *b0, const pixe
}
}
-template<int bx, int by>
-void pixel_add_ps_neon(pixel *a, intptr_t dstride, const pixel *b0, const int16_t *b1, intptr_t sstride0,
- intptr_t sstride1)
+template<int width, int height>
+void pixel_add_ps_neon(pixel *dst, intptr_t dstride, const pixel *src0,
+ const int16_t *src1, intptr_t sstride0, intptr_t sstride1)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
- for (; (x + 8) <= bx; x += 8)
- {
- int16x8_t t;
- int16x8_t b1e = vld1q_s16(b1 + x);
- int16x8_t b0e;
#if HIGH_BIT_DEPTH
- b0e = vreinterpretq_s16_u16(vld1q_u16(b0 + x));
- t = vaddq_s16(b0e, b1e);
- t = vminq_s16(t, vdupq_n_s16((1 << X265_DEPTH) - 1));
- t = vmaxq_s16(t, vdupq_n_s16(0));
- vst1q_u16(a + x, vreinterpretq_u16_s16(t));
-#else
- b0e = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b0 + x)));
- t = vaddq_s16(b0e, b1e);
- vst1_u8(a + x, vqmovun_s16(t));
-#endif
+ for (int w = 0; w + 16 <= width; w += 16)
+ {
+ uint16x8_t s0_lo = vld1q_u16(src0 + w);
+ uint16x8_t s0_hi = vld1q_u16(src0 + w + 8);
+ int16x8_t s1_lo = vld1q_s16(src1 + w);
+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8);
+
+ int16x8_t sum_lo = vaddq_s16(vreinterpretq_s16_u16(s0_lo), s1_lo);
+ int16x8_t sum_hi = vaddq_s16(vreinterpretq_s16_u16(s0_hi), s1_hi);
+ sum_lo = vminq_s16(sum_lo, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum_lo = vmaxq_s16(sum_lo, vdupq_n_s16(0));
+ sum_hi = vminq_s16(sum_hi, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum_hi = vmaxq_s16(sum_hi, vdupq_n_s16(0));
+
+ vst1q_u16(dst + w, vreinterpretq_u16_s16(sum_lo));
+ vst1q_u16(dst + w + 8, vreinterpretq_u16_s16(sum_hi));
+ }
+ if (width == 8)
+ {
+ uint16x8_t s0 = vld1q_u16(src0);
+ int16x8_t s1 = vld1q_s16(src1);
+
+ int16x8_t sum = vaddq_s16(vreinterpretq_s16_u16(s0), s1);
+ sum = vminq_s16(sum, vdupq_n_s16((1 << X265_DEPTH) - 1));
+ sum = vmaxq_s16(sum, vdupq_n_s16(0));
+
+ vst1q_u16(dst, vreinterpretq_u16_s16(sum));
}
- for (; x < bx; x++)
+ if (width == 4)
{
- a[x] = (int16_t)x265_clip(b0[x] + b1[x]);
+ int16x4_t s1 = vld1_s16(src1);
+ uint16x4_t s0 = vld1_u16(src0);
+
+ int16x4_t sum = vadd_s16(vreinterpret_s16_u16(s0), s1);
+ sum = vmin_s16(sum, vdup_n_s16((1 << X265_DEPTH) - 1));
+ sum = vmax_s16(sum, vdup_n_s16(0));
+
+ vst1_u16(dst, vreinterpret_u16_s16(sum));
}
+#else // !HIGH_BIT_DEPTH
+ for (int w = 0; w + 16 <= width; w += 16)
+ {
+ uint8x16_t s0 = vld1q_u8(src0 + w);
+ int16x8_t s1_lo = vld1q_s16(src1 + w);
+ int16x8_t s1_hi = vld1q_s16(src1 + w + 8);
- b0 += sstride0;
- b1 += sstride1;
- a += dstride;
+ uint16x8_t sum_lo = vaddw_u8(vreinterpretq_u16_s16(s1_lo), vget_low_u8(s0));
+ uint16x8_t sum_hi = vaddw_u8(vreinterpretq_u16_s16(s1_hi), vget_high_u8(s0));
+ uint8x8_t d0_lo = vqmovun_s16(vreinterpretq_s16_u16(sum_lo));
+ uint8x8_t d0_hi = vqmovun_s16(vreinterpretq_s16_u16(sum_hi));
+
+ vst1_u8(dst + w, d0_lo);
+ vst1_u8(dst + w + 8, d0_hi);
+ }
+ if (width == 8)
+ {
+ uint8x8_t s0 = vld1_u8(src0);
+ int16x8_t s1 = vld1q_s16(src1);
+
+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0);
+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum));
+
+ vst1_u8(dst, d0);
+ }
+ if (width == 4)
+ {
+ uint8x8_t s0 = load_u8x4x1(src0);
+ int16x8_t s1 = vcombine_s16(vld1_s16(src1), vdup_n_s16(0));
+
+ uint16x8_t sum = vaddw_u8(vreinterpretq_u16_s16(s1), s0);
+ uint8x8_t d0 = vqmovun_s16(vreinterpretq_s16_u16(sum));
+
+ store_u8x4x1(dst, d0);
+ }
+#endif
+
+ src0 += sstride0;
+ src1 += sstride1;
+ dst += dstride;
}
}
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 56a2253ea..257bcd7aa 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -531,292 +531,6 @@ function PFX(pixel_sub_ps_32x64_sve2)
ret
endfunc
-function PFX(pixel_add_ps_4x4_sve2)
- ptrue p0.h, vl8
- ptrue p1.h, vl4
-.rept 4
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p1/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p1, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x8_sve2)
- ptrue p0.h, vl8
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-.macro pixel_add_ps_16xN_sve2 h
-function PFX(pixel_add_ps_16x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_16x\h
- ptrue p0.b, vl16
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z2.h}, p0/z, [x3]
- ld1h {z3.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z2.h
- add z25.h, z1.h, z3.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_pixel_add_ps_16x\h\():
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z2.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-.endm
-
-pixel_add_ps_16xN_sve2 16
-pixel_add_ps_16xN_sve2 32
-
-.macro pixel_add_ps_32xN_sve2 h
- function PFX(pixel_add_ps_32x\h\()_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_32x\h
- lsl x5, x5, #1
- mov w12, #\h / 4
-.Loop_add_ps__sve2_32x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v1.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b-v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps__sve2_32x\h
- ret
-.vl_gt_16_pixel_add_ps_32x\h\():
- cmp x9, #48
- bgt .vl_gt_48_pixel_add_ps_32x\h
- ptrue p0.b, vl32
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z4.h}, p0/z, [x3]
- ld1h {z5.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z4.h
- add z25.h, z1.h, z5.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_pixel_add_ps_32x\h\():
- ptrue p0.b, vl64
-.rept \h
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z4.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z4.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-.endm
-
-pixel_add_ps_32xN_sve2 32
-pixel_add_ps_32xN_sve2 64
-
-function PFX(pixel_add_ps_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_add_ps_64x64
- ptrue p0.b, vl16
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1b {z2.h}, p0/z, [x2, #2, mul vl]
- ld1b {z3.h}, p0/z, [x2, #3, mul vl]
- ld1b {z4.h}, p0/z, [x2, #4 ,mul vl]
- ld1b {z5.h}, p0/z, [x2, #5, mul vl]
- ld1b {z6.h}, p0/z, [x2, #6, mul vl]
- ld1b {z7.h}, p0/z, [x2, #7, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- ld1h {z10.h}, p0/z, [x3, #2, mul vl]
- ld1h {z11.h}, p0/z, [x3, #3, mul vl]
- ld1h {z12.h}, p0/z, [x3, #4, mul vl]
- ld1h {z13.h}, p0/z, [x3, #5, mul vl]
- ld1h {z14.h}, p0/z, [x3, #6, mul vl]
- ld1h {z15.h}, p0/z, [x3, #7, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- add z26.h, z2.h, z10.h
- add z27.h, z3.h, z11.h
- add z28.h, z4.h, z12.h
- add z29.h, z5.h, z13.h
- add z30.h, z6.h, z14.h
- add z31.h, z7.h, z15.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- sqxtunb z8.b, z26.h
- sqxtunb z9.b, z27.h
- sqxtunb z10.b, z28.h
- sqxtunb z11.b, z29.h
- sqxtunb z12.b, z30.h
- sqxtunb z13.b, z31.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- st1b {z8.h}, p0, [x0, #2, mul vl]
- st1b {z9.h}, p0, [x0, #3, mul vl]
- st1b {z10.h}, p0, [x0, #4, mul vl]
- st1b {z11.h}, p0, [x0, #5, mul vl]
- st1b {z12.h}, p0, [x0, #6, mul vl]
- st1b {z13.h}, p0, [x0, #7, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_16_pixel_add_ps_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_add_ps_64x64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1b {z2.h}, p0/z, [x2, #2, mul vl]
- ld1b {z3.h}, p0/z, [x2, #3, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- ld1h {z10.h}, p0/z, [x3, #2, mul vl]
- ld1h {z11.h}, p0/z, [x3, #3, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- add z26.h, z2.h, z10.h
- add z27.h, z3.h, z11.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- sqxtunb z8.b, z26.h
- sqxtunb z9.b, z27.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- st1b {z8.h}, p0, [x0, #2, mul vl]
- st1b {z9.h}, p0, [x0, #3, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_48_pixel_add_ps_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_add_ps_64x64
- ptrue p0.b, vl64
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1b {z1.h}, p0/z, [x2, #1, mul vl]
- ld1h {z8.h}, p0/z, [x3]
- ld1h {z9.h}, p0/z, [x3, #1, mul vl]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- add z25.h, z1.h, z9.h
- sqxtunb z6.b, z24.h
- sqxtunb z7.b, z25.h
- st1b {z6.h}, p0, [x0]
- st1b {z7.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1
-.endr
- ret
-.vl_gt_112_pixel_add_ps_64x64:
- ptrue p0.b, vl128
-.rept 64
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z8.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z24.h, z0.h, z8.h
- sqxtunb z6.b, z24.h
- st1b {z6.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-// Chroma add_ps
-function PFX(pixel_add_ps_4x8_sve2)
- ptrue p0.h,vl4
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x16_sve2)
- ptrue p0.h,vl8
-.rept 16
- ld1b {z0.h}, p0/z, [x2]
- ld1h {z2.h}, p0/z, [x3]
- add x2, x2, x4
- add x3, x3, x5, lsl #1
- add z4.h, z0.h, z2.h
- sqxtunb z4.b, z4.h
- st1b {z4.h}, p0, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
// void scale1D_128to64(pixel *dst, const pixel *src)
function PFX(scale1D_128to64_sve2)
rdvl x9, #1
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 480278e5e..0751e0e7c 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -340,189 +340,6 @@ function PFX(pixel_sub_ps_32x64_neon)
ret
endfunc
-// void x265_pixel_add_ps_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
-function PFX(pixel_add_ps_4x4_neon)
- lsl x5, x5, #1
-.rept 2
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.4h}, [x3], x5
- ld1 {v3.4h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.s}[0], [x0], x1
- st1 {v5.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x8_neon)
- lsl x5, x5, #1
-.rept 4
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.8h}, [x3], x5
- ld1 {v3.8h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
-.endr
- ret
-endfunc
-
-.macro pixel_add_ps_16xN_neon h
-function PFX(pixel_add_ps_16x\h\()_neon)
- lsl x5, x5, #1
- mov w12, #\h / 8
-.Loop_add_ps_16x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b}, [x2], x4
- ld1 {v1.16b}, [x2], x4
- ld1 {v16.8h-v17.8h}, [x3], x5
- ld1 {v18.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b}, [x0], x1
- st1 {v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_16x\h
- ret
-endfunc
-.endm
-
-pixel_add_ps_16xN_neon 16
-pixel_add_ps_16xN_neon 32
-
-.macro pixel_add_ps_32xN_neon h
- function PFX(pixel_add_ps_32x\h\()_neon)
- lsl x5, x5, #1
- mov w12, #\h / 4
-.Loop_add_ps_32x\h\():
- sub w12, w12, #1
-.rept 4
- ld1 {v0.16b-v1.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- add v24.8h, v4.8h, v16.8h
- add v25.8h, v5.8h, v17.8h
- add v26.8h, v6.8h, v18.8h
- add v27.8h, v7.8h, v19.8h
- sqxtun v4.8b, v24.8h
- sqxtun2 v4.16b, v25.8h
- sqxtun v5.8b, v26.8h
- sqxtun2 v5.16b, v27.8h
- st1 {v4.16b-v5.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_32x\h
- ret
-endfunc
-.endm
-
-pixel_add_ps_32xN_neon 32
-pixel_add_ps_32xN_neon 64
-
-function PFX(pixel_add_ps_64x64_neon)
- lsl x5, x5, #1
- sub x5, x5, #64
- mov w12, #32
-.Loop_add_ps_64x64:
- sub w12, w12, #1
-.rept 2
- ld1 {v0.16b-v3.16b}, [x2], x4
- ld1 {v16.8h-v19.8h}, [x3], #64
- ld1 {v20.8h-v23.8h}, [x3], x5
- uxtl v4.8h, v0.8b
- uxtl2 v5.8h, v0.16b
- uxtl v6.8h, v1.8b
- uxtl2 v7.8h, v1.16b
- uxtl v24.8h, v2.8b
- uxtl2 v25.8h, v2.16b
- uxtl v26.8h, v3.8b
- uxtl2 v27.8h, v3.16b
- add v0.8h, v4.8h, v16.8h
- add v1.8h, v5.8h, v17.8h
- add v2.8h, v6.8h, v18.8h
- add v3.8h, v7.8h, v19.8h
- add v4.8h, v24.8h, v20.8h
- add v5.8h, v25.8h, v21.8h
- add v6.8h, v26.8h, v22.8h
- add v7.8h, v27.8h, v23.8h
- sqxtun v0.8b, v0.8h
- sqxtun2 v0.16b, v1.8h
- sqxtun v1.8b, v2.8h
- sqxtun2 v1.16b, v3.8h
- sqxtun v2.8b, v4.8h
- sqxtun2 v2.16b, v5.8h
- sqxtun v3.8b, v6.8h
- sqxtun2 v3.16b, v7.8h
- st1 {v0.16b-v3.16b}, [x0], x1
-.endr
- cbnz w12, .Loop_add_ps_64x64
- ret
-endfunc
-
-// Chroma add_ps
-function PFX(pixel_add_ps_4x8_neon)
- lsl x5, x5, #1
-.rept 4
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.4h}, [x3], x5
- ld1 {v3.4h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.s}[0], [x0], x1
- st1 {v5.s}[0], [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(pixel_add_ps_8x16_neon)
- lsl x5, x5, #1
-.rept 8
- ld1 {v0.8b}, [x2], x4
- ld1 {v1.8b}, [x2], x4
- ld1 {v2.8h}, [x3], x5
- ld1 {v3.8h}, [x3], x5
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- add v4.8h, v0.8h, v2.8h
- add v5.8h, v1.8h, v3.8h
- sqxtun v4.8b, v4.8h
- sqxtun v5.8b, v5.8h
- st1 {v4.8b}, [x0], x1
- st1 {v5.8b}, [x0], x1
-.endr
- ret
-endfunc
-
// void scale1D_128to64(pixel *dst, const pixel *src)
function PFX(scale1D_128to64_neon)
.rept 2
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list