[x265] [PATCH 1/4] AArch64: Optimize and clean up SBD pixel_var functions
Li Zhang
li.zhang2 at arm.com
Tue Jun 17 18:22:42 UTC 2025
Optimize the standard bit-depth pixel_var Neon intrinsics implementation
for every block size. Improve block sizes >= 16 by unrolling and doing
the widening and reduction of accumulators only at the end. Extend the
intrinsics implementation to support 4x4 blocks.
Delete the Neon and SVE2 assembly implementations as they are now slower
than the Neon instrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 12 --
source/common/aarch64/fun-decls.h | 10 --
source/common/aarch64/pixel-prim.cpp | 80 ++++++---
source/common/aarch64/pixel-util-common.S | 27 ---
source/common/aarch64/pixel-util-sve2.S | 195 ----------------------
source/common/aarch64/pixel-util.S | 61 -------
6 files changed, 61 insertions(+), 324 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..d8b0beb8f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -470,12 +470,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // pixel_var
- p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
- p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
- p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon);
- p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon);
-
// calc_Residual
p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = PFX(getResidual4_neon);
p.cu[BLOCK_8x8].calcresidual[NONALIGNED] = PFX(getResidual8_neon);
@@ -643,12 +637,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // pixel_var
- p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2);
- p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2);
- p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_sve2);
- p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_sve2);
-
// calc_Residual
p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_sve2);
p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_sve2);
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..9db5abfe4 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -162,11 +162,6 @@ FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const
FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
-uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
-
void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
@@ -230,11 +225,6 @@ void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, int
int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
-
void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..d1ddec6a1 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1479,34 +1479,75 @@ void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, in
template<int size>
uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
{
- uint32_t sum = 0, sqr = 0;
+ if (size >= 16)
+ {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ for (int h = 0; h < size; h += 2)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ uint8x16_t s[2];
+ load_u8x16xn<2>(pix + w, i_stride, s);
+
+ sum[0] = vpadalq_u8(sum[0], s[0]);
+ sum[1] = vpadalq_u8(sum[1], s[1]);
- uint32x4_t vsqr = vdupq_n_u32(0);
+ uint16x8_t sqr_lo = vmull_u8(vget_low_u8(s[0]), vget_low_u8(s[0]));
+ uint16x8_t sqr_hi = vmull_u8(vget_high_u8(s[0]), vget_high_u8(s[0]));
+ sqr[0] = vpadalq_u16(sqr[0], sqr_lo);
+ sqr[0] = vpadalq_u16(sqr[0], sqr_hi);
- for (int y = 0; y < size; y++)
+ sqr_lo = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ sqr_hi = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sqr[1] = vpadalq_u16(sqr[1], sqr_lo);
+ sqr[1] = vpadalq_u16(sqr[1], sqr_hi);
+ }
+
+ pix += 2 * i_stride;
+ }
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sqr[0] = vaddq_u32(sqr[0], sqr[1]);
+
+ return vaddvq_u32(sum_u32) + (vaddlvq_u32(sqr[0]) << 32);
+ }
+ if (size == 8)
{
- int x = 0;
- uint16x8_t vsum = vdupq_n_u16(0);
- for (; (x + 8) <= size; x += 8)
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x4_t sqr = vdupq_n_u32(0);
+
+ for (int h = 0; h < size; ++h)
{
- uint16x8_t in;
- in = vmovl_u8(vld1_u8(pix + x));
- vsum = vaddq_u16(vsum, in);
- vsqr = vmlal_u16(vsqr, vget_low_u16(in), vget_low_u16(in));
- vsqr = vmlal_high_u16(vsqr, in, in);
+ uint8x8_t s = vld1_u8(pix);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vpadalq_u16(sqr, vmull_u8(s, s));
+
+ pix += i_stride;
}
- for (; x < size; x++)
+
+ return vaddvq_u16(sum) + (vaddlvq_u32(sqr) << 32);
+ }
+ if (size == 4)
+ {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x4_t sqr = vdupq_n_u32(0);
+
+ for (int h = 0; h < size; h += 2)
{
- sum += pix[x];
- sqr += pix[x] * pix[x];
- }
+ uint8x8_t s = load_u8x4x2(pix, i_stride);
- sum += vaddvq_u16(vsum);
+ sum = vaddw_u8(sum, s);
+ sqr = vpadalq_u16(sqr, vmull_u8(s, s));
+
+ pix += 2 * i_stride;
+ }
- pix += i_stride;
+ return vaddvq_u16(sum) + (vaddlvq_u32(sqr) << 32);
}
- sqr += vaddvq_u32(vsqr);
- return sum + ((uint64_t)sqr << 32);
}
template<int blockSize>
@@ -2028,6 +2069,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_CU(64, 64);
#if !(HIGH_BIT_DEPTH)
+ p.cu[BLOCK_4x4].var = pixel_var_neon<4>;
p.cu[BLOCK_8x8].var = pixel_var_neon<8>;
p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
p.cu[BLOCK_32x32].var = pixel_var_neon<32>;
diff --git a/source/common/aarch64/pixel-util-common.S b/source/common/aarch64/pixel-util-common.S
index db2a63ff5..284ea0aae 100644
--- a/source/common/aarch64/pixel-util-common.S
+++ b/source/common/aarch64/pixel-util-common.S
@@ -34,33 +34,6 @@
.align 4
-.macro pixel_var_start
- movi v0.16b, #0
- movi v1.16b, #0
- movi v2.16b, #0
- movi v3.16b, #0
-.endm
-
-.macro pixel_var_1 v
- uaddw v0.8h, v0.8h, \v\().8b
- umull v30.8h, \v\().8b, \v\().8b
- uaddw2 v1.8h, v1.8h, \v\().16b
- umull2 v31.8h, \v\().16b, \v\().16b
- uadalp v2.4s, v30.8h
- uadalp v3.4s, v31.8h
-.endm
-
-.macro pixel_var_end
- uaddlv s0, v0.8h
- uaddlv s1, v1.8h
- add v2.4s, v2.4s, v3.4s
- fadd s0, s0, s1
- uaddlv d2, v2.4s
- fmov w0, s0
- fmov x2, d2
- orr x0, x0, x2, lsl #32
-.endm
-
.macro ssimDist_start
movi v0.16b, #0
movi v1.16b, #0
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 56a2253ea..eee60b425 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -36,201 +36,6 @@
.text
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_sve2)
- ptrue p0.h, vl8
- ld1b {z0.h}, p0/z, [x0]
- add x0, x0, x1
- mul z31.h, z0.h, z0.h
- uaddlp v1.4s, v31.8h
-.rept 7
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z31.h, z4.h, z4.h
- uadalp z1.s, p0/m, z31.h
-.endr
- uaddlv s0, v0.8h
- uaddlv d1, v1.4s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_16x16_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_16x16
- pixel_var_start
- mov w12, #16
-.Loop_var_16_sve2:
- sub w12, w12, #1
- ld1 {v4.16b}, [x0], x1
- pixel_var_1 v4
- cbnz w12, .Loop_var_16_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_16x16:
- ptrue p0.h, vl16
- mov z0.d, #0
- mov z1.d, #0
-.rept 16
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z30.h, z4.h, z4.h
- uadalp z1.s, p0/m, z30.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_32x32_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_32x32
- pixel_var_start
- mov w12, #32
-.Loop_var_32_sve2:
- sub w12, w12, #1
- ld1 {v4.16b-v5.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- cbnz w12, .Loop_var_32_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_32x32:
- cmp x9, #48
- bgt .vl_gt_48_pixel_var_32x32
- ptrue p0.b, vl32
- mov z0.d, #0
- mov z1.d, #0
-.rept 32
- ld1b {z4.b}, p0/z, [x0]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- umullb z28.h, z4.b, z4.b
- umullt z29.h, z4.b, z4.b
- uadalp z1.s, p0/m, z28.h
- uadalp z1.s, p0/m, z29.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_48_pixel_var_32x32:
- ptrue p0.h, vl32
- mov z0.d, #0
- mov z1.d, #0
-.rept 32
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z28.h, z4.h, z4.h
- uadalp z1.s, p0/m, z28.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_64x64
- pixel_var_start
- mov w12, #64
-.Loop_var_64_sve2:
- sub w12, w12, #1
- ld1 {v4.16b-v7.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- pixel_var_1 v6
- pixel_var_1 v7
- cbnz w12, .Loop_var_64_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_var_64x64
- ptrue p0.b, vl32
- mov z0.d, #0
- mov z2.d, #0
-.rept 64
- ld1b {z4.b}, p0/z, [x0]
- ld1b {z5.b}, p0/z, [x0, #1, mul vl]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- uaddwb z0.h, z0.h, z5.b
- uaddwt z0.h, z0.h, z5.b
- umullb z24.h, z4.b, z4.b
- umullt z25.h, z4.b, z4.b
- umullb z26.h, z5.b, z5.b
- umullt z27.h, z5.b, z5.b
- uadalp z2.s, p0/m, z24.h
- uadalp z2.s, p0/m, z25.h
- uadalp z2.s, p0/m, z26.h
- uadalp z2.s, p0/m, z27.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z2.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_48_pixel_var_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_var_64x64
- ptrue p0.b, vl64
- mov z0.d, #0
- mov z2.d, #0
-.rept 64
- ld1b {z4.b}, p0/z, [x0]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- umullb z24.h, z4.b, z4.b
- umullt z25.h, z4.b, z4.b
- uadalp z2.s, p0/m, z24.h
- uadalp z2.s, p0/m, z25.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z2.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_112_pixel_var_64x64:
- ptrue p0.h, vl64
- mov z0.d, #0
- mov z1.d, #0
-.rept 64
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z24.h, z4.h, z4.h
- uadalp z1.s, p0/m, z24.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
function PFX(getResidual16_sve2)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 480278e5e..48bc32778 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -36,67 +36,6 @@
.text
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_neon)
- ld1 {v4.8b}, [x0], x1 // pix[x]
- uxtl v0.8h, v4.8b // sum = pix[x]
- umull v1.8h, v4.8b, v4.8b
- uaddlp v1.4s, v1.8h // sqr = pix[x] * pix[x]
-
-.rept 7
- ld1 {v4.8b}, [x0], x1 // pix[x]
- umull v31.8h, v4.8b, v4.8b
- uaddw v0.8h, v0.8h, v4.8b // sum += pix[x]
- uadalp v1.4s, v31.8h // sqr += pix[x] * pix[x]
-.endr
- uaddlv s0, v0.8h
- uaddlv d1, v1.4s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32 // return sum + ((uint64_t)sqr << 32);
- ret
-endfunc
-
-function PFX(pixel_var_16x16_neon)
- pixel_var_start
- mov w12, #16
-.Loop_var_16:
- sub w12, w12, #1
- ld1 {v4.16b}, [x0], x1
- pixel_var_1 v4
- cbnz w12, .Loop_var_16
- pixel_var_end
- ret
-endfunc
-
-function PFX(pixel_var_32x32_neon)
- pixel_var_start
- mov w12, #32
-.Loop_var_32:
- sub w12, w12, #1
- ld1 {v4.16b-v5.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- cbnz w12, .Loop_var_32
- pixel_var_end
- ret
-endfunc
-
-function PFX(pixel_var_64x64_neon)
- pixel_var_start
- mov w12, #64
-.Loop_var_64:
- sub w12, w12, #1
- ld1 {v4.16b-v7.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- pixel_var_1 v6
- pixel_var_1 v7
- cbnz w12, .Loop_var_64
- pixel_var_end
- ret
-endfunc
-
// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
function PFX(getResidual4_neon)
lsl x4, x3, #1
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 6978693420645d7911969183c87484ae836cabfa Mon Sep 17 00:00:00 2001
Message-Id: <6978693420645d7911969183c87484ae836cabfa.1750183023.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1750183023.git.li.zhang2 at arm.com>
References: <cover.1750183023.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Mon, 16 Jun 2025 17:22:24 +0200
Subject: [PATCH 1/4] AArch64: Optimize and clean up SBD pixel_var functions
Optimize the standard bit-depth pixel_var Neon intrinsics implementation
for every block size. Improve block sizes >= 16 by unrolling and doing
the widening and reduction of accumulators only at the end. Extend the
intrinsics implementation to support 4x4 blocks.
Delete the Neon and SVE2 assembly implementations as they are now slower
than the Neon instrinsics implementation.
---
source/common/aarch64/asm-primitives.cpp | 12 --
source/common/aarch64/fun-decls.h | 10 --
source/common/aarch64/pixel-prim.cpp | 80 ++++++---
source/common/aarch64/pixel-util-common.S | 27 ---
source/common/aarch64/pixel-util-sve2.S | 195 ----------------------
source/common/aarch64/pixel-util.S | 61 -------
6 files changed, 61 insertions(+), 324 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 5ce9352bd..d8b0beb8f 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -470,12 +470,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
- // pixel_var
- p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
- p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
- p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon);
- p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon);
-
// calc_Residual
p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = PFX(getResidual4_neon);
p.cu[BLOCK_8x8].calcresidual[NONALIGNED] = PFX(getResidual8_neon);
@@ -643,12 +637,6 @@ void setupSve2Primitives(EncoderPrimitives &p)
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // pixel_var
- p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_sve2);
- p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_sve2);
- p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_sve2);
- p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_sve2);
-
// calc_Residual
p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_sve2);
p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_sve2);
diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 12383b573..9db5abfe4 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -162,11 +162,6 @@ FUNCDEF_PU_MULT_16(void, sad_x3, neon_dotprod, const pixel*, const pixel*, const
FUNCDEF_PU_MULT_16(void, sad_x4, neon_dotprod, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*);
FUNCDEF_PU(sse_t, pixel_sse_pp, neon_dotprod, const pixel*, intptr_t, const pixel*, intptr_t);
-uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
-
void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
@@ -230,11 +225,6 @@ void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, int
int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
-uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
-
void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index f4df6786e..d1ddec6a1 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1479,34 +1479,75 @@ void cpy1Dto2D_shr_neon(int16_t* dst, const int16_t* src, intptr_t dstStride, in
template<int size>
uint64_t pixel_var_neon(const uint8_t *pix, intptr_t i_stride)
{
- uint32_t sum = 0, sqr = 0;
+ if (size >= 16)
+ {
+ uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint32x4_t sqr[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+ for (int h = 0; h < size; h += 2)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ uint8x16_t s[2];
+ load_u8x16xn<2>(pix + w, i_stride, s);
+
+ sum[0] = vpadalq_u8(sum[0], s[0]);
+ sum[1] = vpadalq_u8(sum[1], s[1]);
- uint32x4_t vsqr = vdupq_n_u32(0);
+ uint16x8_t sqr_lo = vmull_u8(vget_low_u8(s[0]), vget_low_u8(s[0]));
+ uint16x8_t sqr_hi = vmull_u8(vget_high_u8(s[0]), vget_high_u8(s[0]));
+ sqr[0] = vpadalq_u16(sqr[0], sqr_lo);
+ sqr[0] = vpadalq_u16(sqr[0], sqr_hi);
- for (int y = 0; y < size; y++)
+ sqr_lo = vmull_u8(vget_low_u8(s[1]), vget_low_u8(s[1]));
+ sqr_hi = vmull_u8(vget_high_u8(s[1]), vget_high_u8(s[1]));
+ sqr[1] = vpadalq_u16(sqr[1], sqr_lo);
+ sqr[1] = vpadalq_u16(sqr[1], sqr_hi);
+ }
+
+ pix += 2 * i_stride;
+ }
+
+ uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+ sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+ sqr[0] = vaddq_u32(sqr[0], sqr[1]);
+
+ return vaddvq_u32(sum_u32) + (vaddlvq_u32(sqr[0]) << 32);
+ }
+ if (size == 8)
{
- int x = 0;
- uint16x8_t vsum = vdupq_n_u16(0);
- for (; (x + 8) <= size; x += 8)
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x4_t sqr = vdupq_n_u32(0);
+
+ for (int h = 0; h < size; ++h)
{
- uint16x8_t in;
- in = vmovl_u8(vld1_u8(pix + x));
- vsum = vaddq_u16(vsum, in);
- vsqr = vmlal_u16(vsqr, vget_low_u16(in), vget_low_u16(in));
- vsqr = vmlal_high_u16(vsqr, in, in);
+ uint8x8_t s = vld1_u8(pix);
+
+ sum = vaddw_u8(sum, s);
+ sqr = vpadalq_u16(sqr, vmull_u8(s, s));
+
+ pix += i_stride;
}
- for (; x < size; x++)
+
+ return vaddvq_u16(sum) + (vaddlvq_u32(sqr) << 32);
+ }
+ if (size == 4)
+ {
+ uint16x8_t sum = vdupq_n_u16(0);
+ uint32x4_t sqr = vdupq_n_u32(0);
+
+ for (int h = 0; h < size; h += 2)
{
- sum += pix[x];
- sqr += pix[x] * pix[x];
- }
+ uint8x8_t s = load_u8x4x2(pix, i_stride);
- sum += vaddvq_u16(vsum);
+ sum = vaddw_u8(sum, s);
+ sqr = vpadalq_u16(sqr, vmull_u8(s, s));
+
+ pix += 2 * i_stride;
+ }
- pix += i_stride;
+ return vaddvq_u16(sum) + (vaddlvq_u32(sqr) << 32);
}
- sqr += vaddvq_u32(vsqr);
- return sum + ((uint64_t)sqr << 32);
}
template<int blockSize>
@@ -2028,6 +2069,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
LUMA_CU(64, 64);
#if !(HIGH_BIT_DEPTH)
+ p.cu[BLOCK_4x4].var = pixel_var_neon<4>;
p.cu[BLOCK_8x8].var = pixel_var_neon<8>;
p.cu[BLOCK_16x16].var = pixel_var_neon<16>;
p.cu[BLOCK_32x32].var = pixel_var_neon<32>;
diff --git a/source/common/aarch64/pixel-util-common.S b/source/common/aarch64/pixel-util-common.S
index db2a63ff5..284ea0aae 100644
--- a/source/common/aarch64/pixel-util-common.S
+++ b/source/common/aarch64/pixel-util-common.S
@@ -34,33 +34,6 @@
.align 4
-.macro pixel_var_start
- movi v0.16b, #0
- movi v1.16b, #0
- movi v2.16b, #0
- movi v3.16b, #0
-.endm
-
-.macro pixel_var_1 v
- uaddw v0.8h, v0.8h, \v\().8b
- umull v30.8h, \v\().8b, \v\().8b
- uaddw2 v1.8h, v1.8h, \v\().16b
- umull2 v31.8h, \v\().16b, \v\().16b
- uadalp v2.4s, v30.8h
- uadalp v3.4s, v31.8h
-.endm
-
-.macro pixel_var_end
- uaddlv s0, v0.8h
- uaddlv s1, v1.8h
- add v2.4s, v2.4s, v3.4s
- fadd s0, s0, s1
- uaddlv d2, v2.4s
- fmov w0, s0
- fmov x2, d2
- orr x0, x0, x2, lsl #32
-.endm
-
.macro ssimDist_start
movi v0.16b, #0
movi v1.16b, #0
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index 56a2253ea..eee60b425 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -36,201 +36,6 @@
.text
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_sve2)
- ptrue p0.h, vl8
- ld1b {z0.h}, p0/z, [x0]
- add x0, x0, x1
- mul z31.h, z0.h, z0.h
- uaddlp v1.4s, v31.8h
-.rept 7
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z31.h, z4.h, z4.h
- uadalp z1.s, p0/m, z31.h
-.endr
- uaddlv s0, v0.8h
- uaddlv d1, v1.4s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_16x16_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_16x16
- pixel_var_start
- mov w12, #16
-.Loop_var_16_sve2:
- sub w12, w12, #1
- ld1 {v4.16b}, [x0], x1
- pixel_var_1 v4
- cbnz w12, .Loop_var_16_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_16x16:
- ptrue p0.h, vl16
- mov z0.d, #0
- mov z1.d, #0
-.rept 16
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z30.h, z4.h, z4.h
- uadalp z1.s, p0/m, z30.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_32x32_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_32x32
- pixel_var_start
- mov w12, #32
-.Loop_var_32_sve2:
- sub w12, w12, #1
- ld1 {v4.16b-v5.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- cbnz w12, .Loop_var_32_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_32x32:
- cmp x9, #48
- bgt .vl_gt_48_pixel_var_32x32
- ptrue p0.b, vl32
- mov z0.d, #0
- mov z1.d, #0
-.rept 32
- ld1b {z4.b}, p0/z, [x0]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- umullb z28.h, z4.b, z4.b
- umullt z29.h, z4.b, z4.b
- uadalp z1.s, p0/m, z28.h
- uadalp z1.s, p0/m, z29.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_48_pixel_var_32x32:
- ptrue p0.h, vl32
- mov z0.d, #0
- mov z1.d, #0
-.rept 32
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z28.h, z4.h, z4.h
- uadalp z1.s, p0/m, z28.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
-function PFX(pixel_var_64x64_sve2)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_pixel_var_64x64
- pixel_var_start
- mov w12, #64
-.Loop_var_64_sve2:
- sub w12, w12, #1
- ld1 {v4.16b-v7.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- pixel_var_1 v6
- pixel_var_1 v7
- cbnz w12, .Loop_var_64_sve2
- pixel_var_end
- ret
-.vl_gt_16_pixel_var_64x64:
- cmp x9, #48
- bgt .vl_gt_48_pixel_var_64x64
- ptrue p0.b, vl32
- mov z0.d, #0
- mov z2.d, #0
-.rept 64
- ld1b {z4.b}, p0/z, [x0]
- ld1b {z5.b}, p0/z, [x0, #1, mul vl]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- uaddwb z0.h, z0.h, z5.b
- uaddwt z0.h, z0.h, z5.b
- umullb z24.h, z4.b, z4.b
- umullt z25.h, z4.b, z4.b
- umullb z26.h, z5.b, z5.b
- umullt z27.h, z5.b, z5.b
- uadalp z2.s, p0/m, z24.h
- uadalp z2.s, p0/m, z25.h
- uadalp z2.s, p0/m, z26.h
- uadalp z2.s, p0/m, z27.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z2.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_48_pixel_var_64x64:
- cmp x9, #112
- bgt .vl_gt_112_pixel_var_64x64
- ptrue p0.b, vl64
- mov z0.d, #0
- mov z2.d, #0
-.rept 64
- ld1b {z4.b}, p0/z, [x0]
- add x0, x0, x1
- uaddwb z0.h, z0.h, z4.b
- uaddwt z0.h, z0.h, z4.b
- umullb z24.h, z4.b, z4.b
- umullt z25.h, z4.b, z4.b
- uadalp z2.s, p0/m, z24.h
- uadalp z2.s, p0/m, z25.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z2.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-.vl_gt_112_pixel_var_64x64:
- ptrue p0.h, vl64
- mov z0.d, #0
- mov z1.d, #0
-.rept 64
- ld1b {z4.h}, p0/z, [x0]
- add x0, x0, x1
- add z0.h, z0.h, z4.h
- mul z24.h, z4.h, z4.h
- uadalp z1.s, p0/m, z24.h
-.endr
- uaddv d0, p0, z0.h
- uaddv d1, p0, z1.s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32
- ret
-endfunc
-
function PFX(getResidual16_sve2)
rdvl x9, #1
cmp x9, #16
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 480278e5e..48bc32778 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -36,67 +36,6 @@
.text
-// uint64_t pixel_var(const pixel* pix, intptr_t i_stride)
-function PFX(pixel_var_8x8_neon)
- ld1 {v4.8b}, [x0], x1 // pix[x]
- uxtl v0.8h, v4.8b // sum = pix[x]
- umull v1.8h, v4.8b, v4.8b
- uaddlp v1.4s, v1.8h // sqr = pix[x] * pix[x]
-
-.rept 7
- ld1 {v4.8b}, [x0], x1 // pix[x]
- umull v31.8h, v4.8b, v4.8b
- uaddw v0.8h, v0.8h, v4.8b // sum += pix[x]
- uadalp v1.4s, v31.8h // sqr += pix[x] * pix[x]
-.endr
- uaddlv s0, v0.8h
- uaddlv d1, v1.4s
- fmov w0, s0
- fmov x1, d1
- orr x0, x0, x1, lsl #32 // return sum + ((uint64_t)sqr << 32);
- ret
-endfunc
-
-function PFX(pixel_var_16x16_neon)
- pixel_var_start
- mov w12, #16
-.Loop_var_16:
- sub w12, w12, #1
- ld1 {v4.16b}, [x0], x1
- pixel_var_1 v4
- cbnz w12, .Loop_var_16
- pixel_var_end
- ret
-endfunc
-
-function PFX(pixel_var_32x32_neon)
- pixel_var_start
- mov w12, #32
-.Loop_var_32:
- sub w12, w12, #1
- ld1 {v4.16b-v5.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- cbnz w12, .Loop_var_32
- pixel_var_end
- ret
-endfunc
-
-function PFX(pixel_var_64x64_neon)
- pixel_var_start
- mov w12, #64
-.Loop_var_64:
- sub w12, w12, #1
- ld1 {v4.16b-v7.16b}, [x0], x1
- pixel_var_1 v4
- pixel_var_1 v5
- pixel_var_1 v6
- pixel_var_1 v7
- cbnz w12, .Loop_var_64
- pixel_var_end
- ret
-endfunc
-
// void getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride)
function PFX(getResidual4_neon)
lsl x4, x3, #1
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list