[x265] [PATCH 2/8] AArch64: Optimize blockcopy_ps Neon intrinsics implementation
Li Zhang
li.zhang2 at arm.com
Mon May 19 16:42:17 UTC 2025
Unroll the blockcopy_ps_neon intrinsics implementation to enable use of
LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 30 ---
source/common/aarch64/blockcopy8-sve.S | 239 -----------------------
source/common/aarch64/blockcopy8.S | 153 ---------------
source/common/aarch64/pixel-prim.cpp | 85 ++++++--
4 files changed, 67 insertions(+), 440 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 2c6911d8b..463da8319 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -411,13 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
- // Blockcopy_ps
- p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.cu[BLOCK_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
- p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
- p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -435,16 +428,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon);
- // chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps = PFX(blockcopy_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_neon);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -678,11 +661,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
- // Blockcopy_ps
- p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_sve);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
@@ -695,14 +673,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
- // chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps = PFX(blockcopy_ps_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_sve);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index e80722654..9f9406e6e 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -166,147 +166,6 @@ function PFX(blockcopy_sp_32x32_sve)
ret
endfunc
-function PFX(blockcopy_ps_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_16_16
- lsl x1, x1, #1
-.rept 8
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ps_16_16:
- ptrue p0.b, vl32
-.rept 16
- ld1b {z1.h}, p0/z, [x2]
- st1h {z1.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_32_32
- lsl x1, x1, #1
- mov w12, #4
-.Loop_cps32_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32_sve
- ret
-.vl_gt_16_blockcopy_ps_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_32_32
- ptrue p0.b, vl32
-.rept 32
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x0]
- st1h {z3.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_32_32:
- ptrue p0.b, vl64
-.rept 32
- ld1b {z2.h}, p0/z, [x2]
- st1h {z2.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_64_64
- lsl x1, x1, #1
- sub x1, x1, #64
- mov w12, #16
-.Loop_cps64_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps64_sve
- ret
-.vl_gt_16_blockcopy_ps_64_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_64_64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- ld1b {z5.h}, p0/z, [x2, #1, mul vl]
- ld1b {z6.h}, p0/z, [x2, #2, mul vl]
- ld1b {z7.h}, p0/z, [x2, #3, mul vl]
- st1h {z4.h}, p0, [x0]
- st1h {z5.h}, p0, [x0, #1, mul vl]
- st1h {z6.h}, p0, [x0, #2, mul vl]
- st1h {z7.h}, p0, [x0, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_64_64:
- cmp x9, #112
- bgt .vl_gt_112_blockcopy_ps_64_64
- ptrue p0.b, vl64
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- ld1b {z5.h}, p0/z, [x2, #1, mul vl]
- st1h {z4.h}, p0, [x0]
- st1h {z5.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_112_blockcopy_ps_64_64:
- ptrue p0.b, vl128
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- st1h {z4.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-
-endfunc
-
function PFX(blockcopy_ss_16x16_sve)
rdvl x9, #1
cmp x9, #16
@@ -514,104 +373,6 @@ function PFX(blockcopy_ss_32x64_sve)
ret
endfunc
-// chroma blockcopy_ps
-function PFX(blockcopy_ps_4x8_sve)
- ptrue p0.h, vl4
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x16_sve)
- ptrue p0.h, vl8
-.rept 16
- ld1b {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_16_32
- lsl x1, x1, #1
-.rept 16
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ps_16_32:
- ptrue p0.b, vl32
-.rept 32
- ld1b {z1.h}, p0/z, [x2]
- st1h {z1.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_32_64
- lsl x1, x1, #1
- mov w12, #8
-.Loop_cps32x64_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32x64_sve
- ret
-.vl_gt_16_blockcopy_ps_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_32_64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x0]
- st1h {z3.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_32_64:
- ptrue p0.b, vl64
-.rept 64
- ld1b {z2.h}, p0/z, [x2]
- st1h {z2.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_sve)
ptrue p0.h, vl4
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index d466f8ea8..11685d254 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -123,94 +123,6 @@ function PFX(blockcopy_sp_64x64_neon)
ret
endfunc
-// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
-function PFX(blockcopy_ps_4x4_neon)
- lsl x1, x1, #1
-.rept 2
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.4h}, [x0], x1
- st1 {v1.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x8_neon)
- lsl x1, x1, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x16_neon)
- lsl x1, x1, #1
-.rept 8
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_neon)
- lsl x1, x1, #1
- mov w12, #4
-.Loop_cps32:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32
- ret
-endfunc
-
-function PFX(blockcopy_ps_64x64_neon)
- lsl x1, x1, #1
- sub x1, x1, #64
- mov w12, #16
-.Loop_cps64:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps64
- ret
-endfunc
-
// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
function PFX(blockcopy_ss_4x4_neon)
lsl x1, x1, #1
@@ -331,71 +243,6 @@ function PFX(blockcopy_ss_32x64_neon)
ret
endfunc
-// chroma blockcopy_ps
-function PFX(blockcopy_ps_4x8_neon)
- lsl x1, x1, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.4h}, [x0], x1
- st1 {v1.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x16_neon)
- lsl x1, x1, #1
-.rept 8
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x32_neon)
- lsl x1, x1, #1
-.rept 16
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x64_neon)
- lsl x1, x1, #1
- mov w12, #8
-.Loop_cps32x64:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32x64
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_neon)
lsl x3, x3, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 9afd9f913..80678a827 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -916,31 +916,42 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
}
-template<int bx, int by>
-void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
+#if !HIGH_BIT_DEPTH
+template<int width, int height>
+void blockcopy_ps_neon(int16_t *dst, intptr_t dst_stride, const pixel *src,
+ intptr_t src_stride)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
- for (; (x + 8) <= bx; x += 8)
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
{
-#if HIGH_BIT_DEPTH
- vst1q_s16(a + x, vreinterpretq_s16_u16(vld1q_u16(b + x)));
-#else
- int16x8_t in = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b + x)));
- vst1q_s16(a + x, in);
-#endif
+ uint8x16_t s = vld1q_u8(src + w);
+ uint8x16x2_t t = vzipq_u8(s, vdupq_n_u8(0));
+ int16x8x2_t s_s16;
+ s_s16.val[0] = vreinterpretq_s16_u8(t.val[0]);
+ s_s16.val[1] = vreinterpretq_s16_u8(t.val[1]);
+ vst1q_s16_x2(dst + w, s_s16);
}
- for (; x < bx; x++)
+ if (width & 8)
{
- a[x] = (int16_t)b[x];
+ uint8x8_t s = vld1_u8(src + w);
+ uint16x8_t s_u16 = vmovl_u8(s);
+ vst1q_s16(dst + w, vreinterpretq_s16_u16(s_u16));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint8x8_t s = load_u8x4x1(src + w);
+ uint16x4_t s_u16 = vget_low_u16(vmovl_u8(s));
+ vst1_s16(dst + w, vreinterpret_s16_u16(s_u16));
}
- a += stridea;
- b += strideb;
+ dst += dst_stride;
+ src += src_stride;
}
}
-
+#endif // !HIGH_BIT_DEPTH
template<int width, int height>
void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
@@ -1758,6 +1769,18 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
#endif // !(HIGH_BIT_DEPTH)
+#if HIGH_BIT_DEPTH
+#define LUMA_CU(W, H) \
+ p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
+ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
+#else // !HIGH_BIT_DEPTH
#define LUMA_CU(W, H) \
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
@@ -1770,7 +1793,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
-
+#endif // HIGH_BIT_DEPTH
LUMA_PU_S(4, 4);
LUMA_PU_S(8, 8);
@@ -1920,7 +1943,19 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
+#if HIGH_BIT_DEPTH
+#define CHROMA_CU_420(W, H) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#define CHROMA_CU_S_420(W, H) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#else // !HIGH_BIT_DEPTH
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1934,7 +1969,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
+#endif // HIGH_BIT_DEPTH
CHROMA_CU_S_420(4, 4)
CHROMA_CU_420(8, 8)
@@ -2008,6 +2043,19 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+#if HIGH_BIT_DEPTH
+#define CHROMA_CU_422(W, H) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+
+#define CHROMA_CU_S_422(W, H) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#else // !HIGH_BIT_DEPTH
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -2021,6 +2069,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#endif // HIGH_BIT_DEPTH
CHROMA_CU_S_422(4, 8)
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From 0c2f180b3c4aa7847ca05c35b11bda8d370e7742 Mon Sep 17 00:00:00 2001
Message-Id: <0c2f180b3c4aa7847ca05c35b11bda8d370e7742.1747668338.git.li.zhang2 at arm.com>
In-Reply-To: <cover.1747668338.git.li.zhang2 at arm.com>
References: <cover.1747668338.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Thu, 8 May 2025 19:15:49 +0200
Subject: [PATCH 2/8] AArch64: Optimize blockcopy_ps Neon intrinsics
implementation
Unroll the blockcopy_ps_neon intrinsics implementation to enable use of
LDP and STP instructions.
Delete the Neon and SVE assembly implementations of these kernels as
they are no faster, and only serve to increase binary size.
Co-authored by: Jonathan Wright <jonathan.wright at arm.com>
---
source/common/aarch64/asm-primitives.cpp | 30 ---
source/common/aarch64/blockcopy8-sve.S | 239 -----------------------
source/common/aarch64/blockcopy8.S | 153 ---------------
source/common/aarch64/pixel-prim.cpp | 85 ++++++--
4 files changed, 67 insertions(+), 440 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 2c6911d8b..463da8319 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -411,13 +411,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_neon);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_neon);
- // Blockcopy_ps
- p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.cu[BLOCK_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
- p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
- p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_neon);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -435,16 +428,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_neon);
- // chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps = PFX(blockcopy_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_neon);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_neon);
@@ -678,11 +661,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
- // Blockcopy_ps
- p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_sve);
-
// Blockcopy_sp
p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
@@ -695,14 +673,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
- // chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps = PFX(blockcopy_ps_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_sve);
-
// chroma blockcopy_sp
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index e80722654..9f9406e6e 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -166,147 +166,6 @@ function PFX(blockcopy_sp_32x32_sve)
ret
endfunc
-function PFX(blockcopy_ps_16x16_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_16_16
- lsl x1, x1, #1
-.rept 8
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ps_16_16:
- ptrue p0.b, vl32
-.rept 16
- ld1b {z1.h}, p0/z, [x2]
- st1h {z1.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_32_32
- lsl x1, x1, #1
- mov w12, #4
-.Loop_cps32_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32_sve
- ret
-.vl_gt_16_blockcopy_ps_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_32_32
- ptrue p0.b, vl32
-.rept 32
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x0]
- st1h {z3.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_32_32:
- ptrue p0.b, vl64
-.rept 32
- ld1b {z2.h}, p0/z, [x2]
- st1h {z2.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_64x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_64_64
- lsl x1, x1, #1
- sub x1, x1, #64
- mov w12, #16
-.Loop_cps64_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps64_sve
- ret
-.vl_gt_16_blockcopy_ps_64_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_64_64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- ld1b {z5.h}, p0/z, [x2, #1, mul vl]
- ld1b {z6.h}, p0/z, [x2, #2, mul vl]
- ld1b {z7.h}, p0/z, [x2, #3, mul vl]
- st1h {z4.h}, p0, [x0]
- st1h {z5.h}, p0, [x0, #1, mul vl]
- st1h {z6.h}, p0, [x0, #2, mul vl]
- st1h {z7.h}, p0, [x0, #3, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_64_64:
- cmp x9, #112
- bgt .vl_gt_112_blockcopy_ps_64_64
- ptrue p0.b, vl64
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- ld1b {z5.h}, p0/z, [x2, #1, mul vl]
- st1h {z4.h}, p0, [x0]
- st1h {z5.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_112_blockcopy_ps_64_64:
- ptrue p0.b, vl128
-.rept 64
- ld1b {z4.h}, p0/z, [x2]
- st1h {z4.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-
-endfunc
-
function PFX(blockcopy_ss_16x16_sve)
rdvl x9, #1
cmp x9, #16
@@ -514,104 +373,6 @@ function PFX(blockcopy_ss_32x64_sve)
ret
endfunc
-// chroma blockcopy_ps
-function PFX(blockcopy_ps_4x8_sve)
- ptrue p0.h, vl4
-.rept 8
- ld1b {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x16_sve)
- ptrue p0.h, vl8
-.rept 16
- ld1b {z0.h}, p0/z, [x2]
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_16_32
- lsl x1, x1, #1
-.rept 16
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockcopy_ps_16_32:
- ptrue p0.b, vl32
-.rept 32
- ld1b {z1.h}, p0/z, [x2]
- st1h {z1.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x64_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockcopy_ps_32_64
- lsl x1, x1, #1
- mov w12, #8
-.Loop_cps32x64_sve:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32x64_sve
- ret
-.vl_gt_16_blockcopy_ps_32_64:
- cmp x9, #48
- bgt .vl_gt_48_blockcopy_ps_32_64
- ptrue p0.b, vl32
-.rept 64
- ld1b {z2.h}, p0/z, [x2]
- ld1b {z3.h}, p0/z, [x2, #1, mul vl]
- st1h {z2.h}, p0, [x0]
- st1h {z3.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-.vl_gt_48_blockcopy_ps_32_64:
- ptrue p0.b, vl64
-.rept 64
- ld1b {z2.h}, p0/z, [x2]
- st1h {z2.h}, p0, [x0]
- add x0, x0, x1, lsl #1
- add x2, x2, x3
-.endr
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_sve)
ptrue p0.h, vl4
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index d466f8ea8..11685d254 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -123,94 +123,6 @@ function PFX(blockcopy_sp_64x64_neon)
ret
endfunc
-// void blockcopy_ps(int16_t* a, intptr_t stridea, const pixel* b, intptr_t strideb)
-function PFX(blockcopy_ps_4x4_neon)
- lsl x1, x1, #1
-.rept 2
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.4h}, [x0], x1
- st1 {v1.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x8_neon)
- lsl x1, x1, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x16_neon)
- lsl x1, x1, #1
-.rept 8
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x32_neon)
- lsl x1, x1, #1
- mov w12, #4
-.Loop_cps32:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32
- ret
-endfunc
-
-function PFX(blockcopy_ps_64x64_neon)
- lsl x1, x1, #1
- sub x1, x1, #64
- mov w12, #16
-.Loop_cps64:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps64
- ret
-endfunc
-
// void x265_blockcopy_ss(int16_t* a, intptr_t stridea, const int16_t* b, intptr_t strideb)
function PFX(blockcopy_ss_4x4_neon)
lsl x1, x1, #1
@@ -331,71 +243,6 @@ function PFX(blockcopy_ss_32x64_neon)
ret
endfunc
-// chroma blockcopy_ps
-function PFX(blockcopy_ps_4x8_neon)
- lsl x1, x1, #1
-.rept 4
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.4h}, [x0], x1
- st1 {v1.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_8x16_neon)
- lsl x1, x1, #1
-.rept 8
- ld1 {v0.8b}, [x2], x3
- ld1 {v1.8b}, [x2], x3
- uxtl v0.8h, v0.8b
- uxtl v1.8h, v1.8b
- st1 {v0.8h}, [x0], x1
- st1 {v1.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_16x32_neon)
- lsl x1, x1, #1
-.rept 16
- ld1 {v4.16b}, [x2], x3
- ld1 {v5.16b}, [x2], x3
- uxtl v0.8h, v4.8b
- uxtl2 v1.8h, v4.16b
- uxtl v2.8h, v5.8b
- uxtl2 v3.8h, v5.16b
- st1 {v0.8h-v1.8h}, [x0], x1
- st1 {v2.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockcopy_ps_32x64_neon)
- lsl x1, x1, #1
- mov w12, #8
-.Loop_cps32x64:
- sub w12, w12, #1
-.rept 4
- ld1 {v16.16b-v17.16b}, [x2], x3
- ld1 {v18.16b-v19.16b}, [x2], x3
- uxtl v0.8h, v16.8b
- uxtl2 v1.8h, v16.16b
- uxtl v2.8h, v17.8b
- uxtl2 v3.8h, v17.16b
- uxtl v4.8h, v18.8b
- uxtl2 v5.8h, v18.16b
- uxtl v6.8h, v19.8b
- uxtl2 v7.8h, v19.16b
- st1 {v0.8h-v3.8h}, [x0], x1
- st1 {v4.8h-v7.8h}, [x0], x1
-.endr
- cbnz w12, .Loop_cps32x64
- ret
-endfunc
-
// chroma blockcopy_sp
function PFX(blockcopy_sp_4x8_neon)
lsl x3, x3, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 9afd9f913..80678a827 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -916,31 +916,42 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
}
-template<int bx, int by>
-void blockcopy_ps_neon(int16_t *a, intptr_t stridea, const pixel *b, intptr_t strideb)
+#if !HIGH_BIT_DEPTH
+template<int width, int height>
+void blockcopy_ps_neon(int16_t *dst, intptr_t dst_stride, const pixel *src,
+ intptr_t src_stride)
{
- for (int y = 0; y < by; y++)
+ for (int h = 0; h < height; h++)
{
- int x = 0;
- for (; (x + 8) <= bx; x += 8)
+ int w = 0;
+ for (; w + 16 <= width; w += 16)
{
-#if HIGH_BIT_DEPTH
- vst1q_s16(a + x, vreinterpretq_s16_u16(vld1q_u16(b + x)));
-#else
- int16x8_t in = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(b + x)));
- vst1q_s16(a + x, in);
-#endif
+ uint8x16_t s = vld1q_u8(src + w);
+ uint8x16x2_t t = vzipq_u8(s, vdupq_n_u8(0));
+ int16x8x2_t s_s16;
+ s_s16.val[0] = vreinterpretq_s16_u8(t.val[0]);
+ s_s16.val[1] = vreinterpretq_s16_u8(t.val[1]);
+ vst1q_s16_x2(dst + w, s_s16);
}
- for (; x < bx; x++)
+ if (width & 8)
{
- a[x] = (int16_t)b[x];
+ uint8x8_t s = vld1_u8(src + w);
+ uint16x8_t s_u16 = vmovl_u8(s);
+ vst1q_s16(dst + w, vreinterpretq_s16_u16(s_u16));
+ w += 8;
+ }
+ if (width & 4)
+ {
+ uint8x8_t s = load_u8x4x1(src + w);
+ uint16x4_t s_u16 = vget_low_u16(vmovl_u8(s));
+ vst1_s16(dst + w, vreinterpret_s16_u16(s_u16));
}
- a += stridea;
- b += strideb;
+ dst += dst_stride;
+ src += src_stride;
}
}
-
+#endif // !HIGH_BIT_DEPTH
template<int width, int height>
void blockcopy_pp_neon(pixel *dst, intptr_t dst_stride, const pixel *src,
@@ -1758,6 +1769,18 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.pu[LUMA_ ## W ## x ## H].pixelavg_pp[ALIGNED] = pixelavg_pp_neon<W, H>;
#endif // !(HIGH_BIT_DEPTH)
+#if HIGH_BIT_DEPTH
+#define LUMA_CU(W, H) \
+ p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[NONALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
+ p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
+#else // !HIGH_BIT_DEPTH
#define LUMA_CU(W, H) \
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
@@ -1770,7 +1793,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].cpy1Dto2D_shl[ALIGNED] = cpy1Dto2D_shl_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].psy_cost_pp = psyCost_pp_neon<BLOCK_ ## W ## x ## H>; \
p.cu[BLOCK_ ## W ## x ## H].transpose = transpose_neon<W>;
-
+#endif // HIGH_BIT_DEPTH
LUMA_PU_S(4, 4);
LUMA_PU_S(8, 8);
@@ -1920,7 +1943,19 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = satd8_neon<32, 24>;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = satd8_neon<32, 32>;
+#if HIGH_BIT_DEPTH
+#define CHROMA_CU_420(W, H) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#define CHROMA_CU_S_420(W, H) \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#else // !HIGH_BIT_DEPTH
#define CHROMA_CU_420(W, H) \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -1934,7 +1969,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I420].cu[BLOCK_420_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
-
+#endif // HIGH_BIT_DEPTH
CHROMA_CU_S_420(4, 4)
CHROMA_CU_420(8, 8)
@@ -2008,6 +2043,19 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = satd4_neon<12, 32>;
+#if HIGH_BIT_DEPTH
+#define CHROMA_CU_422(W, H) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+
+#define CHROMA_CU_S_422(W, H) \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#else // !HIGH_BIT_DEPTH
#define CHROMA_CU_422(W, H) \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
@@ -2021,6 +2069,7 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.chroma[X265_CSP_I422].cu[BLOCK_422_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>;
+#endif // HIGH_BIT_DEPTH
CHROMA_CU_S_422(4, 8)
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list