[x265] [PATCH] AArch64: Clean up blockfill Neon and SVE2 functions
Li Zhang
li.zhang2 at arm.com
Wed May 28 14:27:09 UTC 2025
Use the Neon intrinsics implementation of blockfill_s for all block
sizes. Unroll the Neon intrinsics implementation to ensure the usage of
STP. Delete the Neon and SVE assembly implementations of blockfill_s as
they are no faster, and only serve to increase binary size.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 8 ---
source/common/aarch64/blockcopy8-sve.S | 70 ------------------------
source/common/aarch64/blockcopy8.S | 56 -------------------
source/common/aarch64/pixel-prim.cpp | 49 ++++++++---------
5 files changed, 25 insertions(+), 160 deletions(-)
delete mode 100644 source/common/aarch64/blockcopy8-sve.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 7eb40fb05..405ec0b2d 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -114,7 +114,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
# Add Arm assembly files here.
set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
- set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+ set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
set(VEC_PRIMITIVES)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 536af1d5d..5ce9352bd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,10 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Block_fill
- ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
- ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
-
// copy_count
p.cu[BLOCK_4x4].copy_cnt = PFX(copy_cnt_4_neon);
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_neon);
@@ -589,10 +585,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Block_fill
- LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
- LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
-
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
deleted file mode 100644
index 401167038..000000000
--- a/source/common/aarch64/blockcopy8-sve.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
-
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(blockfill_s_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockfill_s_32_32
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
-.rept 32
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockfill_s_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockfill_s_32_32
- dup z0.h, w2
- ptrue p0.h, vl16
-.rept 32
- st1h {z0.h}, p0, [x0]
- st1h {z0.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
-.endr
- ret
-.vl_gt_48_blockfill_s_32_32:
- dup z0.h, w2
- ptrue p0.h, vl32
-.rept 32
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 00b49df4d..0c30ed614 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -33,62 +33,6 @@
.text
-// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
-function PFX(blockfill_s_4x4_neon)
- dup v0.4h, w2
- lsl x1, x1, #1
-.rept 4
- st1 {v0.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_8x8_neon)
- dup v0.8h, w2
- lsl x1, x1, #1
-.rept 8
- st1 {v0.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_16x16_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- lsl x1, x1, #1
-.rept 16
- stp q0, q1, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_32x32_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
-.rept 32
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_64x64_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
- sub x1, x1, #64
-.rept 64
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
function PFX(copy_cnt_4_neon)
lsl x2, x2, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 575c9cab8..f4df6786e 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -819,24 +819,6 @@ static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix
#endif // HIGH_BIT_DEPTH
-template<int size>
-void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
-{
- for (int y = 0; y < size; y++)
- {
- int x = 0;
- int16x8_t v = vdupq_n_s16(val);
- for (; (x + 8) <= size; x += 8)
- {
- vst1q_s16(dst + y * dstride + x, v);
- }
- for (; x < size; x++)
- {
- dst[y * dstride + x] = val;
- }
- }
-}
-
template<int lx, int ly>
int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
@@ -915,6 +897,26 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
return sum;
}
+template<int size>
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
+{
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ vst1q_s16(dst + h * dstride + w, vdupq_n_s16(val));
+ vst1q_s16(dst + h * dstride + w + 8, vdupq_n_s16(val));
+ }
+ if (size == 8)
+ {
+ vst1q_s16(dst + h * dstride, vdupq_n_s16(val));
+ }
+ if (size == 4)
+ {
+ vst1_s16(dst + h * dstride, vdup_n_s16(val));
+ }
+ }
+}
#if !HIGH_BIT_DEPTH
template<int width, int height>
@@ -1937,6 +1939,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
@@ -1949,6 +1953,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
@@ -2028,13 +2034,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
#endif // !(HIGH_BIT_DEPTH)
- p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_16x16].blockfill_s[ALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
- p.cu[BLOCK_64x64].blockfill_s[ALIGNED] = blockfill_s_neon<64>;
-
p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = getResidual_neon<4>;
p.cu[BLOCK_4x4].calcresidual[ALIGNED] = getResidual_neon<4>;
--
2.39.5 (Apple Git-154)
-------------- next part --------------
>From fb4af028d49b38571a51ed1026969d3bc32d5fde Mon Sep 17 00:00:00 2001
Message-Id: <fb4af028d49b38571a51ed1026969d3bc32d5fde.1748441936.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Tue, 20 May 2025 15:15:38 +0200
Subject: [PATCH] AArch64: Clean up blockfill Neon and SVE2 functions
Use the Neon intrinsics implementation of blockfill_s for all block
sizes. Unroll the Neon intrinsics implementation to ensure the usage of
STP. Delete the Neon and SVE assembly implementations of blockfill_s as
they are no faster, and only serve to increase binary size.
---
source/common/CMakeLists.txt | 2 +-
source/common/aarch64/asm-primitives.cpp | 8 ---
source/common/aarch64/blockcopy8-sve.S | 70 ------------------------
source/common/aarch64/blockcopy8.S | 56 -------------------
source/common/aarch64/pixel-prim.cpp | 49 ++++++++---------
5 files changed, 25 insertions(+), 160 deletions(-)
delete mode 100644 source/common/aarch64/blockcopy8-sve.S
diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 7eb40fb05..405ec0b2d 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -114,7 +114,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
# Add Arm assembly files here.
set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
- set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+ set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
set(VEC_PRIMITIVES)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 536af1d5d..5ce9352bd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,10 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
- // Block_fill
- ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
- ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
-
// copy_count
p.cu[BLOCK_4x4].copy_cnt = PFX(copy_cnt_4_neon);
p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_neon);
@@ -589,10 +585,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- // Block_fill
- LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
- LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
-
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_sve);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
deleted file mode 100644
index 401167038..000000000
--- a/source/common/aarch64/blockcopy8-sve.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
-
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(blockfill_s_32x32_sve)
- rdvl x9, #1
- cmp x9, #16
- bgt .vl_gt_16_blockfill_s_32_32
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
-.rept 32
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-.vl_gt_16_blockfill_s_32_32:
- cmp x9, #48
- bgt .vl_gt_48_blockfill_s_32_32
- dup z0.h, w2
- ptrue p0.h, vl16
-.rept 32
- st1h {z0.h}, p0, [x0]
- st1h {z0.h}, p0, [x0, #1, mul vl]
- add x0, x0, x1, lsl #1
-.endr
- ret
-.vl_gt_48_blockfill_s_32_32:
- dup z0.h, w2
- ptrue p0.h, vl32
-.rept 32
- st1h {z0.h}, p0, [x0]
- add x0, x0, x1, lsl #1
-.endr
- ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 00b49df4d..0c30ed614 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -33,62 +33,6 @@
.text
-// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
-function PFX(blockfill_s_4x4_neon)
- dup v0.4h, w2
- lsl x1, x1, #1
-.rept 4
- st1 {v0.4h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_8x8_neon)
- dup v0.8h, w2
- lsl x1, x1, #1
-.rept 8
- st1 {v0.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_16x16_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- lsl x1, x1, #1
-.rept 16
- stp q0, q1, [x0]
- add x0, x0, x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_32x32_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
-.rept 32
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
-function PFX(blockfill_s_64x64_neon)
- dup v0.8h, w2
- mov v1.16b, v0.16b
- mov v2.16b, v0.16b
- mov v3.16b, v0.16b
- lsl x1, x1, #1
- sub x1, x1, #64
-.rept 64
- st1 {v0.8h-v3.8h}, [x0], #64
- st1 {v0.8h-v3.8h}, [x0], x1
-.endr
- ret
-endfunc
-
// uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
function PFX(copy_cnt_4_neon)
lsl x2, x2, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 575c9cab8..f4df6786e 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -819,24 +819,6 @@ static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix
#endif // HIGH_BIT_DEPTH
-template<int size>
-void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
-{
- for (int y = 0; y < size; y++)
- {
- int x = 0;
- int16x8_t v = vdupq_n_s16(val);
- for (; (x + 8) <= size; x += 8)
- {
- vst1q_s16(dst + y * dstride + x, v);
- }
- for (; x < size; x++)
- {
- dst[y * dstride + x] = val;
- }
- }
-}
-
template<int lx, int ly>
int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
{
@@ -915,6 +897,26 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
return sum;
}
+template<int size>
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
+{
+ for (int h = 0; h < size; h++)
+ {
+ for (int w = 0; w + 16 <= size; w += 16)
+ {
+ vst1q_s16(dst + h * dstride + w, vdupq_n_s16(val));
+ vst1q_s16(dst + h * dstride + w + 8, vdupq_n_s16(val));
+ }
+ if (size == 8)
+ {
+ vst1q_s16(dst + h * dstride, vdupq_n_s16(val));
+ }
+ if (size == 4)
+ {
+ vst1_s16(dst + h * dstride, vdup_n_s16(val));
+ }
+ }
+}
#if !HIGH_BIT_DEPTH
template<int width, int height>
@@ -1937,6 +1939,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
@@ -1949,6 +1953,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_ ## W ## x ## H].sub_ps = pixel_sub_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED] = pixel_add_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+ p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED] = blockfill_s_neon<W>; \
p.cu[BLOCK_ ## W ## x ## H].copy_pp = blockcopy_pp_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ps = blockcopy_ps_neon<W, H>; \
p.cu[BLOCK_ ## W ## x ## H].copy_ss = blockcopy_ss_neon<W, H>; \
@@ -2028,13 +2034,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
#endif // !(HIGH_BIT_DEPTH)
- p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_16x16].blockfill_s[ALIGNED] = blockfill_s_neon<16>;
- p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_32x32].blockfill_s[ALIGNED] = blockfill_s_neon<32>;
- p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
- p.cu[BLOCK_64x64].blockfill_s[ALIGNED] = blockfill_s_neon<64>;
-
p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = getResidual_neon<4>;
p.cu[BLOCK_4x4].calcresidual[ALIGNED] = getResidual_neon<4>;
--
2.39.5 (Apple Git-154)
More information about the x265-devel
mailing list