[x265] [PATCH] AArch64: Clean up blockfill Neon and SVE2 functions

Wed May 28 14:27:09 UTC 2025

Use the Neon intrinsics implementation of blockfill_s for all block
sizes. Unroll the Neon intrinsics implementation to ensure the usage of
STP. Delete the Neon and SVE assembly implementations of blockfill_s as
they are no faster, and only serve to increase binary size.
---
 source/common/CMakeLists.txt             |  2 +-
 source/common/aarch64/asm-primitives.cpp |  8 ---
 source/common/aarch64/blockcopy8-sve.S   | 70 ------------------------
 source/common/aarch64/blockcopy8.S       | 56 -------------------
 source/common/aarch64/pixel-prim.cpp     | 49 ++++++++---------
 5 files changed, 25 insertions(+), 160 deletions(-)
 delete mode 100644 source/common/aarch64/blockcopy8-sve.S

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 7eb40fb05..405ec0b2d 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -114,7 +114,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     # Add Arm assembly files here.
     set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+    set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
     set(VEC_PRIMITIVES)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 536af1d5d..5ce9352bd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,10 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
     ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
 
-    // Block_fill
-    ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
-    ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
-
     // copy_count
     p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
     p.cu[BLOCK_8x8].copy_cnt     = PFX(copy_cnt_8_neon);
@@ -589,10 +585,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
     CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
     LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
 
-    // Block_fill
-    LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
-    LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
-
     // sse_ss
     p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_sve);
     p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
deleted file mode 100644
index 401167038..000000000
--- a/source/common/aarch64/blockcopy8-sve.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(blockfill_s_32x32_sve)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockfill_s_32_32
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 32
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-.vl_gt_16_blockfill_s_32_32:
-    cmp             x9, #48
-    bgt             .vl_gt_48_blockfill_s_32_32
-    dup             z0.h, w2
-    ptrue           p0.h, vl16
-.rept 32
-    st1h            {z0.h}, p0, [x0]
-    st1h            {z0.h}, p0, [x0, #1, mul vl]
-    add             x0, x0, x1, lsl #1
-.endr
-    ret
-.vl_gt_48_blockfill_s_32_32:
-    dup             z0.h, w2
-    ptrue           p0.h, vl32
-.rept 32
-    st1h            {z0.h}, p0, [x0]
-    add             x0, x0, x1, lsl #1
-.endr
-    ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 00b49df4d..0c30ed614 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -33,62 +33,6 @@
 
 .text
 
-// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
-function PFX(blockfill_s_4x4_neon)
-    dup             v0.4h, w2
-    lsl             x1, x1, #1
-.rept 4
-    st1             {v0.4h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_8x8_neon)
-    dup             v0.8h, w2
-    lsl             x1, x1, #1
-.rept 8
-    st1             {v0.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_16x16_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 16
-    stp             q0, q1, [x0]
-    add             x0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_32x32_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 32
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_64x64_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-    sub             x1, x1, #64
-.rept 64
-    st1             {v0.8h-v3.8h}, [x0], #64
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
 // uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
 function PFX(copy_cnt_4_neon)
     lsl             x2, x2, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 575c9cab8..f4df6786e 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -819,24 +819,6 @@ static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix
 
 #endif // HIGH_BIT_DEPTH
 
-template<int size>
-void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
-{
-    for (int y = 0; y < size; y++)
-    {
-        int x = 0;
-        int16x8_t v = vdupq_n_s16(val);
-        for (; (x + 8) <= size; x += 8)
-        {
-            vst1q_s16(dst + y * dstride + x, v);
-        }
-        for (; x < size; x++)
-        {
-            dst[y * dstride + x] = val;
-        }
-    }
-}
-
 template<int lx, int ly>
 int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
@@ -915,6 +897,26 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
     return sum;
 }
 
+template<int size>
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
+{
+    for (int h = 0; h < size; h++)
+    {
+        for (int w = 0; w + 16 <= size; w += 16)
+        {
+            vst1q_s16(dst + h * dstride + w, vdupq_n_s16(val));
+            vst1q_s16(dst + h * dstride + w + 8, vdupq_n_s16(val));
+        }
+        if (size == 8)
+        {
+            vst1q_s16(dst + h * dstride, vdupq_n_s16(val));
+        }
+        if (size == 4)
+        {
+            vst1_s16(dst + h * dstride, vdup_n_s16(val));
+        }
+    }
+}
 
 #if !HIGH_BIT_DEPTH
 template<int width, int height>
@@ -1937,6 +1939,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
@@ -1949,6 +1953,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_neon<W, H>; \
@@ -2028,13 +2034,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
 #endif // !(HIGH_BIT_DEPTH)
 
-    p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
-    p.cu[BLOCK_16x16].blockfill_s[ALIGNED]    = blockfill_s_neon<16>;
-    p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
-    p.cu[BLOCK_32x32].blockfill_s[ALIGNED]    = blockfill_s_neon<32>;
-    p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
-    p.cu[BLOCK_64x64].blockfill_s[ALIGNED]    = blockfill_s_neon<64>;
-
 
     p.cu[BLOCK_4x4].calcresidual[NONALIGNED]    = getResidual_neon<4>;
     p.cu[BLOCK_4x4].calcresidual[ALIGNED]       = getResidual_neon<4>;
-- 
2.39.5 (Apple Git-154)

-------------- next part --------------
>From fb4af028d49b38571a51ed1026969d3bc32d5fde Mon Sep 17 00:00:00 2001
Message-Id: <fb4af028d49b38571a51ed1026969d3bc32d5fde.1748441936.git.li.zhang2 at arm.com>
From: Li Zhang <li.zhang2 at arm.com>
Date: Tue, 20 May 2025 15:15:38 +0200
Subject: [PATCH] AArch64: Clean up blockfill Neon and SVE2 functions

Use the Neon intrinsics implementation of blockfill_s for all block
sizes. Unroll the Neon intrinsics implementation to ensure the usage of
STP. Delete the Neon and SVE assembly implementations of blockfill_s as
they are no faster, and only serve to increase binary size.
---
 source/common/CMakeLists.txt             |  2 +-
 source/common/aarch64/asm-primitives.cpp |  8 ---
 source/common/aarch64/blockcopy8-sve.S   | 70 ------------------------
 source/common/aarch64/blockcopy8.S       | 56 -------------------
 source/common/aarch64/pixel-prim.cpp     | 49 ++++++++---------
 5 files changed, 25 insertions(+), 160 deletions(-)
 delete mode 100644 source/common/aarch64/blockcopy8-sve.S

diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt
index 7eb40fb05..405ec0b2d 100644
--- a/source/common/CMakeLists.txt
+++ b/source/common/CMakeLists.txt
@@ -114,7 +114,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64))
     # Add Arm assembly files here.
     set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S ssd-a.S ssd-a-common.S intrapred.S dct.S)
     set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S)
-    set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
+    set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S)
     set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S)
     set(VEC_PRIMITIVES)
 
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 536af1d5d..5ce9352bd 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -404,10 +404,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
     ALL_CHROMA_444_PU(p2s[NONALIGNED], filterPixelToShort, neon);
     ALL_LUMA_PU(convert_p2s[NONALIGNED], filterPixelToShort, neon);
 
-    // Block_fill
-    ALL_LUMA_TU(blockfill_s[ALIGNED], blockfill_s, neon);
-    ALL_LUMA_TU(blockfill_s[NONALIGNED], blockfill_s, neon);
-
     // copy_count
     p.cu[BLOCK_4x4].copy_cnt     = PFX(copy_cnt_4_neon);
     p.cu[BLOCK_8x8].copy_cnt     = PFX(copy_cnt_8_neon);
@@ -589,10 +585,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
     CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
     LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
 
-    // Block_fill
-    LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
-    LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
-
     // sse_ss
     p.cu[BLOCK_4x4].sse_ss   = PFX(pixel_sse_ss_4x4_sve);
     p.cu[BLOCK_8x8].sse_ss   = PFX(pixel_sse_ss_8x8_sve);
diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
deleted file mode 100644
index 401167038..000000000
--- a/source/common/aarch64/blockcopy8-sve.S
+++ /dev/null
@@ -1,70 +0,0 @@
-/*****************************************************************************
- * Copyright (C) 2022-2023 MulticoreWare, Inc
- *
- * Authors: David Chen <david.chen at myais.com.cn>
- 
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
- *
- * This program is also available under a commercial proprietary license.
- * For more information, contact us at license @ x265.com.
- *****************************************************************************/
-
-#include "asm-sve.S"
-
-.arch armv8-a+sve
-
-#ifdef __APPLE__
-.section __RODATA,__rodata
-#else
-.section .rodata
-#endif
-
-.align 4
-
-.text
-
-function PFX(blockfill_s_32x32_sve)
-    rdvl            x9, #1
-    cmp             x9, #16
-    bgt             .vl_gt_16_blockfill_s_32_32
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 32
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-.vl_gt_16_blockfill_s_32_32:
-    cmp             x9, #48
-    bgt             .vl_gt_48_blockfill_s_32_32
-    dup             z0.h, w2
-    ptrue           p0.h, vl16
-.rept 32
-    st1h            {z0.h}, p0, [x0]
-    st1h            {z0.h}, p0, [x0, #1, mul vl]
-    add             x0, x0, x1, lsl #1
-.endr
-    ret
-.vl_gt_48_blockfill_s_32_32:
-    dup             z0.h, w2
-    ptrue           p0.h, vl32
-.rept 32
-    st1h            {z0.h}, p0, [x0]
-    add             x0, x0, x1, lsl #1
-.endr
-    ret
-endfunc
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 00b49df4d..0c30ed614 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -33,62 +33,6 @@
 
 .text
 
-// void x265_blockfill_s_neon(int16_t* dst, intptr_t dstride, int16_t val)
-function PFX(blockfill_s_4x4_neon)
-    dup             v0.4h, w2
-    lsl             x1, x1, #1
-.rept 4
-    st1             {v0.4h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_8x8_neon)
-    dup             v0.8h, w2
-    lsl             x1, x1, #1
-.rept 8
-    st1             {v0.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_16x16_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 16
-    stp             q0, q1, [x0]
-    add             x0, x0, x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_32x32_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-.rept 32
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
-function PFX(blockfill_s_64x64_neon)
-    dup             v0.8h, w2
-    mov             v1.16b, v0.16b
-    mov             v2.16b, v0.16b
-    mov             v3.16b, v0.16b
-    lsl             x1, x1, #1
-    sub             x1, x1, #64
-.rept 64
-    st1             {v0.8h-v3.8h}, [x0], #64
-    st1             {v0.8h-v3.8h}, [x0], x1
-.endr
-    ret
-endfunc
-
 // uint32_t copy_count(int16_t* coeff, const int16_t* residual, intptr_t resiStride)
 function PFX(copy_cnt_4_neon)
     lsl             x2, x2, #1
diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 575c9cab8..f4df6786e 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -819,24 +819,6 @@ static inline int pixel_sa8d_16x16_neon(const uint8_t *pix1, intptr_t stride_pix
 
 #endif // HIGH_BIT_DEPTH
 
-template<int size>
-void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
-{
-    for (int y = 0; y < size; y++)
-    {
-        int x = 0;
-        int16x8_t v = vdupq_n_s16(val);
-        for (; (x + 8) <= size; x += 8)
-        {
-            vst1q_s16(dst + y * dstride + x, v);
-        }
-        for (; x < size; x++)
-        {
-            dst[y * dstride + x] = val;
-        }
-    }
-}
-
 template<int lx, int ly>
 int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intptr_t stride_pix2)
 {
@@ -915,6 +897,26 @@ int sad_pp_neon(const pixel *pix1, intptr_t stride_pix1, const pixel *pix2, intp
     return sum;
 }
 
+template<int size>
+void blockfill_s_neon(int16_t *dst, intptr_t dstride, int16_t val)
+{
+    for (int h = 0; h < size; h++)
+    {
+        for (int w = 0; w + 16 <= size; w += 16)
+        {
+            vst1q_s16(dst + h * dstride + w, vdupq_n_s16(val));
+            vst1q_s16(dst + h * dstride + w + 8, vdupq_n_s16(val));
+        }
+        if (size == 8)
+        {
+            vst1q_s16(dst + h * dstride, vdupq_n_s16(val));
+        }
+        if (size == 4)
+        {
+            vst1_s16(dst + h * dstride, vdup_n_s16(val));
+        }
+    }
+}
 
 #if !HIGH_BIT_DEPTH
 template<int width, int height>
@@ -1937,6 +1939,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].cpy2Dto1D_shl = cpy2Dto1D_shl_neon<W>; \
@@ -1949,6 +1953,8 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_ ## W ## x ## H].sub_ps        = pixel_sub_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[NONALIGNED]    = pixel_add_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].add_ps[ALIGNED] = pixel_add_ps_neon<W, H>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[NONALIGNED] = blockfill_s_neon<W>; \
+    p.cu[BLOCK_ ## W ## x ## H].blockfill_s[ALIGNED]    = blockfill_s_neon<W>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_pp       = blockcopy_pp_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ps       = blockcopy_ps_neon<W, H>; \
     p.cu[BLOCK_ ## W ## x ## H].copy_ss       = blockcopy_ss_neon<W, H>; \
@@ -2028,13 +2034,6 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_64x64].var = pixel_var_neon<64>;
 #endif // !(HIGH_BIT_DEPTH)
 
-    p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = blockfill_s_neon<16>;
-    p.cu[BLOCK_16x16].blockfill_s[ALIGNED]    = blockfill_s_neon<16>;
-    p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = blockfill_s_neon<32>;
-    p.cu[BLOCK_32x32].blockfill_s[ALIGNED]    = blockfill_s_neon<32>;
-    p.cu[BLOCK_64x64].blockfill_s[NONALIGNED] = blockfill_s_neon<64>;
-    p.cu[BLOCK_64x64].blockfill_s[ALIGNED]    = blockfill_s_neon<64>;
-
 
     p.cu[BLOCK_4x4].calcresidual[NONALIGNED]    = getResidual_neon<4>;
     p.cu[BLOCK_4x4].calcresidual[ALIGNED]       = getResidual_neon<4>;
-- 
2.39.5 (Apple Git-154)