[x264-devel] [Git][videolan/x264][master] 8 commits: Create Common NEON dct-a Macros

Martin Storsjö (@mstorsjo) gitlab at videolan.org
Tue Nov 28 10:04:28 UTC 2023



Martin Storsjö pushed to branch master at VideoLAN / x264


Commits:
b6190c6f by David Chen at 2023-11-18T08:42:48+02:00
Create Common NEON dct-a Macros

Place NEON dct-a macros that are intended to be
used by SVE/SVE2 functions as well in a common file.

- - - - -
5c382660 by David Chen at 2023-11-20T08:03:51+02:00
Improve dct-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/dct-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.

Command executed: ./checkasm8 --bench=sub
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
sub4x4_dct_c: 528
sub4x4_dct_neon: 322
sub4x4_dct_sve: 247

Command executed: ./checkasm8 --bench=sub
Testbed: AWS Graviton3
Results:
sub4x4_dct_c: 562
sub4x4_dct_neon: 376
sub4x4_dct_sve: 255

Command executed: ./checkasm8 --bench=add
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
add4x4_idct_c: 698
add4x4_idct_neon: 386
add4x4_idct_sve2: 345

Command executed: ./checkasm8 --bench=zigzag
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
zigzag_interleave_8x8_cavlc_frame_c: 582
zigzag_interleave_8x8_cavlc_frame_neon: 273
zigzag_interleave_8x8_cavlc_frame_sve: 257

Command executed: ./checkasm8 --bench=zigzag
Testbed: AWS Graviton3
Results:
zigzag_interleave_8x8_cavlc_frame_c: 587
zigzag_interleave_8x8_cavlc_frame_neon: 257
zigzag_interleave_8x8_cavlc_frame_sve: 249

- - - - -
37949a99 by David Chen at 2023-11-20T08:03:53+02:00
Create Common NEON deblock-a Macros

Place NEON deblock-a macros that are intended to be
used by SVE/SVE2 functions as well in a common file.

- - - - -
5ad5e5d8 by David Chen at 2023-11-20T08:03:54+02:00
Improve deblock-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/deblock-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.

Command executed: ./checkasm8 --bench=deblock
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
deblock_chroma[1]_c: 735
deblock_chroma[1]_neon: 427
deblock_chroma[1]_sve: 353

Command executed: ./checkasm8 --bench=deblock
Testbed: AWS Graviton3
Results:
deblock_chroma[1]_c: 719
deblock_chroma[1]_neon: 442
deblock_chroma[1]_sve: 345

- - - - -
21a788f1 by David Chen at 2023-11-23T08:24:13+02:00
Create Common NEON mc-a Macros and Functions

Place NEON mc-a macros and functions that are intended
to be used by SVE/SVE2 functions as well in a common file.

- - - - -
06dcf3f9 by David Chen at 2023-11-23T08:24:16+02:00
Improve mc-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/mc-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.

Command executed: ./checkasm8 --bench=avg
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
avg_4x2_c: 274
avg_4x2_neon: 215
avg_4x2_sve: 171
avg_4x4_c: 461
avg_4x4_neon: 343
avg_4x4_sve: 225
avg_4x8_c: 806
avg_4x8_neon: 619
avg_4x8_sve: 334
avg_4x16_c: 1523
avg_4x16_neon: 1168
avg_4x16_sve: 558

Command executed: ./checkasm8 --bench=avg
Testbed: AWS Graviton3
Results:
avg_4x2_c: 267
avg_4x2_neon: 213
avg_4x2_sve: 167
avg_4x4_c: 467
avg_4x4_neon: 350
avg_4x4_sve: 221
avg_4x8_c: 784
avg_4x8_neon: 624
avg_4x8_sve: 302
avg_4x16_c: 1445
avg_4x16_neon: 1182
avg_4x16_sve: 485

- - - - -
0ac52d29 by David Chen at 2023-11-23T08:26:53+02:00
Create Common NEON pixel-a Macros and Constants

Place NEON pixel-a macros and constants that are intended
to be used by SVE/SVE2 functions as well in a common file.

- - - - -
c1c9931d by David Chen at 2023-11-23T19:01:29+02:00
Improve pixel-a.S Performance by Using SVE/SVE2

Imporve the performance of NEON functions of aarch64/pixel-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.

Command executed: ./checkasm8 --bench=ssd
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
ssd_4x4_c: 235
ssd_4x4_neon: 226
ssd_4x4_sve: 151
ssd_4x8_c: 409
ssd_4x8_neon: 363
ssd_4x8_sve: 201
ssd_4x16_c: 781
ssd_4x16_neon: 653
ssd_4x16_sve: 313
ssd_8x4_c: 402
ssd_8x4_neon: 192
ssd_8x4_sve: 192
ssd_8x8_c: 728
ssd_8x8_neon: 275
ssd_8x8_sve: 275

Command executed: ./checkasm10 --bench=ssd
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
ssd_4x4_c: 256
ssd_4x4_neon: 226
ssd_4x4_sve: 153
ssd_4x8_c: 460
ssd_4x8_neon: 369
ssd_4x8_sve: 215
ssd_4x16_c: 852
ssd_4x16_neon: 651
ssd_4x16_sve: 340

Command executed: ./checkasm8 --bench=ssd
Testbed: AWS Graviton3
Results:
ssd_4x4_c: 295
ssd_4x4_neon: 288
ssd_4x4_sve: 228
ssd_4x8_c: 454
ssd_4x8_neon: 431
ssd_4x8_sve: 294
ssd_4x16_c: 779
ssd_4x16_neon: 631
ssd_4x16_sve: 438
ssd_8x4_c: 463
ssd_8x4_neon: 247
ssd_8x4_sve: 246
ssd_8x8_c: 781
ssd_8x8_neon: 413
ssd_8x8_sve: 353

Command executed: ./checkasm10 --bench=ssd
Testbed: AWS Graviton3
Results:
ssd_4x4_c: 322
ssd_4x4_neon: 335
ssd_4x4_sve: 240
ssd_4x8_c: 522
ssd_4x8_neon: 448
ssd_4x8_sve: 294
ssd_4x16_c: 832
ssd_4x16_neon: 603
ssd_4x16_sve: 440

Command executed: ./checkasm8 --bench=sa8d
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
sa8d_8x8_c: 2103
sa8d_8x8_neon: 619
sa8d_8x8_sve: 617

Command executed: ./checkasm8 --bench=sa8d
Testbed: AWS Graviton3
Results:
sa8d_8x8_c: 2021
sa8d_8x8_neon: 597
sa8d_8x8_sve: 580

Command executed: ./checkasm8 --bench=var
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
var_8x8_c: 595
var_8x8_neon: 262
var_8x8_sve: 262
var_8x16_c: 1193
var_8x16_neon: 435
var_8x16_sve: 419

Command executed: ./checkasm8 --bench=var
Testbed: AWS Graviton3
Results:
var_8x8_c: 616
var_8x8_neon: 229
var_8x8_sve: 222
var_8x16_c: 1207
var_8x16_neon: 399
var_8x16_sve: 389

Command executed: ./checkasm8 --bench=hadamard_ac
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
hadamard_ac_8x8_c: 2330
hadamard_ac_8x8_neon: 635
hadamard_ac_8x8_sve: 635
hadamard_ac_8x16_c: 4500
hadamard_ac_8x16_neon: 1152
hadamard_ac_8x16_sve: 1151
hadamard_ac_16x8_c: 4499
hadamard_ac_16x8_neon: 1151
hadamard_ac_16x8_sve: 1150
hadamard_ac_16x16_c: 8812
hadamard_ac_16x16_neon: 2187
hadamard_ac_16x16_sve: 2186

Command executed: ./checkasm8 --bench=hadamard_ac
Testbed: AWS Graviton3
Results:
hadamard_ac_8x8_c: 2266
hadamard_ac_8x8_neon: 517
hadamard_ac_8x8_sve: 513
hadamard_ac_8x16_c: 4444
hadamard_ac_8x16_neon: 867
hadamard_ac_8x16_sve: 849
hadamard_ac_16x8_c: 4443
hadamard_ac_16x8_neon: 880
hadamard_ac_16x8_sve: 868
hadamard_ac_16x16_c: 8595
hadamard_ac_16x16_neon: 1656
hadamard_ac_16x16_sve: 1622

- - - - -


21 changed files:

- Makefile
- + common/aarch64/dct-a-common.S
- + common/aarch64/dct-a-sve.S
- + common/aarch64/dct-a-sve2.S
- common/aarch64/dct-a.S
- common/aarch64/dct.h
- + common/aarch64/deblock-a-common.S
- + common/aarch64/deblock-a-sve.S
- common/aarch64/deblock-a.S
- common/aarch64/deblock.h
- + common/aarch64/mc-a-common.S
- + common/aarch64/mc-a-sve.S
- common/aarch64/mc-a.S
- common/aarch64/mc-c.c
- + common/aarch64/pixel-a-common.S
- + common/aarch64/pixel-a-sve.S
- common/aarch64/pixel-a.S
- common/aarch64/pixel.h
- common/dct.c
- common/deblock.c
- common/pixel.c


Changes:

=====================================
Makefile
=====================================
@@ -160,7 +160,7 @@ endif
 OBJCHK += tools/checkasm-arm.o
 endif
 
-# AArch64 NEON optims
+# AArch64 NEON and SVE/SVE2 optims
 ifeq ($(SYS_ARCH),AARCH64)
 SRCASM_X  = common/aarch64/bitstream-a.S \
             common/aarch64/cabac-a.S \
@@ -170,6 +170,15 @@ SRCASM_X  = common/aarch64/bitstream-a.S \
             common/aarch64/pixel-a.S \
             common/aarch64/predict-a.S \
             common/aarch64/quant-a.S
+ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve.S \
+            common/aarch64/deblock-a-sve.S \
+            common/aarch64/mc-a-sve.S \
+            common/aarch64/pixel-a-sve.S
+endif
+ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve2.S
+endif
 SRCS_X   += common/aarch64/asm-offsets.c \
             common/aarch64/mc-c.c \
             common/aarch64/predict-c.c


=====================================
common/aarch64/dct-a-common.S
=====================================
@@ -0,0 +1,40 @@
+/****************************************************************************
+ * dct-a-common.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *          David Chen   <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros that are intended to be used by
+// the SVE/SVE2 functions as well
+
+.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
+    SUMSUB_AB   \v1, \v6, \v5, \v6
+    SUMSUB_AB   \v3, \v7, \v4, \v7
+    add         \v0, \v3, \v1
+    add         \v4, \v7, \v7
+    add         \v5, \v6, \v6
+    sub         \v2, \v3, \v1
+    add         \v1, \v4, \v6
+    sub         \v3, \v7, \v5
+.endm


=====================================
common/aarch64/dct-a-sve.S
=====================================
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * dct-a-sve.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve
+
+function sub4x4_dct_sve, export=1
+    mov         x3, #FENC_STRIDE
+    mov         x4, #FDEC_STRIDE
+    ptrue       p0.h, vl4
+    ld1b        {z0.h}, p0/z, [x1]
+    add         x1, x1, x3
+    ld1b        {z1.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z2.h}, p0/z, [x1]
+    add         x1, x1, x3
+    sub         v16.4h, v0.4h, v1.4h
+    ld1b        {z3.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z4.h}, p0/z, [x1]
+    add         x1, x1, x3
+    sub         v17.4h, v2.4h, v3.4h
+    ld1b        {z5.h}, p0/z, [x2]
+    add         x2, x2, x4
+    ld1b        {z6.h}, p0/z, [x1]
+    sub         v18.4h, v4.4h, v5.4h
+    ld1b        {z7.h}, p0/z, [x2]
+    sub         v19.4h, v6.4h, v7.4h
+
+    DCT_1D      v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
+    transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
+    DCT_1D      v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
+    st1         {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
+    ret
+endfunc
+
+function zigzag_interleave_8x8_cavlc_sve, export=1
+    mov         z31.s, #1
+    ptrue       p2.s, vl2
+    ld4         {v0.8h,v1.8h,v2.8h,v3.8h}, [x1],  #64
+    ld4         {v4.8h,v5.8h,v6.8h,v7.8h}, [x1],  #64
+    umax        v16.8h, v0.8h,  v4.8h
+    umax        v17.8h, v1.8h,  v5.8h
+    umax        v18.8h, v2.8h,  v6.8h
+    umax        v19.8h, v3.8h,  v7.8h
+    st1         {v0.8h}, [x0],  #16
+    st1         {v4.8h}, [x0],  #16
+    umaxp       v16.8h, v16.8h, v17.8h
+    umaxp       v18.8h, v18.8h, v19.8h
+    st1         {v1.8h}, [x0],  #16
+    st1         {v5.8h}, [x0],  #16
+    umaxp       v16.8h, v16.8h, v18.8h
+    st1         {v2.8h}, [x0],  #16
+    st1         {v6.8h}, [x0],  #16
+    cmhs        v16.4s, v16.4s, v31.4s
+    st1         {v3.8h}, [x0],  #16
+    and         v16.16b, v16.16b, v31.16b
+    st1         {v7.8h}, [x0],  #16
+    st1b        {z16.s}, p2, [x2]
+    add         x2, x2, #8
+    mov         v16.d[0], v16.d[1]
+    st1b        {z16.s}, p2, [x2]
+    ret
+endfunc


=====================================
common/aarch64/dct-a-sve2.S
=====================================
@@ -0,0 +1,89 @@
+/****************************************************************************
+ * dct-a-sve2.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve+sve2
+
+function add4x4_idct_sve2, export=1
+    mov         x2, #FDEC_STRIDE
+    mov         x11, x0
+    ptrue       p0.h, vl8
+    ptrue       p1.h, vl4
+    ld1         {v0.8h, v1.8h}, [x1]
+
+    SUMSUB_AB   v4.8h, v5.8h, v0.8h, v1.8h
+
+    sshr        v7.8h, v0.8h, #1
+    sshr        v6.8h, v1.8h, #1
+    sub         v7.8h, v7.8h, v1.8h
+    add         v6.8h, v6.8h, v0.8h
+    mov         v7.d[0], v7.d[1]
+    mov         v6.d[0], v6.d[1]
+    ld1b        {z28.h}, p0/z, [x11]
+    add         x11, x11, x2
+    SUMSUB_AB   v0.8h, v2.8h, v4.8h, v6.8h
+    SUMSUB_AB   v1.8h, v3.8h, v5.8h, v7.8h
+
+    transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
+
+    SUMSUB_AB   v4.4h, v5.4h, v0.4h, v3.4h
+
+    sshr        v7.4h, v1.4h, #1
+    sshr        v6.4h, v2.4h, #1
+    sub         v7.4h, v7.4h, v2.4h
+    add         v6.4h, v6.4h, v1.4h
+    ld1b        {z29.h}, p0/z, [x11]
+    add         x11, x11, x2
+    SUMSUB_AB   v0.4h, v2.4h, v4.4h, v6.4h
+    SUMSUB_AB   v1.4h, v3.4h, v5.4h, v7.4h
+
+    srshr       z0.h, p1/m, z0.h, #6
+    srshr       z1.h, p1/m, z1.h, #6
+    ld1b        {z31.h}, p0/z, [x11]
+    add         x11, x11, x2
+    srshr       z2.h, p1/m, z2.h, #6
+    srshr       z3.h, p1/m, z3.h, #6
+    ld1b        {z30.h}, p0/z, [x11]
+
+    add         v0.8h, v0.8h, v28.8h
+    add         v1.8h, v1.8h, v29.8h
+    add         v2.8h, v2.8h, v30.8h
+    add         v3.8h, v3.8h, v31.8h
+    sqxtunb     z0.b, z0.h
+    sqxtunb     z1.b, z1.h
+    sqxtunb     z2.b, z2.h
+    sqxtunb     z3.b, z3.h
+
+    st1b        {z0.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z1.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z3.h}, p1, [x0]
+    add         x0, x0, x2
+    st1b        {z2.h}, p1, [x0]
+    ret
+endfunc


=====================================
common/aarch64/dct-a.S
=====================================
@@ -25,6 +25,7 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "dct-a-common.S"
 
 const scan4x4_frame, align=4
 .byte    0,1,   8,9,   2,3,   4,5
@@ -120,17 +121,6 @@ function idct4x4dc_neon, export=1
     ret
 endfunc
 
-.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
-    SUMSUB_AB   \v1, \v6, \v5, \v6
-    SUMSUB_AB   \v3, \v7, \v4, \v7
-    add         \v0, \v3, \v1
-    add         \v4, \v7, \v7
-    add         \v5, \v6, \v6
-    sub         \v2, \v3, \v1
-    add         \v1, \v4, \v6
-    sub         \v3, \v7, \v5
-.endm
-
 function sub4x4_dct_neon, export=1
     mov         x3, #FENC_STRIDE
     mov         x4, #FDEC_STRIDE


=====================================
common/aarch64/dct.h
=====================================
@@ -91,4 +91,13 @@ int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel
 #define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
 void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
 
+#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
+void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
+void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
+
+#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
+void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+
 #endif


=====================================
common/aarch64/deblock-a-common.S
=====================================
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * deblock-a-common.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *          David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros that are intended to be used by
+// the SVE/SVE2 functions as well
+
+.macro h264_loop_filter_start
+    cmp             w2,  #0
+    ldr             w6,  [x4]
+    ccmp            w3,  #0, #0, ne
+    mov             v24.s[0], w6
+    and             w8,  w6,  w6,  lsl #16
+    b.eq            1f
+    ands            w8,  w8,  w8,  lsl #8
+    b.ge            2f
+1:
+    ret
+2:
+.endm


=====================================
common/aarch64/deblock-a-sve.S
=====================================
@@ -0,0 +1,98 @@
+/*****************************************************************************
+ * deblock-a-sve.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "deblock-a-common.S"
+
+.arch armv8-a+sve
+
+.macro h264_loop_filter_chroma_sve
+    ptrue           p0.b, vl16
+
+    dup             v22.16b, w2              // alpha
+    uxtl            v24.8h,  v24.8b
+    uabd            v26.16b, v16.16b, v0.16b   // abs(p0 - q0)
+    uxtl            v4.8h,   v0.8b
+    uxtl2           v5.8h,   v0.16b
+    uabd            v28.16b, v18.16b, v16.16b  // abs(p1 - p0)
+    usubw           v4.8h,   v4.8h,   v16.8b
+    usubw2          v5.8h,   v5.8h,   v16.16b
+    sli             v24.8h,  v24.8h,  #8
+    shl             v4.8h,   v4.8h,   #2
+    shl             v5.8h,   v5.8h,   #2
+    uabd            v30.16b, v2.16b,  v0.16b   // abs(q1 - q0)
+    uxtl            v24.4s,  v24.4h
+    uaddw           v4.8h,   v4.8h,   v18.8b
+    uaddw2          v5.8h,   v5.8h,   v18.16b
+
+    cmphi           p1.b, p0/z, z22.b, z26.b
+    usubw           v4.8h,   v4.8h,   v2.8b
+    usubw2          v5.8h,   v5.8h,   v2.16b
+    sli             v24.4s,  v24.4s,  #16
+    dup             v22.16b, w3              // beta
+    rshrn           v4.8b,   v4.8h,   #3
+    rshrn2          v4.16b,  v5.8h,   #3
+    cmphi           p2.b, p0/z, z22.b, z28.b
+    cmphi           p3.b, p0/z, z22.b, z30.b
+    smin            v4.16b,  v4.16b,  v24.16b
+    neg             v25.16b, v24.16b
+    and             p1.b, p0/z, p1.b, p2.b
+    smax            v4.16b,  v4.16b,  v25.16b
+    and             p1.b, p0/z, p1.b, p3.b
+    uxtl            v22.8h,  v0.8b
+    uxtl2           v23.8h,  v0.16b
+
+    uxtl            v28.8h,  v16.8b
+    uxtl2           v29.8h,  v16.16b
+    saddw           v28.8h,  v28.8h,  v4.8b
+    saddw2          v29.8h,  v29.8h,  v4.16b
+    ssubw           v22.8h,  v22.8h,  v4.8b
+    ssubw2          v23.8h,  v23.8h,  v4.16b
+    sqxtun          v16.8b,  v28.8h
+    sqxtun          v0.8b,   v22.8h
+    sqxtun2         v16.16b, v29.8h
+    sqxtun2         v0.16b,  v23.8h
+.endm
+
+function deblock_v_chroma_sve, export=1
+    h264_loop_filter_start
+
+    sub             x0,  x0,  x1, lsl #1
+    // No performance improvement if sve load is used. So, continue using
+    // NEON load here
+    ld1             {v18.16b}, [x0], x1
+    ld1             {v16.16b}, [x0], x1
+    ld1             {v0.16b},  [x0], x1
+    ld1             {v2.16b},  [x0]
+
+    h264_loop_filter_chroma_sve
+
+    sub             x0,  x0,  x1, lsl #1
+    st1b            {z16.b}, p1, [x0]
+    add             x0, x0, x1
+    st1b            {z0.b}, p1, [x0]
+
+    ret
+endfunc


=====================================
common/aarch64/deblock-a.S
=====================================
@@ -25,20 +25,7 @@
  *****************************************************************************/
 
 #include "asm.S"
-
-.macro h264_loop_filter_start
-    cmp             w2,  #0
-    ldr             w6,  [x4]
-    ccmp            w3,  #0, #0, ne
-    mov             v24.s[0], w6
-    and             w8,  w6,  w6,  lsl #16
-    b.eq            1f
-    ands            w8,  w8,  w8,  lsl #8
-    b.ge            2f
-1:
-    ret
-2:
-.endm
+#include "deblock-a-common.S"
 
 .macro h264_loop_filter_luma
     dup             v22.16b, w2                     // alpha


=====================================
common/aarch64/deblock.h
=====================================
@@ -55,4 +55,7 @@ void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, i
 #define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
 void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
 
+#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
+void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+
 #endif


=====================================
common/aarch64/mc-a-common.S
=====================================
@@ -0,0 +1,66 @@
+/****************************************************************************
+ * mc-a-common.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *          Mans Rullgard <mans at mansr.com>
+ *          Stefan Groenroos <stefan.gronroos at gmail.com>
+ *          David Chen   <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros and functions that are intended to be used by
+// the SVE/SVE2 functions as well
+
+#if BIT_DEPTH == 8
+
+// 0 < weight < 64
+.macro load_weights_add_add
+    mov         w6,  w6
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+    neg         w7,  w7
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+    neg         w6,  w6
+.endm
+
+function pixel_avg_w4_neon
+1:  subs        w9,  w9,  #2
+    ld1         {v0.s}[0], [x2], x3
+    ld1         {v2.s}[0], [x4], x5
+    urhadd      v0.8b,  v0.8b,  v2.8b
+    ld1         {v1.s}[0], [x2], x3
+    ld1         {v3.s}[0], [x4], x5
+    urhadd      v1.8b,  v1.8b,  v3.8b
+    st1         {v0.s}[0], [x0], x1
+    st1         {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+#else // BIT_DEPTH == 10
+
+#endif


=====================================
common/aarch64/mc-a-sve.S
=====================================
@@ -0,0 +1,108 @@
+/*****************************************************************************
+ * mc-a-sve.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "mc-a-common.S"
+
+.arch armv8-a+sve
+
+#if BIT_DEPTH == 8
+
+// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
+//                 uint8_t *src1, intptr_t src1_stride,
+//                 uint8_t *src2, intptr_t src2_stride, int weight );
+.macro AVGH_SVE w h
+function pixel_avg_\w\()x\h\()_sve, export=1
+    mov         w10, #64
+    cmp         w6,  #32
+    mov         w9, #\h
+    b.eq        pixel_avg_w\w\()_neon
+    subs        w7,  w10,  w6
+    b.lt        pixel_avg_weight_w\w\()_add_sub_sve     // weight > 64
+    cmp         w6,  #0
+    b.ge        pixel_avg_weight_w\w\()_add_add_sve
+    b           pixel_avg_weight_w\w\()_sub_add_sve     // weight < 0
+endfunc
+.endm
+
+AVGH_SVE  4, 2
+AVGH_SVE  4, 4
+AVGH_SVE  4, 8
+AVGH_SVE  4, 16
+
+// 0 < weight < 64
+.macro weight_add_add_sve dst, s1, s2, h=
+    mul         \dst, \s1, v30.8h
+    mla         \dst, \s2, v31.8h
+.endm
+
+// weight > 64
+.macro weight_add_sub_sve dst, s1, s2, h=
+    mul         \dst, \s1, v30.8h
+    mls         \dst, \s2, v31.8h
+.endm
+
+// weight < 0
+.macro weight_sub_add_sve dst, s1, s2, h=
+    mul         \dst, \s2, v31.8h
+    mls         \dst, \s1, v30.8h
+.endm
+
+.macro AVG_WEIGHT_SVE ext
+function pixel_avg_weight_w4_\ext\()_sve
+    load_weights_\ext
+    ptrue       p0.b, vl8
+    dup         v30.8h, w6
+    dup         v31.8h, w7
+1:  // height loop
+    subs        w9,  w9,  #2
+    ld1b        {z0.h}, p0/z, [x2]
+    add         x2, x2, x3
+    ld1b        {z1.h}, p0/z, [x4]
+    add         x4, x4, x5
+    weight_\ext\()_sve v4.8h,  v0.8h,  v1.8h
+    ld1b        {z2.h}, p0/z, [x2]
+    add         x2, x2, x3
+    ld1b        {z3.h}, p0/z, [x4]
+    add         x4, x4, x5
+
+    sqrshrun    v0.8b,  v4.8h,  #6
+    weight_\ext\()_sve v5.8h,  v2.8h,  v3.8h
+    st1         {v0.s}[0], [x0], x1
+    sqrshrun    v1.8b,  v5.8h,  #6
+    st1         {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+.endm
+
+AVG_WEIGHT_SVE add_add
+AVG_WEIGHT_SVE add_sub
+AVG_WEIGHT_SVE sub_add
+
+#else // BIT_DEPTH == 10
+
+
+#endif


=====================================
common/aarch64/mc-a.S
=====================================
@@ -27,6 +27,7 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "mc-a-common.S"
 
 // note: prefetch stuff assumes 64-byte cacheline
 
@@ -327,9 +328,6 @@ AVGH 16, 8
 AVGH 16, 16
 
 // 0 < weight < 64
-.macro load_weights_add_add
-    mov         w6,  w6
-.endm
 .macro weight_add_add dst, s1, s2, h=
 .ifc \h, 2
     umull2      \dst, \s1, v30.16b
@@ -341,9 +339,6 @@ AVGH 16, 16
 .endm
 
 // weight > 64
-.macro load_weights_add_sub
-    neg         w7,  w7
-.endm
 .macro weight_add_sub dst, s1, s2, h=
 .ifc \h, 2
     umull2      \dst, \s1, v30.16b
@@ -355,9 +350,6 @@ AVGH 16, 16
 .endm
 
 // weight < 0
-.macro load_weights_sub_add
-    neg         w6,  w6
-.endm
 .macro weight_sub_add dst, s1, s2, h=
 .ifc \h, 2
     umull2      \dst, \s2, v31.16b
@@ -448,20 +440,6 @@ AVG_WEIGHT add_add
 AVG_WEIGHT add_sub
 AVG_WEIGHT sub_add
 
-function pixel_avg_w4_neon
-1:  subs        w9,  w9,  #2
-    ld1         {v0.s}[0], [x2], x3
-    ld1         {v2.s}[0], [x4], x5
-    urhadd      v0.8b,  v0.8b,  v2.8b
-    ld1         {v1.s}[0], [x2], x3
-    ld1         {v3.s}[0], [x4], x5
-    urhadd      v1.8b,  v1.8b,  v3.8b
-    st1         {v0.s}[0], [x0], x1
-    st1         {v1.s}[0], [x0], x1
-    b.gt        1b
-    ret
-endfunc
-
 function pixel_avg_w8_neon
 1:  subs        w9,  w9,  #4
     ld1         {v0.8b}, [x2], x3


=====================================
common/aarch64/mc-c.c
=====================================
@@ -58,6 +58,15 @@ void x264_pixel_avg_4x4_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, i
 #define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
 void x264_pixel_avg_4x2_neon  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
 
+#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
+void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
+void x264_pixel_avg_4x8_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
+void x264_pixel_avg_4x4_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
+void x264_pixel_avg_4x2_sve  ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+
 #define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
 void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
 #define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
@@ -278,64 +287,70 @@ void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
         pf->prefetch_ref      = x264_prefetch_ref_aarch64;
     }
 
-    if( !(cpu&X264_CPU_NEON) )
-        return;
-
-    pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
-    pf->mbtree_propagate_list = mbtree_propagate_list_neon;
-    pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
-    pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
-
-    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
-    pf->memzero_aligned = x264_memzero_aligned_neon;
-
-    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
-    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
-    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
-    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
-    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
-    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
-    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
-    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
-    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
-
-    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
-    pf->copy[PIXEL_16x16]    = x264_mc_copy_w16_neon;
-    pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
-    pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
-
-    pf->weight       = mc_wtab_neon;
-    pf->offsetadd    = mc_offsetadd_wtab_neon;
-    pf->offsetsub    = mc_offsetsub_wtab_neon;
-    pf->weight_cache = weight_cache_neon;
-
-    pf->mc_chroma = x264_mc_chroma_neon;
-    pf->mc_luma = mc_luma_neon;
-    pf->get_ref = get_ref_neon;
-
-    pf->integral_init4h = x264_integral_init4h_neon;
-    pf->integral_init8h = x264_integral_init8h_neon;
-    pf->integral_init4v = x264_integral_init4v_neon;
-    pf->integral_init8v = x264_integral_init8v_neon;
-
-    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
-
-    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
-
-    pf->store_interleave_chroma       = x264_store_interleave_chroma_neon;
-
-    pf->plane_copy                  = plane_copy_neon;
-    pf->plane_copy_swap             = plane_copy_swap_neon;
-    pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
-    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
-    pf->plane_copy_interleave       = plane_copy_interleave_neon;
-
-    pf->hpel_filter = x264_hpel_filter_neon;
+    if( cpu&X264_CPU_NEON )
+    {
+        pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+        pf->mbtree_propagate_list = mbtree_propagate_list_neon;
+        pf->mbtree_fix8_pack      = x264_mbtree_fix8_pack_neon;
+        pf->mbtree_fix8_unpack    = x264_mbtree_fix8_unpack_neon;
+
+        pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+        pf->memzero_aligned = x264_memzero_aligned_neon;
+
+        pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+        pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
+        pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
+        pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
+        pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+        pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
+        pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
+        pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
+        pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
+
+        pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+        pf->copy[PIXEL_16x16]    = x264_mc_copy_w16_neon;
+        pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
+        pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
+
+        pf->weight       = mc_wtab_neon;
+        pf->offsetadd    = mc_offsetadd_wtab_neon;
+        pf->offsetsub    = mc_offsetsub_wtab_neon;
+        pf->weight_cache = weight_cache_neon;
+
+        pf->mc_chroma = x264_mc_chroma_neon;
+        pf->mc_luma = mc_luma_neon;
+        pf->get_ref = get_ref_neon;
+
+        pf->integral_init4h = x264_integral_init4h_neon;
+        pf->integral_init8h = x264_integral_init8h_neon;
+        pf->integral_init4v = x264_integral_init4v_neon;
+        pf->integral_init8v = x264_integral_init8v_neon;
+
+        pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+
+        pf->store_interleave_chroma       = x264_store_interleave_chroma_neon;
+
+        pf->plane_copy                  = plane_copy_neon;
+        pf->plane_copy_swap             = plane_copy_swap_neon;
+        pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
+        pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+        pf->plane_copy_interleave       = plane_copy_interleave_neon;
+
+        pf->hpel_filter = x264_hpel_filter_neon;
+    }
 
 #if !HIGH_BIT_DEPTH
-
-
-
+#if HAVE_SVE
+    if( cpu&X264_CPU_SVE )
+    {
+        pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_sve;
+        pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_sve;
+        pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_sve;
+        pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_sve;
+    }
+#endif
 #endif // !HIGH_BIT_DEPTH
 }


=====================================
common/aarch64/pixel-a-common.S
=====================================
@@ -0,0 +1,44 @@
+/****************************************************************************
+ * pixel-a-common.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *          David Chen   <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros and constants that are intended to be used by
+// the SVE/SVE2 functions as well
+
+const mask_ac_4_8
+.short 0, -1, -1, -1,  0, -1, -1, -1
+.short 0, -1, -1, -1, -1, -1, -1, -1
+endconst
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+    SUMSUB_AB   \s1, \d1, \a, \b
+    SUMSUB_AB   \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm


=====================================
common/aarch64/pixel-a-sve.S
=====================================
@@ -0,0 +1,523 @@
+/*****************************************************************************
+ * pixel-a-sve.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "pixel-a-common.S"
+
+.arch armv8-a+sve
+
+#if BIT_DEPTH == 8
+
+.macro SSD_START_SVE_4
+    ptrue       p0.h, vl4
+    ld1b        {z16.h}, p0/z, [x0]
+    ld1b        {z17.h}, p0/z, [x2]
+    add         x0, x0, x1
+    add         x2, x2, x3
+    sub         v2.4h, v16.4h, v17.4h
+    ld1b        {z16.h}, p0/z, [x0]
+    ld1b        {z17.h}, p0/z, [x2]
+    add         x0, x0, x1
+    add         x2, x2, x3
+    smull       v0.4s,  v2.4h,   v2.4h
+.endm
+
+.macro SSD_SVE_4
+    sub         v2.4h, v16.4h, v17.4h
+    ld1b        {z16.h}, p0/z, [x0]
+    ld1b        {z17.h}, p0/z, [x2]
+    add         x0, x0, x1
+    add         x2, x2, x3
+    smlal       v0.4s,  v2.4h,   v2.4h
+.endm
+
+.macro SSD_END_SVE_4
+    sub         v2.4h, v16.4h, v17.4h
+    smlal       v0.4s,  v2.4h,   v2.4h
+.endm
+
+.macro SSD_START_SVE_8
+    ptrue       p0.h, vl8
+    ld1b        {z16.h}, p0/z, [x0]
+    ld1b        {z17.h}, p0/z, [x2]
+    add         x0, x0, x1
+    add         x2, x2, x3
+    sub         v2.8h, v16.8h, v17.8h
+    ld1b        {z16.h}, p0/z, [x0]
+    smull       v0.4s,  v2.4h,   v2.4h
+    ld1b        {z17.h}, p0/z, [x2]
+    smlal2      v0.4s,  v2.8h,   v2.8h
+    add         x0, x0, x1
+    add         x2, x2, x3
+.endm
+
+.macro SSD_SVE_8
+    sub         v2.8h, v16.8h, v17.8h
+    ld1b        {z16.h}, p0/z, [x0]
+    smlal       v0.4s,  v2.4h,   v2.4h
+    ld1b        {z17.h}, p0/z, [x2]
+    smlal2      v0.4s,  v2.8h,   v2.8h
+    add         x0, x0, x1
+    add         x2, x2, x3
+.endm
+
+.macro SSD_END_SVE_8
+    sub         v2.8h,  v16.8h,  v17.8h
+    smlal       v0.4s,  v2.4h,   v2.4h
+    smlal2      v0.4s,  v2.8h,   v2.8h
+.endm
+
+.macro SSD_FUNC_SVE w h
+function pixel_ssd_\w\()x\h\()_sve, export=1
+    SSD_START_SVE_\w
+.rept \h-2
+    SSD_SVE_\w
+.endr
+    SSD_END_SVE_\w
+
+    addv        s0,  v0.4s
+    mov         w0,  v0.s[0]
+    ret
+endfunc
+.endm
+
+.macro load_diff_fly_sve_8x8
+    ld1b        {z1.h}, p0/z, [x2]
+    ld1b        {z0.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    ld1b        {z3.h}, p0/z, [x2]
+    ld1b        {z2.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    sub         v16.8h, v0.8h,  v1.8h
+    sub         v17.8h, v2.8h,  v3.8h
+    ld1b        {z5.h}, p0/z, [x2]
+    ld1b        {z4.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    ld1b        {z7.h}, p0/z, [x2]
+    ld1b        {z6.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    sub         v18.8h, v4.8h,  v5.8h
+    sub         v19.8h, v6.8h,  v7.8h
+    ld1b        {z1.h}, p0/z, [x2]
+    ld1b        {z0.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    ld1b        {z3.h}, p0/z, [x2]
+    ld1b        {z2.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    sub         v20.8h, v0.8h,  v1.8h
+    sub         v21.8h, v2.8h,  v3.8h
+    ld1b        {z5.h}, p0/z, [x2]
+    ld1b        {z4.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+    ld1b        {z7.h}, p0/z, [x2]
+    ld1b        {z6.h}, p0/z, [x0]
+    add         x2, x2, x3
+    add         x0, x0, x1
+
+    SUMSUB_AB   v0.8h,  v1.8h,  v16.8h, v17.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v18.8h, v19.8h
+
+    sub         v22.8h, v4.8h,  v5.8h
+    sub         v23.8h, v6.8h,  v7.8h
+.endm
+
+.macro pixel_var_sve_8 h
+function pixel_var_8x\h\()_sve, export=1
+    ptrue           p0.h, vl8
+    ld1b            {z16.h}, p0/z, [x0]
+    add             x0, x0, x1
+    ld1b            {z17.h}, p0/z, [x0]
+    add             x0, x0, x1
+    mov             x2,  \h - 4
+    mul             v1.8h,  v16.8h, v16.8h
+    mul             v2.8h,  v17.8h, v17.8h
+    add             v0.8h,  v16.8h,  v17.8h
+    ld1b            {z18.h}, p0/z, [x0]
+    add             x0, x0, x1
+    uaddlp          v1.4s,  v1.8h
+    uaddlp          v2.4s,  v2.8h
+    ld1b            {z19.h}, p0/z, [x0]
+    add             x0, x0, x1
+
+1:  subs            x2,  x2,  #4
+    add             v0.8h,  v0.8h,  v18.8h
+    mul             v24.8h, v18.8h, v18.8h
+    ld1b            {z20.h}, p0/z, [x0]
+    add             x0, x0, x1
+    add             v0.8h,  v0.8h,  v19.8h
+    mul             v25.8h, v19.8h, v19.8h
+    uadalp          v1.4s,  v24.8h
+    ld1b            {z21.h}, p0/z, [x0]
+    add             x0, x0, x1
+    add             v0.8h,  v0.8h,  v20.8h
+    mul             v26.8h, v20.8h, v20.8h
+    uadalp          v2.4s,  v25.8h
+    ld1b            {z18.h}, p0/z, [x0]
+    add             x0, x0, x1
+    add             v0.8h,  v0.8h,  v21.8h
+    mul             v27.8h, v21.8h, v21.8h
+    uadalp          v1.4s,  v26.8h
+    ld1b            {z19.h}, p0/z, [x0]
+    add             x0, x0, x1
+    uadalp          v2.4s,  v27.8h
+    b.gt            1b
+
+    add             v0.8h,  v0.8h,  v18.8h
+    mul             v28.8h, v18.8h, v18.8h
+    add             v0.8h,  v0.8h,  v19.8h
+    mul             v29.8h, v19.8h, v19.8h
+    uadalp          v1.4s,  v28.8h
+    uadalp          v2.4s,  v29.8h
+
+    b               var_end
+endfunc
+.endm
+
+function var_end
+    add             v1.4s,  v1.4s,  v2.4s
+    uaddlv          s0,  v0.8h
+    uaddlv          d1,  v1.4s
+    mov             w0,  v0.s[0]
+    mov             x1,  v1.d[0]
+    orr             x0,  x0,  x1,  lsl #32
+    ret
+endfunc
+
+.macro SUMSUBL_AB_SVE  sum, sub, a, b
+    add         \sum,  \a,  \b
+    sub         \sub,  \a,  \b
+.endm
+
+function pixel_sa8d_8x8_sve, export=1
+    ptrue       p0.h, vl8
+    mov         x4,  x30
+    bl          pixel_sa8d_8x8_sve
+    add         v0.8h,  v0.8h,  v1.8h
+    uaddlv      s0,  v0.8h
+    mov         w0,  v0.s[0]
+    add         w0,  w0,  #1
+    lsr         w0,  w0,  #1
+    ret         x4
+endfunc
+
+.macro sa8d_satd_sve_8x8 satd=
+function pixel_sa8d_\satd\()8x8_sve
+    load_diff_fly_sve_8x8
+
+    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
+    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
+
+    HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h,  v1.8h, v2.8h, v3.8h
+.ifc \satd, satd_
+    transpose   v0.8h,  v1.8h,  v16.8h, v17.8h
+    transpose   v2.8h,  v3.8h,  v18.8h, v19.8h
+    transpose   v4.8h,  v5.8h,  v20.8h, v21.8h
+    transpose   v6.8h,  v7.8h,  v22.8h, v23.8h
+
+    SUMSUB_AB   v24.8h, v25.8h, v0.8h,  v1.8h
+    SUMSUB_AB   v26.8h, v27.8h, v2.8h,  v3.8h
+    SUMSUB_AB   v0.8h,  v1.8h,  v4.8h,  v5.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v6.8h,  v7.8h
+
+    transpose   v4.4s,  v6.4s,  v24.4s, v26.4s
+    transpose   v5.4s,  v7.4s,  v25.4s, v27.4s
+    transpose   v24.4s, v26.4s, v0.4s,  v2.4s
+    transpose   v25.4s, v27.4s, v1.4s,  v3.4s
+
+    abs         v0.8h,  v4.8h
+    abs         v1.8h,  v5.8h
+    abs         v2.8h,  v6.8h
+    abs         v3.8h,  v7.8h
+    abs         v4.8h,  v24.8h
+    abs         v5.8h,  v25.8h
+    abs         v6.8h,  v26.8h
+    abs         v7.8h,  v27.8h
+
+    umax        v0.8h,  v0.8h,  v2.8h
+    umax        v1.8h,  v1.8h,  v3.8h
+    umax        v2.8h,  v4.8h,  v6.8h
+    umax        v3.8h,  v5.8h,  v7.8h
+
+    add         v26.8h, v0.8h,  v1.8h
+    add         v27.8h, v2.8h,  v3.8h
+.endif
+
+    SUMSUB_AB   v0.8h,  v16.8h, v16.8h, v20.8h
+    SUMSUB_AB   v1.8h,  v17.8h, v17.8h, v21.8h
+    SUMSUB_AB   v2.8h,  v18.8h, v18.8h, v22.8h
+    SUMSUB_AB   v3.8h,  v19.8h, v19.8h, v23.8h
+
+    transpose   v20.8h, v21.8h, v16.8h, v17.8h
+    transpose   v4.8h,  v5.8h,  v0.8h,  v1.8h
+    transpose   v22.8h, v23.8h, v18.8h, v19.8h
+    transpose   v6.8h,  v7.8h,  v2.8h,  v3.8h
+
+    SUMSUB_AB   v2.8h,  v3.8h,  v20.8h, v21.8h
+    SUMSUB_AB   v24.8h, v25.8h, v4.8h,  v5.8h
+    SUMSUB_AB   v0.8h,  v1.8h,  v22.8h, v23.8h
+    SUMSUB_AB   v4.8h,  v5.8h,  v6.8h,  v7.8h
+
+    transpose   v20.4s, v22.4s, v2.4s,  v0.4s
+    transpose   v21.4s, v23.4s, v3.4s,  v1.4s
+    transpose   v16.4s, v18.4s, v24.4s, v4.4s
+    transpose   v17.4s, v19.4s, v25.4s, v5.4s
+
+    SUMSUB_AB   v0.8h,  v2.8h,  v20.8h, v22.8h
+    SUMSUB_AB   v1.8h,  v3.8h,  v21.8h, v23.8h
+    SUMSUB_AB   v4.8h,  v6.8h,  v16.8h, v18.8h
+    SUMSUB_AB   v5.8h,  v7.8h,  v17.8h, v19.8h
+
+    transpose   v16.2d, v20.2d,  v0.2d,  v4.2d
+    transpose   v17.2d, v21.2d,  v1.2d,  v5.2d
+    transpose   v18.2d, v22.2d,  v2.2d,  v6.2d
+    transpose   v19.2d, v23.2d,  v3.2d,  v7.2d
+
+    abs         v16.8h, v16.8h
+    abs         v20.8h, v20.8h
+    abs         v17.8h, v17.8h
+    abs         v21.8h, v21.8h
+    abs         v18.8h, v18.8h
+    abs         v22.8h, v22.8h
+    abs         v19.8h, v19.8h
+    abs         v23.8h, v23.8h
+
+    umax        v16.8h, v16.8h, v20.8h
+    umax        v17.8h, v17.8h, v21.8h
+    umax        v18.8h, v18.8h, v22.8h
+    umax        v19.8h, v19.8h, v23.8h
+
+    add         v0.8h,  v16.8h, v17.8h
+    add         v1.8h,  v18.8h, v19.8h
+
+    ret
+endfunc
+.endm
+
+.macro HADAMARD_AC_SVE w h
+function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
+    ptrue       p0.h, vl8
+    movrel      x5, mask_ac_4_8
+    mov         x4,  x30
+    ld1         {v30.8h,v31.8h}, [x5]
+    movi        v28.16b, #0
+    movi        v29.16b, #0
+
+    bl          hadamard_ac_8x8_sve
+.if \h > 8
+    bl          hadamard_ac_8x8_sve
+.endif
+.if \w > 8
+    sub         x0,  x0,  x1,  lsl #3
+    add         x0,  x0,  #8
+    bl          hadamard_ac_8x8_sve
+.endif
+.if \w * \h == 256
+    sub         x0,  x0,  x1,  lsl #4
+    bl          hadamard_ac_8x8_sve
+.endif
+
+    addv        s1,  v29.4s
+    addv        s0,  v28.4s
+    mov         w1,  v1.s[0]
+    mov         w0,  v0.s[0]
+    lsr         w1,  w1,  #2
+    lsr         w0,  w0,  #1
+    orr         x0,  x0,  x1, lsl #32
+    ret         x4
+endfunc
+.endm
+
+// v28: satd  v29: sa8d  v30: mask_ac4  v31: mask_ac8
+function hadamard_ac_8x8_sve
+    ld1b        {z16.h}, p0/z, [x0]
+    add         x0, x0, x1
+    ld1b        {z17.h}, p0/z, [x0]
+    add         x0, x0, x1
+    ld1b        {z18.h}, p0/z, [x0]
+    add         x0, x0, x1
+    ld1b        {z19.h}, p0/z, [x0]
+    add         x0, x0, x1
+    SUMSUBL_AB_SVE  v0.8h,  v1.8h, v16.8h, v17.8h
+    ld1b        {z20.h}, p0/z, [x0]
+    add         x0, x0, x1
+    ld1b        {z21.h}, p0/z, [x0]
+    add         x0, x0, x1
+    SUMSUBL_AB_SVE  v2.8h,  v3.8h, v18.8h, v19.8h
+    ld1b        {z22.h}, p0/z, [x0]
+    add         x0, x0, x1
+    ld1b        {z23.h}, p0/z, [x0]
+    add         x0, x0, x1
+    SUMSUBL_AB_SVE  v4.8h,  v5.8h, v20.8h, v21.8h
+    SUMSUBL_AB_SVE  v6.8h,  v7.8h, v22.8h, v23.8h
+
+    SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h,  v2.8h,  v1.8h,  v3.8h
+    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
+
+    transpose   v0.8h,  v1.8h,  v16.8h,  v17.8h
+    transpose   v2.8h,  v3.8h,  v18.8h,  v19.8h
+    transpose   v4.8h,  v5.8h,  v20.8h,  v21.8h
+    transpose   v6.8h,  v7.8h,  v22.8h,  v23.8h
+
+    SUMSUB_AB   v16.8h, v17.8h, v0.8h,  v1.8h
+    SUMSUB_AB   v18.8h, v19.8h, v2.8h,  v3.8h
+    SUMSUB_AB   v20.8h, v21.8h, v4.8h,  v5.8h
+    SUMSUB_AB   v22.8h, v23.8h, v6.8h,  v7.8h
+
+    transpose   v0.4s,  v2.4s,  v16.4s, v18.4s
+    transpose   v1.4s,  v3.4s,  v17.4s, v19.4s
+    transpose   v4.4s,  v6.4s,  v20.4s, v22.4s
+    transpose   v5.4s,  v7.4s,  v21.4s, v23.4s
+
+    SUMSUB_AB   v16.8h, v18.8h, v0.8h,  v2.8h
+    SUMSUB_AB   v17.8h, v19.8h, v1.8h,  v3.8h
+    SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h,  v6.8h,  v5.8h,  v7.8h
+
+    abs         v0.8h,  v16.8h
+    abs         v4.8h,  v20.8h
+    abs         v1.8h,  v17.8h
+    abs         v5.8h,  v21.8h
+    abs         v2.8h,  v18.8h
+    abs         v6.8h,  v22.8h
+    abs         v3.8h,  v19.8h
+    abs         v7.8h,  v23.8h
+
+    add         v0.8h,  v0.8h,  v4.8h
+    add         v1.8h,  v1.8h,  v5.8h
+    and         v0.16b, v0.16b, v30.16b
+    add         v2.8h,  v2.8h,  v6.8h
+    add         v3.8h,  v3.8h,  v7.8h
+    add         v0.8h,  v0.8h,  v2.8h
+    add         v1.8h,  v1.8h,  v3.8h
+    uadalp      v28.4s, v0.8h
+    uadalp      v28.4s, v1.8h
+
+    SUMSUB_AB   v6.8h,  v7.8h,  v23.8h, v19.8h
+    SUMSUB_AB   v4.8h,  v5.8h,  v22.8h, v18.8h
+    SUMSUB_AB   v2.8h,  v3.8h,  v21.8h, v17.8h
+    SUMSUB_AB   v1.8h,  v0.8h,  v16.8h,  v20.8h
+
+    transpose   v16.2d, v17.2d,  v6.2d,  v7.2d
+    transpose   v18.2d, v19.2d,  v4.2d,  v5.2d
+    transpose   v20.2d, v21.2d,  v2.2d,  v3.2d
+
+    abs         v16.8h,  v16.8h
+    abs         v17.8h,  v17.8h
+    abs         v18.8h,  v18.8h
+    abs         v19.8h,  v19.8h
+    abs         v20.8h,  v20.8h
+    abs         v21.8h,  v21.8h
+
+    transpose   v7.2d,  v6.2d,  v1.2d,  v0.2d
+
+    umax        v3.8h,  v16.8h,  v17.8h
+    umax        v2.8h,  v18.8h,  v19.8h
+    umax        v1.8h,  v20.8h,  v21.8h
+
+    SUMSUB_AB   v4.8h,  v5.8h,  v7.8h,  v6.8h
+
+    add         v2.8h,  v2.8h,  v3.8h
+    add         v2.8h,  v2.8h,  v1.8h
+    and         v4.16b, v4.16b, v31.16b
+    add         v2.8h,  v2.8h,  v2.8h
+    abs         v5.8h,  v5.8h
+    abs         v4.8h,  v4.8h
+    add         v2.8h,  v2.8h,  v5.8h
+    add         v2.8h,  v2.8h,  v4.8h
+    uadalp      v29.4s, v2.8h
+    ret
+endfunc
+
+SSD_FUNC_SVE   4, 4
+SSD_FUNC_SVE   4, 8
+SSD_FUNC_SVE   4, 16
+SSD_FUNC_SVE   8, 4
+SSD_FUNC_SVE   8, 8
+
+pixel_var_sve_8  8
+pixel_var_sve_8 16
+
+sa8d_satd_sve_8x8
+
+HADAMARD_AC_SVE  8, 8
+HADAMARD_AC_SVE  8, 16
+HADAMARD_AC_SVE 16, 8
+HADAMARD_AC_SVE 16, 16
+
+#else /* BIT_DEPTH == 10 */
+
+.macro SSD_START_SVE_4
+    ptrue       p0.s, vl4
+    ld1h        {z16.s}, p0/z, [x0]
+    ld1h        {z17.s}, p0/z, [x2]
+    add         x0, x0, x1, lsl #1
+    add         x2, x2, x3, lsl #1
+    sub         v2.4s, v16.4s, v17.4s
+    ld1h        {z16.s}, p0/z, [x0]
+    ld1h        {z17.s}, p0/z, [x2]
+    add         x0, x0, x1, lsl #1
+    add         x2, x2, x3, lsl #1
+    mul         v0.4s, v2.4s, v2.4s
+.endm
+
+.macro SSD_SVE_4
+    sub         v2.4s, v16.4s, v17.4s
+    ld1h        {z16.s}, p0/z, [x0]
+    ld1h        {z17.s}, p0/z, [x2]
+    add         x0, x0, x1, lsl #1
+    add         x2, x2, x3, lsl #1
+    mla         v0.4s, v2.4s, v2.4s
+.endm
+
+.macro SSD_END_SVE_4
+    sub         v2.4s, v16.4s, v17.4s
+    mla         v0.4s,  v2.4s, v2.4s
+.endm
+
+.macro SSD_FUNC_SVE w h
+function pixel_ssd_\w\()x\h\()_sve, export=1
+    SSD_START_SVE_\w
+.rept \h-2
+    SSD_SVE_\w
+.endr
+    SSD_END_SVE_\w
+
+    addv        s0, v0.4s
+    fmov        w0, s0
+    ret
+endfunc
+.endm
+
+SSD_FUNC_SVE   4, 4
+SSD_FUNC_SVE   4, 8
+SSD_FUNC_SVE   4, 16
+
+#endif /* BIT_DEPTH == 8 */


=====================================
common/aarch64/pixel-a.S
=====================================
@@ -25,6 +25,7 @@
  *****************************************************************************/
 
 #include "asm.S"
+#include "pixel-a-common.S"
 
 const mask
 .rept 16
@@ -35,26 +36,11 @@ const mask
 .endr
 endconst
 
-const mask_ac_4_8
-.short 0, -1, -1, -1,  0, -1, -1, -1
-.short 0, -1, -1, -1, -1, -1, -1, -1
-endconst
-
 .macro SUMSUBL_AB   sum, sub, a, b
     uaddl       \sum, \a, \b
     usubl       \sub, \a, \b
 .endm
 
-.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
-    SUMSUB_AB   \s1, \d1, \a, \b
-    SUMSUB_AB   \s2, \d2, \c, \d
-.endm
-
-.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
-    SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
-    SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
-.endm
-
 #if BIT_DEPTH == 8
 
 .macro SAD_START_4


=====================================
common/aarch64/pixel.h
=====================================
@@ -65,6 +65,11 @@
 #define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
 #define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
 #define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
+#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
+#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
+#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
+#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
+#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
 #define DECL_PIXELS( ret, name, suffix, args ) \
     ret x264_pixel_##name##_16x16_##suffix args;\
     ret x264_pixel_##name##_16x8_##suffix args;\
@@ -73,10 +78,18 @@
     ret x264_pixel_##name##_8x4_##suffix args;\
     ret x264_pixel_##name##_4x16_##suffix args;\
     ret x264_pixel_##name##_4x8_##suffix args;\
-    ret x264_pixel_##name##_4x4_##suffix args;\
+    ret x264_pixel_##name##_4x4_##suffix args;
+#define DECL_PIXELS_SSD_SVE( ret, args ) \
+    ret x264_pixel_ssd_8x8_sve args;\
+    ret x264_pixel_ssd_8x4_sve args;\
+    ret x264_pixel_ssd_4x16_sve args;\
+    ret x264_pixel_ssd_4x8_sve args;\
+    ret x264_pixel_ssd_4x4_sve args;
 
 #define DECL_X1( name, suffix ) \
     DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
+#define DECL_X1_SSD_SVE( ) \
+    DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
 
 #define DECL_X4( name, suffix ) \
     DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
@@ -86,6 +99,7 @@ DECL_X1( sad, neon )
 DECL_X4( sad, neon )
 DECL_X1( satd, neon )
 DECL_X1( ssd, neon )
+DECL_X1_SSD_SVE( )
 
 
 #define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
@@ -100,6 +114,8 @@ int x264_pixel_sa8d_8x8_neon  ( pixel *, intptr_t, pixel *, intptr_t );
 int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
 #define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
 uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
+#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
+int x264_pixel_sa8d_8x8_sve  ( pixel *, intptr_t, pixel *, intptr_t );
 
 #define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
 uint64_t x264_pixel_var_8x8_neon  ( pixel *, intptr_t );
@@ -111,6 +127,11 @@ uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
 int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
 #define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
 int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
+#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
+uint64_t x264_pixel_var_8x8_sve  ( pixel *, intptr_t );
+#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
+uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
+
 
 #define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
 uint64_t x264_pixel_hadamard_ac_8x8_neon  ( pixel *, intptr_t );
@@ -120,6 +141,15 @@ uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
 uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
 #define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
 uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
+uint64_t x264_pixel_hadamard_ac_8x8_sve  ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
+uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
+uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
+uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
+
 
 #define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
 void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,


=====================================
common/dct.c
=====================================
@@ -707,6 +707,18 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
         dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
     }
+#if HAVE_SVE
+    if ( cpu&X264_CPU_SVE )
+    {
+        dctf->sub4x4_dct    = x264_sub4x4_dct_sve;
+    }
+#endif
+#if HAVE_SVE2
+    if ( cpu&X264_CPU_SVE2 )
+    {
+        dctf->add4x4_idct   = x264_add4x4_idct_sve2;
+    }
+#endif
 #endif
 
 #if HAVE_MSA
@@ -1105,6 +1117,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
         pf_interlaced->interleave_8x8_cavlc =
         pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_neon;
     }
+#if HAVE_SVE
+    if( cpu&X264_CPU_SVE )
+    {
+        pf_progressive->interleave_8x8_cavlc =  x264_zigzag_interleave_8x8_cavlc_sve;
+    }
+#endif
 #endif // HAVE_AARCH64
 
 #if HAVE_ALTIVEC


=====================================
common/deblock.c
=====================================
@@ -803,6 +803,12 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
         pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
         pf->deblock_strength     = x264_deblock_strength_neon;
     }
+#if HAVE_SVE
+    if ( cpu&X264_CPU_SVE )
+    {
+        pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
+    }
+#endif
 #endif
 
 #if HAVE_MSA


=====================================
common/pixel.c
=====================================
@@ -829,12 +829,32 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
 #define INIT8_NAME( name1, name2, cpu ) \
     INIT7_NAME( name1, name2, cpu ) \
     pixf->name1[PIXEL_4x16]  = x264_pixel_##name2##_4x16##cpu;
+#if HAVE_SVE
+#define INIT7_NAME_SVE_SSD_10BIT( ) \
+    pixf->ssd[PIXEL_4x4]   = x264_pixel_ssd_4x4_sve; \
+    pixf->ssd[PIXEL_4x8]   = x264_pixel_ssd_4x8_sve;
+#endif
+#if HAVE_SVE
+#define INIT8_NAME_SVE_SSD( ) \
+    pixf->ssd[PIXEL_8x8]   = x264_pixel_ssd_8x8_sve; \
+    pixf->ssd[PIXEL_8x4]   = x264_pixel_ssd_8x4_sve; \
+    pixf->ssd[PIXEL_4x8]   = x264_pixel_ssd_4x8_sve; \
+    pixf->ssd[PIXEL_4x4]   = x264_pixel_ssd_4x4_sve; \
+    pixf->ssd[PIXEL_4x16]  = x264_pixel_ssd_4x16_sve;
+#define INIT8_NAME_SVE_SSD_10BIT() \
+    INIT7_NAME_SVE_SSD_10BIT() \
+    pixf->ssd[PIXEL_4x16]  = x264_pixel_ssd_4x16_sve;
+#endif
 #define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
 #define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
 #define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
 #define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
 #define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
 #define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
+#if HAVE_SVE
+#define INIT8_SVE_SSD( ) INIT8_NAME_SVE_SSD( )
+#define INIT8_SVE_SSD_10BIT( ) INIT8_NAME_SVE_SSD_10BIT( )
+#endif
 
 #define INIT_ADS( cpu ) \
     pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
@@ -1086,6 +1106,12 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
         pixf->ssim_4x4x2_core    = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
     }
+#if HAVE_SVE
+    if( cpu&X264_CPU_SVE )
+    {
+        INIT8_SVE_SSD_10BIT();
+    }
+#endif
 #endif // HAVE_AARCH64
 
 #else // !HIGH_BIT_DEPTH
@@ -1499,6 +1525,18 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
         pixf->ssim_4x4x2_core   = x264_pixel_ssim_4x4x2_core_neon;
         pixf->ssim_end4         = x264_pixel_ssim_end4_neon;
     }
+#if HAVE_SVE
+    if( cpu&X264_CPU_SVE )
+    {
+        INIT8_SVE_SSD( );
+        INIT4( hadamard_ac, _sve );
+
+        pixf->sa8d[PIXEL_8x8]   = x264_pixel_sa8d_8x8_sve;
+
+        pixf->var[PIXEL_8x8]    = x264_pixel_var_8x8_sve;
+        pixf->var[PIXEL_8x16]   = x264_pixel_var_8x16_sve;
+    }
+#endif
 #endif // HAVE_AARCH64
 
 #if HAVE_MSA



View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/c196240409e4d7c01b47448d93b1f9683aaa7cf7...c1c9931dc87289b8aeba78150467f17bdb97d019

-- 
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/c196240409e4d7c01b47448d93b1f9683aaa7cf7...c1c9931dc87289b8aeba78150467f17bdb97d019
You're receiving this email because of your account on code.videolan.org.


VideoLAN code repository instance


More information about the x264-devel mailing list