[x264-devel] [Git][videolan/x264][master] 8 commits: Create Common NEON dct-a Macros
Martin Storsjö (@mstorsjo)
gitlab at videolan.org
Tue Nov 28 10:04:28 UTC 2023
Martin Storsjö pushed to branch master at VideoLAN / x264
Commits:
b6190c6f by David Chen at 2023-11-18T08:42:48+02:00
Create Common NEON dct-a Macros
Place NEON dct-a macros that are intended to be
used by SVE/SVE2 functions as well in a common file.
- - - - -
5c382660 by David Chen at 2023-11-20T08:03:51+02:00
Improve dct-a.S Performance by Using SVE/SVE2
Imporve the performance of NEON functions of aarch64/dct-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.
Command executed: ./checkasm8 --bench=sub
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
sub4x4_dct_c: 528
sub4x4_dct_neon: 322
sub4x4_dct_sve: 247
Command executed: ./checkasm8 --bench=sub
Testbed: AWS Graviton3
Results:
sub4x4_dct_c: 562
sub4x4_dct_neon: 376
sub4x4_dct_sve: 255
Command executed: ./checkasm8 --bench=add
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
add4x4_idct_c: 698
add4x4_idct_neon: 386
add4x4_idct_sve2: 345
Command executed: ./checkasm8 --bench=zigzag
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
zigzag_interleave_8x8_cavlc_frame_c: 582
zigzag_interleave_8x8_cavlc_frame_neon: 273
zigzag_interleave_8x8_cavlc_frame_sve: 257
Command executed: ./checkasm8 --bench=zigzag
Testbed: AWS Graviton3
Results:
zigzag_interleave_8x8_cavlc_frame_c: 587
zigzag_interleave_8x8_cavlc_frame_neon: 257
zigzag_interleave_8x8_cavlc_frame_sve: 249
- - - - -
37949a99 by David Chen at 2023-11-20T08:03:53+02:00
Create Common NEON deblock-a Macros
Place NEON deblock-a macros that are intended to be
used by SVE/SVE2 functions as well in a common file.
- - - - -
5ad5e5d8 by David Chen at 2023-11-20T08:03:54+02:00
Improve deblock-a.S Performance by Using SVE/SVE2
Imporve the performance of NEON functions of aarch64/deblock-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.
Command executed: ./checkasm8 --bench=deblock
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
deblock_chroma[1]_c: 735
deblock_chroma[1]_neon: 427
deblock_chroma[1]_sve: 353
Command executed: ./checkasm8 --bench=deblock
Testbed: AWS Graviton3
Results:
deblock_chroma[1]_c: 719
deblock_chroma[1]_neon: 442
deblock_chroma[1]_sve: 345
- - - - -
21a788f1 by David Chen at 2023-11-23T08:24:13+02:00
Create Common NEON mc-a Macros and Functions
Place NEON mc-a macros and functions that are intended
to be used by SVE/SVE2 functions as well in a common file.
- - - - -
06dcf3f9 by David Chen at 2023-11-23T08:24:16+02:00
Improve mc-a.S Performance by Using SVE/SVE2
Imporve the performance of NEON functions of aarch64/mc-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.
Command executed: ./checkasm8 --bench=avg
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
avg_4x2_c: 274
avg_4x2_neon: 215
avg_4x2_sve: 171
avg_4x4_c: 461
avg_4x4_neon: 343
avg_4x4_sve: 225
avg_4x8_c: 806
avg_4x8_neon: 619
avg_4x8_sve: 334
avg_4x16_c: 1523
avg_4x16_neon: 1168
avg_4x16_sve: 558
Command executed: ./checkasm8 --bench=avg
Testbed: AWS Graviton3
Results:
avg_4x2_c: 267
avg_4x2_neon: 213
avg_4x2_sve: 167
avg_4x4_c: 467
avg_4x4_neon: 350
avg_4x4_sve: 221
avg_4x8_c: 784
avg_4x8_neon: 624
avg_4x8_sve: 302
avg_4x16_c: 1445
avg_4x16_neon: 1182
avg_4x16_sve: 485
- - - - -
0ac52d29 by David Chen at 2023-11-23T08:26:53+02:00
Create Common NEON pixel-a Macros and Constants
Place NEON pixel-a macros and constants that are intended
to be used by SVE/SVE2 functions as well in a common file.
- - - - -
c1c9931d by David Chen at 2023-11-23T19:01:29+02:00
Improve pixel-a.S Performance by Using SVE/SVE2
Imporve the performance of NEON functions of aarch64/pixel-a.S
by using the SVE/SVE2 instruction set. Below, the specific functions
are listed together with the improved performance results.
Command executed: ./checkasm8 --bench=ssd
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
ssd_4x4_c: 235
ssd_4x4_neon: 226
ssd_4x4_sve: 151
ssd_4x8_c: 409
ssd_4x8_neon: 363
ssd_4x8_sve: 201
ssd_4x16_c: 781
ssd_4x16_neon: 653
ssd_4x16_sve: 313
ssd_8x4_c: 402
ssd_8x4_neon: 192
ssd_8x4_sve: 192
ssd_8x8_c: 728
ssd_8x8_neon: 275
ssd_8x8_sve: 275
Command executed: ./checkasm10 --bench=ssd
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
ssd_4x4_c: 256
ssd_4x4_neon: 226
ssd_4x4_sve: 153
ssd_4x8_c: 460
ssd_4x8_neon: 369
ssd_4x8_sve: 215
ssd_4x16_c: 852
ssd_4x16_neon: 651
ssd_4x16_sve: 340
Command executed: ./checkasm8 --bench=ssd
Testbed: AWS Graviton3
Results:
ssd_4x4_c: 295
ssd_4x4_neon: 288
ssd_4x4_sve: 228
ssd_4x8_c: 454
ssd_4x8_neon: 431
ssd_4x8_sve: 294
ssd_4x16_c: 779
ssd_4x16_neon: 631
ssd_4x16_sve: 438
ssd_8x4_c: 463
ssd_8x4_neon: 247
ssd_8x4_sve: 246
ssd_8x8_c: 781
ssd_8x8_neon: 413
ssd_8x8_sve: 353
Command executed: ./checkasm10 --bench=ssd
Testbed: AWS Graviton3
Results:
ssd_4x4_c: 322
ssd_4x4_neon: 335
ssd_4x4_sve: 240
ssd_4x8_c: 522
ssd_4x8_neon: 448
ssd_4x8_sve: 294
ssd_4x16_c: 832
ssd_4x16_neon: 603
ssd_4x16_sve: 440
Command executed: ./checkasm8 --bench=sa8d
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
sa8d_8x8_c: 2103
sa8d_8x8_neon: 619
sa8d_8x8_sve: 617
Command executed: ./checkasm8 --bench=sa8d
Testbed: AWS Graviton3
Results:
sa8d_8x8_c: 2021
sa8d_8x8_neon: 597
sa8d_8x8_sve: 580
Command executed: ./checkasm8 --bench=var
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
var_8x8_c: 595
var_8x8_neon: 262
var_8x8_sve: 262
var_8x16_c: 1193
var_8x16_neon: 435
var_8x16_sve: 419
Command executed: ./checkasm8 --bench=var
Testbed: AWS Graviton3
Results:
var_8x8_c: 616
var_8x8_neon: 229
var_8x8_sve: 222
var_8x16_c: 1207
var_8x16_neon: 399
var_8x16_sve: 389
Command executed: ./checkasm8 --bench=hadamard_ac
Testbed: Alibaba g8y instance based on Yitian 710 CPU
Results:
hadamard_ac_8x8_c: 2330
hadamard_ac_8x8_neon: 635
hadamard_ac_8x8_sve: 635
hadamard_ac_8x16_c: 4500
hadamard_ac_8x16_neon: 1152
hadamard_ac_8x16_sve: 1151
hadamard_ac_16x8_c: 4499
hadamard_ac_16x8_neon: 1151
hadamard_ac_16x8_sve: 1150
hadamard_ac_16x16_c: 8812
hadamard_ac_16x16_neon: 2187
hadamard_ac_16x16_sve: 2186
Command executed: ./checkasm8 --bench=hadamard_ac
Testbed: AWS Graviton3
Results:
hadamard_ac_8x8_c: 2266
hadamard_ac_8x8_neon: 517
hadamard_ac_8x8_sve: 513
hadamard_ac_8x16_c: 4444
hadamard_ac_8x16_neon: 867
hadamard_ac_8x16_sve: 849
hadamard_ac_16x8_c: 4443
hadamard_ac_16x8_neon: 880
hadamard_ac_16x8_sve: 868
hadamard_ac_16x16_c: 8595
hadamard_ac_16x16_neon: 1656
hadamard_ac_16x16_sve: 1622
- - - - -
21 changed files:
- Makefile
- + common/aarch64/dct-a-common.S
- + common/aarch64/dct-a-sve.S
- + common/aarch64/dct-a-sve2.S
- common/aarch64/dct-a.S
- common/aarch64/dct.h
- + common/aarch64/deblock-a-common.S
- + common/aarch64/deblock-a-sve.S
- common/aarch64/deblock-a.S
- common/aarch64/deblock.h
- + common/aarch64/mc-a-common.S
- + common/aarch64/mc-a-sve.S
- common/aarch64/mc-a.S
- common/aarch64/mc-c.c
- + common/aarch64/pixel-a-common.S
- + common/aarch64/pixel-a-sve.S
- common/aarch64/pixel-a.S
- common/aarch64/pixel.h
- common/dct.c
- common/deblock.c
- common/pixel.c
Changes:
=====================================
Makefile
=====================================
@@ -160,7 +160,7 @@ endif
OBJCHK += tools/checkasm-arm.o
endif
-# AArch64 NEON optims
+# AArch64 NEON and SVE/SVE2 optims
ifeq ($(SYS_ARCH),AARCH64)
SRCASM_X = common/aarch64/bitstream-a.S \
common/aarch64/cabac-a.S \
@@ -170,6 +170,15 @@ SRCASM_X = common/aarch64/bitstream-a.S \
common/aarch64/pixel-a.S \
common/aarch64/predict-a.S \
common/aarch64/quant-a.S
+ifneq ($(findstring HAVE_SVE 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve.S \
+ common/aarch64/deblock-a-sve.S \
+ common/aarch64/mc-a-sve.S \
+ common/aarch64/pixel-a-sve.S
+endif
+ifneq ($(findstring HAVE_SVE2 1, $(CONFIG)),)
+SRCASM_X += common/aarch64/dct-a-sve2.S
+endif
SRCS_X += common/aarch64/asm-offsets.c \
common/aarch64/mc-c.c \
common/aarch64/predict-c.c
=====================================
common/aarch64/dct-a-common.S
=====================================
@@ -0,0 +1,40 @@
+/****************************************************************************
+ * dct-a-common.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ * David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros that are intended to be used by
+// the SVE/SVE2 functions as well
+
+.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
+ SUMSUB_AB \v1, \v6, \v5, \v6
+ SUMSUB_AB \v3, \v7, \v4, \v7
+ add \v0, \v3, \v1
+ add \v4, \v7, \v7
+ add \v5, \v6, \v6
+ sub \v2, \v3, \v1
+ add \v1, \v4, \v6
+ sub \v3, \v7, \v5
+.endm
=====================================
common/aarch64/dct-a-sve.S
=====================================
@@ -0,0 +1,88 @@
+/****************************************************************************
+ * dct-a-sve.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve
+
+function sub4x4_dct_sve, export=1
+ mov x3, #FENC_STRIDE
+ mov x4, #FDEC_STRIDE
+ ptrue p0.h, vl4
+ ld1b {z0.h}, p0/z, [x1]
+ add x1, x1, x3
+ ld1b {z1.h}, p0/z, [x2]
+ add x2, x2, x4
+ ld1b {z2.h}, p0/z, [x1]
+ add x1, x1, x3
+ sub v16.4h, v0.4h, v1.4h
+ ld1b {z3.h}, p0/z, [x2]
+ add x2, x2, x4
+ ld1b {z4.h}, p0/z, [x1]
+ add x1, x1, x3
+ sub v17.4h, v2.4h, v3.4h
+ ld1b {z5.h}, p0/z, [x2]
+ add x2, x2, x4
+ ld1b {z6.h}, p0/z, [x1]
+ sub v18.4h, v4.4h, v5.4h
+ ld1b {z7.h}, p0/z, [x2]
+ sub v19.4h, v6.4h, v7.4h
+
+ DCT_1D v0.4h, v1.4h, v2.4h, v3.4h, v16.4h, v17.4h, v18.4h, v19.4h
+ transpose4x4.h v0, v1, v2, v3, v4, v5, v6, v7
+ DCT_1D v4.4h, v5.4h, v6.4h, v7.4h, v0.4h, v1.4h, v2.4h, v3.4h
+ st1 {v4.4h,v5.4h,v6.4h,v7.4h}, [x0]
+ ret
+endfunc
+
+function zigzag_interleave_8x8_cavlc_sve, export=1
+ mov z31.s, #1
+ ptrue p2.s, vl2
+ ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
+ ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
+ umax v16.8h, v0.8h, v4.8h
+ umax v17.8h, v1.8h, v5.8h
+ umax v18.8h, v2.8h, v6.8h
+ umax v19.8h, v3.8h, v7.8h
+ st1 {v0.8h}, [x0], #16
+ st1 {v4.8h}, [x0], #16
+ umaxp v16.8h, v16.8h, v17.8h
+ umaxp v18.8h, v18.8h, v19.8h
+ st1 {v1.8h}, [x0], #16
+ st1 {v5.8h}, [x0], #16
+ umaxp v16.8h, v16.8h, v18.8h
+ st1 {v2.8h}, [x0], #16
+ st1 {v6.8h}, [x0], #16
+ cmhs v16.4s, v16.4s, v31.4s
+ st1 {v3.8h}, [x0], #16
+ and v16.16b, v16.16b, v31.16b
+ st1 {v7.8h}, [x0], #16
+ st1b {z16.s}, p2, [x2]
+ add x2, x2, #8
+ mov v16.d[0], v16.d[1]
+ st1b {z16.s}, p2, [x2]
+ ret
+endfunc
=====================================
common/aarch64/dct-a-sve2.S
=====================================
@@ -0,0 +1,89 @@
+/****************************************************************************
+ * dct-a-sve2.S: aarch64 transform and zigzag
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "dct-a-common.S"
+
+.arch armv8-a+sve+sve2
+
+function add4x4_idct_sve2, export=1
+ mov x2, #FDEC_STRIDE
+ mov x11, x0
+ ptrue p0.h, vl8
+ ptrue p1.h, vl4
+ ld1 {v0.8h, v1.8h}, [x1]
+
+ SUMSUB_AB v4.8h, v5.8h, v0.8h, v1.8h
+
+ sshr v7.8h, v0.8h, #1
+ sshr v6.8h, v1.8h, #1
+ sub v7.8h, v7.8h, v1.8h
+ add v6.8h, v6.8h, v0.8h
+ mov v7.d[0], v7.d[1]
+ mov v6.d[0], v6.d[1]
+ ld1b {z28.h}, p0/z, [x11]
+ add x11, x11, x2
+ SUMSUB_AB v0.8h, v2.8h, v4.8h, v6.8h
+ SUMSUB_AB v1.8h, v3.8h, v5.8h, v7.8h
+
+ transpose4x4.h v0, v1, v3, v2, v16, v17, v18, v19
+
+ SUMSUB_AB v4.4h, v5.4h, v0.4h, v3.4h
+
+ sshr v7.4h, v1.4h, #1
+ sshr v6.4h, v2.4h, #1
+ sub v7.4h, v7.4h, v2.4h
+ add v6.4h, v6.4h, v1.4h
+ ld1b {z29.h}, p0/z, [x11]
+ add x11, x11, x2
+ SUMSUB_AB v0.4h, v2.4h, v4.4h, v6.4h
+ SUMSUB_AB v1.4h, v3.4h, v5.4h, v7.4h
+
+ srshr z0.h, p1/m, z0.h, #6
+ srshr z1.h, p1/m, z1.h, #6
+ ld1b {z31.h}, p0/z, [x11]
+ add x11, x11, x2
+ srshr z2.h, p1/m, z2.h, #6
+ srshr z3.h, p1/m, z3.h, #6
+ ld1b {z30.h}, p0/z, [x11]
+
+ add v0.8h, v0.8h, v28.8h
+ add v1.8h, v1.8h, v29.8h
+ add v2.8h, v2.8h, v30.8h
+ add v3.8h, v3.8h, v31.8h
+ sqxtunb z0.b, z0.h
+ sqxtunb z1.b, z1.h
+ sqxtunb z2.b, z2.h
+ sqxtunb z3.b, z3.h
+
+ st1b {z0.h}, p1, [x0]
+ add x0, x0, x2
+ st1b {z1.h}, p1, [x0]
+ add x0, x0, x2
+ st1b {z3.h}, p1, [x0]
+ add x0, x0, x2
+ st1b {z2.h}, p1, [x0]
+ ret
+endfunc
=====================================
common/aarch64/dct-a.S
=====================================
@@ -25,6 +25,7 @@
*****************************************************************************/
#include "asm.S"
+#include "dct-a-common.S"
const scan4x4_frame, align=4
.byte 0,1, 8,9, 2,3, 4,5
@@ -120,17 +121,6 @@ function idct4x4dc_neon, export=1
ret
endfunc
-.macro DCT_1D v0 v1 v2 v3 v4 v5 v6 v7
- SUMSUB_AB \v1, \v6, \v5, \v6
- SUMSUB_AB \v3, \v7, \v4, \v7
- add \v0, \v3, \v1
- add \v4, \v7, \v7
- add \v5, \v6, \v6
- sub \v2, \v3, \v1
- add \v1, \v4, \v6
- sub \v3, \v7, \v5
-.endm
-
function sub4x4_dct_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
=====================================
common/aarch64/dct.h
=====================================
@@ -91,4 +91,13 @@ int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel
#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+#define x264_sub4x4_dct_sve x264_template(sub4x4_dct_sve)
+void x264_sub4x4_dct_sve( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
+
+#define x264_add4x4_idct_sve2 x264_template(add4x4_idct_sve2)
+void x264_add4x4_idct_sve2( uint8_t *p_dst, int16_t dct[16] );
+
+#define x264_zigzag_interleave_8x8_cavlc_sve x264_template(zigzag_interleave_8x8_cavlc_sve)
+void x264_zigzag_interleave_8x8_cavlc_sve( dctcoef *dst, dctcoef *src, uint8_t *nnz );
+
#endif
=====================================
common/aarch64/deblock-a-common.S
=====================================
@@ -0,0 +1,43 @@
+/*****************************************************************************
+ * deblock-a-common.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: Mans Rullgard <mans at mansr.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ * David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros that are intended to be used by
+// the SVE/SVE2 functions as well
+
+.macro h264_loop_filter_start
+ cmp w2, #0
+ ldr w6, [x4]
+ ccmp w3, #0, #0, ne
+ mov v24.s[0], w6
+ and w8, w6, w6, lsl #16
+ b.eq 1f
+ ands w8, w8, w8, lsl #8
+ b.ge 2f
+1:
+ ret
+2:
+.endm
=====================================
common/aarch64/deblock-a-sve.S
=====================================
@@ -0,0 +1,98 @@
+/*****************************************************************************
+ * deblock-a-sve.S: aarch64 deblocking
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "deblock-a-common.S"
+
+.arch armv8-a+sve
+
+.macro h264_loop_filter_chroma_sve
+ ptrue p0.b, vl16
+
+ dup v22.16b, w2 // alpha
+ uxtl v24.8h, v24.8b
+ uabd v26.16b, v16.16b, v0.16b // abs(p0 - q0)
+ uxtl v4.8h, v0.8b
+ uxtl2 v5.8h, v0.16b
+ uabd v28.16b, v18.16b, v16.16b // abs(p1 - p0)
+ usubw v4.8h, v4.8h, v16.8b
+ usubw2 v5.8h, v5.8h, v16.16b
+ sli v24.8h, v24.8h, #8
+ shl v4.8h, v4.8h, #2
+ shl v5.8h, v5.8h, #2
+ uabd v30.16b, v2.16b, v0.16b // abs(q1 - q0)
+ uxtl v24.4s, v24.4h
+ uaddw v4.8h, v4.8h, v18.8b
+ uaddw2 v5.8h, v5.8h, v18.16b
+
+ cmphi p1.b, p0/z, z22.b, z26.b
+ usubw v4.8h, v4.8h, v2.8b
+ usubw2 v5.8h, v5.8h, v2.16b
+ sli v24.4s, v24.4s, #16
+ dup v22.16b, w3 // beta
+ rshrn v4.8b, v4.8h, #3
+ rshrn2 v4.16b, v5.8h, #3
+ cmphi p2.b, p0/z, z22.b, z28.b
+ cmphi p3.b, p0/z, z22.b, z30.b
+ smin v4.16b, v4.16b, v24.16b
+ neg v25.16b, v24.16b
+ and p1.b, p0/z, p1.b, p2.b
+ smax v4.16b, v4.16b, v25.16b
+ and p1.b, p0/z, p1.b, p3.b
+ uxtl v22.8h, v0.8b
+ uxtl2 v23.8h, v0.16b
+
+ uxtl v28.8h, v16.8b
+ uxtl2 v29.8h, v16.16b
+ saddw v28.8h, v28.8h, v4.8b
+ saddw2 v29.8h, v29.8h, v4.16b
+ ssubw v22.8h, v22.8h, v4.8b
+ ssubw2 v23.8h, v23.8h, v4.16b
+ sqxtun v16.8b, v28.8h
+ sqxtun v0.8b, v22.8h
+ sqxtun2 v16.16b, v29.8h
+ sqxtun2 v0.16b, v23.8h
+.endm
+
+function deblock_v_chroma_sve, export=1
+ h264_loop_filter_start
+
+ sub x0, x0, x1, lsl #1
+ // No performance improvement if sve load is used. So, continue using
+ // NEON load here
+ ld1 {v18.16b}, [x0], x1
+ ld1 {v16.16b}, [x0], x1
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v2.16b}, [x0]
+
+ h264_loop_filter_chroma_sve
+
+ sub x0, x0, x1, lsl #1
+ st1b {z16.b}, p1, [x0]
+ add x0, x0, x1
+ st1b {z0.b}, p1, [x0]
+
+ ret
+endfunc
=====================================
common/aarch64/deblock-a.S
=====================================
@@ -25,20 +25,7 @@
*****************************************************************************/
#include "asm.S"
-
-.macro h264_loop_filter_start
- cmp w2, #0
- ldr w6, [x4]
- ccmp w3, #0, #0, ne
- mov v24.s[0], w6
- and w8, w6, w6, lsl #16
- b.eq 1f
- ands w8, w8, w8, lsl #8
- b.ge 2f
-1:
- ret
-2:
-.endm
+#include "deblock-a-common.S"
.macro h264_loop_filter_luma
dup v22.16b, w2 // alpha
=====================================
common/aarch64/deblock.h
=====================================
@@ -55,4 +55,7 @@ void x264_deblock_h_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, i
#define x264_deblock_v_luma_intra_neon x264_template(deblock_v_luma_intra_neon)
void x264_deblock_v_luma_intra_neon( uint8_t *pix, intptr_t stride, int alpha, int beta );
+#define x264_deblock_v_chroma_sve x264_template(deblock_v_chroma_sve)
+void x264_deblock_v_chroma_sve( uint8_t *pix, intptr_t stride, int alpha, int beta, int8_t *tc0 );
+
#endif
=====================================
common/aarch64/mc-a-common.S
=====================================
@@ -0,0 +1,66 @@
+/****************************************************************************
+ * mc-a-common.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ * Mans Rullgard <mans at mansr.com>
+ * Stefan Groenroos <stefan.gronroos at gmail.com>
+ * David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros and functions that are intended to be used by
+// the SVE/SVE2 functions as well
+
+#if BIT_DEPTH == 8
+
+// 0 < weight < 64
+.macro load_weights_add_add
+ mov w6, w6
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+ neg w7, w7
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+ neg w6, w6
+.endm
+
+function pixel_avg_w4_neon
+1: subs w9, w9, #2
+ ld1 {v0.s}[0], [x2], x3
+ ld1 {v2.s}[0], [x4], x5
+ urhadd v0.8b, v0.8b, v2.8b
+ ld1 {v1.s}[0], [x2], x3
+ ld1 {v3.s}[0], [x4], x5
+ urhadd v1.8b, v1.8b, v3.8b
+ st1 {v0.s}[0], [x0], x1
+ st1 {v1.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+
+#else // BIT_DEPTH == 10
+
+#endif
=====================================
common/aarch64/mc-a-sve.S
=====================================
@@ -0,0 +1,108 @@
+/*****************************************************************************
+ * mc-a-sve.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "mc-a-common.S"
+
+.arch armv8-a+sve
+
+#if BIT_DEPTH == 8
+
+// void pixel_avg( uint8_t *dst, intptr_t dst_stride,
+// uint8_t *src1, intptr_t src1_stride,
+// uint8_t *src2, intptr_t src2_stride, int weight );
+.macro AVGH_SVE w h
+function pixel_avg_\w\()x\h\()_sve, export=1
+ mov w10, #64
+ cmp w6, #32
+ mov w9, #\h
+ b.eq pixel_avg_w\w\()_neon
+ subs w7, w10, w6
+ b.lt pixel_avg_weight_w\w\()_add_sub_sve // weight > 64
+ cmp w6, #0
+ b.ge pixel_avg_weight_w\w\()_add_add_sve
+ b pixel_avg_weight_w\w\()_sub_add_sve // weight < 0
+endfunc
+.endm
+
+AVGH_SVE 4, 2
+AVGH_SVE 4, 4
+AVGH_SVE 4, 8
+AVGH_SVE 4, 16
+
+// 0 < weight < 64
+.macro weight_add_add_sve dst, s1, s2, h=
+ mul \dst, \s1, v30.8h
+ mla \dst, \s2, v31.8h
+.endm
+
+// weight > 64
+.macro weight_add_sub_sve dst, s1, s2, h=
+ mul \dst, \s1, v30.8h
+ mls \dst, \s2, v31.8h
+.endm
+
+// weight < 0
+.macro weight_sub_add_sve dst, s1, s2, h=
+ mul \dst, \s2, v31.8h
+ mls \dst, \s1, v30.8h
+.endm
+
+.macro AVG_WEIGHT_SVE ext
+function pixel_avg_weight_w4_\ext\()_sve
+ load_weights_\ext
+ ptrue p0.b, vl8
+ dup v30.8h, w6
+ dup v31.8h, w7
+1: // height loop
+ subs w9, w9, #2
+ ld1b {z0.h}, p0/z, [x2]
+ add x2, x2, x3
+ ld1b {z1.h}, p0/z, [x4]
+ add x4, x4, x5
+ weight_\ext\()_sve v4.8h, v0.8h, v1.8h
+ ld1b {z2.h}, p0/z, [x2]
+ add x2, x2, x3
+ ld1b {z3.h}, p0/z, [x4]
+ add x4, x4, x5
+
+ sqrshrun v0.8b, v4.8h, #6
+ weight_\ext\()_sve v5.8h, v2.8h, v3.8h
+ st1 {v0.s}[0], [x0], x1
+ sqrshrun v1.8b, v5.8h, #6
+ st1 {v1.s}[0], [x0], x1
+ b.gt 1b
+ ret
+endfunc
+.endm
+
+AVG_WEIGHT_SVE add_add
+AVG_WEIGHT_SVE add_sub
+AVG_WEIGHT_SVE sub_add
+
+#else // BIT_DEPTH == 10
+
+
+#endif
=====================================
common/aarch64/mc-a.S
=====================================
@@ -27,6 +27,7 @@
*****************************************************************************/
#include "asm.S"
+#include "mc-a-common.S"
// note: prefetch stuff assumes 64-byte cacheline
@@ -327,9 +328,6 @@ AVGH 16, 8
AVGH 16, 16
// 0 < weight < 64
-.macro load_weights_add_add
- mov w6, w6
-.endm
.macro weight_add_add dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s1, v30.16b
@@ -341,9 +339,6 @@ AVGH 16, 16
.endm
// weight > 64
-.macro load_weights_add_sub
- neg w7, w7
-.endm
.macro weight_add_sub dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s1, v30.16b
@@ -355,9 +350,6 @@ AVGH 16, 16
.endm
// weight < 0
-.macro load_weights_sub_add
- neg w6, w6
-.endm
.macro weight_sub_add dst, s1, s2, h=
.ifc \h, 2
umull2 \dst, \s2, v31.16b
@@ -448,20 +440,6 @@ AVG_WEIGHT add_add
AVG_WEIGHT add_sub
AVG_WEIGHT sub_add
-function pixel_avg_w4_neon
-1: subs w9, w9, #2
- ld1 {v0.s}[0], [x2], x3
- ld1 {v2.s}[0], [x4], x5
- urhadd v0.8b, v0.8b, v2.8b
- ld1 {v1.s}[0], [x2], x3
- ld1 {v3.s}[0], [x4], x5
- urhadd v1.8b, v1.8b, v3.8b
- st1 {v0.s}[0], [x0], x1
- st1 {v1.s}[0], [x0], x1
- b.gt 1b
- ret
-endfunc
-
function pixel_avg_w8_neon
1: subs w9, w9, #4
ld1 {v0.8b}, [x2], x3
=====================================
common/aarch64/mc-c.c
=====================================
@@ -58,6 +58,15 @@ void x264_pixel_avg_4x4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, i
#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
void x264_pixel_avg_4x2_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x16_sve x264_template(pixel_avg_4x16_sve)
+void x264_pixel_avg_4x16_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x8_sve x264_template(pixel_avg_4x8_sve)
+void x264_pixel_avg_4x8_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x4_sve x264_template(pixel_avg_4x4_sve)
+void x264_pixel_avg_4x4_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+#define x264_pixel_avg_4x2_sve x264_template(pixel_avg_4x2_sve)
+void x264_pixel_avg_4x2_sve ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int );
+
#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
void x264_pixel_avg2_w4_neon ( pixel *, intptr_t, pixel *, intptr_t, pixel *, int );
#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
@@ -278,64 +287,70 @@ void x264_mc_init_aarch64( uint32_t cpu, x264_mc_functions_t *pf )
pf->prefetch_ref = x264_prefetch_ref_aarch64;
}
- if( !(cpu&X264_CPU_NEON) )
- return;
-
- pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
- pf->mbtree_propagate_list = mbtree_propagate_list_neon;
- pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
- pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
-
- pf->memcpy_aligned = x264_memcpy_aligned_neon;
- pf->memzero_aligned = x264_memzero_aligned_neon;
-
- pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
- pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
- pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
- pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
- pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
- pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
- pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
- pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
- pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
-
- pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
- pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
- pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
- pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
-
- pf->weight = mc_wtab_neon;
- pf->offsetadd = mc_offsetadd_wtab_neon;
- pf->offsetsub = mc_offsetsub_wtab_neon;
- pf->weight_cache = weight_cache_neon;
-
- pf->mc_chroma = x264_mc_chroma_neon;
- pf->mc_luma = mc_luma_neon;
- pf->get_ref = get_ref_neon;
-
- pf->integral_init4h = x264_integral_init4h_neon;
- pf->integral_init8h = x264_integral_init8h_neon;
- pf->integral_init4v = x264_integral_init4v_neon;
- pf->integral_init8v = x264_integral_init8v_neon;
-
- pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
-
- pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
-
- pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
-
- pf->plane_copy = plane_copy_neon;
- pf->plane_copy_swap = plane_copy_swap_neon;
- pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
- pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
- pf->plane_copy_interleave = plane_copy_interleave_neon;
-
- pf->hpel_filter = x264_hpel_filter_neon;
+ if( cpu&X264_CPU_NEON )
+ {
+ pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_neon;
+ pf->mbtree_propagate_list = mbtree_propagate_list_neon;
+ pf->mbtree_fix8_pack = x264_mbtree_fix8_pack_neon;
+ pf->mbtree_fix8_unpack = x264_mbtree_fix8_unpack_neon;
+
+ pf->memcpy_aligned = x264_memcpy_aligned_neon;
+ pf->memzero_aligned = x264_memzero_aligned_neon;
+
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_neon;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_neon;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_neon;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_neon;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_neon;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_neon;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_neon;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_neon;
+
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_neon;
+ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_neon;
+ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_neon;
+
+ pf->weight = mc_wtab_neon;
+ pf->offsetadd = mc_offsetadd_wtab_neon;
+ pf->offsetsub = mc_offsetsub_wtab_neon;
+ pf->weight_cache = weight_cache_neon;
+
+ pf->mc_chroma = x264_mc_chroma_neon;
+ pf->mc_luma = mc_luma_neon;
+ pf->get_ref = get_ref_neon;
+
+ pf->integral_init4h = x264_integral_init4h_neon;
+ pf->integral_init8h = x264_integral_init8h_neon;
+ pf->integral_init4v = x264_integral_init4v_neon;
+ pf->integral_init8v = x264_integral_init8v_neon;
+
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+
+ pf->store_interleave_chroma = x264_store_interleave_chroma_neon;
+
+ pf->plane_copy = plane_copy_neon;
+ pf->plane_copy_swap = plane_copy_swap_neon;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
+ pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+ pf->plane_copy_interleave = plane_copy_interleave_neon;
+
+ pf->hpel_filter = x264_hpel_filter_neon;
+ }
#if !HIGH_BIT_DEPTH
-
-
-
+#if HAVE_SVE
+ if( cpu&X264_CPU_SVE )
+ {
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_sve;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_sve;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_sve;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_sve;
+ }
+#endif
#endif // !HIGH_BIT_DEPTH
}
=====================================
common/aarch64/pixel-a-common.S
=====================================
@@ -0,0 +1,44 @@
+/****************************************************************************
+ * pixel-a-common.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ * Janne Grunau <janne-x264 at jannau.net>
+ * David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+// This file contains the NEON macros and constants that are intended to be used by
+// the SVE/SVE2 functions as well
+
+const mask_ac_4_8
+.short 0, -1, -1, -1, 0, -1, -1, -1
+.short 0, -1, -1, -1, -1, -1, -1, -1
+endconst
+
+.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
+ SUMSUB_AB \s1, \d1, \a, \b
+ SUMSUB_AB \s2, \d2, \c, \d
+.endm
+
+.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
+ SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
+ SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
+.endm
=====================================
common/aarch64/pixel-a-sve.S
=====================================
@@ -0,0 +1,523 @@
+/*****************************************************************************
+ * pixel-a-sve.S: aarch64 pixel metrics
+ *****************************************************************************
+ * Copyright (C) 2009-2023 x264 project
+ *
+ * Authors: David Chen <david.chen at myais.com.cn>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+#include "pixel-a-common.S"
+
+.arch armv8-a+sve
+
+#if BIT_DEPTH == 8
+
+.macro SSD_START_SVE_4
+ ptrue p0.h, vl4
+ ld1b {z16.h}, p0/z, [x0]
+ ld1b {z17.h}, p0/z, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ sub v2.4h, v16.4h, v17.4h
+ ld1b {z16.h}, p0/z, [x0]
+ ld1b {z17.h}, p0/z, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ smull v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_SVE_4
+ sub v2.4h, v16.4h, v17.4h
+ ld1b {z16.h}, p0/z, [x0]
+ ld1b {z17.h}, p0/z, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ smlal v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_END_SVE_4
+ sub v2.4h, v16.4h, v17.4h
+ smlal v0.4s, v2.4h, v2.4h
+.endm
+
+.macro SSD_START_SVE_8
+ ptrue p0.h, vl8
+ ld1b {z16.h}, p0/z, [x0]
+ ld1b {z17.h}, p0/z, [x2]
+ add x0, x0, x1
+ add x2, x2, x3
+ sub v2.8h, v16.8h, v17.8h
+ ld1b {z16.h}, p0/z, [x0]
+ smull v0.4s, v2.4h, v2.4h
+ ld1b {z17.h}, p0/z, [x2]
+ smlal2 v0.4s, v2.8h, v2.8h
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SSD_SVE_8
+ sub v2.8h, v16.8h, v17.8h
+ ld1b {z16.h}, p0/z, [x0]
+ smlal v0.4s, v2.4h, v2.4h
+ ld1b {z17.h}, p0/z, [x2]
+ smlal2 v0.4s, v2.8h, v2.8h
+ add x0, x0, x1
+ add x2, x2, x3
+.endm
+
+.macro SSD_END_SVE_8
+ sub v2.8h, v16.8h, v17.8h
+ smlal v0.4s, v2.4h, v2.4h
+ smlal2 v0.4s, v2.8h, v2.8h
+.endm
+
+.macro SSD_FUNC_SVE w h
+function pixel_ssd_\w\()x\h\()_sve, export=1
+ SSD_START_SVE_\w
+.rept \h-2
+ SSD_SVE_\w
+.endr
+ SSD_END_SVE_\w
+
+ addv s0, v0.4s
+ mov w0, v0.s[0]
+ ret
+endfunc
+.endm
+
+.macro load_diff_fly_sve_8x8
+ ld1b {z1.h}, p0/z, [x2]
+ ld1b {z0.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ ld1b {z3.h}, p0/z, [x2]
+ ld1b {z2.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ sub v16.8h, v0.8h, v1.8h
+ sub v17.8h, v2.8h, v3.8h
+ ld1b {z5.h}, p0/z, [x2]
+ ld1b {z4.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ ld1b {z7.h}, p0/z, [x2]
+ ld1b {z6.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ sub v18.8h, v4.8h, v5.8h
+ sub v19.8h, v6.8h, v7.8h
+ ld1b {z1.h}, p0/z, [x2]
+ ld1b {z0.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ ld1b {z3.h}, p0/z, [x2]
+ ld1b {z2.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ sub v20.8h, v0.8h, v1.8h
+ sub v21.8h, v2.8h, v3.8h
+ ld1b {z5.h}, p0/z, [x2]
+ ld1b {z4.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+ ld1b {z7.h}, p0/z, [x2]
+ ld1b {z6.h}, p0/z, [x0]
+ add x2, x2, x3
+ add x0, x0, x1
+
+ SUMSUB_AB v0.8h, v1.8h, v16.8h, v17.8h
+ SUMSUB_AB v2.8h, v3.8h, v18.8h, v19.8h
+
+ sub v22.8h, v4.8h, v5.8h
+ sub v23.8h, v6.8h, v7.8h
+.endm
+
+.macro pixel_var_sve_8 h
+function pixel_var_8x\h\()_sve, export=1
+ ptrue p0.h, vl8
+ ld1b {z16.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z17.h}, p0/z, [x0]
+ add x0, x0, x1
+ mov x2, \h - 4
+ mul v1.8h, v16.8h, v16.8h
+ mul v2.8h, v17.8h, v17.8h
+ add v0.8h, v16.8h, v17.8h
+ ld1b {z18.h}, p0/z, [x0]
+ add x0, x0, x1
+ uaddlp v1.4s, v1.8h
+ uaddlp v2.4s, v2.8h
+ ld1b {z19.h}, p0/z, [x0]
+ add x0, x0, x1
+
+1: subs x2, x2, #4
+ add v0.8h, v0.8h, v18.8h
+ mul v24.8h, v18.8h, v18.8h
+ ld1b {z20.h}, p0/z, [x0]
+ add x0, x0, x1
+ add v0.8h, v0.8h, v19.8h
+ mul v25.8h, v19.8h, v19.8h
+ uadalp v1.4s, v24.8h
+ ld1b {z21.h}, p0/z, [x0]
+ add x0, x0, x1
+ add v0.8h, v0.8h, v20.8h
+ mul v26.8h, v20.8h, v20.8h
+ uadalp v2.4s, v25.8h
+ ld1b {z18.h}, p0/z, [x0]
+ add x0, x0, x1
+ add v0.8h, v0.8h, v21.8h
+ mul v27.8h, v21.8h, v21.8h
+ uadalp v1.4s, v26.8h
+ ld1b {z19.h}, p0/z, [x0]
+ add x0, x0, x1
+ uadalp v2.4s, v27.8h
+ b.gt 1b
+
+ add v0.8h, v0.8h, v18.8h
+ mul v28.8h, v18.8h, v18.8h
+ add v0.8h, v0.8h, v19.8h
+ mul v29.8h, v19.8h, v19.8h
+ uadalp v1.4s, v28.8h
+ uadalp v2.4s, v29.8h
+
+ b var_end
+endfunc
+.endm
+
+function var_end
+ add v1.4s, v1.4s, v2.4s
+ uaddlv s0, v0.8h
+ uaddlv d1, v1.4s
+ mov w0, v0.s[0]
+ mov x1, v1.d[0]
+ orr x0, x0, x1, lsl #32
+ ret
+endfunc
+
+.macro SUMSUBL_AB_SVE sum, sub, a, b
+ add \sum, \a, \b
+ sub \sub, \a, \b
+.endm
+
+function pixel_sa8d_8x8_sve, export=1
+ ptrue p0.h, vl8
+ mov x4, x30
+ bl pixel_sa8d_8x8_sve
+ add v0.8h, v0.8h, v1.8h
+ uaddlv s0, v0.8h
+ mov w0, v0.s[0]
+ add w0, w0, #1
+ lsr w0, w0, #1
+ ret x4
+endfunc
+
+.macro sa8d_satd_sve_8x8 satd=
+function pixel_sa8d_\satd\()8x8_sve
+ load_diff_fly_sve_8x8
+
+ SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
+ SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
+
+ HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
+.ifc \satd, satd_
+ transpose v0.8h, v1.8h, v16.8h, v17.8h
+ transpose v2.8h, v3.8h, v18.8h, v19.8h
+ transpose v4.8h, v5.8h, v20.8h, v21.8h
+ transpose v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
+ SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
+ SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
+ SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
+
+ transpose v4.4s, v6.4s, v24.4s, v26.4s
+ transpose v5.4s, v7.4s, v25.4s, v27.4s
+ transpose v24.4s, v26.4s, v0.4s, v2.4s
+ transpose v25.4s, v27.4s, v1.4s, v3.4s
+
+ abs v0.8h, v4.8h
+ abs v1.8h, v5.8h
+ abs v2.8h, v6.8h
+ abs v3.8h, v7.8h
+ abs v4.8h, v24.8h
+ abs v5.8h, v25.8h
+ abs v6.8h, v26.8h
+ abs v7.8h, v27.8h
+
+ umax v0.8h, v0.8h, v2.8h
+ umax v1.8h, v1.8h, v3.8h
+ umax v2.8h, v4.8h, v6.8h
+ umax v3.8h, v5.8h, v7.8h
+
+ add v26.8h, v0.8h, v1.8h
+ add v27.8h, v2.8h, v3.8h
+.endif
+
+ SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
+ SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
+ SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
+ SUMSUB_AB v3.8h, v19.8h, v19.8h, v23.8h
+
+ transpose v20.8h, v21.8h, v16.8h, v17.8h
+ transpose v4.8h, v5.8h, v0.8h, v1.8h
+ transpose v22.8h, v23.8h, v18.8h, v19.8h
+ transpose v6.8h, v7.8h, v2.8h, v3.8h
+
+ SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
+ SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
+ SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
+ SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
+
+ transpose v20.4s, v22.4s, v2.4s, v0.4s
+ transpose v21.4s, v23.4s, v3.4s, v1.4s
+ transpose v16.4s, v18.4s, v24.4s, v4.4s
+ transpose v17.4s, v19.4s, v25.4s, v5.4s
+
+ SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
+ SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
+ SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
+ SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
+
+ transpose v16.2d, v20.2d, v0.2d, v4.2d
+ transpose v17.2d, v21.2d, v1.2d, v5.2d
+ transpose v18.2d, v22.2d, v2.2d, v6.2d
+ transpose v19.2d, v23.2d, v3.2d, v7.2d
+
+ abs v16.8h, v16.8h
+ abs v20.8h, v20.8h
+ abs v17.8h, v17.8h
+ abs v21.8h, v21.8h
+ abs v18.8h, v18.8h
+ abs v22.8h, v22.8h
+ abs v19.8h, v19.8h
+ abs v23.8h, v23.8h
+
+ umax v16.8h, v16.8h, v20.8h
+ umax v17.8h, v17.8h, v21.8h
+ umax v18.8h, v18.8h, v22.8h
+ umax v19.8h, v19.8h, v23.8h
+
+ add v0.8h, v16.8h, v17.8h
+ add v1.8h, v18.8h, v19.8h
+
+ ret
+endfunc
+.endm
+
+.macro HADAMARD_AC_SVE w h
+function pixel_hadamard_ac_\w\()x\h\()_sve, export=1
+ ptrue p0.h, vl8
+ movrel x5, mask_ac_4_8
+ mov x4, x30
+ ld1 {v30.8h,v31.8h}, [x5]
+ movi v28.16b, #0
+ movi v29.16b, #0
+
+ bl hadamard_ac_8x8_sve
+.if \h > 8
+ bl hadamard_ac_8x8_sve
+.endif
+.if \w > 8
+ sub x0, x0, x1, lsl #3
+ add x0, x0, #8
+ bl hadamard_ac_8x8_sve
+.endif
+.if \w * \h == 256
+ sub x0, x0, x1, lsl #4
+ bl hadamard_ac_8x8_sve
+.endif
+
+ addv s1, v29.4s
+ addv s0, v28.4s
+ mov w1, v1.s[0]
+ mov w0, v0.s[0]
+ lsr w1, w1, #2
+ lsr w0, w0, #1
+ orr x0, x0, x1, lsl #32
+ ret x4
+endfunc
+.endm
+
+// v28: satd v29: sa8d v30: mask_ac4 v31: mask_ac8
+function hadamard_ac_8x8_sve
+ ld1b {z16.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z17.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z18.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z19.h}, p0/z, [x0]
+ add x0, x0, x1
+ SUMSUBL_AB_SVE v0.8h, v1.8h, v16.8h, v17.8h
+ ld1b {z20.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z21.h}, p0/z, [x0]
+ add x0, x0, x1
+ SUMSUBL_AB_SVE v2.8h, v3.8h, v18.8h, v19.8h
+ ld1b {z22.h}, p0/z, [x0]
+ add x0, x0, x1
+ ld1b {z23.h}, p0/z, [x0]
+ add x0, x0, x1
+ SUMSUBL_AB_SVE v4.8h, v5.8h, v20.8h, v21.8h
+ SUMSUBL_AB_SVE v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_ABCD v16.8h, v18.8h, v17.8h, v19.8h, v0.8h, v2.8h, v1.8h, v3.8h
+ SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
+
+ transpose v0.8h, v1.8h, v16.8h, v17.8h
+ transpose v2.8h, v3.8h, v18.8h, v19.8h
+ transpose v4.8h, v5.8h, v20.8h, v21.8h
+ transpose v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_AB v16.8h, v17.8h, v0.8h, v1.8h
+ SUMSUB_AB v18.8h, v19.8h, v2.8h, v3.8h
+ SUMSUB_AB v20.8h, v21.8h, v4.8h, v5.8h
+ SUMSUB_AB v22.8h, v23.8h, v6.8h, v7.8h
+
+ transpose v0.4s, v2.4s, v16.4s, v18.4s
+ transpose v1.4s, v3.4s, v17.4s, v19.4s
+ transpose v4.4s, v6.4s, v20.4s, v22.4s
+ transpose v5.4s, v7.4s, v21.4s, v23.4s
+
+ SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
+ SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
+ SUMSUB_ABCD v20.8h, v22.8h, v21.8h, v23.8h, v4.8h, v6.8h, v5.8h, v7.8h
+
+ abs v0.8h, v16.8h
+ abs v4.8h, v20.8h
+ abs v1.8h, v17.8h
+ abs v5.8h, v21.8h
+ abs v2.8h, v18.8h
+ abs v6.8h, v22.8h
+ abs v3.8h, v19.8h
+ abs v7.8h, v23.8h
+
+ add v0.8h, v0.8h, v4.8h
+ add v1.8h, v1.8h, v5.8h
+ and v0.16b, v0.16b, v30.16b
+ add v2.8h, v2.8h, v6.8h
+ add v3.8h, v3.8h, v7.8h
+ add v0.8h, v0.8h, v2.8h
+ add v1.8h, v1.8h, v3.8h
+ uadalp v28.4s, v0.8h
+ uadalp v28.4s, v1.8h
+
+ SUMSUB_AB v6.8h, v7.8h, v23.8h, v19.8h
+ SUMSUB_AB v4.8h, v5.8h, v22.8h, v18.8h
+ SUMSUB_AB v2.8h, v3.8h, v21.8h, v17.8h
+ SUMSUB_AB v1.8h, v0.8h, v16.8h, v20.8h
+
+ transpose v16.2d, v17.2d, v6.2d, v7.2d
+ transpose v18.2d, v19.2d, v4.2d, v5.2d
+ transpose v20.2d, v21.2d, v2.2d, v3.2d
+
+ abs v16.8h, v16.8h
+ abs v17.8h, v17.8h
+ abs v18.8h, v18.8h
+ abs v19.8h, v19.8h
+ abs v20.8h, v20.8h
+ abs v21.8h, v21.8h
+
+ transpose v7.2d, v6.2d, v1.2d, v0.2d
+
+ umax v3.8h, v16.8h, v17.8h
+ umax v2.8h, v18.8h, v19.8h
+ umax v1.8h, v20.8h, v21.8h
+
+ SUMSUB_AB v4.8h, v5.8h, v7.8h, v6.8h
+
+ add v2.8h, v2.8h, v3.8h
+ add v2.8h, v2.8h, v1.8h
+ and v4.16b, v4.16b, v31.16b
+ add v2.8h, v2.8h, v2.8h
+ abs v5.8h, v5.8h
+ abs v4.8h, v4.8h
+ add v2.8h, v2.8h, v5.8h
+ add v2.8h, v2.8h, v4.8h
+ uadalp v29.4s, v2.8h
+ ret
+endfunc
+
+SSD_FUNC_SVE 4, 4
+SSD_FUNC_SVE 4, 8
+SSD_FUNC_SVE 4, 16
+SSD_FUNC_SVE 8, 4
+SSD_FUNC_SVE 8, 8
+
+pixel_var_sve_8 8
+pixel_var_sve_8 16
+
+sa8d_satd_sve_8x8
+
+HADAMARD_AC_SVE 8, 8
+HADAMARD_AC_SVE 8, 16
+HADAMARD_AC_SVE 16, 8
+HADAMARD_AC_SVE 16, 16
+
+#else /* BIT_DEPTH == 10 */
+
+.macro SSD_START_SVE_4
+ ptrue p0.s, vl4
+ ld1h {z16.s}, p0/z, [x0]
+ ld1h {z17.s}, p0/z, [x2]
+ add x0, x0, x1, lsl #1
+ add x2, x2, x3, lsl #1
+ sub v2.4s, v16.4s, v17.4s
+ ld1h {z16.s}, p0/z, [x0]
+ ld1h {z17.s}, p0/z, [x2]
+ add x0, x0, x1, lsl #1
+ add x2, x2, x3, lsl #1
+ mul v0.4s, v2.4s, v2.4s
+.endm
+
+.macro SSD_SVE_4
+ sub v2.4s, v16.4s, v17.4s
+ ld1h {z16.s}, p0/z, [x0]
+ ld1h {z17.s}, p0/z, [x2]
+ add x0, x0, x1, lsl #1
+ add x2, x2, x3, lsl #1
+ mla v0.4s, v2.4s, v2.4s
+.endm
+
+.macro SSD_END_SVE_4
+ sub v2.4s, v16.4s, v17.4s
+ mla v0.4s, v2.4s, v2.4s
+.endm
+
+.macro SSD_FUNC_SVE w h
+function pixel_ssd_\w\()x\h\()_sve, export=1
+ SSD_START_SVE_\w
+.rept \h-2
+ SSD_SVE_\w
+.endr
+ SSD_END_SVE_\w
+
+ addv s0, v0.4s
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+SSD_FUNC_SVE 4, 4
+SSD_FUNC_SVE 4, 8
+SSD_FUNC_SVE 4, 16
+
+#endif /* BIT_DEPTH == 8 */
=====================================
common/aarch64/pixel-a.S
=====================================
@@ -25,6 +25,7 @@
*****************************************************************************/
#include "asm.S"
+#include "pixel-a-common.S"
const mask
.rept 16
@@ -35,26 +36,11 @@ const mask
.endr
endconst
-const mask_ac_4_8
-.short 0, -1, -1, -1, 0, -1, -1, -1
-.short 0, -1, -1, -1, -1, -1, -1, -1
-endconst
-
.macro SUMSUBL_AB sum, sub, a, b
uaddl \sum, \a, \b
usubl \sub, \a, \b
.endm
-.macro SUMSUB_ABCD s1, d1, s2, d2, a, b, c, d
- SUMSUB_AB \s1, \d1, \a, \b
- SUMSUB_AB \s2, \d2, \c, \d
-.endm
-
-.macro HADAMARD4_V r1, r2, r3, r4, t1, t2, t3, t4
- SUMSUB_ABCD \t1, \t2, \t3, \t4, \r1, \r2, \r3, \r4
- SUMSUB_ABCD \r1, \r3, \r2, \r4, \t1, \t3, \t2, \t4
-.endm
-
#if BIT_DEPTH == 8
.macro SAD_START_4
=====================================
common/aarch64/pixel.h
=====================================
@@ -65,6 +65,11 @@
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
+#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
+#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
+#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
+#define x264_pixel_ssd_8x4_sve x264_template(pixel_ssd_8x4_sve)
+#define x264_pixel_ssd_8x8_sve x264_template(pixel_ssd_8x8_sve)
#define DECL_PIXELS( ret, name, suffix, args ) \
ret x264_pixel_##name##_16x16_##suffix args;\
ret x264_pixel_##name##_16x8_##suffix args;\
@@ -73,10 +78,18 @@
ret x264_pixel_##name##_8x4_##suffix args;\
ret x264_pixel_##name##_4x16_##suffix args;\
ret x264_pixel_##name##_4x8_##suffix args;\
- ret x264_pixel_##name##_4x4_##suffix args;\
+ ret x264_pixel_##name##_4x4_##suffix args;
+#define DECL_PIXELS_SSD_SVE( ret, args ) \
+ ret x264_pixel_ssd_8x8_sve args;\
+ ret x264_pixel_ssd_8x4_sve args;\
+ ret x264_pixel_ssd_4x16_sve args;\
+ ret x264_pixel_ssd_4x8_sve args;\
+ ret x264_pixel_ssd_4x4_sve args;
#define DECL_X1( name, suffix ) \
DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
+#define DECL_X1_SSD_SVE( ) \
+ DECL_PIXELS_SSD_SVE( int, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
@@ -86,6 +99,7 @@ DECL_X1( sad, neon )
DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
+DECL_X1_SSD_SVE( )
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
@@ -100,6 +114,8 @@ int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
int x264_pixel_sa8d_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
uint64_t x264_pixel_sa8d_satd_16x16_neon( pixel *, intptr_t, pixel *, intptr_t );
+#define x264_pixel_sa8d_8x8_sve x264_template(pixel_sa8d_8x8_sve)
+int x264_pixel_sa8d_8x8_sve ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
uint64_t x264_pixel_var_8x8_neon ( pixel *, intptr_t );
@@ -111,6 +127,11 @@ uint64_t x264_pixel_var_16x16_neon( pixel *, intptr_t );
int x264_pixel_var2_8x8_neon ( pixel *, pixel *, int * );
#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
int x264_pixel_var2_8x16_neon( pixel *, pixel *, int * );
+#define x264_pixel_var_8x8_sve x264_template(pixel_var_8x8_sve)
+uint64_t x264_pixel_var_8x8_sve ( pixel *, intptr_t );
+#define x264_pixel_var_8x16_sve x264_template(pixel_var_8x16_sve)
+uint64_t x264_pixel_var_8x16_sve ( pixel *, intptr_t );
+
#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
uint64_t x264_pixel_hadamard_ac_8x8_neon ( pixel *, intptr_t );
@@ -120,6 +141,15 @@ uint64_t x264_pixel_hadamard_ac_8x16_neon ( pixel *, intptr_t );
uint64_t x264_pixel_hadamard_ac_16x8_neon ( pixel *, intptr_t );
#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
uint64_t x264_pixel_hadamard_ac_16x16_neon( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_8x8_sve x264_template(pixel_hadamard_ac_8x8_sve)
+uint64_t x264_pixel_hadamard_ac_8x8_sve ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_8x16_sve x264_template(pixel_hadamard_ac_8x16_sve)
+uint64_t x264_pixel_hadamard_ac_8x16_sve ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_16x8_sve x264_template(pixel_hadamard_ac_16x8_sve)
+uint64_t x264_pixel_hadamard_ac_16x8_sve ( pixel *, intptr_t );
+#define x264_pixel_hadamard_ac_16x16_sve x264_template(pixel_hadamard_ac_16x16_sve)
+uint64_t x264_pixel_hadamard_ac_16x16_sve( pixel *, intptr_t );
+
#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
void x264_pixel_ssim_4x4x2_core_neon( const pixel *, intptr_t,
=====================================
common/dct.c
=====================================
@@ -707,6 +707,18 @@ void x264_dct_init( uint32_t cpu, x264_dct_function_t *dctf )
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
}
+#if HAVE_SVE
+ if ( cpu&X264_CPU_SVE )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_sve;
+ }
+#endif
+#if HAVE_SVE2
+ if ( cpu&X264_CPU_SVE2 )
+ {
+ dctf->add4x4_idct = x264_add4x4_idct_sve2;
+ }
+#endif
#endif
#if HAVE_MSA
@@ -1105,6 +1117,12 @@ void x264_zigzag_init( uint32_t cpu, x264_zigzag_function_t *pf_progressive, x26
pf_interlaced->interleave_8x8_cavlc =
pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
}
+#if HAVE_SVE
+ if( cpu&X264_CPU_SVE )
+ {
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sve;
+ }
+#endif
#endif // HAVE_AARCH64
#if HAVE_ALTIVEC
=====================================
common/deblock.c
=====================================
@@ -803,6 +803,12 @@ void x264_deblock_init( uint32_t cpu, x264_deblock_function_t *pf, int b_mbaff )
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_neon;
pf->deblock_strength = x264_deblock_strength_neon;
}
+#if HAVE_SVE
+ if ( cpu&X264_CPU_SVE )
+ {
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_sve;
+ }
+#endif
#endif
#if HAVE_MSA
=====================================
common/pixel.c
=====================================
@@ -829,12 +829,32 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
#define INIT8_NAME( name1, name2, cpu ) \
INIT7_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu;
+#if HAVE_SVE
+#define INIT7_NAME_SVE_SSD_10BIT( ) \
+ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \
+ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve;
+#endif
+#if HAVE_SVE
+#define INIT8_NAME_SVE_SSD( ) \
+ pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_sve; \
+ pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_sve; \
+ pixf->ssd[PIXEL_4x8] = x264_pixel_ssd_4x8_sve; \
+ pixf->ssd[PIXEL_4x4] = x264_pixel_ssd_4x4_sve; \
+ pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve;
+#define INIT8_NAME_SVE_SSD_10BIT() \
+ INIT7_NAME_SVE_SSD_10BIT() \
+ pixf->ssd[PIXEL_4x16] = x264_pixel_ssd_4x16_sve;
+#endif
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
+#if HAVE_SVE
+#define INIT8_SVE_SSD( ) INIT8_NAME_SVE_SSD( )
+#define INIT8_SVE_SSD_10BIT( ) INIT8_NAME_SVE_SSD_10BIT( )
+#endif
#define INIT_ADS( cpu ) \
pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
@@ -1086,6 +1106,12 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
}
+#if HAVE_SVE
+ if( cpu&X264_CPU_SVE )
+ {
+ INIT8_SVE_SSD_10BIT();
+ }
+#endif
#endif // HAVE_AARCH64
#else // !HIGH_BIT_DEPTH
@@ -1499,6 +1525,18 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
}
+#if HAVE_SVE
+ if( cpu&X264_CPU_SVE )
+ {
+ INIT8_SVE_SSD( );
+ INIT4( hadamard_ac, _sve );
+
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sve;
+
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sve;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_sve;
+ }
+#endif
#endif // HAVE_AARCH64
#if HAVE_MSA
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/c196240409e4d7c01b47448d93b1f9683aaa7cf7...c1c9931dc87289b8aeba78150467f17bdb97d019
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/c196240409e4d7c01b47448d93b1f9683aaa7cf7...c1c9931dc87289b8aeba78150467f17bdb97d019
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list