[x264-devel] [Git][videolan/x264][master] Provide implementations for functions using the instructions SDOT/UDOT in the...
Martin Storsjö (@mstorsjo)
gitlab at videolan.org
Wed Mar 19 07:50:49 UTC 2025
Martin Storsjö pushed to branch master at VideoLAN / x264
Commits:
fe9e4a7f by Konstantinos Margaritis at 2025-03-12T12:35:10+00:00
Provide implementations for functions using the instructions SDOT/UDOT in the DotProd Armv8 extension.
Functions implemented:
sad_16x8, sad_16x16,
sad_x3_16x8_neon, sad_x3_16x16_neon,
sad_x4_16x8_neon, sad_x4_16x16_neon,
ssd_8x4, ssd_8x8, ssd_8x16, ssd_16x8, ssd_16x16,
pixel_vsad
Performance improvement against Neon ranges from 5% to 188%.
Following is the output of ./checkasm8 --bench (run on a Graviton4 system):
sad_16x8_c: 1323
sad_16x8_neon: 224
sad_16x8_dotprod: 211
sad_16x16_c: 2619
sad_16x16_neon: 365
sad_16x16_dotprod: 320
sad_x3_16x8_c: 3836
sad_x3_16x8_neon: 403
sad_x3_16x8_dotprod: 317
sad_x3_16x16_c: 7725
sad_x3_16x16_neon: 714
sad_x3_16x16_dotprod: 532
sad_x4_16x8_c: 5080
sad_x4_16x8_neon: 438
sad_x4_16x8_dotprod: 375
sad_x4_16x16_c: 10260
sad_x4_16x16_neon: 794
sad_x4_16x16_dotprod: 655
ssd_8x4_c: 381
ssd_8x4_neon: 157
ssd_8x4_dotprod: 115
ssd_8x4_sve: 150
ssd_8x8_c: 695
ssd_8x8_neon: 238
ssd_8x8_dotprod: 161
ssd_8x8_sve: 228
ssd_8x16_c: 1335
ssd_8x16_neon: 388
ssd_8x16_dotprod: 267
ssd_16x8_c: 1342
ssd_16x8_neon: 285
ssd_16x8_dotprod: 166
ssd_16x16_c: 2623
ssd_16x16_neon: 503
ssd_16x16_dotprod: 277
vsad_c: 2786
vsad_neon: 311
vsad_dotprod: 235
- - - - -
3 changed files:
- common/aarch64/pixel-a.S
- common/aarch64/pixel.h
- common/pixel.c
Changes:
=====================================
common/aarch64/pixel-a.S
=====================================
@@ -77,26 +77,42 @@ endconst
uabal v17.8h, v2.8b, v3.8b
.endm
-.macro SAD_START_16
+.macro SAD_START_16, dotprod=0
ld1 {v1.16b}, [x2], x3
ld1 {v0.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
+.if \dotprod == 0
uabdl v16.8h, v0.8b, v1.8b
uabdl2 v17.8h, v0.16b, v1.16b
uabal v16.8h, v2.8b, v3.8b
uabal2 v17.8h, v2.16b, v3.16b
+.else
+ movi v18.4s, #0x0
+ movi v19.16b, #0x1
+ uabd v16.16b, v0.16b, v1.16b
+ uabd v17.16b, v2.16b, v3.16b
+ udot v18.4s, v16.16b, v19.16b
+ udot v18.4s, v17.16b, v19.16b
+.endif
.endm
-.macro SAD_16
+.macro SAD_16, dotprod=0
ld1 {v1.16b}, [x2], x3
ld1 {v0.16b}, [x0], x1
ld1 {v3.16b}, [x2], x3
ld1 {v2.16b}, [x0], x1
+.if \dotprod == 0
uabal v16.8h, v0.8b, v1.8b
uabal2 v17.8h, v0.16b, v1.16b
uabal v16.8h, v2.8b, v3.8b
uabal2 v17.8h, v2.16b, v3.16b
+.else
+ uabd v16.16b, v0.16b, v1.16b
+ uabd v17.16b, v2.16b, v3.16b
+ udot v18.4s, v16.16b, v19.16b
+ udot v18.4s, v17.16b, v19.16b
+.endif
.endm
.macro SAD_FUNC w, h, name
@@ -115,6 +131,19 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
endfunc
.endm
+.macro SAD_FUNC_DOTPROD w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon_dotprod, export=1
+ SAD_START_\w 1
+
+.rept \h / 2 - 1
+ SAD_\w 1
+.endr
+ addv s0, v18.4s
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
.macro SAD_X_4 x, first=uabal
ld1 {v0.s}[0], [x0], x7
ld1 {v1.s}[0], [x1], x5
@@ -232,6 +261,75 @@ function pixel_sad_x\x\()_\w\()x\h\()_neon, export=1
endfunc
.endm
+.macro SAD_X_DOTPROD_16 x
+ ld1 {v0.16b}, [x0], x7
+ ld1 {v1.16b}, [x1], x5
+ ld1 {v2.16b}, [x2], x5
+ uabd v20.16b, v1.16b, v0.16b
+ uabd v22.16b, v2.16b, v0.16b
+ ld1 {v5.16b}, [x0], x7
+ udot v16.4s, v20.16b, v28.16b
+ udot v17.4s, v22.16b, v28.16b
+ ld1 {v3.16b}, [x3], x5
+ ld1 {v1.16b}, [x1], x5
+ uabd v24.16b, v3.16b, v0.16b
+ uabd v21.16b, v1.16b, v5.16b
+ ld1 {v2.16b}, [x2], x5
+ ld1 {v3.16b}, [x3], x5
+ udot v18.4s, v24.16b, v28.16b
+ udot v16.4s, v21.16b, v28.16b
+ uabd v23.16b, v2.16b, v5.16b
+ uabd v25.16b, v3.16b, v5.16b
+ udot v17.4s, v23.16b, v28.16b
+ udot v18.4s, v25.16b, v28.16b
+.if \x == 4
+ ld1 {v4.16b}, [x4], x5
+ ld1 {v1.16b}, [x4], x5
+ uabd v26.16b, v4.16b, v0.16b
+ uabd v27.16b, v1.16b, v5.16b
+ udot v19.4s, v26.16b, v28.16b
+ udot v19.4s, v27.16b, v28.16b
+.endif
+.endm
+
+.macro SAD_X_DOTPROD_FUNC x, w, h
+function pixel_sad_x\x\()_\w\()x\h\()_neon_dotprod, export=1
+ movi v16.4s, #0x0
+ movi v17.4s, #0x0
+ movi v18.4s, #0x0
+.if \x == 4
+ movi v19.4s, #0x0
+.endif
+ movi v28.16b, #0x1
+
+.if \x == 3
+ mov x6, x5
+ mov x5, x4
+.endif
+ mov x7, #FENC_STRIDE
+
+ SAD_X_DOTPROD_\w \x
+
+.rept \h / 2 - 1
+ SAD_X_DOTPROD_\w \x
+.endr
+
+ addv s0, v16.4s
+ addv s1, v17.4s
+ addv s2, v18.4s
+.if \x == 4
+ addv s3, v19.4s
+.endif
+ stp s0, s1, [x6], #8
+.if \x == 3
+ str s2, [x6]
+.else
+ stp s2, s3, [x6]
+.endif
+ ret
+endfunc
+.endm
+
function pixel_vsad_neon, export=1
subs w2, w2, #2
ld1 {v0.16b}, [x0], x1
@@ -256,6 +354,35 @@ function pixel_vsad_neon, export=1
ret
endfunc
+#if HAVE_DOTPROD
+ENABLE_DOTPROD
+function pixel_vsad_neon_dotprod, export=1
+ ld1 {v0.16b}, [x0], x1
+ ld1 {v1.16b}, [x0], x1
+ subs w2, w2, #2
+ movi v3.16b, #0x1
+ movi v6.4s, #0x0
+ uabd v5.16b, v0.16b, v1.16b
+ udot v6.4s, v5.16b, v3.16b
+ b.le 2f
+1:
+ ld1 {v0.16b}, [x0], x1
+ subs w2, w2, #2
+ uabd v5.16b, v0.16b, v1.16b
+ ld1 {v1.16b}, [x0], x1
+ udot v6.4s, v5.16b, v3.16b
+ b.lt 2f
+ uabd v5.16b, v0.16b, v1.16b
+ udot v6.4s, v5.16b, v3.16b
+ b.gt 1b
+2:
+ addv s0, v6.4s
+ fmov w0, s0
+ ret
+endfunc
+DISABLE_DOTPROD
+#endif // HAVE_DOTPROD
+
function pixel_asd8_neon, export=1
sub w4, w4, #2
ld1 {v0.8b}, [x0], x1
@@ -375,6 +502,45 @@ function pixel_ssd_\w\()x\h\()_neon, export=1
endfunc
.endm
+.macro SSD_DOTPROD_8
+ ld1 {v16.8b}, [x0], x1
+ ld1 {v17.8b}, [x2], x3
+ ld1 {v18.8b}, [x0], x1
+ uabd v20.8b, v16.8b, v17.8b
+ ld1 {v19.8b}, [x2], x3
+ uabd v21.8b, v18.8b, v19.8b
+ udot v22.2s, v20.8b, v20.8b
+ udot v22.2s, v21.8b, v21.8b
+.endm
+
+.macro SSD_DOTPROD_16
+ ld1 {v16.16b}, [x0], x1
+ ld1 {v17.16b}, [x2], x3
+ ld1 {v18.16b}, [x0], x1
+ uabd v20.16b, v16.16b, v17.16b
+ ld1 {v19.16b}, [x2], x3
+ uabd v21.16b, v18.16b, v19.16b
+ udot v22.4s, v20.16b, v20.16b
+ udot v22.4s, v21.16b, v21.16b
+.endm
+
+.macro SSD_DOTPROD_FUNC w h
+function pixel_ssd_\w\()x\h\()_neon_dotprod, export=1
+ movi v22.4s, #0x0
+
+.rept \h/2
+ SSD_DOTPROD_\w
+.endr
+.if \w > 8
+ addv s0, v22.4s
+.else
+ addp v0.2s, v22.2s, v22.2s
+.endif
+ mov w0, v0.s[0]
+ ret
+endfunc
+.endm
+
function pixel_satd_4x4_neon, export=1
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
@@ -2855,3 +3021,20 @@ HADAMARD_AC 8, 8
HADAMARD_AC 8, 16
HADAMARD_AC 16, 8
HADAMARD_AC 16, 16
+
+#if BIT_DEPTH == 8 && HAVE_DOTPROD
+ENABLE_DOTPROD
+SAD_FUNC_DOTPROD 16, 8
+SAD_FUNC_DOTPROD 16, 16
+SAD_X_DOTPROD_FUNC 3, 16, 8
+SAD_X_DOTPROD_FUNC 3, 16, 16
+SAD_X_DOTPROD_FUNC 4, 16, 8
+SAD_X_DOTPROD_FUNC 4, 16, 16
+
+SSD_DOTPROD_FUNC 8, 4
+SSD_DOTPROD_FUNC 8, 8
+SSD_DOTPROD_FUNC 8, 16
+SSD_DOTPROD_FUNC 16, 8
+SSD_DOTPROD_FUNC 16, 16
+DISABLE_DOTPROD
+#endif // BIT_DEPTH == 8 && HAVE_DOTPROD
=====================================
common/aarch64/pixel.h
=====================================
@@ -35,6 +35,7 @@
#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
+
#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
@@ -65,6 +66,22 @@
#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
+
+#if HAVE_DOTPROD
+#define x264_pixel_sad_16x8_neon_dotprod x264_template(pixel_sad_16x8_neon_dotprod)
+#define x264_pixel_sad_16x16_neon_dotprod x264_template(pixel_sad_16x16_neon_dotprod)
+#define x264_pixel_sad_x3_16x16_neon_dotprod x264_template(pixel_sad_x3_16x16_neon_dotprod)
+#define x264_pixel_sad_x3_16x8_neon_dotprod x264_template(pixel_sad_x3_16x8_neon_dotprod)
+#define x264_pixel_sad_x4_16x16_neon_dotprod x264_template(pixel_sad_x4_16x16_neon_dotprod)
+#define x264_pixel_sad_x4_16x8_neon_dotprod x264_template(pixel_sad_x4_16x8_neon_dotprod)
+
+#define x264_pixel_ssd_16x16_neon_dotprod x264_template(pixel_ssd_16x16_neon_dotprod)
+#define x264_pixel_ssd_16x8_neon_dotprod x264_template(pixel_ssd_16x8_neon_dotprod)
+#define x264_pixel_ssd_8x16_neon_dotprod x264_template(pixel_ssd_8x16_neon_dotprod)
+#define x264_pixel_ssd_8x4_neon_dotprod x264_template(pixel_ssd_8x4_neon_dotprod)
+#define x264_pixel_ssd_8x8_neon_dotprod x264_template(pixel_ssd_8x8_neon_dotprod)
+#endif // HAVE_DOTPROD
+
#define x264_pixel_ssd_4x16_sve x264_template(pixel_ssd_4x16_sve)
#define x264_pixel_ssd_4x4_sve x264_template(pixel_ssd_4x4_sve)
#define x264_pixel_ssd_4x8_sve x264_template(pixel_ssd_4x8_sve)
@@ -101,6 +118,11 @@ DECL_X1( satd, neon )
DECL_X1( ssd, neon )
DECL_X1_SSD_SVE( )
+#if HAVE_DOTPROD
+DECL_X1( sad, neon_dotprod )
+DECL_X4( sad, neon_dotprod )
+DECL_X1( ssd, neon_dotprod )
+#endif // HAVE_DOTPROD
#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, int, uint64_t *, uint64_t * );
@@ -108,6 +130,11 @@ void x264_pixel_ssd_nv12_core_neon( pixel *, intptr_t, pixel *, intptr_t, int, i
#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
int x264_pixel_vsad_neon( pixel *, intptr_t, int );
+#if HAVE_DOTPROD
+#define x264_pixel_vsad_neon_dotprod x264_template(pixel_vsad_neon_dotprod)
+int x264_pixel_vsad_neon_dotprod( pixel *, intptr_t, int );
+#endif // HAVE_DOTPROD
+
#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
int x264_pixel_sa8d_8x8_neon ( pixel *, intptr_t, pixel *, intptr_t );
#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
=====================================
common/pixel.c
=====================================
@@ -1525,6 +1525,23 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
}
+#if HAVE_DOTPROD
+ if( cpu&X264_CPU_DOTPROD ) {
+ pixf->sad[PIXEL_16x8] = x264_pixel_sad_16x8_neon_dotprod;
+ pixf->sad[PIXEL_16x16] = x264_pixel_sad_16x16_neon_dotprod;
+ pixf->sad_x3[PIXEL_16x8] = x264_pixel_sad_x3_16x8_neon_dotprod;
+ pixf->sad_x3[PIXEL_16x16] = x264_pixel_sad_x3_16x16_neon_dotprod;
+ pixf->sad_x4[PIXEL_16x8] = x264_pixel_sad_x4_16x8_neon_dotprod;
+ pixf->sad_x4[PIXEL_16x16] = x264_pixel_sad_x4_16x16_neon_dotprod;
+ pixf->ssd[PIXEL_8x4] = x264_pixel_ssd_8x4_neon_dotprod;
+ pixf->ssd[PIXEL_8x8] = x264_pixel_ssd_8x8_neon_dotprod;
+ pixf->ssd[PIXEL_8x16] = x264_pixel_ssd_8x16_neon_dotprod;
+ pixf->ssd[PIXEL_16x16] = x264_pixel_ssd_16x16_neon_dotprod;
+ pixf->ssd[PIXEL_16x8] = x264_pixel_ssd_16x8_neon_dotprod;
+ pixf->vsad = x264_pixel_vsad_neon_dotprod;
+ }
+#endif // HAVE_DOTPROD
+
#if HAVE_SVE
if( cpu&X264_CPU_SVE )
{
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/fe9e4a7f39d7a9fd208e4592238f9809d5936439
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/fe9e4a7f39d7a9fd208e4592238f9809d5936439
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list