[x264-devel] [Git][videolan/x264][master] aarch64: pixel: add 10bits sad functions
Anton Mitrofanov (@BugMaster)
gitlab at videolan.org
Sun Dec 4 18:04:28 UTC 2022
Anton Mitrofanov pushed to branch master at VideoLAN / x264
Commits:
416e3eb2 by Hubert Mazur at 2022-10-28T07:11:57+00:00
aarch64: pixel: add 10bits sad functions
Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.
sad_4x4_c: 583
sad_4x4_neon: 273
sad_4x8_c: 1179
sad_4x8_neon: 366
sad_4x16_c: 2121
sad_4x16_neon: 550
sad_8x4_c: 924
sad_8x4_neon: 213
sad_8x8_c: 1711
sad_8x8_neon: 316
sad_8x16_c: 3505
sad_8x16_neon: 497
sad_16x8_c: 3070
sad_16x8_neon: 635
sad_16x16_c: 6113
sad_16x16_neon: 1118
Signed-off-by: Hubert Mazur <hum at semihalf.com>
Signed-off-by: Grzegorz Bernacki <gjb at semihalf.com>
- - - - -
3 changed files:
- common/aarch64/pixel-a.S
- common/aarch64/pixel.h
- common/pixel.c
Changes:
=====================================
common/aarch64/pixel-a.S
=====================================
@@ -40,6 +40,7 @@ const mask_ac_4_8
.short 0, -1, -1, -1, -1, -1, -1, -1
endconst
+#if BIT_DEPTH == 8
.macro SAD_START_4
ld1 {v1.s}[0], [x2], x3
ld1 {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
endfunc
.endm
+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ ld1 {v1.d}[0], [x2], x3
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x2], x3
+ ld1 {v0.d}[1], [x0], x1
+ uabdl v16.4s, v0.4h, v1.4h
+ uabdl2 v18.4s, v0.8h, v1.8h
+.endm
+
+.macro SAD_4
+ ld1 {v1.d}[0], [x2], x3
+ ld1 {v0.d}[0], [x0], x1
+ ld1 {v1.d}[1], [x2], x3
+ ld1 {v0.d}[1], [x0], x1
+ uabal v16.4s, v0.4h, v1.4h
+ uabal2 v18.4s, v0.8h, v1.8h
+.endm
+
+.macro SAD_START_8
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ ld1 {v1.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ ld1 {v2.8h}, [x0], x1
+ uabdl v16.4s, v0.4h, v1.4h
+ uabdl2 v17.4s, v0.8h, v1.8h
+ uabdl v18.4s, v2.4h, v3.4h
+ uabdl2 v19.4s, v2.8h, v3.8h
+.endm
+
+.macro SAD_8
+ ld1 {v1.8h}, [x2], x3
+ ld1 {v0.8h}, [x0], x1
+ ld1 {v3.8h}, [x2], x3
+ ld1 {v2.8h}, [x0], x1
+ uabal v16.4s, v0.4h, v1.4h
+ uabal2 v17.4s, v0.8h, v1.8h
+ uabal v18.4s, v2.4h, v3.4h
+ uabal2 v19.4s, v2.8h, v3.8h
+.endm
+
+.macro SAD_START_16
+ lsl x1, x1, #1
+ lsl x3, x3, #1
+ ld2 {v0.8h, v1.8h}, [x2], x3
+ ld2 {v2.8h, v3.8h}, [x0], x1
+ ld2 {v4.8h, v5.8h}, [x2], x3
+ ld2 {v6.8h, v7.8h}, [x0], x1
+ uabdl v16.4s, v0.4h, v2.4h
+ uabdl2 v17.4s, v0.8h, v2.8h
+ uabdl v20.4s, v1.4h, v3.4h
+ uabdl2 v21.4s, v1.8h, v3.8h
+ uabdl v18.4s, v4.4h, v6.4h
+ uabdl2 v19.4s, v4.8h, v6.8h
+ uabdl v22.4s, v5.4h, v7.4h
+ uabdl2 v23.4s, v5.8h, v7.8h
+.endm
+
+.macro SAD_16
+ ld2 {v0.8h, v1.8h}, [x2], x3
+ ld2 {v2.8h, v3.8h}, [x0], x1
+ ld2 {v4.8h, v5.8h}, [x2], x3
+ ld2 {v6.8h, v7.8h}, [x0], x1
+ uabal v16.4s, v0.4h, v2.4h
+ uabal2 v17.4s, v0.8h, v2.8h
+ uabal v20.4s, v1.4h, v3.4h
+ uabal2 v21.4s, v1.8h, v3.8h
+ uabal v18.4s, v4.4h, v6.4h
+ uabal2 v19.4s, v4.8h, v6.8h
+ uabal v22.4s, v5.4h, v7.4h
+ uabal2 v23.4s, v5.8h, v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+ SAD_START_\w
+
+.rept \h / 2 - 1
+ SAD_\w
+.endr
+.if \w > 8
+ add v20.4s, v20.4s, v21.4s
+ add v16.4s, v16.4s, v20.4s
+ add v22.4s, v22.4s, v23.4s
+ add v18.4s, v18.4s, v22.4s
+.endif
+.if \w > 4
+ add v16.4s, v16.4s, v17.4s
+ add v18.4s, v18.4s, v19.4s
+.endif
+ add v16.4s, v16.4s, v18.4s
+ uaddlv s0, v16.8h
+ fmov w0, s0
+ ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
SAD_FUNC 4, 4
SAD_FUNC 4, 8
SAD_FUNC 4, 16
=====================================
common/aarch64/pixel.h
=====================================
@@ -76,11 +76,11 @@
ret x264_pixel_##name##_4x4_##suffix args;\
#define DECL_X1( name, suffix ) \
- DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+ DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
#define DECL_X4( name, suffix ) \
- DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
- DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+ DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+ DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
DECL_X1( sad, neon )
DECL_X4( sad, neon )
=====================================
common/pixel.c
=====================================
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
+#if HAVE_AARCH64
+ if( cpu&X264_CPU_NEON )
+ {
+ INIT8( sad, _neon );
+ }
+#endif // HAVE_AARCH64
+
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/416e3eb2b52abb0a67e57599aba4f8be3003b36d
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/416e3eb2b52abb0a67e57599aba4f8be3003b36d
You're receiving this email because of your account on code.videolan.org.
VideoLAN code repository instance
More information about the x264-devel
mailing list