[x264-devel] [Git][videolan/x264][master] aarch64: pixel: add 10bits sad functions

Anton Mitrofanov (@BugMaster) gitlab at videolan.org
Sun Dec 4 18:04:28 UTC 2022



Anton Mitrofanov pushed to branch master at VideoLAN / x264


Commits:
416e3eb2 by Hubert Mazur at 2022-10-28T07:11:57+00:00
aarch64: pixel: add 10bits sad functions

Provide routines for sad functions for high bit depth, i.e. 10 bits.
Benchmarks run on AWS Gravtion 2 instances.

sad_4x4_c: 583
sad_4x4_neon: 273
sad_4x8_c: 1179
sad_4x8_neon: 366
sad_4x16_c: 2121
sad_4x16_neon: 550
sad_8x4_c: 924
sad_8x4_neon: 213
sad_8x8_c: 1711
sad_8x8_neon: 316
sad_8x16_c: 3505
sad_8x16_neon: 497
sad_16x8_c: 3070
sad_16x8_neon: 635
sad_16x16_c: 6113
sad_16x16_neon: 1118

Signed-off-by: Hubert Mazur <hum at semihalf.com>
Signed-off-by: Grzegorz Bernacki <gjb at semihalf.com>

- - - - -


3 changed files:

- common/aarch64/pixel-a.S
- common/aarch64/pixel.h
- common/pixel.c


Changes:

=====================================
common/aarch64/pixel-a.S
=====================================
@@ -40,6 +40,7 @@ const mask_ac_4_8
 .short 0, -1, -1, -1, -1, -1, -1, -1
 endconst
 
+#if BIT_DEPTH == 8
 .macro SAD_START_4
     ld1        {v1.s}[0], [x2], x3
     ld1        {v0.s}[0], [x0], x1
@@ -112,6 +113,110 @@ function pixel_sad\name\()_\w\()x\h\()_neon, export=1
 endfunc
 .endm
 
+#else /* BIT_DEPTH == 8 */
+
+.macro SAD_START_4
+    lsl        x1, x1, #1
+    lsl        x3, x3, #1
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_4
+    ld1        {v1.d}[0], [x2], x3
+    ld1        {v0.d}[0], [x0], x1
+    ld1        {v1.d}[1], [x2], x3
+    ld1        {v0.d}[1], [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v18.4s,  v0.8h,  v1.8h
+.endm
+
+.macro SAD_START_8
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v1.4h
+    uabdl2      v17.4s,  v0.8h,  v1.8h
+    uabdl       v18.4s,  v2.4h,  v3.4h
+    uabdl2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_8
+    ld1         {v1.8h}, [x2], x3
+    ld1         {v0.8h}, [x0], x1
+    ld1         {v3.8h}, [x2], x3
+    ld1         {v2.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v1.4h
+    uabal2      v17.4s,  v0.8h,  v1.8h
+    uabal       v18.4s,  v2.4h,  v3.4h
+    uabal2      v19.4s,  v2.8h,  v3.8h
+.endm
+
+.macro SAD_START_16
+    lsl         x1, x1, #1
+    lsl         x3, x3, #1
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabdl       v16.4s,  v0.4h,  v2.4h
+    uabdl2      v17.4s,  v0.8h,  v2.8h
+    uabdl       v20.4s,  v1.4h,  v3.4h
+    uabdl2      v21.4s,  v1.8h,  v3.8h
+    uabdl       v18.4s,  v4.4h,  v6.4h
+    uabdl2      v19.4s,  v4.8h,  v6.8h
+    uabdl       v22.4s,  v5.4h,  v7.4h
+    uabdl2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_16
+    ld2         {v0.8h, v1.8h}, [x2], x3
+    ld2         {v2.8h, v3.8h}, [x0], x1
+    ld2         {v4.8h, v5.8h}, [x2], x3
+    ld2         {v6.8h, v7.8h}, [x0], x1
+    uabal       v16.4s,  v0.4h,  v2.4h
+    uabal2      v17.4s,  v0.8h,  v2.8h
+    uabal       v20.4s,  v1.4h,  v3.4h
+    uabal2      v21.4s,  v1.8h,  v3.8h
+    uabal       v18.4s,  v4.4h,  v6.4h
+    uabal2      v19.4s,  v4.8h,  v6.8h
+    uabal       v22.4s,  v5.4h,  v7.4h
+    uabal2      v23.4s,  v5.8h,  v7.8h
+.endm
+
+.macro SAD_FUNC w, h, name
+function pixel_sad\name\()_\w\()x\h\()_neon, export=1
+    SAD_START_\w
+
+.rept \h / 2 - 1
+    SAD_\w
+.endr
+.if \w > 8
+    add         v20.4s,  v20.4s,  v21.4s
+    add         v16.4s,  v16.4s,  v20.4s
+    add         v22.4s,  v22.4s,  v23.4s
+    add         v18.4s,  v18.4s,  v22.4s
+.endif
+.if \w > 4
+    add         v16.4s,  v16.4s,  v17.4s
+    add         v18.4s,  v18.4s,  v19.4s
+.endif
+    add         v16.4s,  v16.4s,  v18.4s
+    uaddlv      s0,  v16.8h
+    fmov        w0,  s0
+    ret
+endfunc
+.endm
+
+#endif /* BIT_DEPTH == 8 */
+
 SAD_FUNC  4,  4
 SAD_FUNC  4,  8
 SAD_FUNC  4,  16


=====================================
common/aarch64/pixel.h
=====================================
@@ -76,11 +76,11 @@
     ret x264_pixel_##name##_4x4_##suffix args;\
 
 #define DECL_X1( name, suffix ) \
-    DECL_PIXELS( int, name, suffix, ( uint8_t *, intptr_t, uint8_t *, intptr_t ) )
+    DECL_PIXELS( int, name, suffix, ( pixel *, intptr_t, pixel *, intptr_t ) )
 
 #define DECL_X4( name, suffix ) \
-    DECL_PIXELS( void, name##_x3, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )\
-    DECL_PIXELS( void, name##_x4, suffix, ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, int * ) )
+    DECL_PIXELS( void, name##_x3, suffix, ( pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )\
+    DECL_PIXELS( void, name##_x4, suffix, ( pixel *, pixel *, pixel *, pixel *, pixel *, intptr_t, int * ) )
 
 DECL_X1( sad, neon )
 DECL_X4( sad, neon )


=====================================
common/pixel.c
=====================================
@@ -1054,6 +1054,13 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
         pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
     }
 #endif // HAVE_MMX
+#if HAVE_AARCH64
+    if( cpu&X264_CPU_NEON )
+    {
+        INIT8( sad, _neon );
+    }
+#endif // HAVE_AARCH64
+
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )



View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/416e3eb2b52abb0a67e57599aba4f8be3003b36d

-- 
View it on GitLab: https://code.videolan.org/videolan/x264/-/commit/416e3eb2b52abb0a67e57599aba4f8be3003b36d
You're receiving this email because of your account on code.videolan.org.


VideoLAN code repository instance


More information about the x264-devel mailing list