[x264-devel] [PATCH 02/23] aarch64: implement x264_pixel_sa8d_satd_16x16_neon
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:30 CET 2014
~20% faster than calling pixel_sa8d_16x16 and pixel_satd_16x16
separately.
---
common/aarch64/pixel-a.S | 103 ++++++++++++++++++++++++++++++++++++++++-------
common/aarch64/pixel.h | 1 +
common/pixel.c | 1 +
3 files changed, 91 insertions(+), 14 deletions(-)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index 07e9a61..8c7b927 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -803,7 +803,7 @@ endfunc
function x264_pixel_sa8d_8x8_neon, export=1
mov x4, x30
- bl x264_sa8d_8x8_neon
+ bl pixel_sa8d_8x8_neon
add v0.8h, v0.8h, v1.8h
uaddlv s0, v0.8h
mov w0, v0.s[0]
@@ -814,20 +814,20 @@ endfunc
function x264_pixel_sa8d_16x16_neon, export=1
mov x4, x30
- bl x264_sa8d_8x8_neon
+ bl pixel_sa8d_8x8_neon
uaddlp v30.4s, v0.8h
uaddlp v31.4s, v1.8h
- bl x264_sa8d_8x8_neon
+ bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
sub x0, x0, x1, lsl #4
sub x2, x2, x3, lsl #4
add x0, x0, #8
add x2, x2, #8
- bl x264_sa8d_8x8_neon
+ bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
- bl x264_sa8d_8x8_neon
+ bl pixel_sa8d_8x8_neon
uadalp v30.4s, v0.8h
uadalp v31.4s, v1.8h
add v0.4s, v30.4s, v31.4s
@@ -838,13 +838,48 @@ function x264_pixel_sa8d_16x16_neon, export=1
ret x4
endfunc
-function x264_sa8d_8x8_neon
+.macro sa8d_satd_8x8 satd=
+function pixel_sa8d_\satd\()8x8_neon
load_diff_fly_8x8
SUMSUB_AB v16.8h, v18.8h, v0.8h, v2.8h
SUMSUB_AB v17.8h, v19.8h, v1.8h, v3.8h
HADAMARD4_V v20.8h, v21.8h, v22.8h, v23.8h, v0.8h, v1.8h, v2.8h, v3.8h
+.ifc \satd, satd_
+ transpose v0.8h, v1.8h, v16.8h, v17.8h
+ transpose v2.8h, v3.8h, v18.8h, v19.8h
+ transpose v4.8h, v5.8h, v20.8h, v21.8h
+ transpose v6.8h, v7.8h, v22.8h, v23.8h
+
+ SUMSUB_AB v24.8h, v25.8h, v0.8h, v1.8h
+ SUMSUB_AB v26.8h, v27.8h, v2.8h, v3.8h
+ SUMSUB_AB v0.8h, v1.8h, v4.8h, v5.8h
+ SUMSUB_AB v2.8h, v3.8h, v6.8h, v7.8h
+
+ transpose v4.4s, v6.4s, v24.4s, v26.4s
+ transpose v5.4s, v7.4s, v25.4s, v27.4s
+ transpose v24.4s, v26.4s, v0.4s, v2.4s
+ transpose v25.4s, v27.4s, v1.4s, v3.4s
+
+ abs v0.8h, v4.8h
+ abs v1.8h, v5.8h
+ abs v2.8h, v6.8h
+ abs v3.8h, v7.8h
+ abs v4.8h, v24.8h
+ abs v5.8h, v25.8h
+ abs v6.8h, v26.8h
+ abs v7.8h, v27.8h
+
+ umax v0.8h, v0.8h, v2.8h
+ umax v1.8h, v1.8h, v3.8h
+ umax v2.8h, v4.8h, v6.8h
+ umax v3.8h, v5.8h, v7.8h
+
+ add v26.8h, v0.8h, v1.8h
+ add v27.8h, v2.8h, v3.8h
+.endif
+
SUMSUB_AB v0.8h, v16.8h, v16.8h, v20.8h
SUMSUB_AB v1.8h, v17.8h, v17.8h, v21.8h
SUMSUB_AB v2.8h, v18.8h, v18.8h, v22.8h
@@ -855,20 +890,20 @@ function x264_sa8d_8x8_neon
transpose v22.8h, v23.8h, v18.8h, v19.8h
transpose v6.8h, v7.8h, v2.8h, v3.8h
- SUMSUB_AB v28.8h, v29.8h, v20.8h, v21.8h
+ SUMSUB_AB v2.8h, v3.8h, v20.8h, v21.8h
SUMSUB_AB v24.8h, v25.8h, v4.8h, v5.8h
SUMSUB_AB v0.8h, v1.8h, v22.8h, v23.8h
- SUMSUB_AB v26.8h, v27.8h, v6.8h, v7.8h
+ SUMSUB_AB v4.8h, v5.8h, v6.8h, v7.8h
- transpose v20.4s, v22.4s, v28.4s, v0.4s
- transpose v21.4s, v23.4s, v29.4s, v1.4s
- transpose v16.4s, v18.4s, v24.4s, v26.4s
- transpose v17.4s, v19.4s, v25.4s, v27.4s
+ transpose v20.4s, v22.4s, v2.4s, v0.4s
+ transpose v21.4s, v23.4s, v3.4s, v1.4s
+ transpose v16.4s, v18.4s, v24.4s, v4.4s
+ transpose v17.4s, v19.4s, v25.4s, v5.4s
SUMSUB_AB v0.8h, v2.8h, v20.8h, v22.8h
SUMSUB_AB v1.8h, v3.8h, v21.8h, v23.8h
- SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
- SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
+ SUMSUB_AB v4.8h, v6.8h, v16.8h, v18.8h
+ SUMSUB_AB v5.8h, v7.8h, v17.8h, v19.8h
transpose v16.2d, v20.2d, v0.2d, v4.2d
transpose v17.2d, v21.2d, v1.2d, v5.2d
@@ -894,7 +929,47 @@ function x264_sa8d_8x8_neon
ret
endfunc
+.endm
+
+sa8d_satd_8x8
+sa8d_satd_8x8 satd_
+function x264_pixel_sa8d_satd_16x16_neon, export=1
+ mov x4, x30
+ bl pixel_sa8d_satd_8x8_neon
+ uaddlp v30.4s, v0.8h
+ uaddlp v31.4s, v1.8h
+ uaddlp v28.4s, v26.8h
+ uaddlp v29.4s, v27.8h
+ bl pixel_sa8d_satd_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ uadalp v28.4s, v26.8h
+ uadalp v29.4s, v27.8h
+ sub x0, x0, x1, lsl #4
+ sub x2, x2, x3, lsl #4
+ add x0, x0, #8
+ add x2, x2, #8
+ bl pixel_sa8d_satd_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ uadalp v28.4s, v26.8h
+ uadalp v29.4s, v27.8h
+ bl pixel_sa8d_satd_8x8_neon
+ uadalp v30.4s, v0.8h
+ uadalp v31.4s, v1.8h
+ uadalp v28.4s, v26.8h
+ uadalp v29.4s, v27.8h
+ add v0.4s, v30.4s, v31.4s // sa8d
+ add v1.4s, v28.4s, v29.4s // satd
+ addv s0, v0.4s
+ addv s1, v1.4s
+ urshr v0.4s, v0.4s, #1
+ fmov w0, s0
+ fmov w1, s1
+ add x0, x0, x1, lsl #32
+ ret x4
+endfunc
.macro HADAMARD_AC w h
function x264_pixel_hadamard_ac_\w\()x\h\()_neon, export=1
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 9c7768c..99ab16d 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -49,6 +49,7 @@ DECL_X1( ssd, neon )
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
int x264_pixel_sa8d_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
+uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index de79152..421a67c 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1422,6 +1422,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_neon;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
+ pixf->sa8d_satd[PIXEL_16x16] = x264_pixel_sa8d_satd_16x16_neon;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
--
2.1.3
More information about the x264-devel
mailing list