[x264-devel] aarch64: implement x264_pixel_ssd_nv12_core_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:44 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Wed Jul 30 15:48:25 2014 +0100| [35b91f2410dcf4fc5191dd85ccda7a42eb01eae8] | committer: Anton Mitrofanov
aarch64: implement x264_pixel_ssd_nv12_core_neon
13 times faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=35b91f2410dcf4fc5191dd85ccda7a42eb01eae8
---
common/aarch64/pixel-a.S | 71 ++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/pixel.h | 3 ++
common/pixel.c | 1 +
3 files changed, 75 insertions(+)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index efa708a..d2c3de6 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -373,6 +373,77 @@ SSD_FUNC 8, 16
SSD_FUNC 16, 8
SSD_FUNC 16, 16
+
+function x264_pixel_ssd_nv12_core_neon, export=1
+ sxtw x8, w4
+ add x8, x8, #8
+ and x8, x8, #~15
+ movi v6.2d, #0
+ movi v7.2d, #0
+ sub x1, x1, x8, lsl #1
+ sub x3, x3, x8, lsl #1
+1:
+ subs w8, w4, #16
+ ld2 {v0.8b,v1.8b}, [x0], #16
+ ld2 {v2.8b,v3.8b}, [x2], #16
+ ld2 {v24.8b,v25.8b}, [x0], #16
+ ld2 {v26.8b,v27.8b}, [x2], #16
+
+ usubl v16.8h, v0.8b, v2.8b
+ usubl v17.8h, v1.8b, v3.8b
+ smull v20.4s, v16.4h, v16.4h
+ smull v21.4s, v17.4h, v17.4h
+ usubl v18.8h, v24.8b, v26.8b
+ usubl v19.8h, v25.8b, v27.8b
+ smlal2 v20.4s, v16.8h, v16.8h
+ smlal2 v21.4s, v17.8h, v17.8h
+
+ b.lt 4f
+ b.eq 3f
+2:
+ smlal v20.4s, v18.4h, v18.4h
+ smlal v21.4s, v19.4h, v19.4h
+ ld2 {v0.8b,v1.8b}, [x0], #16
+ ld2 {v2.8b,v3.8b}, [x2], #16
+ smlal2 v20.4s, v18.8h, v18.8h
+ smlal2 v21.4s, v19.8h, v19.8h
+
+ subs w8, w8, #16
+ usubl v16.8h, v0.8b, v2.8b
+ usubl v17.8h, v1.8b, v3.8b
+ smlal v20.4s, v16.4h, v16.4h
+ smlal v21.4s, v17.4h, v17.4h
+ ld2 {v24.8b,v25.8b}, [x0], #16
+ ld2 {v26.8b,v27.8b}, [x2], #16
+ smlal2 v20.4s, v16.8h, v16.8h
+ smlal2 v21.4s, v17.8h, v17.8h
+ b.lt 4f
+
+ usubl v18.8h, v24.8b, v26.8b
+ usubl v19.8h, v25.8b, v27.8b
+ b.gt 2b
+3:
+ smlal v20.4s, v18.4h, v18.4h
+ smlal v21.4s, v19.4h, v19.4h
+ smlal2 v20.4s, v18.8h, v18.8h
+ smlal2 v21.4s, v19.8h, v19.8h
+4:
+ subs w5, w5, #1
+ uaddw v6.2d, v6.2d, v20.2s
+ uaddw v7.2d, v7.2d, v21.2s
+ add x0, x0, x1
+ add x2, x2, x3
+ uaddw2 v6.2d, v6.2d, v20.4s
+ uaddw2 v7.2d, v7.2d, v21.4s
+ b.gt 1b
+
+ addp v6.2d, v6.2d, v7.2d
+ st1 {v6.d}[0], [x6]
+ st1 {v6.d}[1], [x7]
+
+ ret
+endfunc
+
.macro pixel_var_8 h
function x264_pixel_var_8x\h\()_neon, export=1
ld1 {v16.8b}, [x0], x1
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index c7cc6c9..c7c386a 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -48,6 +48,9 @@ DECL_X4( sad, neon )
DECL_X1( satd, neon )
DECL_X1( ssd, neon )
+
+void x264_pixel_ssd_nv12_core_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, uint64_t *, uint64_t * );
+
int x264_pixel_vsad_neon( uint8_t *, intptr_t, int );
int x264_pixel_sa8d_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index d467151..bb1894a 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1442,6 +1442,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_neon;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
}
More information about the x264-devel
mailing list