[x264-devel] aarch64: implement x264_sub8x16_dct_dc_neon
Janne Grunau
git at videolan.org
Sat Dec 20 21:10:45 CET 2014
x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Fri Aug 8 11:19:35 2014 +0100| [45e1ebf88a1c3bf37e1326ce621a9b735d155885] | committer: Anton Mitrofanov
aarch64: implement x264_sub8x16_dct_dc_neon
4 times faster than C.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=45e1ebf88a1c3bf37e1326ce621a9b735d155885
---
common/aarch64/dct-a.S | 88 ++++++++++++++++++++++++++++--------------------
common/aarch64/dct.h | 1 +
common/dct.c | 3 ++
3 files changed, 55 insertions(+), 37 deletions(-)
diff --git a/common/aarch64/dct-a.S b/common/aarch64/dct-a.S
index aa12118..14aa867 100644
--- a/common/aarch64/dct-a.S
+++ b/common/aarch64/dct-a.S
@@ -622,56 +622,70 @@ function x264_add16x16_idct_dc_neon, export=1
ret
endfunc
+.macro sub4x4x2_dct_dc, dst, t0, t1, t2, t3, t4, t5, t6, t7
+ ld1 {\t0\().8b}, [x1], x3
+ ld1 {\t1\().8b}, [x2], x4
+ ld1 {\t2\().8b}, [x1], x3
+ ld1 {\t3\().8b}, [x2], x4
+ usubl \t0\().8h, \t0\().8b, \t1\().8b
+ ld1 {\t4\().8b}, [x1], x3
+ ld1 {\t5\().8b}, [x2], x4
+ usubl \t1\().8h, \t2\().8b, \t3\().8b
+ ld1 {\t6\().8b}, [x1], x3
+ ld1 {\t7\().8b}, [x2], x4
+ add \dst\().8h, \t0\().8h, \t1\().8h
+ usubl \t2\().8h, \t4\().8b, \t5\().8b
+ usubl \t3\().8h, \t6\().8b, \t7\().8b
+ add \dst\().8h, \dst\().8h, \t2\().8h
+ add \dst\().8h, \dst\().8h, \t3\().8h
+.endm
+
function x264_sub8x8_dct_dc_neon, export=1
mov x3, #FENC_STRIDE
mov x4, #FDEC_STRIDE
- ld1 {v16.8b}, [x1], x3
- ld1 {v17.8b}, [x2], x4
- usubl v16.8h, v16.8b, v17.8b
- ld1 {v18.8b}, [x1], x3
- ld1 {v19.8b}, [x2], x4
- usubl v17.8h, v18.8b, v19.8b
- ld1 {v20.8b}, [x1], x3
- ld1 {v21.8b}, [x2], x4
- usubl v18.8h, v20.8b, v21.8b
- ld1 {v22.8b}, [x1], x3
- add v0.8h, v16.8h, v17.8h
- ld1 {v23.8b}, [x2], x4
- usubl v19.8h, v22.8b, v23.8b
- ld1 {v24.8b}, [x1], x3
- add v0.8h, v0.8h, v18.8h
- ld1 {v25.8b}, [x2], x4
- usubl v20.8h, v24.8b, v25.8b
- ld1 {v26.8b}, [x1], x3
- add v0.8h, v0.8h, v19.8h
- ld1 {v27.8b}, [x2], x4
- usubl v21.8h, v26.8b, v27.8b
- ld1 {v28.8b}, [x1], x3
- ld1 {v29.8b}, [x2], x4
- usubl v22.8h, v28.8b, v29.8b
- ld1 {v30.8b}, [x1], x3
- add v1.8h, v20.8h, v21.8h
- ld1 {v31.8b}, [x2], x4
- usubl v23.8h, v30.8b, v31.8b
- add v1.8h, v1.8h, v22.8h
- add v1.8h, v1.8h, v23.8h
+ sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
+ sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
+
+ transpose v2.2d, v3.2d, v0.2d, v1.2d
+ SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
+ transpose v2.2d, v3.2d, v0.2d, v1.2d
+ SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
- add v0.8h, v2.8h, v3.8h
- sub v1.8h, v2.8h, v3.8h
+ addp v0.8h, v2.8h, v3.8h
+ addp v0.8h, v0.8h, v0.8h
- transpose v2.2d, v3.2d, v0.2d, v1.2d
+ st1 {v0.4h}, [x0]
+ ret
+endfunc
+
+function x264_sub8x16_dct_dc_neon, export=1
+ mov x3, #FENC_STRIDE
+ mov x4, #FDEC_STRIDE
+ sub4x4x2_dct_dc v0, v16, v17, v18, v19, v20, v21, v22, v23
+ sub4x4x2_dct_dc v1, v24, v25, v26, v27, v28, v29, v30, v31
+ sub4x4x2_dct_dc v2, v16, v17, v18, v19, v20, v21, v22, v23
+ sub4x4x2_dct_dc v3, v24, v25, v26, v27, v28, v29, v30, v31
+
+ addp v4.8h, v0.8h, v2.8h
+ addp v5.8h, v1.8h, v3.8h
- add v0.8h, v2.8h, v3.8h
- sub v1.8h, v2.8h, v3.8h
+ transpose v2.4s, v3.4s, v4.4s, v5.4s
+ SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
+
+ transpose v2.4s, v3.4s, v0.4s, v1.4s
+ SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
transpose v2.2d, v3.2d, v0.2d, v1.2d
+ SUMSUB_AB v0.8h, v1.8h, v2.8h, v3.8h
+
+ trn1 v2.2d, v0.2d, v1.2d
+ trn2 v3.2d, v1.2d, v0.2d
addp v0.8h, v2.8h, v3.8h
- addp v0.8h, v0.8h, v0.8h
- st1 {v0.4h}, [x0]
+ st1 {v0.8h}, [x0]
ret
endfunc
diff --git a/common/aarch64/dct.h b/common/aarch64/dct.h
index 4af311c..f00ef62 100644
--- a/common/aarch64/dct.h
+++ b/common/aarch64/dct.h
@@ -41,6 +41,7 @@ void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
diff --git a/common/dct.c b/common/dct.c
index e1fb42a..f3e297f 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -747,6 +747,9 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
+#if ARCH_AARCH64
+ dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
+#endif
}
#endif
#endif // HIGH_BIT_DEPTH
More information about the x264-devel
mailing list