[x264-devel] [PATCH 10/24] arm: Implement x264_sub8x16_dct_dc_neon
Martin Storsjö
martin at martin.st
Thu Aug 13 22:59:31 CEST 2015
checkasm timing Cortex-A7 A8 A9
sub8x16_dct_dc_c 6386 3901 4080
sub8x16_dct_dc_neon 1491 698 917
---
common/arm/dct-a.S | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/dct.h | 1 +
common/dct.c | 2 --
3 files changed, 95 insertions(+), 2 deletions(-)
diff --git a/common/arm/dct-a.S b/common/arm/dct-a.S
index a8fee79..58af364 100644
--- a/common/arm/dct-a.S
+++ b/common/arm/dct-a.S
@@ -4,6 +4,7 @@
* Copyright (C) 2009-2015 x264 project
*
* Authors: David Conrad <lessen42 at gmail.com>
+ * Martin Storsjo <martin at martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -658,6 +659,99 @@ function x264_sub8x8_dct_dc_neon
bx lr
endfunc
+function x264_sub8x16_dct_dc_neon
+ mov r3, #FENC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d16}, [r1,:64], r3
+ vld1.64 {d17}, [r2,:64], ip
+ vsubl.u8 q8, d16, d17
+ vld1.64 {d18}, [r1,:64], r3
+ vld1.64 {d19}, [r2,:64], ip
+ vsubl.u8 q9, d18, d19
+ vld1.64 {d20}, [r1,:64], r3
+ vld1.64 {d21}, [r2,:64], ip
+ vsubl.u8 q10, d20, d21
+ vld1.64 {d22}, [r1,:64], r3
+ vadd.s16 q0, q8, q9
+ vld1.64 {d23}, [r2,:64], ip
+ vsubl.u8 q11, d22, d23
+ vld1.64 {d24}, [r1,:64], r3
+ vadd.s16 q0, q0, q10
+ vld1.64 {d25}, [r2,:64], ip
+ vsubl.u8 q12, d24, d25
+ vld1.64 {d26}, [r1,:64], r3
+ vadd.s16 q0, q0, q11
+ vld1.64 {d27}, [r2,:64], ip
+ vsubl.u8 q13, d26, d27
+ vld1.64 {d28}, [r1,:64], r3
+ vld1.64 {d29}, [r2,:64], ip
+ vsubl.u8 q14, d28, d29
+ vld1.64 {d30}, [r1,:64], r3
+ vadd.s16 q1, q12, q13
+ vld1.64 {d31}, [r2,:64], ip
+ vsubl.u8 q15, d30, d31
+
+ vld1.64 {d16}, [r1,:64], r3
+ vadd.s16 q1, q1, q14
+ vld1.64 {d17}, [r2,:64], ip
+ vadd.s16 q1, q1, q15
+ vld1.64 {d18}, [r1,:64], r3
+ vsubl.u8 q8, d16, d17
+ vld1.64 {d19}, [r2,:64], ip
+ vsubl.u8 q9, d18, d19
+ vld1.64 {d20}, [r1,:64], r3
+ vld1.64 {d21}, [r2,:64], ip
+ vsubl.u8 q10, d20, d21
+ vld1.64 {d22}, [r1,:64], r3
+ vadd.s16 q2, q8, q9
+ vld1.64 {d23}, [r2,:64], ip
+ vsubl.u8 q11, d22, d23
+ vld1.64 {d24}, [r1,:64], r3
+ vadd.s16 q2, q2, q10
+ vld1.64 {d25}, [r2,:64], ip
+ vsubl.u8 q12, d24, d25
+ vld1.64 {d26}, [r1,:64], r3
+ vadd.s16 q2, q2, q11
+ vld1.64 {d27}, [r2,:64], ip
+ vsubl.u8 q13, d26, d27
+ vld1.64 {d28}, [r1,:64], r3
+ vld1.64 {d29}, [r2,:64], ip
+ vsubl.u8 q14, d28, d29
+ vld1.64 {d30}, [r1,:64], r3
+ vadd.s16 q3, q12, q13
+ vld1.64 {d31}, [r2,:64], ip
+ vsubl.u8 q15, d30, d31
+ vadd.s16 q3, q3, q14
+
+ vadd.s16 d16, d0, d1 @ b0
+ vadd.s16 q3, q3, q15
+ vsub.s16 d17, d0, d1 @ b4
+ vadd.s16 d18, d2, d3 @ b1
+ vsub.s16 d19, d2, d3 @ b5
+ vadd.s16 d20, d4, d5 @ b2
+ vsub.s16 d21, d4, d5 @ b6
+ vadd.s16 d22, d6, d7 @ b3
+ vsub.s16 d23, d6, d7 @ b7
+ vadd.s16 q0, q8, q9 @ b0 + b1, b4 + b5; a0, a2
+ vsub.s16 q1, q8, q9 @ b0 - b1, b4 - b5; a4, a6
+ vadd.s16 q2, q10, q11 @ b2 + b3, b6 + b7; a1, a3
+ vsub.s16 q3, q10, q11 @ b2 - b3, b6 - b7; a5, a7
+
+ vadd.s16 q8, q0, q2 @ a0 + a1, a2 + a3
+ vsub.s16 q9, q0, q2 @ a0 - a1, a2 - a3
+ vsub.s16 q10, q1, q3 @ a4 - a5, a6 - a7
+ vadd.s16 q11, q1, q3 @ a4 + a5, a6 + a7
+
+ vpadd.s16 d0, d16, d17
+ vpadd.s16 d1, d18, d19
+ vpadd.s16 d2, d20, d21
+ vpadd.s16 d3, d22, d23
+ vpadd.s16 d0, d0, d1
+ vpadd.s16 d1, d2, d3
+ vst1.64 {q0}, [r0,:64]
+ bx lr
+endfunc
+
function x264_zigzag_scan_4x4_frame_neon
movrel r2, scan4x4_frame
diff --git a/common/arm/dct.h b/common/arm/dct.h
index e252e7e..77063d8 100644
--- a/common/arm/dct.h
+++ b/common/arm/dct.h
@@ -40,6 +40,7 @@ void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
void x264_add8x8_idct_dc_neon( uint8_t *p_dst, int16_t dct[4] );
void x264_add16x16_idct_dc_neon( uint8_t *p_dst, int16_t dct[16] );
void x264_sub8x8_dct_dc_neon( int16_t dct[4], uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x16_dct_dc_neon( int16_t dct[8], uint8_t *pix1, uint8_t *pix2 );
void x264_sub8x8_dct8_neon( int16_t dct[64], uint8_t *pix1, uint8_t *pix2 );
void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 );
diff --git a/common/dct.c b/common/dct.c
index e80d64b..aafd9fb 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -750,9 +750,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
dctf->add8x8_idct8 = x264_add8x8_idct8_neon;
dctf->add16x16_idct8= x264_add16x16_idct8_neon;
-#if ARCH_AARCH64
dctf->sub8x16_dct_dc= x264_sub8x16_dct_dc_neon;
-#endif
}
#endif
--
1.7.10.4
More information about the x264-devel
mailing list