[x264-devel] [PATCH 11/24] arm: Implement neon 8x16c intra predict functions
Martin Storsjö
martin at martin.st
Thu Aug 13 22:59:32 CEST 2015
This implements the same functions as are implemented for 8x8c
and as for 8x16c on aarch64.
Some of the simpler ones actually turn out to be slower than the
plain C version, at least on some CPUs.
checkasm timing Cortex-A7 A8 A9
intra_predict_8x16c_dc_c 1347 910 1017
intra_predict_8x16c_dc_neon 1271 1366 1247
intra_predict_8x16c_dcl_c 859 677 692
intra_predict_8x16c_dcl_neon 1006 1209 1065
intra_predict_8x16c_dct_c 871 540 590
intra_predict_8x16c_dct_neon 672 511 657
intra_predict_8x16c_h_c 937 712 719
intra_predict_8x16c_h_neon 722 682 672
intra_predict_8x16c_p_c 10184 9967 8652
intra_predict_8x16c_p_neon 2617 1973 1983
intra_predict_8x16c_v_c 610 380 429
intra_predict_8x16c_v_neon 570 513 507
---
common/arm/predict-a.S | 158 ++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/predict-c.c | 15 +++++
common/arm/predict.h | 8 +++
common/predict.c | 4 ++
4 files changed, 185 insertions(+)
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 7e5d9d3..228fd2e 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -5,6 +5,7 @@
*
* Authors: David Conrad <lessen42 at gmail.com>
* Mans Rullgard <mans at mansr.com>
+ * Martin Storsjo <martin at martin.st>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -552,6 +553,163 @@ function x264_predict_8x8c_p_neon
endfunc
+function x264_predict_8x16c_dc_top_neon
+ sub r2, r0, #FDEC_STRIDE
+ mov r1, #FDEC_STRIDE
+ vld1.8 {d0}, [r2,:64]
+ vpaddl.u8 d0, d0
+ vpadd.u16 d0, d0, d0
+ vrshrn.u16 d0, q0, #2
+ vdup.8 d1, d0[1]
+ vdup.8 d0, d0[0]
+ vtrn.32 d0, d1
+ vmov q1, q0
+ b pred8x16_dc_end
+endfunc
+
+function x264_predict_8x16c_dc_left_neon
+ mov r1, #FDEC_STRIDE
+ sub r2, r0, #1
+ ldcol.8 d0, r2, r1
+ ldcol.8 d2, r2, r1
+ vpaddl.u8 d0, d0
+ vpaddl.u8 d2, d2
+ vpadd.u16 d0, d0, d0
+ vpadd.u16 d2, d2, d2
+ vrshrn.u16 d0, q0, #2
+ vrshrn.u16 d2, q1, #2
+ vdup.8 d1, d0[1]
+ vdup.8 d0, d0[0]
+ vdup.8 d3, d2[1]
+ vdup.8 d2, d2[0]
+ b pred8x16_dc_end
+endfunc
+
+function x264_predict_8x16c_dc_neon
+ sub r2, r0, #FDEC_STRIDE
+ mov r1, #FDEC_STRIDE
+ vld1.8 {d0}, [r2,:64]
+ sub r2, r0, #1
+ ldcol.8 d1, r2, r1
+ vdup.32 d2, d0[1]
+ ldcol.8 d3, r2, r1
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+ vpaddl.u8 q0, q0
+ vpaddl.u8 q1, q1
+ vpadd.u16 d0, d0, d1
+ vpadd.u16 d2, d2, d3
+ vpadd.u16 d1, d0, d0
+ vpadd.u16 d3, d2, d2
+ vrshrn.u16 d4, q0, #3
+ vrshrn.u16 d5, q0, #2
+ vrshrn.u16 d6, q1, #3
+ vrshrn.u16 d7, q1, #2
+ vdup.8 d0, d4[4]
+ vdup.8 d1, d5[3]
+ vdup.8 d16, d5[2]
+ vdup.8 d17, d4[5]
+ vtrn.32 q0, q8
+ vdup.8 d2, d7[1]
+ vdup.8 d3, d7[3]
+ vdup.8 d16, d6[4]
+ vdup.8 d17, d6[5]
+ vtrn.32 q1, q8
+pred8x16_dc_end:
+ add r2, r0, r1, lsl #2
+.rept 4
+ vst1.8 {d0}, [r0,:64], r1
+ vst1.8 {d1}, [r2,:64], r1
+.endr
+ add r2, r2, r1, lsl #2
+ add r0, r0, r1, lsl #2
+.rept 4
+ vst1.8 {d2}, [r0,:64], r1
+ vst1.8 {d3}, [r2,:64], r1
+.endr
+ bx lr
+endfunc
+
+function x264_predict_8x16c_h_neon
+ sub r1, r0, #1
+ mov ip, #FDEC_STRIDE
+.rept 8
+ vld1.8 {d0[]}, [r1], ip
+ vld1.8 {d2[]}, [r1], ip
+ vst1.64 {d0}, [r0,:64], ip
+ vst1.64 {d2}, [r0,:64], ip
+.endr
+ bx lr
+endfunc
+
+function x264_predict_8x16c_v_neon
+ sub r0, r0, #FDEC_STRIDE
+ mov ip, #FDEC_STRIDE
+ vld1.64 {d0}, [r0,:64], ip
+.rept 16
+ vst1.64 {d0}, [r0,:64], ip
+.endr
+ bx lr
+endfunc
+
+function x264_predict_8x16c_p_neon
+ sub r3, r0, #FDEC_STRIDE
+ mov r1, #FDEC_STRIDE
+ add r2, r3, #4
+ sub r3, r3, #1
+ vld1.32 {d0[0]}, [r3]
+ vld1.32 {d2[0]}, [r2,:32], r1
+ ldcol.8 d1, r3, r1
+ add r3, r3, r1
+ ldcol.8 d3, r3, r1
+ vrev64.32 d16, d3
+ vaddl.u8 q8, d2, d16
+ vrev32.8 d0, d0
+ vsubl.u8 q2, d2, d0
+ vrev64.8 d1, d1
+ vsubl.u8 q3, d3, d1
+ movrel r3, p16weight
+ vld1.16 {q0}, [r3,:128]
+ vmul.s16 d4, d4, d0
+ vmul.s16 q3, q3, q0
+ vpadd.i16 d4, d4, d5
+ vpadd.i16 d6, d6, d7
+ vpaddl.s16 d4, d4 @ d4[0] = H
+ vpaddl.s16 d6, d6
+ vpadd.s32 d6, d6 @ d6[0] = V
+ vshl.i32 d5, d4, #4
+ vadd.s32 d4, d4, d5 @ d4[0] = 17*H
+ vshl.i32 d7, d6, #2
+ vrshrn.s32 d4, q2, #5 @ d4[0] = b
+ vadd.s32 d6, d6, d7 @ d6[0] = 5*V
+ vrshrn.s32 d6, q3, #6 @ d6[0] = c
+ mov r3, #0
+ vshl.i16 d3, d4, #2
+ vsub.i16 d3, d3, d4 @ d2[0] = 3 * b
+ vshl.i16 d2, d6, #3
+ vadd.i16 d3, d3, d2 @ d2[0] = 3 * b + 8 * c
+ vsub.i16 d3, d3, d6 @ d2[0] = 3 * b + 7 * c
+ vrev64.16 d16, d16
+ vadd.i16 d16, d16, d0 @ d16[0] = src[]+src[] + 1
+ vshl.i16 d2, d16, #4 @ d3[0] = a + 16
+ vsub.i16 d2, d2, d3 @ i00
+ vext.16 q0, q0, q0, #7
+ vmov.16 d0[0], r3
+ vmul.i16 q0, q0, d4[0]
+ vdup.16 q1, d2[0]
+ vdup.16 q3, d6[0]
+ vadd.i16 q1, q1, q0
+ mov r3, #16
+1:
+ vqshrun.s16 d0, q1, #5
+ vadd.i16 q1, q1, q3
+ vst1.8 {d0}, [r0,:64], r1
+ subs r3, r3, #1
+ bne 1b
+ bx lr
+endfunc
+
+
function x264_predict_16x16_dc_top_neon
sub r2, r0, #FDEC_STRIDE
mov r1, #FDEC_STRIDE
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
index e0ba0da..b0aedfc 100644
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -61,6 +61,21 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
#endif // !HIGH_BIT_DEPTH
}
+void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] )
+{
+ if (!(cpu&X264_CPU_NEON))
+ return;
+
+#if !HIGH_BIT_DEPTH
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_neon;
+ pf[I_PRED_CHROMA_DC_TOP] = x264_predict_8x16c_dc_top_neon;
+ pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x16c_dc_left_neon;
+ pf[I_PRED_CHROMA_H] = x264_predict_8x16c_h_neon;
+ pf[I_PRED_CHROMA_V] = x264_predict_8x16c_v_neon;
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
if (!(cpu&X264_CPU_NEON))
diff --git a/common/arm/predict.h b/common/arm/predict.h
index 242043d..2aa902a 100644
--- a/common/arm/predict.h
+++ b/common/arm/predict.h
@@ -40,6 +40,13 @@ void x264_predict_8x8c_h_neon( uint8_t *src );
void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
+void x264_predict_8x16c_v_neon( uint8_t *src );
+void x264_predict_8x16c_h_neon( uint8_t *src );
+void x264_predict_8x16c_dc_neon( uint8_t *src );
+void x264_predict_8x16c_dc_left_neon( uint8_t *src );
+void x264_predict_8x16c_dc_top_neon( uint8_t *src );
+void x264_predict_8x16c_p_neon( uint8_t *src );
+
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
@@ -60,6 +67,7 @@ void x264_predict_16x16_p_neon( uint8_t *src );
void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init_arm( int cpu, x264_predict_t pf[7] );
void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] );
#endif
diff --git a/common/predict.c b/common/predict.c
index c0f2a0b..f7080f0 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -977,6 +977,10 @@ void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
x264_predict_8x16c_init_mmx( cpu, pf );
#endif
+#if HAVE_ARMV6
+ x264_predict_8x16c_init_arm( cpu, pf );
+#endif
+
#if ARCH_AARCH64
x264_predict_8x16c_init_aarch64( cpu, pf );
#endif
--
1.7.10.4
More information about the x264-devel
mailing list