[x264-devel] More ARM NEON assembly functions
George Stephanos
git at videolan.org
Mon Jan 16 02:11:56 CET 2012
x264 | branch: master | George Stephanos <gaf.stephanos at gmail.com> | Thu Dec 1 16:53:45 2011 -0800| [027b05e0a22421e477847506a205a49b151ae5bf] | committer: Jason Garrett-Glaser
More ARM NEON assembly functions
predict_8x8_v, predict_4x4_dc_top, predict_8x8_ddl, predict_8x8_ddr, predict_8x8_vl, predict_8x8_vr, predict_8x8_hd, predict_8x8_hu.
>From Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=027b05e0a22421e477847506a205a49b151ae5bf
---
common/arm/predict-a.S | 213 +++++++++++++++++++++++++++++++++++++++++++++++-
common/arm/predict-c.c | 16 ++++
2 files changed, 228 insertions(+), 1 deletions(-)
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index 574653e..3e9ed61 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -102,6 +102,21 @@ function x264_predict_4x4_dc_armv6
bx lr
.endfunc
+function x264_predict_4x4_dc_top_neon
+ mov r12, #FDEC_STRIDE
+ sub r1, r0, #FDEC_STRIDE
+ vld1.32 d1[], [r1,:32]
+ vpaddl.u8 d1, d1
+ vpadd.u16 d1, d1, d1
+ vrshr.u16 d1, d1, #2
+ vdup.8 d1, d1[0]
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ vst1.32 d1[0], [r0,:32], r12
+ bx lr
+.endfunc
+
// return a1 = (a1+2*b1+c1+2)>>2 a2 = (a2+2*b2+c2+2)>>2
.macro PRED4x4_LOWPASS a1 b1 c1 a2 b2 c2 pb_1
uhadd8 \a1, \a1, \c1
@@ -211,6 +226,202 @@ function x264_predict_8x8_h_neon
bx lr
.endfunc
+function x264_predict_8x8_v_neon
+ add r1, r1, #16
+ mov r12, #FDEC_STRIDE
+ vld1.8 {d0}, [r1,:64]
+.rept 8
+ vst1.8 {d0}, [r0,:64], r12
+.endr
+ bx lr
+.endfunc
+
+function x264_predict_8x8_ddl_neon
+ add r1, #16
+ vld1.8 {d0, d1}, [r1,:128]
+ vmov.i8 q3, #0
+ vrev64.8 d2, d1
+ vext.8 q8, q3, q0, #15
+ vext.8 q2, q0, q1, #1
+ vhadd.u8 q8, q2
+ mov r12, #FDEC_STRIDE
+ vrhadd.u8 q0, q8
+ vext.8 d2, d0, d1, #1
+ vext.8 d3, d0, d1, #2
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #3
+ vst1.8 d3, [r0,:64], r12
+ vext.8 d3, d0, d1, #4
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #5
+ vst1.8 d3, [r0,:64], r12
+ vext.8 d3, d0, d1, #6
+ vst1.8 d2, [r0,:64], r12
+ vext.8 d2, d0, d1, #7
+ vst1.8 d3, [r0,:64], r12
+ vst1.8 d2, [r0,:64], r12
+ vst1.8 d1, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_ddr_neon
+ vld1.8 {d0-d3}, [r1,:128]
+ vext.8 q2, q0, q1, #7
+ vext.8 q3, q0, q1, #9
+
+ vhadd.u8 q2, q2, q3
+ vrhadd.u8 d0, d1, d4
+ vrhadd.u8 d1, d2, d5
+
+ add r0, #7*FDEC_STRIDE
+ mov r12, #-1*FDEC_STRIDE
+
+ vext.8 d2, d0, d1, #1
+ vst1.8 {d0}, [r0,:64], r12
+ vext.8 d4, d0, d1, #2
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d5, d0, d1, #3
+ vst1.8 {d4}, [r0,:64], r12
+ vext.8 d4, d0, d1, #4
+ vst1.8 {d5}, [r0,:64], r12
+ vext.8 d5, d0, d1, #5
+ vst1.8 {d4}, [r0,:64], r12
+ vext.8 d4, d0, d1, #6
+ vst1.8 {d5}, [r0,:64], r12
+ vext.8 d5, d0, d1, #7
+ vst1.8 {d4}, [r0,:64], r12
+ vst1.8 {d5}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_vl_neon
+ add r1, #16
+ mov r12, #FDEC_STRIDE
+
+ vld1.8 {d0, d1}, [r1,:128]
+ vext.8 q1, q1, q0, #15
+ vext.8 q2, q0, q2, #1
+
+ vrhadd.u8 q3, q0, q2
+
+ vhadd.u8 q1, q1, q2
+ vrhadd.u8 q0, q0, q1
+
+ vext.8 d2, d0, d1, #1
+ vst1.8 {d6}, [r0,:64], r12
+ vext.8 d3, d6, d7, #1
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d3, d6, d7, #2
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #3
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d3, d6, d7, #3
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #4
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_vr_neon
+ add r1, #8
+ mov r12, #FDEC_STRIDE
+ vld1.8 {d4,d5}, [r1,:64]
+
+ vext.8 q1, q2, q2, #14
+ vext.8 q0, q2, q2, #15
+
+ vhadd.u8 q3, q2, q1
+ vrhadd.u8 q2, q2, q0
+ vrhadd.u8 q0, q0, q3
+
+ vmov d2, d0
+
+ vst1.8 {d5}, [r0,:64], r12
+ vuzp.8 d2, d0
+ vst1.8 {d1}, [r0,:64], r12
+ vext.8 d6, d0, d5, #7
+ vext.8 d3, d2, d1, #7
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d6, d0, d5, #6
+ vext.8 d3, d2, d1, #6
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ vext.8 d6, d0, d5, #5
+ vext.8 d3, d2, d1, #5
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d3}, [r0,:64], r12
+ bx lr
+.endfunc
+
+function x264_predict_8x8_hd_neon
+ mov r12, #FDEC_STRIDE
+ add r1, #7
+
+ vld1.8 {d2,d3}, [r1]
+ vext.8 q3, q1, q1, #1
+ vext.8 q2, q1, q1, #2
+
+ vrhadd.u8 q8, q1, q3
+
+ vhadd.u8 q1, q2
+ vrhadd.u8 q0, q1, q3
+
+ vzip.8 d16, d0
+
+ vext.8 d2, d0, d1, #6
+ vext.8 d3, d0, d1, #4
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d0, d1, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d16, d0, #6
+ vst1.8 {d0}, [r0,:64], r12
+ vext.8 d3, d16, d0, #4
+ vst1.8 {d2}, [r0,:64], r12
+ vext.8 d2, d16, d0, #2
+ vst1.8 {d3}, [r0,:64], r12
+ vst1.8 {d2}, [r0,:64], r12
+ vst1.8 {d16}, [r0,:64], r12
+
+ bx lr
+.endfunc
+
+function x264_predict_8x8_hu_neon
+ mov r12, #FDEC_STRIDE
+ add r1, #7
+ vld1.8 {d7}, [r1]
+ vdup.8 d6, d7[0]
+ vrev64.8 d7, d7
+
+ vext.8 d4, d7, d6, #2
+ vext.8 d2, d7, d6, #1
+
+ vhadd.u8 d16, d7, d4
+ vrhadd.u8 d0, d2, d7
+ vrhadd.u8 d1, d16, d2
+
+ vzip.8 d0, d1
+
+ vdup.16 q1, d1[3]
+
+ vext.8 q2, q0, q1, #2
+ vext.8 q3, q0, q1, #4
+ vext.8 q8, q0, q1, #6
+ vst1.8 {d0}, [r0,:64], r12
+ vst1.8 {d4}, [r0,:64], r12
+ vst1.8 {d6}, [r0,:64], r12
+ vst1.8 {d16}, [r0,:64], r12
+
+ vst1.8 {d1}, [r0,:64], r12
+ vst1.8 {d5}, [r0,:64], r12
+ vst1.8 {d7}, [r0,:64], r12
+ vst1.8 {d17}, [r0,:64]
+ bx lr
+.endfunc
function x264_predict_8x8c_dc_top_neon
sub r2, r0, #FDEC_STRIDE
@@ -223,7 +434,7 @@ function x264_predict_8x8c_dc_top_neon
vdup.8 d0, d0[0]
vtrn.32 d0, d1
b pred8x8_dc_end
- .endfunc
+.endfunc
function x264_predict_8x8c_dc_left_neon
mov r1, #FDEC_STRIDE
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
index bccdc50..bf8fd38 100644
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -28,6 +28,7 @@
#include "pixel.h"
void x264_predict_4x4_dc_armv6( uint8_t *src );
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
void x264_predict_4x4_h_armv6( uint8_t *src );
void x264_predict_4x4_ddr_armv6( uint8_t *src );
void x264_predict_4x4_ddl_neon( uint8_t *src );
@@ -40,7 +41,14 @@ void x264_predict_8x8c_v_neon( uint8_t *src );
void x264_predict_8x8c_p_neon( uint8_t *src );
void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
void x264_predict_16x16_dc_neon( uint8_t *src );
void x264_predict_16x16_dc_top_neon( uint8_t *src );
@@ -62,6 +70,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
if (!(cpu&X264_CPU_NEON))
return;
+ pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
#endif // !HIGH_BIT_DEPTH
}
@@ -87,8 +96,15 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_
return;
#if !HIGH_BIT_DEPTH
+ pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+ pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+ pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_neon;
+ pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_neon;
pf[I_PRED_8x8_DC] = x264_predict_8x8_dc_neon;
pf[I_PRED_8x8_H] = x264_predict_8x8_h_neon;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_neon;
+ pf[I_PRED_8x8_HU] = x264_predict_8x8_hu_neon;
+ pf[I_PRED_8x8_V] = x264_predict_8x8_v_neon;
#endif // !HIGH_BIT_DEPTH
}
More information about the x264-devel
mailing list