[x264-devel] [PATCH] x264_plane_copy_(de)interleave_neon
George Stephanos
gaf.stephanos at gmail.com
Thu Feb 9 00:58:08 CET 2012
---
common/arm/mc-a.S | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/mc-c.c | 22 ++++++++++++++++++
2 files changed, 86 insertions(+), 0 deletions(-)
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 08daa4a..a714e9b 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1355,3 +1355,67 @@ lowres_xloop_end:
vpop {d8-d15}
pop {r4-r10,pc}
.endfunc
+//void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
+ //pixel *srcu, int i_srcu,
+ //pixel *srcv, int i_srcv, int w, int h )
+// assumes i_dst and w are multiples of 16, and i_dst>2*w
+function x264_plane_copy_interleave_core_neon
+ push {r4-r11, lr}
+ ldrd r4, [sp, #36]
+ ldrd r6, [sp, #44]
+ sub r9, r3, r6
+ sub r10, r5, r6
+ sub r11, r1, r6, lsl #1
+1:
+ mov r8, r6
+2:
+ vld1.64 {d0-d1}, [r2]!
+ vld1.64 {d2-d3}, [r4]!
+ vzip.8 q0, q1
+ vst1.64 {d0-d3}, [r0]!
+ subs r8, #16
+ bgt 2b
+
+ add r2, r9
+ add r4, r10
+ add r0, r11
+
+ subs r7, #1
+ bgt 1b
+
+ pop {r4-r11, pc}
+.endfunc
+// void plane_copy_deinterleave( pixel *dstu, int i_dstu,
+// pixel *dstv, int i_dstv,
+// pixel *src, int i_src, int w, int h )
+function x264_plane_copy_deinterleave_neon
+ push {r4-r11, lr}
+ ldrd r4, [sp, #36]
+ ldrd r6, [sp, #44]
+ mov r9, r0
+ mov r10, r2
+ mov r11, r4
+1:
+ mov r8, r6
+2:
+ vld1.64 {d0-d3}, [r4]!
+ vuzp.8 q0, q1
+ vst1.64 {d0-d1}, [r0]!
+ vst1.64 {d2-d3}, [r2]!
+ subs r8, #16
+ bgt 2b
+
+ add r9, r1
+ mov r0, r9
+
+ add r10, r3
+ mov r2, r10
+
+ add r11, r5
+ mov r4, r11
+
+ subs r7, #1
+ bgt 1b
+
+ pop {r4-r11, pc}
+.endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index c1fc05c..c61d29d 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -79,6 +79,26 @@ void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+void x264_plane_copy_interleave_core_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
+void x264_plane_copy_deinterleave_neon( pixel *, int, pixel *, int, pixel *, int, int, int );
+
+static void x264_plane_copy_interleave_neon( pixel *dst, int i_dst,
+ pixel *srcu, int i_srcu,
+ pixel *srcv, int i_srcv, int w, int h )
+{
+ if( !(w&15) ) //multiple of 16
+ x264_plane_copy_interleave_core_neon( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );
+ else if( w < 16 || (i_srcu ^ i_srcv) ) //i_srcu != i_srcv
+ x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );
+ else if( i_srcu > 0 ) {
+ x264_plane_copy_interleave_core_neon( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );
+ x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );
+ } else {
+ x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );
+ x264_plane_copy_interleave_core_neon( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );
+ }\
+}
+
#if !HIGH_BIT_DEPTH
static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
{
@@ -243,6 +263,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
#endif // !HIGH_BIT_DEPTH
// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
--
1.7.4.1
More information about the x264-devel
mailing list