[x264-devel] [PATCH] x264_plane_copy_(de)interleave_neon

George Stephanos gaf.stephanos at gmail.com
Thu Feb 9 00:58:08 CET 2012


---
 common/arm/mc-a.S |   64 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 common/arm/mc-c.c |   22 ++++++++++++++++++
 2 files changed, 86 insertions(+), 0 deletions(-)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 08daa4a..a714e9b 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -1355,3 +1355,67 @@ lowres_xloop_end:
     vpop            {d8-d15}
     pop             {r4-r10,pc}
 .endfunc
+//void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
+                                   //pixel *srcu, int i_srcu,
+                                   //pixel *srcv, int i_srcv, int w, int h )
+// assumes i_dst and w are multiples of 16, and i_dst>2*w
+function x264_plane_copy_interleave_core_neon
+    push            {r4-r11, lr}
+    ldrd            r4, [sp, #36]
+    ldrd            r6, [sp, #44]
+    sub             r9, r3, r6
+    sub             r10, r5, r6
+    sub             r11, r1, r6, lsl #1
+1:
+    mov             r8, r6
+2:
+        vld1.64         {d0-d1}, [r2]!
+        vld1.64         {d2-d3}, [r4]!
+        vzip.8          q0, q1
+        vst1.64         {d0-d3}, [r0]!
+        subs            r8, #16
+        bgt             2b
+
+    add             r2, r9
+    add             r4, r10
+    add             r0, r11
+
+    subs            r7, #1
+    bgt             1b
+
+    pop             {r4-r11, pc}
+.endfunc
+// void plane_copy_deinterleave( pixel *dstu, int i_dstu,
+//                               pixel *dstv, int i_dstv,
+//                               pixel *src, int i_src, int w, int h )
+function x264_plane_copy_deinterleave_neon
+    push            {r4-r11, lr}
+    ldrd            r4, [sp, #36]
+    ldrd            r6, [sp, #44]
+    mov             r9, r0
+    mov             r10, r2
+    mov             r11, r4
+1:
+    mov             r8, r6
+2:
+    vld1.64         {d0-d3}, [r4]!
+    vuzp.8          q0, q1
+    vst1.64         {d0-d1}, [r0]!
+    vst1.64         {d2-d3}, [r2]!
+    subs            r8, #16
+    bgt             2b
+
+    add             r9, r1
+    mov             r0, r9
+
+    add             r10, r3
+    mov             r2, r10
+
+    add             r11, r5
+    mov             r4, r11
+
+    subs            r7, #1
+    bgt             1b
+
+    pop             {r4-r11, pc}
+.endfunc
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index c1fc05c..c61d29d 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -79,6 +79,26 @@ void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
 void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
 void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
 
+void x264_plane_copy_interleave_core_neon( uint8_t *, int, uint8_t *, int, uint8_t *, int, int, int );
+void x264_plane_copy_deinterleave_neon( pixel *, int, pixel *, int, pixel *, int, int, int );
+
+static void x264_plane_copy_interleave_neon( pixel *dst, int i_dst,
+                                              pixel *srcu, int i_srcu,
+                                              pixel *srcv, int i_srcv, int w, int h )
+{
+    if( !(w&15) ) //multiple of 16
+        x264_plane_copy_interleave_core_neon( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );
+    else if( w < 16 || (i_srcu ^ i_srcv) ) //i_srcu != i_srcv
+        x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );
+    else if( i_srcu > 0 ) {
+        x264_plane_copy_interleave_core_neon( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );
+        x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );
+    } else {
+        x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );
+        x264_plane_copy_interleave_core_neon( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );
+    }\
+}
+
 #if !HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
@@ -243,6 +263,8 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = hpel_filter_neon;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+    pf->plane_copy_interleave = x264_plane_copy_interleave_neon;
+    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
 #endif // !HIGH_BIT_DEPTH
 
 // Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-- 
1.7.4.1



More information about the x264-devel mailing list