[x265] [PATCH] asm_arm: rewrite filterPixelToShort_4x4, improve by remove unnecessary and slow instructions

Min Chen chenm003 at 163.com
Wed May 25 17:19:59 CEST 2016


# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1464189591 18000
# Node ID 45ea67275abd8d43d77fa2e48e1d9768c09e05b6
# Parent  5abead62ce63ec2a472a2424d54d40f015146995
asm_arm: rewrite filterPixelToShort_4x4, improve by remove unnecessary and slow instructions
---
 source/common/arm/ipfilter8.S |   29 ++++++++++++++++-------------
 1 files changed, 16 insertions(+), 13 deletions(-)

diff -r 5abead62ce63 -r 45ea67275abd source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S	Wed May 25 10:18:47 2016 -0500
+++ b/source/common/arm/ipfilter8.S	Wed May 25 10:19:51 2016 -0500
@@ -48,22 +48,25 @@
 
 // filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
 function x265_filterPixelToShort_4x4_neon
+    vld1.u32    {d0[]}, [r0], r1
+    vld1.u32    {d0[1]}, [r0], r1
+    vld1.u32    {d1[]}, [r0], r1
+    vld1.u32    {d1[1]}, [r0], r1
+
+    // avoid load pipeline stall
+    vmov.i16    q1, #0xE000
+
+    vshll.u8    q2, d0, #6
+    vshll.u8    q3, d1, #6
+    vadd.i16    q2, q1
+    vadd.i16    q3, q1
+
     add         r3, r3
-    vmov.u16    q8, #64
-    vmov.u16    q9, #8192
-    vneg.s16    q9, q9
-.rept 2
-    vld1.u8     {d0}, [r0], r1
-    vld1.u8     {d2}, [r0], r1
-    vmovl.u8    q0, d0
-    vmovl.u8    q1, d2
-    vmov        q2, q9
-    vmov        q3, q9
-    vmla.s16    q2, q0, q8
-    vmla.s16    q3, q1, q8
     vst1.16     {d4}, [r2], r3
+    vst1.16     {d5}, [r2], r3
     vst1.16     {d6}, [r2], r3
-.endr
+    vst1.16     {d7}, [r2], r3
+
     bx          lr
 endfunc
 



More information about the x265-devel mailing list