[x265] [PATCH] asm_arm: rewrite filterPixelToShort_4x4, improve by remove unnecessary and slow instructions
Min Chen
chenm003 at 163.com
Wed May 25 17:19:59 CEST 2016
# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1464189591 18000
# Node ID 45ea67275abd8d43d77fa2e48e1d9768c09e05b6
# Parent 5abead62ce63ec2a472a2424d54d40f015146995
asm_arm: rewrite filterPixelToShort_4x4, improve by remove unnecessary and slow instructions
---
source/common/arm/ipfilter8.S | 29 ++++++++++++++++-------------
1 files changed, 16 insertions(+), 13 deletions(-)
diff -r 5abead62ce63 -r 45ea67275abd source/common/arm/ipfilter8.S
--- a/source/common/arm/ipfilter8.S Wed May 25 10:18:47 2016 -0500
+++ b/source/common/arm/ipfilter8.S Wed May 25 10:19:51 2016 -0500
@@ -48,22 +48,25 @@
// filterPixelToShort(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride)
function x265_filterPixelToShort_4x4_neon
+ vld1.u32 {d0[]}, [r0], r1
+ vld1.u32 {d0[1]}, [r0], r1
+ vld1.u32 {d1[]}, [r0], r1
+ vld1.u32 {d1[1]}, [r0], r1
+
+ // avoid load pipeline stall
+ vmov.i16 q1, #0xE000
+
+ vshll.u8 q2, d0, #6
+ vshll.u8 q3, d1, #6
+ vadd.i16 q2, q1
+ vadd.i16 q3, q1
+
add r3, r3
- vmov.u16 q8, #64
- vmov.u16 q9, #8192
- vneg.s16 q9, q9
-.rept 2
- vld1.u8 {d0}, [r0], r1
- vld1.u8 {d2}, [r0], r1
- vmovl.u8 q0, d0
- vmovl.u8 q1, d2
- vmov q2, q9
- vmov q3, q9
- vmla.s16 q2, q0, q8
- vmla.s16 q3, q1, q8
vst1.16 {d4}, [r2], r3
+ vst1.16 {d5}, [r2], r3
vst1.16 {d6}, [r2], r3
-.endr
+ vst1.16 {d7}, [r2], r3
+
bx lr
endfunc
More information about the x265-devel
mailing list