[x265] [PATCH 2 of 2] asm_arm: rewrite NEON version of dequant_normal
Min Chen
chenm003 at 163.com
Thu Jun 9 20:35:09 CEST 2016
# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1465497295 18000
# Node ID a22130631abb598b10f3f0beecf92af223d778fe
# Parent 7dce8656504fdf8d25f67c7a97b781a031bbdf8a
asm_arm: rewrite NEON version of dequant_normal
OLD:
dequant_normal 9.87x 199.80 1971.87
NEW:
dequant_normal 16.16x 122.04 1971.56
---
source/common/arm/pixel-util.S | 60 ++++++++++++++++++----------------------
1 files changed, 27 insertions(+), 33 deletions(-)
diff -r 7dce8656504f -r a22130631abb source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Thu Jun 09 13:34:52 2016 -0500
+++ b/source/common/arm/pixel-util.S Thu Jun 09 13:34:55 2016 -0500
@@ -2293,44 +2293,38 @@
// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
function x265_dequant_normal_neon
- push {r4, r5, r6}
- ldr r4, [sp, #12] // shift
-#if HIGH_BIT_DEPTH
- cmp r3, #32767
- jle .skip
- shr r3, (BIT_DEPTH - 8)
- sub r4, (BIT_DEPTH - 8)
-.skip:
+ ldr r12, [sp] // shift
+#if HIGH_BIT_DEPTH // NEVER TEST path
+ cmp r3, #32768
+ lsrlt r3, #(BIT_DEPTH - 8)
+ sublt r12, #(BIT_DEPTH - 8)
#endif
- mov r12, #1
- sub r5, r4, #1
- lsr r2, #3 // num / 8
- lsl r5, r12, r5 // 1 << shift - 1
+ lsr r2, #4 // num / 16
- neg r6, r4
- vdup.32 q0, r3
- vdup.32 q1, r6
- vdup.32 q2, r5
+ neg r12, r12
+ vdup.16 q0, r3
+ vdup.32 q1, r12
-dqn_loop1:
- vld1.16 {q3}, [r0]!
- vmovl.s16 q8, d6
- vmovl.s16 q9, d7
+.dqn_loop1:
+ vld1.16 {d4-d7}, [r0]!
- vmul.s32 q8, q0
- vmul.s32 q9, q0
- vadd.s32 q8, q2
- vadd.s32 q9, q2
+ vmull.s16 q8, d4, d0
+ vmull.s16 q9, d5, d0
+ vmull.s16 q10, d6, d0
+ vmull.s16 q11, d7, d0
- vshl.s32 q8, q1
- vshl.s32 q9, q1
+ vrshl.s32 q8, q1
+ vrshl.s32 q9, q1
+ vrshl.s32 q10, q1
+ vrshl.s32 q11, q1
vqmovn.s32 d16, q8
vqmovn.s32 d17, q9
+ vqmovn.s32 d18, q10
+ vqmovn.s32 d19, q11
subs r2, #1
- vst1.16 {q8}, [r1]!
- bne dqn_loop1
- pop {r4, r5, r6}
+ vst1.16 {d16-d19}, [r1]!
+ bgt .dqn_loop1
bx lr
endfunc
More information about the x265-devel
mailing list