[x265] [PATCH 2 of 2] asm_arm: rewrite NEON version of dequant_normal

Min Chen chenm003 at 163.com
Thu Jun 9 20:35:09 CEST 2016


# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1465497295 18000
# Node ID a22130631abb598b10f3f0beecf92af223d778fe
# Parent  7dce8656504fdf8d25f67c7a97b781a031bbdf8a
asm_arm: rewrite NEON version of dequant_normal
OLD:
dequant_normal          9.87x    199.80          1971.87

NEW:
dequant_normal          16.16x   122.04          1971.56
---
 source/common/arm/pixel-util.S |   60 ++++++++++++++++++----------------------
 1 files changed, 27 insertions(+), 33 deletions(-)

diff -r 7dce8656504f -r a22130631abb source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S	Thu Jun 09 13:34:52 2016 -0500
+++ b/source/common/arm/pixel-util.S	Thu Jun 09 13:34:55 2016 -0500
@@ -2293,44 +2293,38 @@
 
 // void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
 function x265_dequant_normal_neon
-    push            {r4, r5, r6}
-    ldr             r4, [sp, #12]       // shift
-#if HIGH_BIT_DEPTH
-    cmp             r3, #32767
-    jle             .skip
-    shr             r3, (BIT_DEPTH - 8)
-    sub             r4, (BIT_DEPTH - 8)
-.skip:
+    ldr             r12, [sp]            // shift
+#if HIGH_BIT_DEPTH  // NEVER TEST path
+    cmp             r3, #32768
+    lsrlt           r3, #(BIT_DEPTH - 8)
+    sublt           r12, #(BIT_DEPTH - 8)
 #endif
-    mov             r12, #1
-    sub             r5, r4, #1
-    lsr             r2, #3              // num / 8
-    lsl             r5, r12, r5         // 1 << shift - 1
+    lsr             r2, #4              // num / 16
 
-    neg             r6, r4
-    vdup.32         q0, r3
-    vdup.32         q1, r6
-    vdup.32         q2, r5
+    neg             r12, r12
+    vdup.16         q0, r3
+    vdup.32         q1, r12
 
-dqn_loop1:
-    vld1.16         {q3}, [r0]!
-    vmovl.s16       q8, d6
-    vmovl.s16       q9, d7
+.dqn_loop1:
+    vld1.16         {d4-d7}, [r0]!
 
-    vmul.s32        q8, q0
-    vmul.s32        q9, q0
-    vadd.s32        q8, q2
-    vadd.s32        q9, q2
+    vmull.s16       q8, d4, d0
+    vmull.s16       q9, d5, d0
+    vmull.s16       q10, d6, d0
+    vmull.s16       q11, d7, d0
 
-    vshl.s32        q8, q1
-    vshl.s32        q9, q1
+    vrshl.s32       q8, q1
+    vrshl.s32       q9, q1
+    vrshl.s32       q10, q1
+    vrshl.s32       q11, q1
     vqmovn.s32      d16, q8
     vqmovn.s32      d17, q9
+    vqmovn.s32      d18, q10
+    vqmovn.s32      d19, q11
 
     subs            r2, #1
-    vst1.16         {q8}, [r1]!
-    bne             dqn_loop1
-    pop             {r4, r5, r6}
+    vst1.16         {d16-d19}, [r1]!
+    bgt            .dqn_loop1
     bx              lr
 endfunc
 



More information about the x265-devel mailing list