[x265] [PATCH] arm: Implement dequant_normal ARM NEON

Wed Apr 20 18:05:33 CEST 2016

At 2016-04-20 19:14:39,radhakrishnan at multicorewareinc.com wrote:
># HG changeset patch
># User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
># Date 1460718872 -19800
>#      Fri Apr 15 16:44:32 2016 +0530
># Node ID 534b8e2845b8156010b3c79bfa88c81c7b0b9295
># Parent  c1bee15b165dd29e524501ba969973f24ea29007
>arm: Implement dequant_normal ARM NEON
>
>diff -r c1bee15b165d -r 534b8e2845b8 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp	Fri Apr 15 11:39:39 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp	Fri Apr 15 16:44:32 2016 +0530
>@@ -45,6 +45,8 @@
>     {
>         // dequant_scaling
>          p.dequant_scaling = PFX(dequant_scaling_neon);
>+         p.dequant_normal  = PFX(dequant_normal_neon);
>+
>         // luma satd
>          p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
>          p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
>diff -r c1bee15b165d -r 534b8e2845b8 source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S	Fri Apr 15 11:39:39 2016 +0530
>+++ b/source/common/arm/pixel-util.S	Fri Apr 15 16:44:32 2016 +0530
>@@ -2027,3 +2027,46 @@
>     pop             {r4, r5, r6, r7}
>     bx              lr
> endfunc
>+
>+// void dequant_normal_c(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift)
>+function x265_dequant_normal_neon
>+    push            {r4, r5, r6}
>+    ldr             r4, [sp, #12]       // shift
>+#if HIGH_BIT_DEPTH
>+    cmp             r3, #32767
>+    jle             .skip
the ARM support instruction execute based onconditional

>+    shr             r3, (BIT_DEPTH - 8)
>+    sub             r4, (BIT_DEPTH - 8)
>+.skip:
>+#endif
>+    mov             r12, #1
>+    sub             r5, r4, #1
>+    lsr             r2, #3              // num / 8
>+    lsl             r5, r12, r5         // 1 << shift - 1
>+
>+    neg             r6, r4
>+    vdup.32         q0, r3
>+    vdup.32         q1, r6
>+    vdup.32         q2, r5
>+
>+dqn_loop1:
>+    vld1.16         {q3}, [r0]!
>+    vmovl.s16       q8, d6
>+    vmovl.s16       q9, d7
>+
>+    vmul.s32        q8, q0
>+    vmul.s32        q9, q0
>+    vadd.s32        q8, q2
>+    vadd.s32        q9, q2

vmovl+vmul+vadd = vmlal


>+ >+    vshl.s32        q8, q1 >+    vshl.s32        q9, q1 In the document arm_assembly_reference, there don't tell us may use Negative shift count, I just see below comment:
imm  --- "1 to (size(datatype) – 1) for VSHL, VQSHL, or VQSHLU"


>+    vqmovn.s32      d16, q8
>+    vqmovn.s32      d17, q9
may combo above and replace by vqshrun

>+
>+    subs            r2, #1
>+    vst1.16         {q8}, [r1]!
>+    bne             dqn_loop1
>+    pop             {r4, r5, r6}
>+    bx              lr
>+endfunc

-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160421/d84fc853/attachment-0001.html>