[x265] [PATCH] arm: Implement dequant_scaling ARM NEON

Fri Apr 15 09:13:30 CEST 2016

# HG changeset patch
# User Radhakrishnan VR <radhakrishnan at multicorewareinc.com>
# Date 1460700579 -19800
#      Fri Apr 15 11:39:39 2016 +0530
# Node ID 5db26e0f305c09cffa1df1557029f7021ae5d7bd
# Parent  54f6bc901448039b2f5523531bda1f2351b6103f
arm: Implement dequant_scaling ARM NEON

diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/asm-primitives.cpp

--- a/source/common/arm/asm-primitives.cpp	Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp	Fri Apr 15 11:39:39 2016 +0530
@@ -43,6 +43,8 @@
 {
     if (cpuMask & X265_CPU_NEON)
     {
+        // dequant_scaling
+         p.dequant_scaling = PFX(dequant_scaling_neon);
         // luma satd
          p.pu[LUMA_4x4].satd   = PFX(pixel_satd_4x4_neon);
          p.pu[LUMA_4x8].satd   = PFX(pixel_satd_4x8_neon);
diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S	Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.S	Fri Apr 15 11:39:39 2016 +0530
@@ -1962,3 +1962,68 @@
     bx              lr
 endfunc
 
+/***** dequant_scaling*****/
+// void dequant_scaling_c(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift)
+function x265_dequant_scaling_neon
+    push            {r4, r5, r6, r7}
+    ldr             r4, [sp, #16]       // per
+    ldr             r5, [sp, #20]       //.shift
+    add             r5, #4              // shift + 4
+    lsr             r3, #3              // num / 8
+    cmp             r5, r4
+    blt             skip
+
+    mov             r12, #1
+    sub             r6, r5, r4          // shift - per
+    sub             r6, #1              // shift - per - 1
+    lsl             r6, r12, r6         // 1 << shift - per - 1 (add)
+    vdup.32         q0, r6
+    sub             r7, r4, r5          // per - shift
+    vdup.32         q3, r7
+
+dequant_loop1:
+    vld1.16         {q9}, [r0]!          // quantCoef
+    vld1.32         {q2}, [r1]!          // deQuantCoef
+    vld1.32         {q10}, [r1]!
+    vmovl.s16       q1, d18
+    vmovl.s16       q9, d19
+
+    vmul.s32        q1, q2              // quantCoef * deQuantCoef
+    vmul.s32        q9, q10
+    vadd.s32        q1, q0              // quantCoef * deQuantCoef + add
+    vadd.s32        q9, q0
+
+    vshl.s32        q1, q3
+    vshl.s32        q9, q3
+    vqmovn.s32      d16, q1             // x265_clip3
+    vqmovn.s32      d17, q9
+    subs            r3, #1
+    vst1.16         {q8}, [r2]!
+    bne             dequant_loop1
+    b               1f
+
+skip:
+    sub             r6, r4, r5          // per - shift
+    vdup.16         q0, r6
+
+dequant_loop2:
+    vld1.16         {q9}, [r0]!          // quantCoef
+    vld1.32         {q2}, [r1]!          // deQuantCoef
+    vld1.32         {q10}, [r1]!
+    vmovl.s16       q1, d18
+    vmovl.s16       q9, d19
+
+    vmul.s32        q1, q2              // quantCoef * deQuantCoef
+    vmul.s32        q9, q10
+    vqmovn.s32      d16, q1             // x265_clip3
+    vqmovn.s32      d17, q9
+
+    vqshl.s16       d16, d0             // coefQ << per - shift
+    vqshl.s16       d17, d0
+    subs            r3, #1
+    vst1.16         {q8}, [r2]!
+    bne             dequant_loop2
+1:
+    pop             {r4, r5, r6, r7}
+    bx              lr
+endfunc
diff -r 54f6bc901448 -r 5db26e0f305c source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h	Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.h	Fri Apr 15 11:39:39 2016 +0530
@@ -78,4 +78,6 @@
 int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
 int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
 int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+
+void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
 #endif // ifndef X265_PIXEL_UTIL_ARM_H