[x265] [PATCH] arm: Implement nquant

Mon Apr 25 13:58:16 CEST 2016

Hey min, I hope I have understood and responded to what you intended.

# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1461321845 -19800
#      Fri Apr 22 16:14:05 2016 +0530
# Node ID cb78ce0ce13f564e0766887b9c1e575c416acc7b
# Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161
arm: Implement nquant

diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/asm-primitives.cpp

--- a/source/common/arm/asm-primitives.cpp    Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp    Fri Apr 22 16:14:05 2016 +0530
@@ -822,6 +822,7 @@

         // quant
         p.quant = PFX(quant_neon);
+        p.nquant = PFX(nquant_neon);
     }
     if (cpuMask & X265_CPU_ARMV6)
     {
diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S    Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/pixel-util.S    Fri Apr 22 16:14:05 2016 +0530
@@ -2022,3 +2022,53 @@
     bx              lr
 endfunc

+function x265_nquant_neon
+    push            {r4-r5}
+    rsb             r3, r3 , #0
+    vdup.s32        q8, r3                  // qbits
+    ldr             r3, [sp, #2* 4]         // add
+    vdup.s32        q9, r3
+    ldr             r3, [sp, #2* 4 + 4]     // numcoeff
+    mov             r4, #1
+    vdup.s32        q10, r4
+
+    lsr             r3, r3 ,#2
+    eor             r4, r4
+    eor             r5, r5
+
+.loop_nquant:
+
+    vld1.s16        d0, [r0]!
+    vmovl.s16       q1, d0                  // coef[blockpos]
+
+    vclt.s32        q4, q1, #0
+
+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) *
quantCoeff[blockpos];
+
+    vadd.s32        q1, q0, q9              // q1= tmplevel+add
+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add)
>> qbits
+
+    // numsig
+    vclz.s32        q2, q1
+    vshr.u32        q2, #5
+    vadd.u32        d4, d5
+    vpadd.u32       d4, d4
+    vmov.32         r12, d4[0]
+    add             r4, r12
+    add             r5, #4
+
+    veor.s32        q2, q1, q4
+    vsub.s32        q2, q2, q4
+    vqmovn.s32      d0, q2
+    vabs.s16        d1, d0
+    vst1.s16        d1, [r2]!
+
+    subs            r3, #1
+    bne             .loop_nquant
+
+    sub             r0, r5, r4
+    pop             {r4-r5}
+    bx              lr
+endfunc
diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h    Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/pixel-util.h    Fri Apr 22 16:14:05 2016 +0530
@@ -80,4 +80,5 @@
 int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);

 uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff,
int16_t* qCoef, int qBits, int add, int numCoeff);
 #endif // ifndef X265_PIXEL_UTIL_ARM_H


Thank you
Regards
Ramya

On Fri, Apr 22, 2016 at 9:38 PM, chen <chenm003 at 163.com> wrote:

>
> At 2016-04-22 18:51:43,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman<ramya at multicorewareinc.com>
> ># Date 1461321845 -19800
> >#      Fri Apr 22 16:14:05 2016 +0530
> ># Node ID 750a20cdf7dcb381f9008f51473fbd74d45b7f5e
> ># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161
> >arm: Implement nquant
> >
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/asm-primitives.cpp
> >--- a/source/common/arm/asm-primitives.cpp	Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/asm-primitives.cpp	Fri Apr 22 16:14:05 2016 +0530
> >@@ -822,6 +822,7 @@
> >
> >         // quant
> >         p.quant = PFX(quant_neon);
> >+        p.nquant = PFX(nquant_neon);
> >     }
> >     if (cpuMask & X265_CPU_ARMV6)
> >     {
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.S
> >--- a/source/common/arm/pixel-util.S	Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/pixel-util.S	Fri Apr 22 16:14:05 2016 +0530
> >@@ -2022,3 +2022,54 @@
> >     bx              lr
> > endfunc
> >
> >+function x265_nquant_neon
> >+    push            {r4-r7}
> may reduce lots of registers after improve algorithm
>
> >+    vdup.s32        q8, r3                  // qbits
> >+    vneg.s32        q8, q8                 // -qbits
> vneg throughout 1, latency 4, so reorder (negative on R3) may save some cycles
> ------ R3 is free now
>
>
> >+    ldr             r4, [sp, #4* 4]         // add
> >+    vdup.s32        q9, r4
> >+    ldr             r5, [sp, #4* 4 + 4]     // numcoeff
> R4 is temporary only, move this instruction may reduce one register
>
> >+    mov             r4, #1
> >+    vdup.s32        q10, r4
> may remove, see below algorithm modify
>
> >+
> >+    lsr             r5, r5 ,#2
> >+    eor             r6, r6
> >+
> >+.loop_nquant:
> >+
> >+    vld1.s16        d0, [r0]!
> >+    vmovl.s16       q1, d0                  // coef[blockpos]
> >+
> >+    vclt.s32        q4, q1, #0
> >+    vorr.s32        q4, q4, q10             // q4= sign
> save sign, we may remove OR operator in here, see below
>
> >+
> >+
> >+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
> >+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
> >+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];
> >+
> >+    vadd.s32        q1, q0, q9              // q1= tmplevel+add
> >+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits
> >+
> >+    // numsig
> >+    vclz.s32        q2, q1
> >+    vshr.u32        q2, #5
> >+    vadd.u32        d4, d5
> >+    vpadd.u32       d4, d4
> >+    vmov.32         r12, d4[0]
> >+    mov             r7, #4
> >+    sub             r7, r7, r12
> >+    add             r6, r7
> why calculate exact numsig in every iteration? we may do it after loop
>
> >+
> >+    vmul.s32        q2, q1, q4
> just want to restore sign bits, we may use algorithm Q2 = (Q1 ^ Q4) - Q4
>
> >+    vqmovn.s32      d0, q2
> >+    vabs.s16        d1, d0
> >+    vst1.s16        d1, [r2]!
> >+
> >+    subs            r5, #1
> >+    bne             .loop_nquant
> >+
> >+    mov             r0, r6
> >+    pop             {r4-r7}
> >+    bx              lr
> >+endfunc
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.h
> >--- a/source/common/arm/pixel-util.h	Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/pixel-util.h	Fri Apr 22 16:14:05 2016 +0530
> >@@ -80,4 +80,5 @@
> > int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
> >
> > uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> >+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> > #endif // ifndef X265_PIXEL_UTIL_ARM_H
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160425/7df61ebb/attachment.html>