[x265] [PATCH] arm: Implement nquant

Fri Apr 22 18:08:21 CEST 2016

At 2016-04-22 18:51:43,ramya at multicorewareinc.com wrote:
># HG changeset patch
># User Ramya Sriraman<ramya at multicorewareinc.com>
># Date 1461321845 -19800
>#      Fri Apr 22 16:14:05 2016 +0530
># Node ID 750a20cdf7dcb381f9008f51473fbd74d45b7f5e
># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161
>arm: Implement nquant
>
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp	Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp	Fri Apr 22 16:14:05 2016 +0530
>@@ -822,6 +822,7 @@
> 
>         // quant
>         p.quant = PFX(quant_neon);
>+        p.nquant = PFX(nquant_neon);
>     }
>     if (cpuMask & X265_CPU_ARMV6)
>     {
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S	Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/pixel-util.S	Fri Apr 22 16:14:05 2016 +0530
>@@ -2022,3 +2022,54 @@
>     bx              lr
> endfunc
> 
>+function x265_nquant_neon
>+    push            {r4-r7}
may reduce lots of registers after improve algorithm

>+    vdup.s32        q8, r3                  // qbits
>+    vneg.s32        q8, q8                 // -qbits
vneg throughout 1, latency 4, so reorder (negative on R3) may save some cycles
------ R3 is free now

>+    ldr             r4, [sp, #4* 4]         // add
>+    vdup.s32        q9, r4
>+    ldr             r5, [sp, #4* 4 + 4]     // numcoeff
R4 is temporary only, move this instruction may reduce one register

>+    mov             r4, #1
>+    vdup.s32        q10, r4
may remove, see below algorithm modify

>+
>+    lsr             r5, r5 ,#2
>+    eor             r6, r6
>+
>+.loop_nquant:
>+
>+    vld1.s16        d0, [r0]!
>+    vmovl.s16       q1, d0                  // coef[blockpos]
>+
>+    vclt.s32        q4, q1, #0
>+    vorr.s32        q4, q4, q10             // q4= sign
save sign, we may remove OR operator in here, see below

>+
>+
>+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
>+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
>+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];
>+
>+    vadd.s32        q1, q0, q9              // q1= tmplevel+add
>+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits
>+
>+    // numsig
>+    vclz.s32        q2, q1
>+    vshr.u32        q2, #5
>+    vadd.u32        d4, d5
>+    vpadd.u32       d4, d4
>+    vmov.32         r12, d4[0]
>+    mov             r7, #4
>+    sub             r7, r7, r12
>+    add             r6, r7
why calculate exact numsig in every iteration? we may do it after loop

>+
>+    vmul.s32        q2, q1, q4
just want to restore sign bits, we may use algorithm Q2 = (Q1 ^ Q4) - Q4

>+    vqmovn.s32      d0, q2
>+    vabs.s16        d1, d0
>+    vst1.s16        d1, [r2]!
>+
>+    subs            r5, #1
>+    bne             .loop_nquant
>+
>+    mov             r0, r6
>+    pop             {r4-r7}
>+    bx              lr
>+endfunc
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.h
>--- a/source/common/arm/pixel-util.h	Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/pixel-util.h	Fri Apr 22 16:14:05 2016 +0530
>@@ -80,4 +80,5 @@
> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
> 
> uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
>+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> #endif // ifndef X265_PIXEL_UTIL_ARM_H
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160423/66a44af9/attachment.html>