<div dir="ltr">Hey min, I hope I have understood and responded to what you intended. <br><br># HG changeset patch<br># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com">ramya@multicorewareinc.com</a>><br># Date 1461321845 -19800<br>#      Fri Apr 22 16:14:05 2016 +0530<br># Node ID cb78ce0ce13f564e0766887b9c1e575c416acc7b<br># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161<br>arm: Implement nquant<br><br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/asm-primitives.cpp<br>--- a/source/common/arm/asm-primitives.cpp    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/asm-primitives.cpp    Fri Apr 22 16:14:05 2016 +0530<br>@@ -822,6 +822,7 @@<br> <br>         // quant<br>         p.quant = PFX(quant_neon);<br>+        p.nquant = PFX(nquant_neon);<br>     }<br>     if (cpuMask & X265_CPU_ARMV6)<br>     {<br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.S<br>--- a/source/common/arm/pixel-util.S    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.S    Fri Apr 22 16:14:05 2016 +0530<br>@@ -2022,3 +2022,53 @@<br>     bx              lr<br> endfunc<br> <br>+function x265_nquant_neon<br>+    push            {r4-r5}<br>+    rsb             r3, r3 , #0<br>+    vdup.s32        q8, r3                  // qbits<br>+    ldr             r3, [sp, #2* 4]         // add<br>+    vdup.s32        q9, r3<br>+    ldr             r3, [sp, #2* 4 + 4]     // numcoeff<br>+    mov             r4, #1<br>+    vdup.s32        q10, r4<br>+<br>+    lsr             r3, r3 ,#2<br>+    eor             r4, r4<br>+    eor             r5, r5<br>+<br>+.loop_nquant:<br>+<br>+    vld1.s16        d0, [r0]!<br>+    vmovl.s16       q1, d0                  // coef[blockpos]<br>+<br>+    vclt.s32        q4, q1, #0<br>+<br>+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])<br>+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]<br>+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];<br>+<br>+    vadd.s32        q1, q0, q9              // q1= tmplevel+add<br>+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits<br>+<br>+    // numsig<br>+    vclz.s32        q2, q1<br>+    vshr.u32        q2, #5<br>+    vadd.u32        d4, d5<br>+    vpadd.u32       d4, d4<br>+    vmov.32         r12, d4[0]<br>+    add             r4, r12<br>+    add             r5, #4<br>+<br>+    veor.s32        q2, q1, q4<br>+    vsub.s32        q2, q2, q4<br>+    vqmovn.s32      d0, q2<br>+    vabs.s16        d1, d0<br>+    vst1.s16        d1, [r2]!<br>+<br>+    subs            r3, #1<br>+    bne             .loop_nquant<br>+<br>+    sub             r0, r5, r4<br>+    pop             {r4-r5}<br>+    bx              lr<br>+endfunc<br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.h<br>--- a/source/common/arm/pixel-util.h    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.h    Fri Apr 22 16:14:05 2016 +0530<br>@@ -80,4 +80,5 @@<br> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);<br> <br> uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br>+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);<br> #endif // ifndef X265_PIXEL_UTIL_ARM_H<br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Fri, Apr 22, 2016 at 9:38 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre><span class=""><br>At 2016-04-22 18:51:43,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:
># HG changeset patch
># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>>
># Date 1461321845 -19800
>#      Fri Apr 22 16:14:05 2016 +0530
># Node ID 750a20cdf7dcb381f9008f51473fbd74d45b7f5e
># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161
>arm: Implement nquant
>
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Fri Apr 22 16:14:05 2016 +0530
>@@ -822,6 +822,7 @@

>         // quant
>         p.quant = PFX(quant_neon);
>+        p.nquant = PFX(nquant_neon);
>     }
>     if (cpuMask & X265_CPU_ARMV6)
>     {
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S       Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/</span>pixel-util.S  Fri Apr 22 16:14:05 2016 +0530
>@@ -2022,3 +2022,54 @@
>     bx              lr
> endfunc

>+function x265_nquant_neon
>+    push            {r4-r7}
may reduce lots of registers after improve algorithm

>+    vdup.s32        q8, r3                  // qbits
>+    vneg.s32        q8, q8                 // -qbits
vneg throughout 1, latency 4, so reorder (negative on R3) may save some cycles
------ R3 is free now</pre><pre><span class="">
>+    ldr             r4, [sp, #4* 4]         // add
>+    vdup.s32        q9, r4
>+    ldr             r5, [sp,</span> #4* 4 + 4]     // numcoeff
R4 is temporary only, move this instruction may reduce one register

>+    mov             r4, #1
>+    vdup.s32        q10, r4
may remove, see below algorithm modify

>+
>+    lsr             r5, r5 ,#2
>+    eor             r6, r6
>+
>+.loop_nquant:
>+
>+    vld1.s16        d0, [r0]!
>+    vmovl.s16       q1, d0                  // coef[blockpos]
>+
>+    vclt.s32        q4, q1, #0
>+    vorr.s32        q4, q4, q10             // q4= sign
save sign, we may remove OR operator in here, see below

>+
>+
>+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
>+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
>+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];
>+
>+    vadd.s32        q1, q0, q9              // q1= tmplevel+add
>+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits
>+
>+    // numsig
>+    vclz.s32        q2, q1
>+    vshr.u32        q2, #5
>+    vadd.u32        d4, d5
>+    vpadd.u32       d4, d4
>+    vmov.32         r12, d4[0]
>+    mov             r7, #4
>+    sub             r7, r7, r12
>+    add             r6, r7
why calculate exact numsig in every iteration? we may do it after loop

>+
>+    vmul.s32        q2, q1, q4
just want to restore sign bits, we may use algorithm Q2 = (Q1 ^ Q4) - Q4

>+    vqmovn.s32      d0, q2
>+    vabs.s16        d1, d0
>+    vst1.s16        d1, [r2]!
>+
>+    subs            r5, #1
>+    bne             .loop_nquant
>+
>+    mov             r0, r6
>+    pop             {r4-r7}
>+    bx              lr
>+endfunc
>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.h
>--- a/source/common/arm/pixel-util.h       Wed Apr 20 18:44:13 2016 +0530
>+++ b/source/common/arm/pixel-util.h       Fri Apr 22 16:14:05 2016 +0530
>@@ -80,4 +80,5 @@
> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);

> uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
>+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> #endif // ifndef X265_PIXEL_UTIL_ARM_H
>_______________________________________________
>x265-devel mailing list
><a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a>
><a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a>
</pre></div><br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>