<div dir="ltr">Hey min, I hope I have understood and responded to what you intended. <br><br># HG changeset patch<br># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com">ramya@multicorewareinc.com</a>><br># Date 1461321845 -19800<br>#      Fri Apr 22 16:14:05 2016 +0530<br># Node ID cb78ce0ce13f564e0766887b9c1e575c416acc7b<br># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161<br>arm: Implement nquant<br><br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/asm-primitives.cpp<br>--- a/source/common/arm/asm-primitives.cpp    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/asm-primitives.cpp    Fri Apr 22 16:14:05 2016 +0530<br>@@ -822,6 +822,7 @@<br> <br>         // quant<br>         p.quant = PFX(quant_neon);<br>+        p.nquant = PFX(nquant_neon);<br>     }<br>     if (cpuMask & X265_CPU_ARMV6)<br>     {<br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.S<br>--- a/source/common/arm/pixel-util.S    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.S    Fri Apr 22 16:14:05 2016 +0530<br>@@ -2022,3 +2022,53 @@<br>     bx              lr<br> endfunc<br> <br>+function x265_nquant_neon<br>+    push            {r4-r5}<br>+    rsb             r3, r3 , #0<br>+    vdup.s32        q8, r3                  // qbits<br>+    ldr             r3, [sp, #2* 4]         // add<br>+    vdup.s32        q9, r3<br>+    ldr             r3, [sp, #2* 4 + 4]     // numcoeff<br>+    mov             r4, #1<br>+    vdup.s32        q10, r4<br>+<br>+    lsr             r3, r3 ,#2<br>+    eor             r4, r4<br>+    eor             r5, r5<br>+<br>+.loop_nquant:<br>+<br>+    vld1.s16        d0, [r0]!<br>+    vmovl.s16       q1, d0                  // coef[blockpos]<br>+<br>+    vclt.s32        q4, q1, #0<br>+<br>+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])<br>+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]<br>+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];<br>+<br>+    vadd.s32        q1, q0, q9              // q1= tmplevel+add<br>+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits<br>+<br>+    // numsig<br>+    vclz.s32        q2, q1<br>+    vshr.u32        q2, #5<br>+    vadd.u32        d4, d5<br>+    vpadd.u32       d4, d4<br>+    vmov.32         r12, d4[0]<br>+    add             r4, r12<br>+    add             r5, #4<br>+<br>+    veor.s32        q2, q1, q4<br>+    vsub.s32        q2, q2, q4<br>+    vqmovn.s32      d0, q2<br>+    vabs.s16        d1, d0<br>+    vst1.s16        d1, [r2]!<br>+<br>+    subs            r3, #1<br>+    bne             .loop_nquant<br>+<br>+    sub             r0, r5, r4<br>+    pop             {r4-r5}<br>+    bx              lr<br>+endfunc<br>diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.h<br>--- a/source/common/arm/pixel-util.h    Wed Apr 20 18:44:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.h    Fri Apr 22 16:14:05 2016 +0530<br>@@ -80,4 +80,5 @@<br> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);<br> <br> uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br>+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);<br> #endif // ifndef X265_PIXEL_UTIL_ARM_H<br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>


<br><div class="gmail_quote">On Fri, Apr 22, 2016 at 9:38 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre><span class=""><br>At 2016-04-22 18:51:43,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:


># HG changeset patch


># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>>


># Date 1461321845 -19800


>#      Fri Apr 22 16:14:05 2016 +0530


># Node ID 750a20cdf7dcb381f9008f51473fbd74d45b7f5e


># Parent  e21b86fb24567ad92be78eaadd4278fd34a7d161


>arm: Implement nquant


>


>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/asm-primitives.cpp


>--- a/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530


>+++ b/source/common/arm/asm-primitives.cpp Fri Apr 22 16:14:05 2016 +0530


>@@ -822,6 +822,7 @@


> 


>         // quant


>         p.quant = PFX(quant_neon);


>+        p.nquant = PFX(nquant_neon);


>     }


>     if (cpuMask & X265_CPU_ARMV6)


>     {


>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.S


>--- a/source/common/arm/pixel-util.S       Wed Apr 20 18:44:13 2016 +0530


>+++ b/source/common/arm/</span>pixel-util.S  Fri Apr 22 16:14:05 2016 +0530


>@@ -2022,3 +2022,54 @@


>     bx              lr


> endfunc


> 


>+function x265_nquant_neon


>+    push            {r4-r7}


may reduce lots of registers after improve algorithm


>+    vdup.s32        q8, r3                  // qbits


>+    vneg.s32        q8, q8                 // -qbits


vneg throughout 1, latency 4, so reorder (negative on R3) may save some cycles


------ R3 is free now</pre><pre><span class="">


>+    ldr             r4, [sp, #4* 4]         // add


>+    vdup.s32        q9, r4


>+    ldr             r5, [sp,</span> #4* 4 + 4]     // numcoeff


R4 is temporary only, move this instruction may reduce one register


>+    mov             r4, #1


>+    vdup.s32        q10, r4


may remove, see below algorithm modify


>+


>+    lsr             r5, r5 ,#2


>+    eor             r6, r6


>+


>+.loop_nquant:


>+


>+    vld1.s16        d0, [r0]!


>+    vmovl.s16       q1, d0                  // coef[blockpos]


>+


>+    vclt.s32        q4, q1, #0


>+    vorr.s32        q4, q4, q10             // q4= sign


save sign, we may remove OR operator in here, see below


>+


>+


>+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])


>+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]


>+    vmul.s32        q0, q0, q1              // q0=tmplevel = abs(level) * quantCoeff[blockpos];


>+


>+    vadd.s32        q1, q0, q9              // q1= tmplevel+add


>+    vshl.s32        q1, q1, q8              // q1= level =(tmplevel+add) >> qbits


>+


>+    // numsig


>+    vclz.s32        q2, q1


>+    vshr.u32        q2, #5


>+    vadd.u32        d4, d5


>+    vpadd.u32       d4, d4


>+    vmov.32         r12, d4[0]


>+    mov             r7, #4


>+    sub             r7, r7, r12


>+    add             r6, r7


why calculate exact numsig in every iteration? we may do it after loop


>+


>+    vmul.s32        q2, q1, q4


just want to restore sign bits, we may use algorithm Q2 = (Q1 ^ Q4) - Q4


>+    vqmovn.s32      d0, q2


>+    vabs.s16        d1, d0


>+    vst1.s16        d1, [r2]!


>+


>+    subs            r5, #1


>+    bne             .loop_nquant


>+


>+    mov             r0, r6


>+    pop             {r4-r7}


>+    bx              lr


>+endfunc


>diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.h


>--- a/source/common/arm/pixel-util.h       Wed Apr 20 18:44:13 2016 +0530


>+++ b/source/common/arm/pixel-util.h       Fri Apr 22 16:14:05 2016 +0530


>@@ -80,4 +80,5 @@


> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);


> 


> uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);


>+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);


> #endif // ifndef X265_PIXEL_UTIL_ARM_H


>_______________________________________________


>x265-devel mailing list


><a href="mailto:x265-devel@videolan.org" target="_blank">x265-devel@videolan.org</a>


><a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a>


</pre></div><br>_______________________________________________<br>


x265-devel mailing list<br>


<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>


<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>


<br></blockquote></div><br></div>