<div dir="ltr">Thanks for the improvements min. Modified patch below.<br><br># HG changeset patch<br># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com">ramya@multicorewareinc.com</a>><br># Date 1461158053 -19800<br># Wed Apr 20 18:44:13 2016 +0530<br># Node ID e21b86fb24567ad92be78eaadd4278fd34a7d161<br># Parent 4f83d465d11b3baa46e6089f73b0929266d4b722<br>arm: Implement quant<br><br>diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/asm-primitives.cpp<br>--- a/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530<br>+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530<br>@@ -820,6 +820,8 @@<br> p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);<br> p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);<br> <br>+ // quant<br>+ p.quant = PFX(quant_neon);<br> }<br> if (cpuMask & X265_CPU_ARMV6)<br> {<br>diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.S<br>--- a/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530<br>@@ -1962,3 +1962,63 @@<br> bx lr<br> endfunc<br> <br>+function x265_quant_neon<br>+ push {r4-r8}<br>+ ldr r4, [sp, #4* 5] //qbits<br>+ vdup.s32 q8, r4<br>+ ldr r5, [sp, #4* 5 + 4] // add<br>+ vdup.s32 q9, r5<br>+ ldr r6, [sp, #4* 5 + 8] // numcoeff<br>+ mov r5, #8<br>+ sub r5, r4, r5<br>+ vdup.s32 q10, r5<br>+ vneg.s32 q10, q10 // -(qbits- 8)<br>+ mov r5, #1<br>+ vdup.u32 q11, r5<br>+<br>+ lsr r6, r6 ,#2<br>+ eor r7, r7<br>+<br>+.loop_quant:<br>+<br>+ vld1.s16 d0, [r0]!<br>+ vmovl.s16 q1, d0 // coef[blockpos]<br>+<br>+ vclt.s32 q4, q1, #0<br>+ vorr.s32 q4, q4, q11 // q4= sign<br>+<br>+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])<br>+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]<br>+ vmul.i32 q0, q0, q1 // q0=tmplevel = abs(level) * quantCoeff[blockpos];<br>+<br>+ vadd.s32 q1, q0, q9 // q1= tmplevel+add<br>+ vneg.s32 q12, q8<br>+ vshl.s32 q1, q1, q12 // q1= level =(tmplevel+add) >> qbits<br>+<br>+ vshl.s32 q3, q1, q8 // q3 = level << qBits<br>+ vsub.s32 q13, q0, q3 // q8= tmplevel - (level << qBits)<br>+ vshl.s32 q13, q13, q10 // q3= ((tmplevel - (level << qBits)) >> qBits8)<br>+ vst1.s32 {q13}, [r2]! // store deltaU<br>+<br>+ // numsig<br>+ vclz.s32 q2, q1<br>+ vshr.u32 q2, #5<br>+ vadd.u32 d4, d5<br>+ vpadd.u32 d4, d4<br>+ vmov.32 r12, d4[0]<br>+ mov r8, #4<br>+ sub r8, r8, r12<br>+ add r7, r8<br>+<br>+ vmul.s32 q2, q1, q4<br>+ vqmovn.s32 d0, q2<br>+ vst1.s16 d0, [r3]!<br>+<br>+ subs r6, #1<br>+ bne .loop_quant<br>+<br>+ mov r0, r7<br>+ pop {r4-r8}<br>+ bx lr<br>+endfunc<br>+<br>diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.h<br>--- a/source/common/arm/pixel-util.h Wed Mar 30 17:29:13 2016 +0530<br>+++ b/source/common/arm/pixel-util.h Wed Apr 20 18:44:13 2016 +0530<br>@@ -78,4 +78,6 @@<br> int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);<br> int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);<br> int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);<br>+<br>+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);<br> #endif // ifndef X265_PIXEL_UTIL_ARM_H<br><br></div><div class="gmail_extra"><br clear="all"><div><div class="gmail_signature"><div dir="ltr"><div><div dir="ltr"><div><div><span style="color:rgb(56,118,29)"><br></span></div><div><span style="color:rgb(56,118,29)">Thank you<br></span></div><span style="color:rgb(56,118,29)">Regards<br></span></div><span style="color:rgb(56,118,29)">Ramya</span><br></div></div></div></div></div>
<br><div class="gmail_quote">On Wed, Apr 20, 2016 at 10:05 PM, chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><pre><div><div class="h5"><br>At 2016-04-20 21:24:03,<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a> wrote:
># HG changeset patch
># User Ramya Sriraman<<a href="mailto:ramya@multicorewareinc.com" target="_blank">ramya@multicorewareinc.com</a>>
># Date 1461158053 -19800
># Wed Apr 20 18:44:13 2016 +0530
># Node ID 72ae446412d6e25aa3d2aa8ecb657f9815fdb635
># Parent 4f83d465d11b3baa46e6089f73b0929266d4b722
>arm: Implement quant
>
>diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/asm-primitives.cpp
>--- a/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530
>+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
>@@ -820,6 +820,8 @@
> p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
> p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);
>
>+ // quant
>+ p.quant = PFX(quant_neon);
> }
> if (cpuMask & X265_CPU_ARMV6)
> {
>diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/pixel-util.S
>--- a/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530
>+++ b/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530
>@@ -1962,3 +1962,63 @@
> bx lr
> endfunc
>
>+function x265_quant_neon
>+ push {r4-r9}
>+ ldr r4, [sp, #4* 6] //qbits
>+ ldr r5, [sp, #4* 6 + 4] // add
>+ ldr r6, [sp, #4* 6 + 8] // numcoeff
>+ mov r7, #8
>+ sub r7, r7 , r4 //-(qbits- 8)
>+
>+ lsr r6, r6 ,#2
>+ mov r8, #0
>+
>+.loop_quant:
>+
>+ vld1.s16 d0, [r0]!
>+ vmovl.s16 q1, d0 // coef[blockpos]
>+
>+ vclt.s32 q4, q1, #0
>+ mov r9, #1
>+ vdup.s32 q2, r9
>+ vorr.s32 q4, q4, q2 // q4= sign
>+
>+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])
>+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]
>+ vmul.i32 q0, q0, q1 // q0=tmplevel = abs(level) * </div></div>quantCoeff[blockpos];
>+
>+ vdup.s32 q2, r5
r5=qbits, it is constant in the loop, the NEON have 16 of registers, so don't need load every iteration
>+ vadd.s32 q1, q0, q2 // q1= tmplevel+add
>+ vdup.s32 q2, r4
>+ vneg.s32 q2, q2
>+ vshl.s32 q1, q1, q2 // q1= level =tmplevel+add >> qbits
how about vqshrun?
>+
>+ vdup.s32 q2, r4
>+ vshl.s32 q3, q1, q2 // q3 = level << qBits
>+ vsub.s32 q8, q0, q3 // q8= tmplevel - (level << qBits)
>+ vdup.s32 q2, r7
>+ vshl.s32 q8, q8, q2 // q3= ((tmplevel - (level << qBits)) >> qBits8)
>+ vst1.s32 {q8}, [r2]! // store deltaU
>+
>+ // numsig
>+ vclz.s32 q2, q1
>+ vshr.u32 q2, #5
>+ vadd.u32 d4, d5
>+ vpadd.u32 d4, d4
>+ vmov.32 r12, d4[0]
>+ mov r9, #4
>+ sub r9, r9, r12
>+ add r8, r9
>+
>+ vmul.s32 q2, q1, q4
>+ vqmovn.s32 d0, q2
>+ vst1.s16 d0, [r3]!
>+
>+ subs r6, #1
>+ bne .loop_quant
>+
>+ mov r0, r8
>+ pop {r4-r9}
>+ bx lr
>+endfunc
</pre></div><br>_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" rel="noreferrer" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
<br></blockquote></div><br></div>