[x265] [PATCH] arm: Implement quant

Ramya Sriraman ramya at multicorewareinc.com
Mon Apr 25 14:03:59 CEST 2016


Ignore above patch. Modified one below.

# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1461158053 -19800
#      Wed Apr 20 18:44:13 2016 +0530
# Node ID c26f9a4dc9173b0cbfb609a984c57607d129f011
# Parent  4f83d465d11b3baa46e6089f73b0929266d4b722
arm: Implement quant

diff -r 4f83d465d11b -r c26f9a4dc917 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp    Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp    Wed Apr 20 18:44:13 2016 +0530
@@ -820,6 +820,8 @@
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp =
PFX(interp_4tap_vert_sp_24x32_neon);
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp =
PFX(interp_4tap_vert_sp_48x64_neon);

+        // quant
+        p.quant = PFX(quant_neon);
     }
     if (cpuMask & X265_CPU_ARMV6)
     {
diff -r 4f83d465d11b -r c26f9a4dc917 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S    Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.S    Wed Apr 20 18:44:13 2016 +0530
@@ -1962,3 +1962,60 @@
     bx              lr
 endfunc

+function x265_quant_neon
+    push            {r4-r6}
+    ldr             r4, [sp, #3* 4]         //qbits
+    vdup.s32        q8, r4
+    mov             r12, #8
+    sub             r12, r12, r4
+    vdup.s32        q10, r12                // -(qbits- 8) = 8- qbits
+    ldr             r4, [sp, #3* 4 + 4]     // add
+    vdup.s32        q9, r4
+    ldr             r4, [sp, #3* 4 + 8]     // numcoeff
+
+    lsr             r4, r4 ,#2
+    eor             r5, r5
+    eor             r6, r6
+
+.loop_quant:
+
+    vld1.s16        d0, [r0]!
+    vmovl.s16       q1, d0                  // coef[blockpos]
+
+    vclt.s32        q4, q1, #0
+
+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
+    vmul.i32        q0, q0, q1              // q0=tmplevel = abs(level) *
quantCoeff[blockpos];
+
+    vadd.s32        q1, q0, q9              // q1= tmplevel+add
+    vneg.s32        q12, q8
+    vshl.s32        q1, q1, q12             // q1= level =(tmplevel+add)
>> qbits
+
+    vshl.s32        q3, q1, q8              // q3 = level << qBits
+    vsub.s32        q13, q0, q3             // q8= tmplevel - (level <<
qBits)
+    vshl.s32        q13, q13, q10           // q3= ((tmplevel - (level <<
qBits)) >> qBits8)
+    vst1.s32        {q13}, [r2]!            // store deltaU
+
+    // numsig
+    vclz.s32        q2, q1
+    vshr.u32        q2, #5
+    vadd.u32        d4, d5
+    vpadd.u32       d4, d4
+    vmov.32         r12, d4[0]
+    add             r5, r12
+    add             r6, #4
+
+    veor.s32        q2, q1, q4
+    vsub.s32        q2, q2, q4
+    vqmovn.s32      d0, q2
+    vst1.s16        d0, [r3]!
+
+    subs            r4, #1
+    bne             .loop_quant
+
+    sub             r0, r6, r5
+    pop             {r4-r6}
+    bx              lr
+endfunc
+
diff -r 4f83d465d11b -r c26f9a4dc917 source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h    Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.h    Wed Apr 20 18:44:13 2016 +0530
@@ -78,4 +78,6 @@
 int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
 int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
 int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
 #endif // ifndef X265_PIXEL_UTIL_ARM_H



Thank you
Regards
Ramya

On Fri, Apr 22, 2016 at 1:55 PM, Ramya Sriraman <ramya at multicorewareinc.com>
wrote:

> Thanks for the improvements min. Modified patch below.
>
> # HG changeset patch
> # User Ramya Sriraman<ramya at multicorewareinc.com>
> # Date 1461158053 -19800
> #      Wed Apr 20 18:44:13 2016 +0530
> # Node ID e21b86fb24567ad92be78eaadd4278fd34a7d161
> # Parent  4f83d465d11b3baa46e6089f73b0929266d4b722
> arm: Implement quant
>
> diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/asm-primitives.cpp
> --- a/source/common/arm/asm-primitives.cpp    Wed Mar 30 17:29:13 2016
> +0530
> +++ b/source/common/arm/asm-primitives.cpp    Wed Apr 20 18:44:13 2016
> +0530
> @@ -820,6 +820,8 @@
>          p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp =
> PFX(interp_4tap_vert_sp_24x32_neon);
>          p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp =
> PFX(interp_4tap_vert_sp_48x64_neon);
>
> +        // quant
> +        p.quant = PFX(quant_neon);
>      }
>      if (cpuMask & X265_CPU_ARMV6)
>      {
> diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.S
> --- a/source/common/arm/pixel-util.S    Wed Mar 30 17:29:13 2016 +0530
> +++ b/source/common/arm/pixel-util.S    Wed Apr 20 18:44:13 2016 +0530
> @@ -1962,3 +1962,63 @@
>      bx              lr
>  endfunc
>
> +function x265_quant_neon
> +    push            {r4-r8}
> +    ldr             r4, [sp, #4* 5]         //qbits
> +    vdup.s32        q8, r4
> +    ldr             r5, [sp, #4* 5 + 4]     // add
> +    vdup.s32        q9, r5
> +    ldr             r6, [sp, #4* 5 + 8]     // numcoeff
> +    mov             r5, #8
> +    sub             r5, r4, r5
> +    vdup.s32        q10, r5
> +    vneg.s32        q10, q10                // -(qbits- 8)
> +    mov             r5, #1
> +    vdup.u32        q11, r5
> +
> +    lsr             r6, r6 ,#2
> +    eor             r7, r7
> +
> +.loop_quant:
> +
> +    vld1.s16        d0, [r0]!
> +    vmovl.s16       q1, d0                  // coef[blockpos]
> +
> +    vclt.s32        q4, q1, #0
> +    vorr.s32        q4, q4, q11             // q4= sign
> +
> +    vabs.s32        q1, q1                  //
> q1=level=abs(coef[blockpos])
> +    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
> +    vmul.i32        q0, q0, q1              // q0=tmplevel = abs(level) *
> quantCoeff[blockpos];
> +
> +    vadd.s32        q1, q0, q9              // q1= tmplevel+add
> +    vneg.s32        q12, q8
> +    vshl.s32        q1, q1, q12             // q1= level =(tmplevel+add)
> >> qbits
> +
> +    vshl.s32        q3, q1, q8              // q3 = level << qBits
> +    vsub.s32        q13, q0, q3             // q8= tmplevel - (level <<
> qBits)
> +    vshl.s32        q13, q13, q10           // q3= ((tmplevel - (level <<
> qBits)) >> qBits8)
> +    vst1.s32        {q13}, [r2]!            // store deltaU
> +
> +    // numsig
> +    vclz.s32        q2, q1
> +    vshr.u32        q2, #5
> +    vadd.u32        d4, d5
> +    vpadd.u32       d4, d4
> +    vmov.32         r12, d4[0]
> +    mov             r8, #4
> +    sub             r8, r8, r12
> +    add             r7, r8
> +
> +    vmul.s32        q2, q1, q4
> +    vqmovn.s32      d0, q2
> +    vst1.s16        d0, [r3]!
> +
> +    subs            r6, #1
> +    bne             .loop_quant
> +
> +    mov             r0, r7
> +    pop             {r4-r8}
> +    bx              lr
> +endfunc
> +
> diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.h
> --- a/source/common/arm/pixel-util.h    Wed Mar 30 17:29:13 2016 +0530
> +++ b/source/common/arm/pixel-util.h    Wed Apr 20 18:44:13 2016 +0530
> @@ -78,4 +78,6 @@
>  int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const
> pixel* pix2, intptr_t i_pix2);
>  int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const
> pixel* pix2, intptr_t i_pix2);
>  int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const
> pixel* pix2, intptr_t i_pix2);
> +
> +uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
> int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
>  #endif // ifndef X265_PIXEL_UTIL_ARM_H
>
>
>
> Thank you
> Regards
> Ramya
>
> On Wed, Apr 20, 2016 at 10:05 PM, chen <chenm003 at 163.com> wrote:
>
>>
>> At 2016-04-20 21:24:03,ramya at multicorewareinc.com wrote:
>> ># HG changeset patch
>> ># User Ramya Sriraman<ramya at multicorewareinc.com>
>> ># Date 1461158053 -19800
>> >#      Wed Apr 20 18:44:13 2016 +0530
>> ># Node ID 72ae446412d6e25aa3d2aa8ecb657f9815fdb635
>> ># Parent  4f83d465d11b3baa46e6089f73b0929266d4b722
>> >arm: Implement quant
>> >
>> >diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/asm-primitives.cpp
>> >--- a/source/common/arm/asm-primitives.cpp	Wed Mar 30 17:29:13 2016 +0530
>> >+++ b/source/common/arm/asm-primitives.cpp	Wed Apr 20 18:44:13 2016 +0530
>> >@@ -820,6 +820,8 @@
>> >         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
>> >         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);
>> >
>> >+        // quant
>> >+        p.quant = PFX(quant_neon);
>> >     }
>> >     if (cpuMask & X265_CPU_ARMV6)
>> >     {
>> >diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/pixel-util.S
>> >--- a/source/common/arm/pixel-util.S	Wed Mar 30 17:29:13 2016 +0530
>> >+++ b/source/common/arm/pixel-util.S	Wed Apr 20 18:44:13 2016 +0530
>> >@@ -1962,3 +1962,63 @@
>> >     bx              lr
>> > endfunc
>> >
>> >+function x265_quant_neon
>> >+    push            {r4-r9}
>> >+    ldr             r4, [sp, #4* 6]         //qbits
>> >+    ldr             r5, [sp, #4* 6 + 4]     // add
>> >+    ldr             r6, [sp, #4* 6 + 8]     // numcoeff
>> >+    mov             r7, #8
>> >+    sub             r7, r7 , r4             //-(qbits- 8)
>> >+
>> >+    lsr             r6, r6 ,#2
>> >+    mov             r8, #0
>> >+
>> >+.loop_quant:
>> >+
>> >+    vld1.s16        d0, [r0]!
>> >+    vmovl.s16       q1, d0                  // coef[blockpos]
>> >+
>> >+    vclt.s32        q4, q1, #0
>> >+    mov             r9, #1
>> >+    vdup.s32        q2, r9
>> >+    vorr.s32        q4, q4, q2              // q4= sign
>> >+
>> >+    vabs.s32        q1, q1                  // q1=level=abs(coef[blockpos])
>> >+    vld1.s32        {q0}, [r1]!             // quantCoeff[blockpos]
>> >+    vmul.i32        q0, q0, q1              // q0=tmplevel = abs(level) *
>> quantCoeff[blockpos];
>> >+
>> >+    vdup.s32        q2, r5
>> r5=qbits, it is constant in the loop, the NEON have 16 of registers, so don't need load every iteration
>>
>> >+    vadd.s32        q1, q0, q2             // q1= tmplevel+add
>> >+    vdup.s32        q2, r4
>> >+    vneg.s32        q2, q2
>> >+    vshl.s32        q1, q1, q2              // q1= level =tmplevel+add >> qbits
>> how about vqshrun?
>>
>> >+
>> >+    vdup.s32        q2, r4
>> >+    vshl.s32        q3, q1, q2              // q3 = level << qBits
>> >+    vsub.s32        q8, q0, q3              // q8= tmplevel - (level << qBits)
>> >+    vdup.s32        q2, r7
>> >+    vshl.s32        q8, q8, q2              // q3= ((tmplevel - (level << qBits)) >> qBits8)
>> >+    vst1.s32        {q8}, [r2]!              // store deltaU
>> >+
>> >+    // numsig
>> >+    vclz.s32        q2, q1
>> >+    vshr.u32        q2, #5
>> >+    vadd.u32        d4, d5
>> >+    vpadd.u32       d4, d4
>> >+    vmov.32         r12, d4[0]
>> >+    mov             r9, #4
>> >+    sub             r9, r9, r12
>> >+    add             r8, r9
>> >+
>> >+    vmul.s32        q2, q1, q4
>> >+    vqmovn.s32      d0, q2
>> >+    vst1.s16        d0, [r3]!
>> >+
>> >+    subs            r6, #1
>> >+    bne             .loop_quant
>> >+
>> >+    mov             r0, r8
>> >+    pop             {r4-r9}
>> >+    bx              lr
>> >+endfunc
>>
>>
>>
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160425/f217b11e/attachment-0001.html>


More information about the x265-devel mailing list