[x265] [PATCH] arm: Implement quant
Ramya Sriraman
ramya at multicorewareinc.com
Fri Apr 22 10:25:32 CEST 2016
Thanks for the improvements min. Modified patch below.
# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1461158053 -19800
# Wed Apr 20 18:44:13 2016 +0530
# Node ID e21b86fb24567ad92be78eaadd4278fd34a7d161
# Parent 4f83d465d11b3baa46e6089f73b0929266d4b722
arm: Implement quant
diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
@@ -820,6 +820,8 @@
p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp =
PFX(interp_4tap_vert_sp_24x32_neon);
p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp =
PFX(interp_4tap_vert_sp_48x64_neon);
+ // quant
+ p.quant = PFX(quant_neon);
}
if (cpuMask & X265_CPU_ARMV6)
{
diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530
@@ -1962,3 +1962,63 @@
bx lr
endfunc
+function x265_quant_neon
+ push {r4-r8}
+ ldr r4, [sp, #4* 5] //qbits
+ vdup.s32 q8, r4
+ ldr r5, [sp, #4* 5 + 4] // add
+ vdup.s32 q9, r5
+ ldr r6, [sp, #4* 5 + 8] // numcoeff
+ mov r5, #8
+ sub r5, r4, r5
+ vdup.s32 q10, r5
+ vneg.s32 q10, q10 // -(qbits- 8)
+ mov r5, #1
+ vdup.u32 q11, r5
+
+ lsr r6, r6 ,#2
+ eor r7, r7
+
+.loop_quant:
+
+ vld1.s16 d0, [r0]!
+ vmovl.s16 q1, d0 // coef[blockpos]
+
+ vclt.s32 q4, q1, #0
+ vorr.s32 q4, q4, q11 // q4= sign
+
+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])
+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]
+ vmul.i32 q0, q0, q1 // q0=tmplevel = abs(level) *
quantCoeff[blockpos];
+
+ vadd.s32 q1, q0, q9 // q1= tmplevel+add
+ vneg.s32 q12, q8
+ vshl.s32 q1, q1, q12 // q1= level =(tmplevel+add)
>> qbits
+
+ vshl.s32 q3, q1, q8 // q3 = level << qBits
+ vsub.s32 q13, q0, q3 // q8= tmplevel - (level <<
qBits)
+ vshl.s32 q13, q13, q10 // q3= ((tmplevel - (level <<
qBits)) >> qBits8)
+ vst1.s32 {q13}, [r2]! // store deltaU
+
+ // numsig
+ vclz.s32 q2, q1
+ vshr.u32 q2, #5
+ vadd.u32 d4, d5
+ vpadd.u32 d4, d4
+ vmov.32 r12, d4[0]
+ mov r8, #4
+ sub r8, r8, r12
+ add r7, r8
+
+ vmul.s32 q2, q1, q4
+ vqmovn.s32 d0, q2
+ vst1.s16 d0, [r3]!
+
+ subs r6, #1
+ bne .loop_quant
+
+ mov r0, r7
+ pop {r4-r8}
+ bx lr
+endfunc
+
diff -r 4f83d465d11b -r e21b86fb2456 source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h Wed Mar 30 17:29:13 2016 +0530
+++ b/source/common/arm/pixel-util.h Wed Apr 20 18:44:13 2016 +0530
@@ -78,4 +78,6 @@
int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
+
+uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
#endif // ifndef X265_PIXEL_UTIL_ARM_H
Thank you
Regards
Ramya
On Wed, Apr 20, 2016 at 10:05 PM, chen <chenm003 at 163.com> wrote:
>
> At 2016-04-20 21:24:03,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman<ramya at multicorewareinc.com>
> ># Date 1461158053 -19800
> ># Wed Apr 20 18:44:13 2016 +0530
> ># Node ID 72ae446412d6e25aa3d2aa8ecb657f9815fdb635
> ># Parent 4f83d465d11b3baa46e6089f73b0929266d4b722
> >arm: Implement quant
> >
> >diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/asm-primitives.cpp
> >--- a/source/common/arm/asm-primitives.cpp Wed Mar 30 17:29:13 2016 +0530
> >+++ b/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
> >@@ -820,6 +820,8 @@
> > p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = PFX(interp_4tap_vert_sp_24x32_neon);
> > p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_neon);
> >
> >+ // quant
> >+ p.quant = PFX(quant_neon);
> > }
> > if (cpuMask & X265_CPU_ARMV6)
> > {
> >diff -r 4f83d465d11b -r 72ae446412d6 source/common/arm/pixel-util.S
> >--- a/source/common/arm/pixel-util.S Wed Mar 30 17:29:13 2016 +0530
> >+++ b/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530
> >@@ -1962,3 +1962,63 @@
> > bx lr
> > endfunc
> >
> >+function x265_quant_neon
> >+ push {r4-r9}
> >+ ldr r4, [sp, #4* 6] //qbits
> >+ ldr r5, [sp, #4* 6 + 4] // add
> >+ ldr r6, [sp, #4* 6 + 8] // numcoeff
> >+ mov r7, #8
> >+ sub r7, r7 , r4 //-(qbits- 8)
> >+
> >+ lsr r6, r6 ,#2
> >+ mov r8, #0
> >+
> >+.loop_quant:
> >+
> >+ vld1.s16 d0, [r0]!
> >+ vmovl.s16 q1, d0 // coef[blockpos]
> >+
> >+ vclt.s32 q4, q1, #0
> >+ mov r9, #1
> >+ vdup.s32 q2, r9
> >+ vorr.s32 q4, q4, q2 // q4= sign
> >+
> >+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])
> >+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]
> >+ vmul.i32 q0, q0, q1 // q0=tmplevel = abs(level) *
> quantCoeff[blockpos];
> >+
> >+ vdup.s32 q2, r5
> r5=qbits, it is constant in the loop, the NEON have 16 of registers, so don't need load every iteration
>
> >+ vadd.s32 q1, q0, q2 // q1= tmplevel+add
> >+ vdup.s32 q2, r4
> >+ vneg.s32 q2, q2
> >+ vshl.s32 q1, q1, q2 // q1= level =tmplevel+add >> qbits
> how about vqshrun?
>
> >+
> >+ vdup.s32 q2, r4
> >+ vshl.s32 q3, q1, q2 // q3 = level << qBits
> >+ vsub.s32 q8, q0, q3 // q8= tmplevel - (level << qBits)
> >+ vdup.s32 q2, r7
> >+ vshl.s32 q8, q8, q2 // q3= ((tmplevel - (level << qBits)) >> qBits8)
> >+ vst1.s32 {q8}, [r2]! // store deltaU
> >+
> >+ // numsig
> >+ vclz.s32 q2, q1
> >+ vshr.u32 q2, #5
> >+ vadd.u32 d4, d5
> >+ vpadd.u32 d4, d4
> >+ vmov.32 r12, d4[0]
> >+ mov r9, #4
> >+ sub r9, r9, r12
> >+ add r8, r9
> >+
> >+ vmul.s32 q2, q1, q4
> >+ vqmovn.s32 d0, q2
> >+ vst1.s16 d0, [r3]!
> >+
> >+ subs r6, #1
> >+ bne .loop_quant
> >+
> >+ mov r0, r8
> >+ pop {r4-r9}
> >+ bx lr
> >+endfunc
>
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160422/b620ec36/attachment.html>
More information about the x265-devel
mailing list