[x265] [PATCH] arm: Implement nquant
Ramya Sriraman
ramya at multicorewareinc.com
Mon Apr 25 13:58:16 CEST 2016
Hey min, I hope I have understood and responded to what you intended.
# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1461321845 -19800
# Fri Apr 22 16:14:05 2016 +0530
# Node ID cb78ce0ce13f564e0766887b9c1e575c416acc7b
# Parent e21b86fb24567ad92be78eaadd4278fd34a7d161
arm: Implement nquant
diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Fri Apr 22 16:14:05 2016 +0530
@@ -822,6 +822,7 @@
// quant
p.quant = PFX(quant_neon);
+ p.nquant = PFX(nquant_neon);
}
if (cpuMask & X265_CPU_ARMV6)
{
diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/pixel-util.S Fri Apr 22 16:14:05 2016 +0530
@@ -2022,3 +2022,53 @@
bx lr
endfunc
+function x265_nquant_neon
+ push {r4-r5}
+ rsb r3, r3 , #0
+ vdup.s32 q8, r3 // qbits
+ ldr r3, [sp, #2* 4] // add
+ vdup.s32 q9, r3
+ ldr r3, [sp, #2* 4 + 4] // numcoeff
+ mov r4, #1
+ vdup.s32 q10, r4
+
+ lsr r3, r3 ,#2
+ eor r4, r4
+ eor r5, r5
+
+.loop_nquant:
+
+ vld1.s16 d0, [r0]!
+ vmovl.s16 q1, d0 // coef[blockpos]
+
+ vclt.s32 q4, q1, #0
+
+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])
+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]
+ vmul.s32 q0, q0, q1 // q0=tmplevel = abs(level) *
quantCoeff[blockpos];
+
+ vadd.s32 q1, q0, q9 // q1= tmplevel+add
+ vshl.s32 q1, q1, q8 // q1= level =(tmplevel+add)
>> qbits
+
+ // numsig
+ vclz.s32 q2, q1
+ vshr.u32 q2, #5
+ vadd.u32 d4, d5
+ vpadd.u32 d4, d4
+ vmov.32 r12, d4[0]
+ add r4, r12
+ add r5, #4
+
+ veor.s32 q2, q1, q4
+ vsub.s32 q2, q2, q4
+ vqmovn.s32 d0, q2
+ vabs.s16 d1, d0
+ vst1.s16 d1, [r2]!
+
+ subs r3, #1
+ bne .loop_nquant
+
+ sub r0, r5, r4
+ pop {r4-r5}
+ bx lr
+endfunc
diff -r e21b86fb2456 -r cb78ce0ce13f source/common/arm/pixel-util.h
--- a/source/common/arm/pixel-util.h Wed Apr 20 18:44:13 2016 +0530
+++ b/source/common/arm/pixel-util.h Fri Apr 22 16:14:05 2016 +0530
@@ -80,4 +80,5 @@
int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const
pixel* pix2, intptr_t i_pix2);
uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff,
int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff,
int16_t* qCoef, int qBits, int add, int numCoeff);
#endif // ifndef X265_PIXEL_UTIL_ARM_H
Thank you
Regards
Ramya
On Fri, Apr 22, 2016 at 9:38 PM, chen <chenm003 at 163.com> wrote:
>
> At 2016-04-22 18:51:43,ramya at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Ramya Sriraman<ramya at multicorewareinc.com>
> ># Date 1461321845 -19800
> ># Fri Apr 22 16:14:05 2016 +0530
> ># Node ID 750a20cdf7dcb381f9008f51473fbd74d45b7f5e
> ># Parent e21b86fb24567ad92be78eaadd4278fd34a7d161
> >arm: Implement nquant
> >
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/asm-primitives.cpp
> >--- a/source/common/arm/asm-primitives.cpp Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/asm-primitives.cpp Fri Apr 22 16:14:05 2016 +0530
> >@@ -822,6 +822,7 @@
> >
> > // quant
> > p.quant = PFX(quant_neon);
> >+ p.nquant = PFX(nquant_neon);
> > }
> > if (cpuMask & X265_CPU_ARMV6)
> > {
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.S
> >--- a/source/common/arm/pixel-util.S Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/pixel-util.S Fri Apr 22 16:14:05 2016 +0530
> >@@ -2022,3 +2022,54 @@
> > bx lr
> > endfunc
> >
> >+function x265_nquant_neon
> >+ push {r4-r7}
> may reduce lots of registers after improve algorithm
>
> >+ vdup.s32 q8, r3 // qbits
> >+ vneg.s32 q8, q8 // -qbits
> vneg throughout 1, latency 4, so reorder (negative on R3) may save some cycles
> ------ R3 is free now
>
>
> >+ ldr r4, [sp, #4* 4] // add
> >+ vdup.s32 q9, r4
> >+ ldr r5, [sp, #4* 4 + 4] // numcoeff
> R4 is temporary only, move this instruction may reduce one register
>
> >+ mov r4, #1
> >+ vdup.s32 q10, r4
> may remove, see below algorithm modify
>
> >+
> >+ lsr r5, r5 ,#2
> >+ eor r6, r6
> >+
> >+.loop_nquant:
> >+
> >+ vld1.s16 d0, [r0]!
> >+ vmovl.s16 q1, d0 // coef[blockpos]
> >+
> >+ vclt.s32 q4, q1, #0
> >+ vorr.s32 q4, q4, q10 // q4= sign
> save sign, we may remove OR operator in here, see below
>
> >+
> >+
> >+ vabs.s32 q1, q1 // q1=level=abs(coef[blockpos])
> >+ vld1.s32 {q0}, [r1]! // quantCoeff[blockpos]
> >+ vmul.s32 q0, q0, q1 // q0=tmplevel = abs(level) * quantCoeff[blockpos];
> >+
> >+ vadd.s32 q1, q0, q9 // q1= tmplevel+add
> >+ vshl.s32 q1, q1, q8 // q1= level =(tmplevel+add) >> qbits
> >+
> >+ // numsig
> >+ vclz.s32 q2, q1
> >+ vshr.u32 q2, #5
> >+ vadd.u32 d4, d5
> >+ vpadd.u32 d4, d4
> >+ vmov.32 r12, d4[0]
> >+ mov r7, #4
> >+ sub r7, r7, r12
> >+ add r6, r7
> why calculate exact numsig in every iteration? we may do it after loop
>
> >+
> >+ vmul.s32 q2, q1, q4
> just want to restore sign bits, we may use algorithm Q2 = (Q1 ^ Q4) - Q4
>
> >+ vqmovn.s32 d0, q2
> >+ vabs.s16 d1, d0
> >+ vst1.s16 d1, [r2]!
> >+
> >+ subs r5, #1
> >+ bne .loop_nquant
> >+
> >+ mov r0, r6
> >+ pop {r4-r7}
> >+ bx lr
> >+endfunc
> >diff -r e21b86fb2456 -r 750a20cdf7dc source/common/arm/pixel-util.h
> >--- a/source/common/arm/pixel-util.h Wed Apr 20 18:44:13 2016 +0530
> >+++ b/source/common/arm/pixel-util.h Fri Apr 22 16:14:05 2016 +0530
> >@@ -80,4 +80,5 @@
> > int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
> >
> > uint32_t x265_quant_neon(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
> >+uint32_t x265_nquant_neon(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
> > #endif // ifndef X265_PIXEL_UTIL_ARM_H
> >_______________________________________________
> >x265-devel mailing list
> >x265-devel at videolan.org
> >https://mailman.videolan.org/listinfo/x265-devel
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160425/7df61ebb/attachment.html>
More information about the x265-devel
mailing list