[x265] [PATCH 2 of 3] asm: optimize nquant by PSIGND, improve 13k cycles -> 11k cycles
Deepthi Nandakumar
deepthi at multicorewareinc.com
Thu Sep 4 06:47:24 CEST 2014
Min,
Praveen has sent a number of patches on changing the entire interface for
quant such that the coefficients are now 16-bit instead of 32-bit. Your
patches still assume they are 32-bit?
Can you review all his patches (8-10 patches) and see if we're moving in
the right direction?
Thanks,
Deepthi
On Thu, Sep 4, 2014 at 5:07 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1409787419 25200
> # Node ID 4ca9e972f48cb4530ca7181ad7cec351568a99b3
> # Parent 94bd00d1af5d8c5f6f26f97c50a727588a860714
> asm: optimize nquant by PSIGND, improve 13k cycles -> 11k cycles
>
> diff -r 94bd00d1af5d -r 4ca9e972f48c source/common/dct.cpp
> --- a/source/common/dct.cpp Wed Sep 03 16:36:44 2014 -0700
> +++ b/source/common/dct.cpp Wed Sep 03 16:36:59 2014 -0700
> @@ -801,6 +801,10 @@
> {
> uint32_t numSig = 0;
>
> + X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not
> multiple of 4x4\n");
> + X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less
> than add\n");
> + X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quantCoeff buffer not
> aligned\n");
> +
> for (int blockpos = 0; blockpos < numCoeff; blockpos++)
> {
> int level = coef[blockpos];
> diff -r 94bd00d1af5d -r 4ca9e972f48c source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Wed Sep 03 16:36:44 2014 -0700
> +++ b/source/common/x86/pixel-util8.asm Wed Sep 03 16:36:59 2014 -0700
> @@ -941,55 +941,47 @@
> ; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int
> qBits, int add, int numCoeff);
>
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal nquant, 4,5,8
> +cglobal nquant, 3,5,8
> movd m6, r4m
> mov r4d, r5m
> pxor m7, m7 ; m7 = numZero
> - movd m5, r3d ; m5 = qbits
> + movd m5, r3m ; m5 = qbits
> pshufd m6, m6, 0 ; m6 = add
> mov r3d, r4d ; r3 = numCoeff
> shr r4d, 3
> +
> .loop:
> movu m0, [r0] ; m0 = level
> movu m1, [r0 + 16] ; m1 = level
> - movu m2, [r1] ; m2 = qcoeff
> - movu m3, [r1 + 16] ; m3 = qcoeff
> +
> + pabsd m2, m0
> + pmulld m2, [r1] ; m4 = tmpLevel1
> + paddd m2, m6
> + psrad m2, m5 ; m4 = level1
> + psignd m2, m0 ; restore sign
> +
> + pabsd m3, m1
> + pmulld m3, [r1 + 16] ; m4 = tmpLevel1
> + paddd m3, m6
> + psrad m3, m5 ; m4 = level1
> + psignd m3, m1 ; restore sign
> add r0, 32
> add r1, 32
>
> - pxor m4, m4
> - pcmpgtd m4, m0 ; m4 = sign
> - pabsd m0, m0
> - pmulld m0, m2 ; m0 = tmpLevel1
> - paddd m0, m6
> - psrad m0, m5 ; m0 = level1
> - pxor m0, m4
> - psubd m0, m4
> -
> - pxor m4, m4
> - pcmpgtd m4, m1 ; m4 = sign
> - pabsd m1, m1
> - pmulld m1, m3 ; m1 = tmpLevel1
> - paddd m1, m6
> - psrad m1, m5 ; m1 = level1
> - pxor m1, m4
> - psubd m1, m4
> -
> - packssdw m0, m0
> - packssdw m1, m1
> - pmovsxwd m0, m0
> + packssdw m2, m3
> + pmovsxwd m0, m2
> + movhlps m1, m2
> pmovsxwd m1, m1
>
> - movu [r2], m0
> + movu [r2 ], m0
> movu [r2 + 16], m1
> add r2, 32
> +
> + pxor m4, m4
> + pcmpeqw m2, m4
> + psubw m7, m2
> +
> dec r4d
> -
> - packssdw m0, m1
> - pxor m4, m4
> - pcmpeqw m0, m4
> - psubw m7, m0
> -
> jnz .loop
>
> packuswb m7, m7
> @@ -997,10 +989,8 @@
> mov eax, r3d
> movd r4d, m7
> sub eax, r4d ; numSig
> -
> RET
>
> -
>
> ;-----------------------------------------------------------------------------
> ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num,
> int scale, int shift)
>
> ;-----------------------------------------------------------------------------
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140904/886f59fe/attachment.html>
More information about the x265-devel
mailing list