[x265] [PATCH 1 of 3] asm: general calSign to accelerate sao
Steve Borho
steve at borho.org
Fri Apr 3 17:48:37 CEST 2015
On 04/03, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1428059407 -28800
> # Node ID abf20efa2234fb7cd6a474d4dac6e3051a94b30c
> # Parent 9a5fa67583feb6ffb7668f82632f7e93e5ec9415
> asm: general calSign to accelerate sao
queued with some adjustments for Dnyaneshwar's cleanup of const-a.asm
> ---
> source/common/x86/const-a.asm | 3 ++
> source/common/x86/loopfilter.asm | 69 ++++++++++++++++++++++++++-----------
> source/encoder/sao.cpp | 14 ++------
> source/test/pixelharness.cpp | 8 ++--
> 4 files changed, 58 insertions(+), 36 deletions(-)
>
> diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/const-a.asm Fri Apr 03 19:10:07 2015 +0800
> @@ -65,6 +65,9 @@
> const pb_32, times 32 db 32
> const pb_128, times 16 db 128
> const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
> +const pb_movemask, times 16 db 0x00
> + times 16 db 0xFF
> +
>
> const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
> const pw_2, times 8 dw 2
> diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/common/x86/loopfilter.asm Fri Apr 03 19:10:07 2015 +0800
> @@ -36,6 +36,7 @@
> cextern pb_128
> cextern pb_2
> cextern pw_2
> +cextern pb_movemask
>
>
> ;============================================================================================================
> @@ -321,29 +322,55 @@
> RET
>
> ;============================================================================================================
> -; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
> +; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
> ;============================================================================================================
> INIT_XMM sse4
> -cglobal calSign, 4, 4, 6
> - mova m1, [pb_128]
> - mova m0, [pb_1]
> - shr r3d, 4
> -.loop
> - movu m2, [r1] ; m2 = pRec[x]
> - movu m3, [r2] ; m3 = pTmpU[x]
> +cglobal calSign, 4,5,6
> + mova m0, [pb_128]
> + mova m1, [pb_1]
>
> - pxor m4, m2, m1
> - pxor m3, m1
> - pcmpgtb m5, m4, m3
> - pcmpgtb m3, m4
> - pand m5, m0
> - por m5, m3
> + sub r1, r0
> + sub r2, r0
>
> - movu [r0], m5
> + mov r4d, r3d
> + shr r3d, 4
> + jz .next
> +.loop:
> + movu m2, [r0 + r1] ; m2 = pRec[x]
> + movu m3, [r0 + r2] ; m3 = pTmpU[x]
> + pxor m4, m2, m0
> + pxor m3, m0
> + pcmpgtb m5, m4, m3
> + pcmpgtb m3, m4
> + pand m5, m1
> + por m5, m3
> + movu [r0], m5
>
> - add r0, 16
> - add r1, 16
> - add r2, 16
> + add r0, 16
> dec r3d
> jnz .loop
> +
> + ; process partial
> +.next:
> + and r4d, 15
> + jz .end
> +
> + movu m2, [r0 + r1] ; m2 = pRec[x]
> + movu m3, [r0 + r2] ; m3 = pTmpU[x]
> + pxor m4, m2, m0
> + pxor m3, m0
> + pcmpgtb m5, m4, m3
> + pcmpgtb m3, m4
> + pand m5, m1
> + por m5, m3
> +
> + lea r3, [pb_movemask + 16]
> + sub r3, r4
> + movu xmm0, [r3]
> + movu m3, [r0]
> + pblendvb m5, m5, m3, xmm0
> + movu [r0], m5
> +
> +.end:
> RET
> +
> diff -r 9a5fa67583fe -r abf20efa2234 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/encoder/sao.cpp Fri Apr 03 19:10:07 2015 +0800
> @@ -783,13 +783,7 @@
> rec += stride;
> }
>
> - if (!(ctuWidth & 15))
> - primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
> - else
> - {
> - for (x = 0; x < ctuWidth; x++)
> - upBuff1[x] = signOf(rec[x] - rec[x - stride]);
> - }
> + primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
>
> for (y = startY; y < endY; y++)
> {
> @@ -832,8 +826,7 @@
> rec += stride;
> }
>
> - for (x = startX; x < endX; x++)
> - upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
> + primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
>
> for (y = startY; y < endY; y++)
> {
> @@ -879,8 +872,7 @@
> rec += stride;
> }
>
> - for (x = startX - 1; x < endX; x++)
> - upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
> + primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
>
> for (y = startY; y < endY; y++)
> {
> diff -r 9a5fa67583fe -r abf20efa2234 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Thu Apr 02 13:21:32 2015 -0500
> +++ b/source/test/pixelharness.cpp Fri Apr 03 19:10:07 2015 +0800
> @@ -870,8 +870,8 @@
>
> bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
> {
> - ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
> - ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
> + ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
> + ALIGN_VAR_16(int8_t, opt_dest[64 * 2]);
>
> memset(ref_dest, 0xCD, sizeof(ref_dest));
> memset(opt_dest, 0xCD, sizeof(opt_dest));
> @@ -880,12 +880,12 @@
>
> for (int i = 0; i < ITERS; i++)
> {
> - int width = 16 * (rand() % 4 + 1);
> + int width = (rand() % 64) + 1;
>
> ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
> checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
>
> - if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
> + if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
> return false;
>
> reportfail();
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list