[x265] [PATCH] asm: rewrite 16bpp partial pixels process code on upShift and downShift (Issue #223)
Deepthi Nandakumar
deepthi at multicorewareinc.com
Thu Dec 31 06:41:43 CET 2015
Min,
Testbench reports a failure here -
** testbench failure reported for vc11_64_main12::
Testing primitives: AVX2
Testing primitives: BMI2
planecopy_sp_shl failed
x265: asm primitive has failed. Go and fix that Right Now!
return code -1
On Thu, Dec 31, 2015 at 5:33 AM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1451520182 21600
> # Node ID 717cb31ed9931513bb0851f0e6c68af868b5ad45
> # Parent 75d1c62d8f0c517dda37ac89f401faa308d60f24
> asm: rewrite 16bpp partial pixels process code on upShift and downShift
> (Issue #223)
> ---
> source/common/x86/pixel-a.asm | 327
> ++++++++++-------------------------------
> source/test/pixelharness.cpp | 25 +++-
> 2 files changed, 103 insertions(+), 249 deletions(-)
>
> diff -r 75d1c62d8f0c -r 717cb31ed993 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Thu Dec 24 13:58:32 2015 +0530
> +++ b/source/common/x86/pixel-a.asm Wed Dec 30 18:03:02 2015 -0600
> @@ -8154,92 +8154,57 @@
> ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>
> ;------------------------------------------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal downShift_16, 7,7,3
> - movd m0, r6d ; m0 = shift
> +cglobal downShift_16, 4,7,3
> + mov r4d, r4m
> + mov r5d, r5m
> + movd m0, r6m ; m0 = shift
> add r1, r1
> +
> dec r5d
> .loopH:
> xor r6, r6
> +
> .loopW:
> movu m1, [r0 + r6 * 2]
> - movu m2, [r0 + r6 * 2 + 16]
> + movu m2, [r0 + r6 * 2 + mmsize]
> psrlw m1, m0
> psrlw m2, m0
> packuswb m1, m2
> movu [r2 + r6], m1
>
> - add r6, 16
> + add r6, mmsize
> cmp r6d, r4d
> - jl .loopW
> + jl .loopW
>
> ; move to next row
> add r0, r1
> add r2, r3
> dec r5d
> - jnz .loopH
> -
> -;processing last row of every frame [To handle width which not a multiple
> of 16]
> -
> + jnz .loopH
> +
> + ;processing last row of every frame [To handle width which not a
> multiple of 16]
> + ; r4d must be more than or equal to 16(mmsize)
> .loop16:
> + movu m1, [r0 + (r4 - mmsize) * 2]
> + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
> + psrlw m1, m0
> + psrlw m2, m0
> + packuswb m1, m2
> + movu [r2 + r4 - mmsize], m1
> +
> + sub r4d, mmsize
> + jz .end
> + cmp r4d, mmsize
> + jge .loop16
> +
> + ; process partial pixels
> movu m1, [r0]
> - movu m2, [r0 + 16]
> + movu m2, [r0 + mmsize]
> psrlw m1, m0
> psrlw m2, m0
> packuswb m1, m2
> movu [r2], m1
>
> - add r0, 2 * mmsize
> - add r2, mmsize
> - sub r4d, 16
> - jz .end
> - cmp r4d, 15
> - jg .loop16
> -
> - cmp r4d, 8
> - jl .process4
> - movu m1, [r0]
> - psrlw m1, m0
> - packuswb m1, m1
> - movh [r2], m1
> -
> - add r0, mmsize
> - add r2, 8
> - sub r4d, 8
> - jz .end
> -
> -.process4:
> - cmp r4d, 4
> - jl .process2
> - movh m1,[r0]
> - psrlw m1, m0
> - packuswb m1, m1
> - movd [r2], m1
> -
> - add r0, 8
> - add r2, 4
> - sub r4d, 4
> - jz .end
> -
> -.process2:
> - cmp r4d, 2
> - jl .process1
> - movd m1, [r0]
> - psrlw m1, m0
> - packuswb m1, m1
> - movd r6, m1
> - mov [r2], r6w
> -
> - add r0, 4
> - add r2, 2
> - sub r4d, 2
> - jz .end
> -
> -.process1:
> - movd m1, [r0]
> - psrlw m1, m0
> - packuswb m1, m1
> - movd r3, m1
> - mov [r2], r3b
> .end:
> RET
>
> @@ -8248,12 +8213,16 @@
> ;void planecopy_sp(uint16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>
> ;-------------------------------------------------------------------------------------------------------------------------------------
> INIT_YMM avx2
> -cglobal downShift_16, 6,7,3
> +cglobal downShift_16, 4,7,3
> + mov r4d, r4m
> + mov r5d, r5m
> movd xm0, r6m ; m0 = shift
> add r1d, r1d
> +
> dec r5d
> .loopH:
> xor r6, r6
> +
> .loopW:
> movu m1, [r0 + r6 * 2 + 0]
> movu m2, [r0 + r6 * 2 + 32]
> @@ -8265,92 +8234,39 @@
>
> add r6d, mmsize
> cmp r6d, r4d
> - jl .loopW
> + jl .loopW
>
> ; move to next row
> add r0, r1
> add r2, r3
> dec r5d
> - jnz .loopH
> -
> -; processing last row of every frame [To handle width which not a
> multiple of 32]
> - mov r6d, r4d
> - and r4d, 31
> - shr r6d, 5
> + jnz .loopH
> +
> + ; processing last row of every frame [To handle width which not a
> multiple of 32]
>
> .loop32:
> - movu m1, [r0]
> - movu m2, [r0 + 32]
> + movu m1, [r0 + (r4 - mmsize) * 2]
> + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
> psrlw m1, xm0
> psrlw m2, xm0
> packuswb m1, m2
> - vpermq m1, m1, 11011000b
> + vpermq m1, m1, q3120
> + movu [r2 + r4 - mmsize], m1
> +
> + sub r4d, mmsize
> + jz .end
> + cmp r4d, mmsize
> + jge .loop32
> +
> + ; process partial pixels
> + movu m1, [r0]
> + movu m2, [r0 + mmsize]
> + psrlw m1, xm0
> + psrlw m2, xm0
> + packuswb m1, m2
> + vpermq m1, m1, q3120
> movu [r2], m1
>
> - add r0, 2*mmsize
> - add r2, mmsize
> - dec r6d
> - jnz .loop32
> -
> - cmp r4d, 16
> - jl .process8
> - movu m1, [r0]
> - psrlw m1, xm0
> - packuswb m1, m1
> - vpermq m1, m1, 10001000b
> - movu [r2], xm1
> -
> - add r0, mmsize
> - add r2, 16
> - sub r4d, 16
> - jz .end
> -
> -.process8:
> - cmp r4d, 8
> - jl .process4
> - movu m1, [r0]
> - psrlw m1, xm0
> - packuswb m1, m1
> - movq [r2], xm1
> -
> - add r0, 16
> - add r2, 8
> - sub r4d, 8
> - jz .end
> -
> -.process4:
> - cmp r4d, 4
> - jl .process2
> - movq xm1,[r0]
> - psrlw m1, xm0
> - packuswb m1, m1
> - movd [r2], xm1
> -
> - add r0, 8
> - add r2, 4
> - sub r4d, 4
> - jz .end
> -
> -.process2:
> - cmp r4d, 2
> - jl .process1
> - movd xm1, [r0]
> - psrlw m1, xm0
> - packuswb m1, m1
> - movd r6d, xm1
> - mov [r2], r6w
> -
> - add r0, 4
> - add r2, 2
> - sub r4d, 2
> - jz .end
> -
> -.process1:
> - movd xm1, [r0]
> - psrlw m1, xm0
> - packuswb m1, m1
> - movd r3d, xm1
> - mov [r2], r3b
> .end:
> RET
>
> @@ -8487,7 +8403,9 @@
> ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>
> ;------------------------------------------------------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal upShift_16, 6,7,4
> +cglobal upShift_16, 4,7,4
> + mov r4d, r4m
> + mov r5d, r5m
> movd m0, r6m ; m0 = shift
> mova m3, [pw_pixel_max]
> FIX_STRIDES r1d, r3d
> @@ -8515,9 +8433,25 @@
> dec r5d
> jnz .loopH
>
> -;processing last row of every frame [To handle width which not a multiple
> of 16]
> -
> + ;processing last row of every frame [To handle width which not a
> multiple of 16]
> +
> + ; WARNING: width(r4d) MUST BE more than or equal to 16(mmsize) in here
> .loop16:
> + movu m1, [r0 + (r4 - mmsize) * 2]
> + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
> + psllw m1, m0
> + psllw m2, m0
> + pand m1, m3
> + pand m2, m3
> + movu [r2 + (r4 - mmsize) * 2], m1
> + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2
> +
> + sub r4d, mmsize
> + jz .end
> + cmp r4d, mmsize
> + jge .loop16
> +
> + ; process partial pixels
> movu m1, [r0]
> movu m2, [r0 + mmsize]
> psllw m1, m0
> @@ -8527,56 +8461,6 @@
> movu [r2], m1
> movu [r2 + mmsize], m2
>
> - add r0, 2 * mmsize
> - add r2, 2 * mmsize
> - sub r4d, 16
> - jz .end
> - jg .loop16
> -
> - cmp r4d, 8
> - jl .process4
> - movu m1, [r0]
> - psrlw m1, m0
> - pand m1, m3
> - movu [r2], m1
> -
> - add r0, mmsize
> - add r2, mmsize
> - sub r4d, 8
> - jz .end
> -
> -.process4:
> - cmp r4d, 4
> - jl .process2
> - movh m1,[r0]
> - psllw m1, m0
> - pand m1, m3
> - movh [r2], m1
> -
> - add r0, 8
> - add r2, 8
> - sub r4d, 4
> - jz .end
> -
> -.process2:
> - cmp r4d, 2
> - jl .process1
> - movd m1, [r0]
> - psllw m1, m0
> - pand m1, m3
> - movd [r2], m1
> -
> - add r0, 4
> - add r2, 4
> - sub r4d, 2
> - jz .end
> -
> -.process1:
> - movd m1, [r0]
> - psllw m1, m0
> - pand m1, m3
> - movd r3, m1
> - mov [r2], r3w
> .end:
> RET
>
> @@ -8584,9 +8468,10 @@
>
> ;-------------------------------------------------------------------------------------------------------------------------------------
> ;void planecopy_sp_shl(uint16_t *src, intptr_t srcStride, pixel *dst,
> intptr_t dstStride, int width, int height, int shift, uint16_t mask)
>
> ;-------------------------------------------------------------------------------------------------------------------------------------
> -; TODO: NO TEST CODE!
> INIT_YMM avx2
> -cglobal upShift_16, 6,7,4
> +cglobal upShift_16, 4,7,4
> + mov r4d, r4m
> + mov r5d, r5m
> movd xm0, r6m ; m0 = shift
> vbroadcasti128 m3, [pw_pixel_max]
> FIX_STRIDES r1d, r3d
> @@ -8613,83 +8498,33 @@
> dec r5d
> jnz .loopH
>
> -; processing last row of every frame [To handle width which not a
> multiple of 32]
> - mov r6d, r4d
> - and r4d, 31
> - shr r6d, 5
> + ; processing last row of every frame [To handle width which not a
> multiple of 32]
>
> .loop32:
> + movu m1, [r0 + (r4 - mmsize) * 2]
> + movu m2, [r0 + (r4 - mmsize) * 2 + mmsize]
> + psllw m1, xm0
> + psllw m2, xm0
> + pand m1, m3
> + pand m2, m3
> + movu [r2 + (r4 - mmsize) * 2], m1
> + movu [r2 + (r4 - mmsize) * 2 + mmsize], m2
> +
> + sub r4d, mmsize
> + jz .end
> + cmp r4d, mmsize
> + jge .loop32
> +
> + ; process partial pixels
> movu m1, [r0]
> - movu m2, [r0 + mmsize]
> + movu m2, [r0]
> psllw m1, xm0
> psllw m2, xm0
> pand m1, m3
> pand m2, m3
> movu [r2], m1
> - movu [r2 + mmsize], m2
> -
> - add r0, 2*mmsize
> - add r2, 2*mmsize
> - dec r6d
> - jnz .loop32
> -
> - cmp r4d, 16
> - jl .process8
> - movu m1, [r0]
> - psllw m1, xm0
> - pand m1, m3
> - movu [r2], m1
> -
> - add r0, mmsize
> - add r2, mmsize
> - sub r4d, 16
> - jz .end
> -
> -.process8:
> - cmp r4d, 8
> - jl .process4
> - movu xm1, [r0]
> - psllw xm1, xm0
> - pand xm1, xm3
> - movu [r2], xm1
> -
> - add r0, 16
> - add r2, 16
> - sub r4d, 8
> - jz .end
> -
> -.process4:
> - cmp r4d, 4
> - jl .process2
> - movq xm1,[r0]
> - psllw xm1, xm0
> - pand xm1, xm3
> - movq [r2], xm1
> -
> - add r0, 8
> - add r2, 8
> - sub r4d, 4
> - jz .end
> -
> -.process2:
> - cmp r4d, 2
> - jl .process1
> - movd xm1, [r0]
> - psllw xm1, xm0
> - pand xm1, xm3
> - movd [r2], xm1
> -
> - add r0, 4
> - add r2, 4
> - sub r4d, 2
> - jz .end
> -
> -.process1:
> - movd xm1, [r0]
> - psllw xm1, xm0
> - pand xm1, xm3
> - movd r3d, xm1
> - mov [r2], r3w
> + movu [r2], m2
> +
> .end:
> RET
>
> diff -r 75d1c62d8f0c -r 717cb31ed993 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Thu Dec 24 13:58:32 2015 +0530
> +++ b/source/test/pixelharness.cpp Wed Dec 30 18:03:02 2015 -0600
> @@ -1299,8 +1299,8 @@
>
> memset(ref_dest, 0xCD, sizeof(ref_dest));
> memset(opt_dest, 0xCD, sizeof(opt_dest));
> - int width = 32 + rand() % 32;
> - int height = 32 + rand() % 32;
> + int width = 32 + (rand() % 32);
> + int height = 32 + (rand() % 32);
> intptr_t srcStride = 64;
> intptr_t dstStride = width;
> int j = 0;
> @@ -1308,11 +1308,23 @@
> for (int i = 0; i < ITERS; i++)
> {
> int index = i % TEST_CASES;
> +
> checked(opt, ushort_test_buff[index] + j, srcStride, opt_dest,
> dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
> ref(ushort_test_buff[index] + j, srcStride, ref_dest, dstStride,
> width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
>
> - if (memcmp(ref_dest, opt_dest, width * height * sizeof(pixel)))
> + if (memcmp(ref_dest, opt_dest, dstStride * height *
> sizeof(pixel)))
> + {
> + memcpy(opt_dest, ref_dest, sizeof(ref_dest));
> + opt(ushort_test_buff[index] + j, srcStride, opt_dest,
> dstStride, width, height, (int)8, (uint16_t)((1 << X265_DEPTH) - 1));
> return false;
> + }
> +
> + // check tail memory area
> + for(int x = width; x < dstStride; x++)
> + {
> + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
> + return false;
> + }
>
> reportfail();
> j += INCR;
> @@ -1344,6 +1356,13 @@
> if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
> return false;
>
> + // check tail memory area
> + for(int x = width; x < dstStride; x++)
> + {
> + if (opt_dest[(height - 1 * dstStride) + x] != 0xCD)
> + return false;
> + }
> +
> reportfail();
> j += INCR;
> }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Deepthi Nandakumar
Engineering Manager, x265
Multicoreware, Inc
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20151231/dec78b6c/attachment-0001.html>
More information about the x265-devel
mailing list