[x265] [PATCH] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
chen
chenm003 at 163.com
Fri Jul 17 02:23:42 CEST 2015
It is read beyond array bound, I modify algorithm to avoid it.
My local smoke-test system failed, I will upload patch when I verify
At 2015-07-17 08:19:35,"Deepthi Nandakumar" <deepthi at multicorewareinc.com> wrote:
Thanks, Min. Do we know what exactly caused this on some Macs - unaligned access? The test farm Darwin machine has reported no smoke test failures.
On Fri, Jul 17, 2015 at 2:51 AM, Steve Borho <steve at borho.org> wrote:
On 06/26, rajesh at multicorewareinc.com wrote:
> # HG changeset patch
> # User Rajesh Paulraj<rajesh at multicorewareinc.com>
> # Date 1435311677 -19800
> # Fri Jun 26 15:11:17 2015 +0530
> # Node ID 818b70b015513a01993af0c48e4714cf4fd8dc84
> # Parent 956401f1a679f1e71181b704d64e4acdb6f1a93f
> asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
>
> avx2:
> planecopy_cp 19.36x 5685.80 110052.08
>
> sse4:
> planecopy_cp 9.65x 10660.20 102850.27
FYI: this primitive has introduced a SIGBUS on some of the main10 smoke
tests on Mac. Min is working on a rewrite of the primitive which should
hopefully resolve this problem
> diff -r 956401f1a679 -r 818b70b01551 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Fri Jun 26 15:01:16 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp Fri Jun 26 15:11:17 2015 +0530
> @@ -1522,6 +1522,7 @@
> p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
> p.weight_pp = PFX(weight_pp_avx2);
> p.sign = PFX(calSign_avx2);
> + p.planecopy_cp = PFX(upShift_8_avx2);
>
> p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
> p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
> diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel-a.asm
> --- a/source/common/x86/pixel-a.asm Fri Jun 26 15:01:16 2015 +0530
> +++ b/source/common/x86/pixel-a.asm Fri Jun 26 15:11:17 2015 +0530
> @@ -7388,6 +7388,96 @@
> .end:
> RET
>
> +;---------------------------------------------------------------------------------------------------------------------
> +;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
> +;---------------------------------------------------------------------------------------------------------------------
> +%if ARCH_X86_64
> +INIT_YMM avx2
> +cglobal upShift_8, 7,8,3
> + movd xm2, r6d
> + add r3, r3
> +
> +.loopH:
> + xor r7, r7
> + mov r6d, r4d
> +.loopW:
> + pmovzxbw m0,[r0 + r7]
> + pmovzxbw m1,[r0 + r7 + 16]
> + psllw m0, xm2
> + psllw m1, xm2
> + movu [r2 + r7 * 2], m0
> + movu [r2 + r7 * 2 + 32], m1
> +
> + add r7d, 32
> + sub r6d, 32
> + jg .loopW
> +
> + ; move to next row
> + add r0, r1
> + add r2, r3
> + dec r5d
> + jnz .loopH
> +
> +;processing last row of every frame [To handle width which not a multiple of 16]
> +
> +.loop16:
> + pmovzxbw m0,[r0]
> + psllw m0, xm2
> + movu [r2], m0
> +
> + add r0, mmsize
> + add r2, 2 * mmsize
> + sub r4d, 16
> + jg .loop16
> + jz .end
> +
> + cmp r4d, 8
> + jl .process4
> + pmovzxbw m0,[r0]
> + psllw m0, xm2
> + movu [r2], m0
> +
> + add r0, 8
> + add r2, mmsize
> + sub r4d, 8
> + jz .end
> +
> +.process4:
> + cmp r4d, 4
> + jl .process2
> + movq xm0,[r0]
> + pmovzxbw m0,xm0
> + psllw xm0, xm2
> + movq [r2], xm0
> +
> + add r0, 4
> + add r2, 8
> + sub r4d, 4
> + jz .end
> +
> +.process2:
> + cmp r4d, 2
> + jl .process1
> + movzx r3d, byte [r0]
> + shl r3d, 2
> + mov [r2], r3w
> + movzx r3d, byte [r0 + 1]
> + shl r3d, 2
> + mov [r2 + 2], r3w
> +
> + add r0, 2
> + add r2, 4
> + sub r4d, 2
> + jz .end
> +
> +.process1:
> + movzx r3d, byte [r0]
> + shl r3d, 2
> + mov [r2], r3w
> +.end:
> + RET
> +%endif
> +
> %macro ABSD2 6 ; dst1, dst2, src1, src2, tmp, tmp
> %if cpuflag(ssse3)
> pabsd %1, %3
> diff -r 956401f1a679 -r 818b70b01551 source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Fri Jun 26 15:01:16 2015 +0530
> +++ b/source/common/x86/pixel.h Fri Jun 26 15:11:17 2015 +0530
> @@ -31,6 +31,7 @@
> void PFX(downShift_16_sse2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
> void PFX(downShift_16_avx2)(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
> void PFX(upShift_8_sse4)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> +void PFX(upShift_8_avx2)(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
>
> #define DECL_PIXELS(cpu) \
> FUNCDEF_PU(int, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
_______________________________________________
x265-devel mailing list
x265-devel at videolan.org
https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150717/09cc8639/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-asm-rewrite-partial-process-code-in-upShift_8_avx2-t.patch
Type: application/octet-stream
Size: 1918 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150717/09cc8639/attachment-0001.obj>
More information about the x265-devel
mailing list