[x265] [PATCH 2 of 2] asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode
dave
dtyx265 at gmail.com
Mon Jan 19 18:33:45 CET 2015
On 01/19/2015 02:22 AM, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1421662910 -28800
> # Node ID b2f64dbe26392dd6bea2badaccf2869bec883392
> # Parent a0bb3bb1b076d2ef559ab94bfe81052142d302c3
> asm: rewrite and fix bug in weight_sp_sse4 on HIGH_BIT_DEPTH mode
> ---
> source/common/pixel.cpp | 7 ++
> source/common/x86/asm-primitives.cpp | 2 +-
> source/common/x86/const-a.asm | 1 +
> source/common/x86/pixel-util8.asm | 134 +++++++++++++++++-----------------
> source/test/pixelharness.cpp | 10 ++-
> 5 files changed, 81 insertions(+), 73 deletions(-)
>
> diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Mon Jan 19 18:21:45 2015 +0800
> +++ b/source/common/pixel.cpp Mon Jan 19 18:21:50 2015 +0800
> @@ -520,6 +520,13 @@
> {
> int x, y;
>
> + const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
I am not sure why but gcc thinks correction is an unused variable here.
source/common/pixel.cpp:523:15: warning: unused variable ‘correction’
[-Wunused-variable]
> +
> + X265_CHECK(!((w0 << 6) > 32767), "w0 using more than 16 bits, asm output will mismatch\n");
> + X265_CHECK(!(round > 32767), "round using more than 16 bits, asm output will mismatch\n");
> + X265_CHECK((shift >= correction), "shift must be include factor correction, please update ASM ABI\n");
> + X265_CHECK(!(round & ((1 << correction) - 1)), "round must be include factor correction, please update ASM ABI\n");
> +
> for (y = 0; y <= height - 1; y++)
> {
> for (x = 0; x <= width - 1; )
> diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Jan 19 18:21:45 2015 +0800
> +++ b/source/common/x86/asm-primitives.cpp Mon Jan 19 18:21:50 2015 +0800
> @@ -925,7 +925,7 @@
> p.planecopy_cp = x265_upShift_8_sse4;
> // these fail unit tests
> p.weight_pp = x265_weight_pp_sse4;
> - // p.weight_sp = x265_weight_sp_sse4;
> + p.weight_sp = x265_weight_sp_sse4;
>
> p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
> #if X86_64
> diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Mon Jan 19 18:21:45 2015 +0800
> +++ b/source/common/x86/const-a.asm Mon Jan 19 18:21:50 2015 +0800
> @@ -63,6 +63,7 @@
> const pb_128, times 16 db 128
> const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
>
> +const pw_0_15, times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
> const pw_2, times 8 dw 2
> const pw_m2, times 8 dw -2
> const pw_4, times 8 dw 4
> diff -r a0bb3bb1b076 -r b2f64dbe2639 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:45 2015 +0800
> +++ b/source/common/x86/pixel-util8.asm Mon Jan 19 18:21:50 2015 +0800
> @@ -53,6 +53,7 @@
> SECTION .text
>
> cextern pw_1
> +cextern pw_0_15
> cextern pb_1
> cextern pw_00ff
> cextern pw_1023
> @@ -63,7 +64,6 @@
> cextern pd_32767
> cextern pd_n32768
>
> -
> ;-----------------------------------------------------------------------------
> ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
> ;-----------------------------------------------------------------------------
> @@ -986,84 +986,82 @@
> ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
> ;-------------------------------------------------------------------------------------------------------------------------------------------------
> INIT_XMM sse4
> -%if ARCH_X86_64
> -cglobal weight_sp, 6, 7+2, 7
> - %define tmp_r0 r7
> - %define tmp_r1 r8
> -%else ; ARCH_X86_64 = 0
> -cglobal weight_sp, 6, 7, 7, 0-(2*4)
> - %define tmp_r0 [(rsp + 0 * 4)]
> - %define tmp_r1 [(rsp + 1 * 4)]
> -%endif ; ARCH_X86_64
> -
> - movd m0, r6m ; m0 = [w0]
> -
> - movd m1, r7m ; m1 = [round]
> - punpcklwd m0, m1
> - pshufd m0, m0, 0 ; m0 = [w0 round]
> -
> - movd m1, r8m ; m1 = [shift]
> -
> - movd m2, r9m
> - pshufd m2, m2, 0 ; m2 =[offset]
> -
> - mova m3, [pw_1]
> - mova m4, [pw_2000]
> -
> +cglobal weight_sp, 6,7,8
> +%if BIT_DEPTH == 10
> + mova m1, [pw_1023]
> +%elif BIT_DEPTH == 12
> + mova m1, [pw_3fff]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> + mova m2, [pw_1]
> + mov r6d, r7m
> + shl r6d, 16
> + or r6d, r6m ; assuming both (w0) and round are using maximum of 16 bits each.
> + movd m3, r6d
> + pshufd m3, m3, 0 ; m3 = [round w0]
> +
> + movd m4, r8m ; m4 = [shift]
> + movd m5, r9m
> + pshufd m5, m5, 0 ; m5 = [offset]
> +
> + ; correct row stride
> + add r3d, r3d
> add r2d, r2d
> + mov r6d, r4d
> + and r6d, ~(mmsize / SIZEOF_PIXEL - 1)
> + sub r3d, r6d
> + sub r3d, r6d
> + sub r2d, r6d
> + sub r2d, r6d
> +
> + ; generate partial width mask (MUST BE IN XMM0)
> + mov r6d, r4d
> + and r6d, (mmsize / SIZEOF_PIXEL - 1)
> + movd m0, r6d
> + pshuflw m0, m0, 0
> + punpcklqdq m0, m0
> + pcmpgtw m0, [pw_0_15]
>
> .loopH:
> mov r6d, r4d
>
> - ; save old src and dst
> - mov tmp_r0, r0
> - mov tmp_r1, r1
> .loopW:
> - movu m5, [r0]
> - paddw m5, m4
> -
> - punpcklwd m6,m5, m3
> - pmaddwd m6, m0
> - psrad m6, m1
> - paddd m6, m2
> -
> - punpckhwd m5, m3
> - pmaddwd m5, m0
> - psrad m5, m1
> - paddd m5, m2
> -
> - packssdw m6, m5
> - packuswb m6, m6
> -
> - sub r6d, 8
> - jl .width4
> - movh [r1], m6
> - je .nextH
> - add r0, 16
> - add r1, 8
> -
> - jmp .loopW
> -
> -.width4:
> - cmp r6d, -4
> - jl .width2
> - movd [r1], m6
> - je .nextH
> - add r1, 4
> - pshufd m6, m6, 1
> -
> -.width2:
> - pextrw [r1], m6, 0
> + movu m6, [r0]
> + paddw m6, [pw_2000]
> +
> + punpcklwd m7, m6, m2
> + pmaddwd m7, m3
> + psrad m7, m4
> + paddd m7, m5
> +
> + punpckhwd m6, m2
> + pmaddwd m6, m3
> + psrad m6, m4
> + paddd m6, m5
> +
> + packusdw m7, m6
> + pminuw m7, m1
> +
> + sub r6d, (mmsize / SIZEOF_PIXEL)
> + jl .widthLess8
> + movu [r1], m7
> + lea r0, [r0 + mmsize]
> + lea r1, [r1 + mmsize]
> + je .nextH
> + jmp .loopW
> +
> +.widthLess8:
> + movu m6, [r1]
> + pblendvb m6, m6, m7, m0
> + movu [r1], m6
>
> .nextH:
> - mov r0, tmp_r0
> - mov r1, tmp_r1
> - lea r0, [r0 + r2]
> - lea r1, [r1 + r3]
> + add r0, r2
> + add r1, r3
>
> dec r5d
> jnz .loopH
> -
> RET
>
> ;-----------------------------------------------------------------
> diff -r a0bb3bb1b076 -r b2f64dbe2639 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Mon Jan 19 18:21:45 2015 +0800
> +++ b/source/test/pixelharness.cpp Mon Jan 19 18:21:50 2015 +0800
> @@ -222,8 +222,8 @@
>
> bool PixelHarness::check_weightp(weightp_sp_t ref, weightp_sp_t opt)
> {
> - ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> - ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> + ALIGN_VAR_16(pixel, ref_dest[64 * (64 + 1)]);
> + ALIGN_VAR_16(pixel, opt_dest[64 * (64 + 1)]);
>
> memset(ref_dest, 0, 64 * 64 * sizeof(pixel));
> memset(opt_dest, 0, 64 * 64 * sizeof(pixel));
> @@ -236,11 +236,12 @@
> int offset = (rand() % 256) - 128;
> intptr_t stride = 64;
> const int correction = (IF_INTERNAL_PREC - X265_DEPTH);
> +
> for (int i = 0; i < ITERS; i++)
> {
> int index = i % TEST_CASES;
> - checked(opt, short_test_buff[index] + j, opt_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
> - ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
> + checked(opt, short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
> + ref(short_test_buff[index] + j, ref_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
>
> if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> {
> @@ -264,6 +265,7 @@
> printf("\n");
> }
> printf("\n");
> + opt(short_test_buff[index] + j, opt_dest, stride, stride + 1, width, height, w0, round << correction, shift + correction, offset);
> return false;
> }
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
More information about the x265-devel
mailing list