[x265] [PATCH 1 of 2] asm: rewrite and fix bug in weight_pp_sse4 on HIGH_BIT_DEPTH mode

dave dtyx265 at gmail.com
Mon Jan 19 18:19:27 CET 2015


On 01/19/2015 02:22 AM, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1421662905 -28800
> # Node ID a0bb3bb1b076d2ef559ab94bfe81052142d302c3
> # Parent  bbc333bd4a6207c72c682b3ea88794c67996aa83
> asm: rewrite and fix bug in weight_pp_sse4 on HIGH_BIT_DEPTH mode
> ---
>   source/common/x86/asm-primitives.cpp |    2 +-
>   source/common/x86/pixel-util8.asm    |   55 +++++++++++++++++++++-------------
>   source/test/pixelharness.cpp         |   45 +++++++++++++++++++++++++++
>   3 files changed, 80 insertions(+), 22 deletions(-)
>
> diff -r bbc333bd4a62 -r a0bb3bb1b076 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Mon Jan 19 09:59:33 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp	Mon Jan 19 18:21:45 2015 +0800
> @@ -924,7 +924,7 @@
>   
>           p.planecopy_cp = x265_upShift_8_sse4;
>           // these fail unit tests
> -        // p.weight_pp = x265_weight_pp_sse4;
> +        p.weight_pp = x265_weight_pp_sse4;
>           // p.weight_sp = x265_weight_sp_sse4;
>   
>           p.cu[BLOCK_4x4].psy_cost_pp = x265_psyCost_pp_4x4_sse4;
> diff -r bbc333bd4a62 -r a0bb3bb1b076 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Mon Jan 19 09:59:33 2015 +0530
> +++ b/source/common/x86/pixel-util8.asm	Mon Jan 19 18:21:45 2015 +0800
> @@ -55,6 +55,8 @@
>   cextern pw_1
>   cextern pb_1
>   cextern pw_00ff
> +cextern pw_1023
> +cextern pw_3fff
>   cextern pw_2000
>   cextern pw_pixel_max
>   cextern pd_1
> @@ -856,26 +858,52 @@
>   ;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
>   ;-----------------------------------------------------------------------------------------------------------------------------------------------
>   INIT_XMM sse4
> -cglobal weight_pp, 6, 7, 6
> -
> -    shl         r5d, 6      ; m0 = [w0<<6]
> +cglobal weight_pp, 4,7,7
> +%define correction      (14 - BIT_DEPTH)
> +%if BIT_DEPTH == 10
> +    mova        m6, [pw_1023]
> +%elif BIT_DEPTH == 12
> +    mova        m6, [pw_3fff]
> +%else
> +  %error Unsupported BIT_DEPTH!
> +%endif
Unsupported BIT_DEPTH! is triggered in 8 bit
from gcc:

source/common/x86/pixel-util8.asm:868: warning: Unsupported 8!

>       mov         r6d, r6m
> -    shl         r6d, 16
> -    or          r6d, r5d    ; assuming both (w0<<6) and round are using maximum of 16 bits each.
> +    mov         r4d, r4m
> +    mov         r5d, r5m
> +    shl         r6d, 16 - correction
> +    or          r6d, r5d    ; assuming both (w0) and round are using maximum of 16 bits each.
>       movd        m0, r6d
> -    pshufd      m0, m0, 0   ; m0 = [w0<<6, round]
> -    movd        m1, r7m
> +    pshufd      m0, m0, 0   ; m0 = [w0, round]
> +    mov         r5d, r7m
> +    sub         r5d, correction
> +    movd        m1, r5d
>       movd        m2, r8m
>       pshufd      m2, m2, 0
>       mova        m5, [pw_1]
>       sub         r2d, r3d
> +    add         r2d, r2d
>       shr         r3d, 4
>   
>   .loopH:
>       mov         r5d, r3d
>   
>   .loopW:
> -    pmovzxbw    m4, [r0]
> +    movu        m4, [r0]
> +    punpcklwd   m3, m4, m5
> +    pmaddwd     m3, m0
> +    psrad       m3, m1
> +    paddd       m3, m2      ; TODO: we can put Offset into Round, but we have to analyze Dynamic Range before that.
> +
> +    punpckhwd   m4, m5
> +    pmaddwd     m4, m0
> +    psrad       m4, m1
> +    paddd       m4, m2
> +
> +    packusdw    m3, m4
> +    pminuw      m3, m6
> +    movu        [r1], m3
> +
> +    movu        m4, [r0 + mmsize]
>       punpcklwd   m3, m4, m5
>       pmaddwd     m3, m0
>       psrad       m3, m1
> @@ -886,33 +914,18 @@
>       psrad       m4, m1
>       paddd       m4, m2
>   
> -    packssdw    m3, m4
> -    packuswb    m3, m3
> -    movh        [r1], m3
> -
> -    pmovzxbw    m4, [r0 + 8]
> -    punpcklwd   m3, m4, m5
> -    pmaddwd     m3, m0
> -    psrad       m3, m1
> -    paddd       m3, m2
> -
> -    punpckhwd   m4, m5
> -    pmaddwd     m4, m0
> -    psrad       m4, m1
> -    paddd       m4, m2
> -
> -    packssdw    m3, m4
> -    packuswb    m3, m3
> -    movh        [r1 + 8], m3
> -
> -    add         r0, 16
> -    add         r1, 16
> +    packusdw    m3, m4
> +    pminuw      m3, m6
> +    movu        [r1 + mmsize], m3
> +
> +    add         r0, 2 * mmsize
> +    add         r1, 2 * mmsize
>   
>       dec         r5d
> -    jnz         .loopW
> -
> -    lea         r0, [r0 + r2]
> -    lea         r1, [r1 + r2]
> +    jnz        .loopW
> +
> +    add         r0, r2
> +    add         r1, r2
>   
>       dec         r4d
>       jnz         .loopH
> diff -r bbc333bd4a62 -r a0bb3bb1b076 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp	Mon Jan 19 09:59:33 2015 +0530
> +++ b/source/test/pixelharness.cpp	Mon Jan 19 18:21:45 2015 +0800
> @@ -243,7 +243,29 @@
>           ref(short_test_buff[index] + j, ref_dest, stride, stride, width, height, w0, round << correction, shift + correction, offset);
>   
>           if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> +        {
> +            printf("--- Ref ---\n");
> +            for(int y = 0; y < 16; y++)
> +            {
> +                for(int x = 0; x < 16; x++)
> +                {
> +                    printf("%04X, ", ref_dest[y * stride + x] & 0xFFFF);
> +                }
> +                printf("\n");
> +            }
> +            printf("\n");
> +            printf("--- Opt ---\n");
> +            for(int y = 0; y < 16; y++)
> +            {
> +                for(int x = 0; x < 16; x++)
> +                {
> +                    printf("%04X, ", opt_dest[y * stride + x] & 0xFFFF);
> +                }
> +                printf("\n");
> +            }
> +            printf("\n");
>               return false;
> +        }
>   
>           reportfail();
>           j += INCR;
> @@ -275,7 +297,30 @@
>           ref(pixel_test_buff[index] + j, ref_dest, stride, width, height, w0, round << correction, shift + correction, offset);
>   
>           if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> +        {
> +            printf("--- Ref ---\n");
> +            for(int y = 0; y < 16; y++)
> +            {
> +                for(int x = 0; x < 16; x++)
> +                {
> +                    printf("%04X, ", ref_dest[y * stride + x] & 0xFFFF);
> +                }
> +                printf("\n");
> +            }
> +            printf("\n");
> +            printf("--- Opt ---\n");
> +            for(int y = 0; y < 16; y++)
> +            {
> +                for(int x = 0; x < 16; x++)
> +                {
> +                    printf("%04X, ", opt_dest[y * stride + x] & 0xFFFF);
> +                }
> +                printf("\n");
> +            }
> +            printf("\n");
> +            checked(opt, pixel_test_buff[index] + j, opt_dest, stride, width, height, w0, round << correction, shift + correction, offset);
>               return false;
> +        }
>   
>           reportfail();
>           j += INCR;
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel



More information about the x265-devel mailing list