[x265] [PATCH] asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives

Steve Borho steve at borho.org
Fri Feb 14 19:39:38 CET 2014


On Fri, Feb 14, 2014 at 4:41 AM, <dnyaneshwar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1392374441 -19800
> #      Fri Feb 14 16:10:41 2014 +0530
> # Node ID 831536babdc08f1553a10754bf2a4f4af6aa1695
> # Parent  ed310b17ff6681f191c85341cf6efe7a50770143
> asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4
> primitives
>

with this patch applied, if I fixup the elif problems, I get occasional
dequant test failures on 8bpp mac.

steve at zeppelin> ./test/TestBench

Using random seed 52FE6216 8bpp

Testing primitives: SSE2

Testing primitives: SSE3

Testing primitives: SSSE3

Testing primitives: SSE4

dequant: Failed!


>
> diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Fri Feb 14 02:30:52 2014
> -0600
> +++ b/source/common/x86/asm-primitives.cpp      Fri Feb 14 16:10:41 2014
> +0530
> @@ -726,6 +726,10 @@
>          p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
>          p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
>          p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
> +
> +        p.dct[DCT_4x4] = x265_dct4_sse2;
> +        p.idct[IDCT_4x4] = x265_idct4_sse2;
> +        p.idct[IDST_4x4] = x265_idst4_sse2;
>      }
>      if (cpuMask & X265_CPU_SSSE3)
>      {
> @@ -740,9 +744,12 @@
>
>          SETUP_INTRA_ANG32(2, 2, ssse3);
>          SETUP_INTRA_ANG32(34, 2, ssse3);
> +
> +        p.dct[DST_4x4] = x265_dst4_ssse3;
>      }
>      if (cpuMask & X265_CPU_SSE4)
>      {
> +        p.dct[DCT_8x8] = x265_dct8_sse4;
>          p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
>
>          p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
> diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm     Fri Feb 14 02:30:52 2014 -0600
> +++ b/source/common/x86/const-a.asm     Fri Feb 14 16:10:41 2014 +0530
> @@ -72,6 +72,8 @@
>
>  const pd_1,        times 4 dd 1
>  const pd_2,        times 4 dd 2
> +const pd_4,        times 4 dd 4
> +const pd_8,        times 4 dd 8
>  const pd_16,       times 4 dd 16
>  const pd_32,       times 4 dd 32
>  const pd_64,       times 4 dd 64
> diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm        Fri Feb 14 02:30:52 2014 -0600
> +++ b/source/common/x86/dct8.asm        Fri Feb 14 16:10:41 2014 +0530
> @@ -67,6 +67,10 @@
>
>  cextern pd_1
>  cextern pd_2
> +cextern pd_4
> +cextern pd_8
> +cextern pd_16
> +cextern pd_32
>  cextern pd_64
>  cextern pd_128
>  cextern pd_256
> @@ -79,6 +83,15 @@
>  ;------------------------------------------------------
>  INIT_XMM sse2
>  cglobal dct4, 3, 4, 8
> +%if BIT_DEPTH == 10
> +  %define       DCT_SHIFT 3
> +  mova          m7, [pd_4]
> +%else if BIT_DEPTH == 8
>

%elif BIT_DEPTH == 8


> +  %define       DCT_SHIFT 1
> +  mova          m7, [pd_1]
> +%else
> +  %error Unsupported BIT_DEPTH!
> +%endif
>
>      add         r2d, r2d
>      lea         r3, [tab_dct4]
> @@ -87,8 +100,6 @@
>      mova        m5, [r3 + 1 * 16]
>      mova        m6, [r3 + 2 * 16]
>
> -    mova        m7, [pd_1]
> -
>      movh        m0, [r0 + 0 * r2]
>      movh        m1, [r0 + 1 * r2]
>      punpcklqdq  m0, m1
> @@ -110,11 +121,11 @@
>
>      pmaddwd     m0, m1, m4
>      paddd       m0, m7
> -    psrad       m0, 1
> +    psrad       m0, DCT_SHIFT
>
>      pmaddwd     m3, m2, m5
>      paddd       m3, m7
> -    psrad       m3, 1
> +    psrad       m3, DCT_SHIFT
>
>      packssdw    m0, m3
>      pshufd      m0, m0, 0xD8
> @@ -122,11 +133,11 @@
>
>      pmaddwd     m1, m6
>      paddd       m1, m7
> -    psrad       m1, 1
> +    psrad       m1, DCT_SHIFT
>
>      pmaddwd     m2, [r3 + 3 * 16]
>      paddd       m2, m7
> -    psrad       m2, 1
> +    psrad       m2, DCT_SHIFT
>
>      packssdw    m1, m2
>      pshufd      m1, m1, 0xD8
> @@ -179,7 +190,7 @@
>    %define IDCT4_OFFSET  [pd_512]
>    %define IDCT4_SHIFT   10
>  %else
> -  %error Unsupport BIT_DEPTH!
> +  %error Unsupported BIT_DEPTH!
>  %endif
>      add         r2d, r2d
>      lea         r3, [tab_dct4]
> @@ -268,25 +279,28 @@
>  INIT_XMM ssse3
>  %if ARCH_X86_64
>  cglobal dst4, 3, 4, 8+2
> +  %define       coef2   m8
> +  %define       coef3   m9
>  %else ; ARCH_X86_64 = 0
>  cglobal dst4, 3, 4, 8
> +  %define       coef2   [r3 + 2 * 16]
> +  %define       coef3   [r3 + 3 * 16]
>  %endif ; ARCH_X86_64
>
> -    %define coef0   m6
> -    %define coef1   m7
> -%if ARCH_X86_64
> -    %define coef2   m8
> -    %define coef3   m9
> -%else
> -    %define coef2   [r3 + 2 * 16]
> -    %define coef3   [r3 + 3 * 16]
> -%endif
> +%define         coef0   m6
> +%define         coef1   m7
> +
> +%if BIT_DEPTH == 8
> +  %define       DST_SHIFT 1
> +  mova          m5, [pd_1]
> +%else if BIT_DEPTH == 10
>

%elif BIT_DEPTH == 10, there's one more of these below


> +  %define       DST_SHIFT 3
> +  mova          m5, [pd_4]
> +%endif
>
>      add         r2d, r2d
>      lea         r3, [tab_dst4]
>
> -    mova        m5, [pd_1]
> -
>      mova        coef0, [r3 + 0 * 16]
>      mova        coef1, [r3 + 1 * 16]
>  %if ARCH_X86_64
> @@ -294,7 +308,7 @@
>      mova        coef3, [r3 + 3 * 16]
>  %endif
>
> -    movh        m0, [r0 + 0 * r2]            ;load
> +    movh        m0, [r0 + 0 * r2]            ; load
>      movh        m1, [r0 + 1 * r2]
>      punpcklqdq  m0, m1
>
> @@ -303,30 +317,30 @@
>      movh        m2, [r0 + r2]
>      punpcklqdq  m1, m2
>
> -    pmaddwd     m2, m0, coef0                ;DST1
> +    pmaddwd     m2, m0, coef0                ; DST1
>      pmaddwd     m3, m1, coef0
>      phaddd      m2, m3
>      paddd       m2, m5
> -    psrad       m2, 1
> +    psrad       m2, DST_SHIFT
>
>      pmaddwd     m3, m0, coef1
>      pmaddwd     m4, m1, coef1
>      phaddd      m3, m4
>      paddd       m3, m5
> -    psrad       m3, 1
> +    psrad       m3, DST_SHIFT
>      packssdw    m2, m3                       ; m2 = T70
>
>      pmaddwd     m3, m0, coef2
>      pmaddwd     m4, m1, coef2
>      phaddd      m3, m4
>      paddd       m3, m5
> -    psrad       m3, 1
> +    psrad       m3, DST_SHIFT
>
>      pmaddwd     m0, coef3
>      pmaddwd     m1, coef3
>      phaddd      m0, m1
>      paddd       m0, m5
> -    psrad       m0, 1
> +    psrad       m0, DST_SHIFT
>      packssdw    m3, m0                       ; m3 = T71
>
>      mova        m5, [pd_128]
> @@ -365,8 +379,16 @@
>  ;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
>  ;-------------------------------------------------------
>  INIT_XMM sse2
> -cglobal idst4, 3, 4, 6
> -
> +cglobal idst4, 3, 4, 7
> +%if BIT_DEPTH == 8
> +  %define m6  [pd_2048]
> +  %define IDCT4_SHIFT 12
> +%elif BIT_DEPTH == 10
> +  %define m6  [pd_512]
> +  %define IDCT4_SHIFT 10
> +%else
> +  %error Unsupported BIT_DEPTH!
> +%endif
>      add         r2d, r2d
>      lea         r3, [tab_idst4]
>
> @@ -415,35 +437,33 @@
>      punpcklwd   m2, m0, m1
>      punpckhwd   m0, m1
>
> -    mova        m5, [pd_2048]
> -
>      punpcklwd   m1, m2, m0
>      punpckhwd   m2, m0
>
>      pmaddwd     m0, m1, [r3 + 0 * 16]
>      pmaddwd     m3, m2, [r3 + 1 * 16]
>      paddd       m0, m3
> -    paddd       m0, m5
> -    psrad       m0, 12                      ; m1 = S0
> +    paddd       m0, m6
> +    psrad       m0, IDCT4_SHIFT             ; m0 = S0
>
>      pmaddwd     m3, m1, [r3 + 2 * 16]
>      pmaddwd     m4, m2, [r3 + 3 * 16]
>      paddd       m3, m4
> -    paddd       m3, m5
> -    psrad       m3, 12                      ; m3 = S8
> +    paddd       m3, m6
> +    psrad       m3, IDCT4_SHIFT             ; m3 = S8
>      packssdw    m0, m3                      ; m0 = m128iA
>
>      pmaddwd     m3, m1, [r3 + 4 * 16]
>      pmaddwd     m4, m2, [r3 + 5 * 16]
>      paddd       m3, m4
> -    paddd       m3, m5
> -    psrad       m3, 12                      ; m3 = S0
> +    paddd       m3, m6
> +    psrad       m3, IDCT4_SHIFT             ; m3 = S0
>
>      pmaddwd     m1, [r3 + 6 * 16]
>      pmaddwd     m2, [r3 + 7 * 16]
>      paddd       m1, m2
> -    paddd       m1, m5
> -    psrad       m1, 12                      ; m1 = S8
> +    paddd       m1, m6
> +    psrad       m1, IDCT4_SHIFT             ; m1 = S8
>      packssdw    m3, m1                      ; m3 = m128iD
>
>      punpcklwd   m1, m0, m3
> @@ -476,11 +496,20 @@
>      ; Row6[4-7] Row7[4-7]
>      ;------------------------
>
> +%if BIT_DEPTH == 10
> +  %define       DCT_SHIFT 4
> +  mova          m6, [pd_8]
> +%else if BIT_DEPTH == 8
> +  %define       DCT_SHIFT 2
> +  mova          m6, [pd_2]
> +%else
> +  %error Unsupported BIT_DEPTH!
> +%endif
> +
>      add         r2, r2
>      lea         r3, [r2 * 3]
>      mov         r5, rsp
>
> -    mova        m6, [pd_2]
>  %assign x 0
>  %rep 2
>      movu        m0, [r0]
> @@ -518,7 +547,7 @@
>      pmaddwd     m5, m0, [r4 + 0*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, 2
> +    psrad       m1, DCT_SHIFT
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -528,7 +557,7 @@
>      pmaddwd     m5, m0, [r4 + 1*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, 2
> +    psrad       m1, DCT_SHIFT
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -538,7 +567,7 @@
>      pmaddwd     m5, m0, [r4 + 2*16]
>      phaddd      m1, m5
>      paddd       m1, m6
> -    psrad       m1, 2
> +    psrad       m1, DCT_SHIFT
>    %if x == 1
>      pshufd      m1, m1, 0x1B
>    %endif
> @@ -548,7 +577,7 @@
>      pmaddwd     m0, [r4 + 3*16]
>      phaddd      m4, m0
>      paddd       m4, m6
> -    psrad       m4, 2
> +    psrad       m4, DCT_SHIFT
>    %if x == 1
>      pshufd      m4, m4, 0x1B
>    %endif
> @@ -564,7 +593,7 @@
>
>      pmaddwd     m3, m0, [r4 + 0*16]
>      paddd       m3, m6
> -    psrad       m3, 2
> +    psrad       m3, DCT_SHIFT
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
> @@ -572,7 +601,7 @@
>
>      pmaddwd     m0, [r4 + 2*16]
>      paddd       m0, m6
> -    psrad       m0, 2
> +    psrad       m0, DCT_SHIFT
>    %if x == 1
>      pshufd      m0, m0, 0x1B
>    %endif
> @@ -580,7 +609,7 @@
>
>      pmaddwd     m3, m2, [r4 + 1*16]
>      paddd       m3, m6
> -    psrad       m3, 2
> +    psrad       m3, DCT_SHIFT
>    %if x == 1
>      pshufd      m3, m3, 0x1B
>    %endif
> @@ -588,7 +617,7 @@
>
>      pmaddwd     m2, [r4 + 3*16]
>      paddd       m2, m6
> -    psrad       m2, 2
> +    psrad       m2, DCT_SHIFT
>    %if x == 1
>      pshufd      m2, m2, 0x1B
>    %endif
> diff -r ed310b17ff66 -r 831536babdc0 source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp      Fri Feb 14 02:30:52 2014 -0600
> +++ b/source/test/mbdstharness.cpp      Fri Feb 14 16:10:41 2014 +0530
> @@ -173,6 +173,10 @@
>
>  bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width)
>  {
> +#if HIGH_BIT_DEPTH
> +    int old_depth = X265_DEPTH;
> +    X265_DEPTH = 10;
> +#endif
>      int j = 0;
>      int cmp_size = sizeof(int) * width * width;
>
> @@ -189,6 +193,11 @@
>              ref(short_test_buff[index] + j, mintbuf3, width);
>              opt(short_test_buff[index] + j, mintbuf4, width);
>  #endif
> +
> +#if HIGH_BIT_DEPTH
> +    X265_DEPTH = old_depth;
> +#endif
> +
>              return false;
>          }
>
> @@ -199,11 +208,20 @@
>  #endif
>      }
>
> +#if HIGH_BIT_DEPTH
> +    X265_DEPTH = old_depth;
> +#endif
> +
>      return true;
>  }
>
>  bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width)
>  {
> +#if HIGH_BIT_DEPTH
> +    int old_depth = X265_DEPTH;
> +    X265_DEPTH = 10;
> +#endif
> +
>      int j = 0;
>      int cmp_size = sizeof(int16_t) * width * width;
>
> @@ -220,6 +238,11 @@
>              ref(int_test_buff[index] + j, mbuf2, width);
>              opt(int_test_buff[index] + j, mbuf3, width);
>  #endif
> +
> +#if HIGH_BIT_DEPTH
> +    X265_DEPTH = old_depth;
> +#endif
> +
>              return false;
>          }
>
> @@ -230,6 +253,9 @@
>  #endif
>      }
>
> +#if HIGH_BIT_DEPTH
> +    X265_DEPTH = old_depth;
> +#endif
>      return true;
>  }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>



-- 
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140214/d835fdee/attachment-0001.html>


More information about the x265-devel mailing list