[x265] [PATCH] asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives
Steve Borho
steve at borho.org
Wed Feb 19 08:46:19 CET 2014
On Wed, Feb 19, 2014 at 1:04 AM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1392792673 -19800
> # Wed Feb 19 12:21:13 2014 +0530
> # Node ID 6150985c3d535f0ea7a1dc0b8f3c69e65e30d25b
> # Parent 1a0d5b456b19e8f187290c662425080cfc870492
> asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4
> primitives
>
pushed
>
> diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Feb 18 14:46:51 2014
> -0600
> +++ b/source/common/x86/asm-primitives.cpp Wed Feb 19 12:21:13 2014
> +0530
> @@ -808,6 +808,10 @@
> p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
> p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
> p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
> +
> + p.dct[DCT_4x4] = x265_dct4_sse2;
> + p.idct[IDCT_4x4] = x265_idct4_sse2;
> + p.idct[IDST_4x4] = x265_idst4_sse2;
> }
> if (cpuMask & X265_CPU_SSSE3)
> {
> @@ -822,10 +826,12 @@
>
> SETUP_INTRA_ANG32(2, 2, ssse3);
> SETUP_INTRA_ANG32(34, 2, ssse3);
> +
> + p.dct[DST_4x4] = x265_dst4_ssse3;
> }
> if (cpuMask & X265_CPU_SSE4)
> {
> -
> + p.dct[DCT_8x8] = x265_dct8_sse4;
> p.quant = x265_quant_sse4;
> p.dequant_normal = x265_dequant_normal_sse4;
> p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/const-a.asm
> --- a/source/common/x86/const-a.asm Tue Feb 18 14:46:51 2014 -0600
> +++ b/source/common/x86/const-a.asm Wed Feb 19 12:21:13 2014 +0530
> @@ -69,9 +69,10 @@
> const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
> const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
> const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
> -
> const pd_1, times 4 dd 1
> const pd_2, times 4 dd 2
> +const pd_4, times 4 dd 4
> +const pd_8, times 4 dd 8
> const pd_16, times 4 dd 16
> const pd_32, times 4 dd 32
> const pd_64, times 4 dd 64
> diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Tue Feb 18 14:46:51 2014 -0600
> +++ b/source/common/x86/dct8.asm Wed Feb 19 12:21:13 2014 +0530
> @@ -64,9 +64,12 @@
> pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
>
> SECTION .text
> -
> cextern pd_1
> cextern pd_2
> +cextern pd_4
> +cextern pd_8
> +cextern pd_16
> +cextern pd_32
> cextern pd_64
> cextern pd_128
> cextern pd_256
> @@ -79,16 +82,21 @@
> ;------------------------------------------------------
> INIT_XMM sse2
> cglobal dct4, 3, 4, 8
> -
> +%if BIT_DEPTH == 10
> + %define DCT_SHIFT 3
> + mova m7, [pd_4]
> +%elif BIT_DEPTH == 8
> + %define DCT_SHIFT 1
> + mova m7, [pd_1]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> add r2d, r2d
> lea r3, [tab_dct4]
>
> mova m4, [r3 + 0 * 16]
> mova m5, [r3 + 1 * 16]
> mova m6, [r3 + 2 * 16]
> -
> - mova m7, [pd_1]
> -
> movh m0, [r0 + 0 * r2]
> movh m1, [r0 + 1 * r2]
> punpcklqdq m0, m1
> @@ -107,27 +115,21 @@
>
> paddw m1, m2, m0
> psubw m2, m0
> -
> pmaddwd m0, m1, m4
> paddd m0, m7
> - psrad m0, 1
> -
> + psrad m0, DCT_SHIFT
> pmaddwd m3, m2, m5
> paddd m3, m7
> - psrad m3, 1
> -
> + psrad m3, DCT_SHIFT
> packssdw m0, m3
> pshufd m0, m0, 0xD8
> pshufhw m0, m0, 0xB1
> -
> pmaddwd m1, m6
> paddd m1, m7
> - psrad m1, 1
> -
> + psrad m1, DCT_SHIFT
> pmaddwd m2, [r3 + 3 * 16]
> paddd m2, m7
> - psrad m2, 1
> -
> + psrad m2, DCT_SHIFT
> packssdw m1, m2
> pshufd m1, m1, 0xD8
> pshufhw m1, m1, 0xB1
> @@ -179,7 +181,7 @@
> %define IDCT4_OFFSET [pd_512]
> %define IDCT4_SHIFT 10
> %else
> - %error Unsupport BIT_DEPTH!
> + %error Unsupported BIT_DEPTH!
> %endif
> add r2d, r2d
> lea r3, [tab_dct4]
> @@ -268,67 +270,60 @@
> INIT_XMM ssse3
> %if ARCH_X86_64
> cglobal dst4, 3, 4, 8+2
> + %define coef2 m8
> + %define coef3 m9
> %else ; ARCH_X86_64 = 0
> cglobal dst4, 3, 4, 8
> + %define coef2 [r3 + 2 * 16]
> + %define coef3 [r3 + 3 * 16]
> %endif ; ARCH_X86_64
> +%define coef0 m6
> +%define coef1 m7
>
> - %define coef0 m6
> - %define coef1 m7
> -%if ARCH_X86_64
> - %define coef2 m8
> - %define coef3 m9
> -%else
> - %define coef2 [r3 + 2 * 16]
> - %define coef3 [r3 + 3 * 16]
> +%if BIT_DEPTH == 8
> + %define DST_SHIFT 1
> + mova m5, [pd_1]
> +%elif BIT_DEPTH == 10
> + %define DST_SHIFT 3
> + mova m5, [pd_4]
> %endif
> -
> add r2d, r2d
> lea r3, [tab_dst4]
> -
> - mova m5, [pd_1]
> -
> mova coef0, [r3 + 0 * 16]
> mova coef1, [r3 + 1 * 16]
> %if ARCH_X86_64
> mova coef2, [r3 + 2 * 16]
> mova coef3, [r3 + 3 * 16]
> %endif
> -
> - movh m0, [r0 + 0 * r2] ;load
> + movh m0, [r0 + 0 * r2] ; load
> movh m1, [r0 + 1 * r2]
> punpcklqdq m0, m1
> -
> lea r0, [r0 + 2 * r2]
> movh m1, [r0]
> movh m2, [r0 + r2]
> punpcklqdq m1, m2
> -
> - pmaddwd m2, m0, coef0 ;DST1
> + pmaddwd m2, m0, coef0 ; DST1
> pmaddwd m3, m1, coef0
> phaddd m2, m3
> paddd m2, m5
> - psrad m2, 1
> -
> + psrad m2, DST_SHIFT
> pmaddwd m3, m0, coef1
> pmaddwd m4, m1, coef1
> phaddd m3, m4
> paddd m3, m5
> - psrad m3, 1
> + psrad m3, DST_SHIFT
> packssdw m2, m3 ; m2 = T70
> -
> pmaddwd m3, m0, coef2
> pmaddwd m4, m1, coef2
> phaddd m3, m4
> paddd m3, m5
> - psrad m3, 1
> -
> + psrad m3, DST_SHIFT
> pmaddwd m0, coef3
> pmaddwd m1, coef3
> phaddd m0, m1
> paddd m0, m5
> - psrad m0, 1
> + psrad m0, DST_SHIFT
> packssdw m3, m0 ; m3 = T71
> -
> mova m5, [pd_128]
>
> pmaddwd m0, m2, coef0 ; DST2
> @@ -365,11 +360,18 @@
> ;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
> ;-------------------------------------------------------
> INIT_XMM sse2
> -cglobal idst4, 3, 4, 6
> -
> +cglobal idst4, 3, 4, 7
> +%if BIT_DEPTH == 8
> + %define m6 [pd_2048]
> + %define IDCT4_SHIFT 12
> +%elif BIT_DEPTH == 10
> + %define m6 [pd_512]
> + %define IDCT4_SHIFT 10
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
> add r2d, r2d
> lea r3, [tab_idst4]
> -
> mova m5, [pd_64]
>
> movu m0, [r0 + 0 * 16]
> @@ -414,38 +416,30 @@
>
> punpcklwd m2, m0, m1
> punpckhwd m0, m1
> -
> - mova m5, [pd_2048]
> -
> punpcklwd m1, m2, m0
> punpckhwd m2, m0
> -
> pmaddwd m0, m1, [r3 + 0 * 16]
> pmaddwd m3, m2, [r3 + 1 * 16]
> paddd m0, m3
> - paddd m0, m5
> - psrad m0, 12 ; m1 = S0
> -
> + paddd m0, m6
> + psrad m0, IDCT4_SHIFT ; m0 = S0
> pmaddwd m3, m1, [r3 + 2 * 16]
> pmaddwd m4, m2, [r3 + 3 * 16]
> paddd m3, m4
> - paddd m3, m5
> - psrad m3, 12 ; m3 = S8
> + paddd m3, m6
> + psrad m3, IDCT4_SHIFT ; m3 = S8
> packssdw m0, m3 ; m0 = m128iA
> -
> pmaddwd m3, m1, [r3 + 4 * 16]
> pmaddwd m4, m2, [r3 + 5 * 16]
> paddd m3, m4
> - paddd m3, m5
> - psrad m3, 12 ; m3 = S0
> -
> + paddd m3, m6
> + psrad m3, IDCT4_SHIFT ; m3 = S0
> pmaddwd m1, [r3 + 6 * 16]
> pmaddwd m2, [r3 + 7 * 16]
> paddd m1, m2
> - paddd m1, m5
> - psrad m1, 12 ; m1 = S8
> + paddd m1, m6
> + psrad m1, IDCT4_SHIFT ; m1 = S8
> packssdw m3, m1 ; m3 = m128iD
> -
> punpcklwd m1, m0, m3
> punpckhwd m0, m3
>
> @@ -475,12 +469,19 @@
> ; ...
> ; Row6[4-7] Row7[4-7]
> ;------------------------
> +%if BIT_DEPTH == 10
> + %define DCT_SHIFT 4
> + mova m6, [pd_8]
> +%elif BIT_DEPTH == 8
> + %define DCT_SHIFT 2
> + mova m6, [pd_2]
> +%else
> + %error Unsupported BIT_DEPTH!
> +%endif
>
> add r2, r2
> lea r3, [r2 * 3]
> mov r5, rsp
> -
> - mova m6, [pd_2]
> %assign x 0
> %rep 2
> movu m0, [r0]
> @@ -518,7 +519,7 @@
> pmaddwd m5, m0, [r4 + 0*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, 2
> + psrad m1, DCT_SHIFT
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -528,7 +529,7 @@
> pmaddwd m5, m0, [r4 + 1*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, 2
> + psrad m1, DCT_SHIFT
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -538,7 +539,7 @@
> pmaddwd m5, m0, [r4 + 2*16]
> phaddd m1, m5
> paddd m1, m6
> - psrad m1, 2
> + psrad m1, DCT_SHIFT
> %if x == 1
> pshufd m1, m1, 0x1B
> %endif
> @@ -548,7 +549,7 @@
> pmaddwd m0, [r4 + 3*16]
> phaddd m4, m0
> paddd m4, m6
> - psrad m4, 2
> + psrad m4, DCT_SHIFT
> %if x == 1
> pshufd m4, m4, 0x1B
> %endif
> @@ -561,34 +562,30 @@
> psubw m2, m3 ; m2 = [EO1 EO0]
> psignw m2, [pw_ppppmmmm]
> pshufb m2, [pb_unpackhlw1]
> -
> pmaddwd m3, m0, [r4 + 0*16]
> paddd m3, m6
> - psrad m3, 2
> + psrad m3, DCT_SHIFT
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 0*2*mmsize], m3 ; Row 0
> -
> pmaddwd m0, [r4 + 2*16]
> paddd m0, m6
> - psrad m0, 2
> + psrad m0, DCT_SHIFT
> %if x == 1
> pshufd m0, m0, 0x1B
> %endif
> mova [r5 + 4*2*mmsize], m0 ; Row 4
> -
> pmaddwd m3, m2, [r4 + 1*16]
> paddd m3, m6
> - psrad m3, 2
> + psrad m3, DCT_SHIFT
> %if x == 1
> pshufd m3, m3, 0x1B
> %endif
> mova [r5 + 2*2*mmsize], m3 ; Row 2
> -
> pmaddwd m2, [r4 + 3*16]
> paddd m2, m6
> - psrad m2, 2
> + psrad m2, DCT_SHIFT
> %if x == 1
> pshufd m2, m2, 0x1B
> %endif
> diff -r 1a0d5b456b19 -r 6150985c3d53 source/test/mbdstharness.cpp
> --- a/source/test/mbdstharness.cpp Tue Feb 18 14:46:51 2014 -0600
> +++ b/source/test/mbdstharness.cpp Wed Feb 19 12:21:13 2014 +0530
> @@ -169,12 +169,14 @@
> X265_FREE(int_test_buff);
> X265_FREE(int_idct_test_buff);
> }
> -
> bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width)
> {
> +#if HIGH_BIT_DEPTH
> + int old_depth = X265_DEPTH;
> + X265_DEPTH = 10;
> +#endif
> int j = 0;
> int cmp_size = sizeof(int) * width * width;
> -
> for (int i = 0; i <= 100; i++)
> {
> int index = rand() % TEST_CASES;
> @@ -188,24 +190,34 @@
> ref(short_test_buff[index] + j, mintbuf3, width);
> opt(short_test_buff[index] + j, mintbuf4, width);
> #endif
> +
> +#if HIGH_BIT_DEPTH
> + X265_DEPTH = old_depth;
> +#endif
> +
> return false;
> }
> -
> j += 16;
> #if _DEBUG
> memset(mbuf2, 0xCD, mem_cmp_size);
> memset(mbuf3, 0xCD, mem_cmp_size);
> #endif
> }
> +#if HIGH_BIT_DEPTH
> + X265_DEPTH = old_depth;
> +#endif
>
> return true;
> }
> -
> bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width)
> {
> +#if HIGH_BIT_DEPTH
> + int old_depth = X265_DEPTH;
> + X265_DEPTH = 10;
> +#endif
> +
> int j = 0;
> int cmp_size = sizeof(int16_t) * width * width;
> -
> for (int i = 0; i <= 100; i++)
> {
> int index = rand() % TEST_CASES;
> @@ -218,16 +230,22 @@
> ref(int_idct_test_buff[index] + j, mbuf2, width);
> opt(int_idct_test_buff[index] + j, mbuf3, width);
> #endif
> +
> +#if HIGH_BIT_DEPTH
> + X265_DEPTH = old_depth;
> +#endif
> +
> return false;
> }
> -
> j += 16;
> #if _DEBUG
> memset(mbuf2, 0xCD, mem_cmp_size);
> memset(mbuf3, 0xCD, mem_cmp_size);
> #endif
> }
> -
> +#if HIGH_BIT_DEPTH
> + X265_DEPTH = old_depth;
> +#endif
> return true;
> }
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140219/ab534d51/attachment-0001.html>
More information about the x265-devel
mailing list