[x265] [PATCH] asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Feb 19 08:04:20 CET 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1392792673 -19800
# Wed Feb 19 12:21:13 2014 +0530
# Node ID 6150985c3d535f0ea7a1dc0b8f3c69e65e30d25b
# Parent 1a0d5b456b19e8f187290c662425080cfc870492
asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives
diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Feb 18 14:46:51 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Feb 19 12:21:13 2014 +0530
@@ -808,6 +808,10 @@
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
+
+ p.dct[DCT_4x4] = x265_dct4_sse2;
+ p.idct[IDCT_4x4] = x265_idct4_sse2;
+ p.idct[IDST_4x4] = x265_idst4_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -822,10 +826,12 @@
SETUP_INTRA_ANG32(2, 2, ssse3);
SETUP_INTRA_ANG32(34, 2, ssse3);
+
+ p.dct[DST_4x4] = x265_dst4_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
-
+ p.dct[DCT_8x8] = x265_dct8_sse4;
p.quant = x265_quant_sse4;
p.dequant_normal = x265_dequant_normal_sse4;
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Feb 18 14:46:51 2014 -0600
+++ b/source/common/x86/const-a.asm Wed Feb 19 12:21:13 2014 +0530
@@ -69,9 +69,10 @@
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
-
const pd_1, times 4 dd 1
const pd_2, times 4 dd 2
+const pd_4, times 4 dd 4
+const pd_8, times 4 dd 8
const pd_16, times 4 dd 16
const pd_32, times 4 dd 32
const pd_64, times 4 dd 64
diff -r 1a0d5b456b19 -r 6150985c3d53 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Feb 18 14:46:51 2014 -0600
+++ b/source/common/x86/dct8.asm Wed Feb 19 12:21:13 2014 +0530
@@ -64,9 +64,12 @@
pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
SECTION .text
-
cextern pd_1
cextern pd_2
+cextern pd_4
+cextern pd_8
+cextern pd_16
+cextern pd_32
cextern pd_64
cextern pd_128
cextern pd_256
@@ -79,16 +82,21 @@
;------------------------------------------------------
INIT_XMM sse2
cglobal dct4, 3, 4, 8
-
+%if BIT_DEPTH == 10
+ %define DCT_SHIFT 3
+ mova m7, [pd_4]
+%elif BIT_DEPTH == 8
+ %define DCT_SHIFT 1
+ mova m7, [pd_1]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
add r2d, r2d
lea r3, [tab_dct4]
mova m4, [r3 + 0 * 16]
mova m5, [r3 + 1 * 16]
mova m6, [r3 + 2 * 16]
-
- mova m7, [pd_1]
-
movh m0, [r0 + 0 * r2]
movh m1, [r0 + 1 * r2]
punpcklqdq m0, m1
@@ -107,27 +115,21 @@
paddw m1, m2, m0
psubw m2, m0
-
pmaddwd m0, m1, m4
paddd m0, m7
- psrad m0, 1
-
+ psrad m0, DCT_SHIFT
pmaddwd m3, m2, m5
paddd m3, m7
- psrad m3, 1
-
+ psrad m3, DCT_SHIFT
packssdw m0, m3
pshufd m0, m0, 0xD8
pshufhw m0, m0, 0xB1
-
pmaddwd m1, m6
paddd m1, m7
- psrad m1, 1
-
+ psrad m1, DCT_SHIFT
pmaddwd m2, [r3 + 3 * 16]
paddd m2, m7
- psrad m2, 1
-
+ psrad m2, DCT_SHIFT
packssdw m1, m2
pshufd m1, m1, 0xD8
pshufhw m1, m1, 0xB1
@@ -179,7 +181,7 @@
%define IDCT4_OFFSET [pd_512]
%define IDCT4_SHIFT 10
%else
- %error Unsupport BIT_DEPTH!
+ %error Unsupported BIT_DEPTH!
%endif
add r2d, r2d
lea r3, [tab_dct4]
@@ -268,67 +270,60 @@
INIT_XMM ssse3
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+2
+ %define coef2 m8
+ %define coef3 m9
%else ; ARCH_X86_64 = 0
cglobal dst4, 3, 4, 8
+ %define coef2 [r3 + 2 * 16]
+ %define coef3 [r3 + 3 * 16]
%endif ; ARCH_X86_64
+%define coef0 m6
+%define coef1 m7
- %define coef0 m6
- %define coef1 m7
-%if ARCH_X86_64
- %define coef2 m8
- %define coef3 m9
-%else
- %define coef2 [r3 + 2 * 16]
- %define coef3 [r3 + 3 * 16]
+%if BIT_DEPTH == 8
+ %define DST_SHIFT 1
+ mova m5, [pd_1]
+%elif BIT_DEPTH == 10
+ %define DST_SHIFT 3
+ mova m5, [pd_4]
%endif
-
add r2d, r2d
lea r3, [tab_dst4]
-
- mova m5, [pd_1]
-
mova coef0, [r3 + 0 * 16]
mova coef1, [r3 + 1 * 16]
%if ARCH_X86_64
mova coef2, [r3 + 2 * 16]
mova coef3, [r3 + 3 * 16]
%endif
-
- movh m0, [r0 + 0 * r2] ;load
+ movh m0, [r0 + 0 * r2] ; load
movh m1, [r0 + 1 * r2]
punpcklqdq m0, m1
-
lea r0, [r0 + 2 * r2]
movh m1, [r0]
movh m2, [r0 + r2]
punpcklqdq m1, m2
-
- pmaddwd m2, m0, coef0 ;DST1
+ pmaddwd m2, m0, coef0 ; DST1
pmaddwd m3, m1, coef0
phaddd m2, m3
paddd m2, m5
- psrad m2, 1
-
+ psrad m2, DST_SHIFT
pmaddwd m3, m0, coef1
pmaddwd m4, m1, coef1
phaddd m3, m4
paddd m3, m5
- psrad m3, 1
+ psrad m3, DST_SHIFT
packssdw m2, m3 ; m2 = T70
-
pmaddwd m3, m0, coef2
pmaddwd m4, m1, coef2
phaddd m3, m4
paddd m3, m5
- psrad m3, 1
-
+ psrad m3, DST_SHIFT
pmaddwd m0, coef3
pmaddwd m1, coef3
phaddd m0, m1
paddd m0, m5
- psrad m0, 1
+ psrad m0, DST_SHIFT
packssdw m3, m0 ; m3 = T71
-
mova m5, [pd_128]
pmaddwd m0, m2, coef0 ; DST2
@@ -365,11 +360,18 @@
;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
;-------------------------------------------------------
INIT_XMM sse2
-cglobal idst4, 3, 4, 6
-
+cglobal idst4, 3, 4, 7
+%if BIT_DEPTH == 8
+ %define m6 [pd_2048]
+ %define IDCT4_SHIFT 12
+%elif BIT_DEPTH == 10
+ %define m6 [pd_512]
+ %define IDCT4_SHIFT 10
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
add r2d, r2d
lea r3, [tab_idst4]
-
mova m5, [pd_64]
movu m0, [r0 + 0 * 16]
@@ -414,38 +416,30 @@
punpcklwd m2, m0, m1
punpckhwd m0, m1
-
- mova m5, [pd_2048]
-
punpcklwd m1, m2, m0
punpckhwd m2, m0
-
pmaddwd m0, m1, [r3 + 0 * 16]
pmaddwd m3, m2, [r3 + 1 * 16]
paddd m0, m3
- paddd m0, m5
- psrad m0, 12 ; m1 = S0
-
+ paddd m0, m6
+ psrad m0, IDCT4_SHIFT ; m0 = S0
pmaddwd m3, m1, [r3 + 2 * 16]
pmaddwd m4, m2, [r3 + 3 * 16]
paddd m3, m4
- paddd m3, m5
- psrad m3, 12 ; m3 = S8
+ paddd m3, m6
+ psrad m3, IDCT4_SHIFT ; m3 = S8
packssdw m0, m3 ; m0 = m128iA
-
pmaddwd m3, m1, [r3 + 4 * 16]
pmaddwd m4, m2, [r3 + 5 * 16]
paddd m3, m4
- paddd m3, m5
- psrad m3, 12 ; m3 = S0
-
+ paddd m3, m6
+ psrad m3, IDCT4_SHIFT ; m3 = S0
pmaddwd m1, [r3 + 6 * 16]
pmaddwd m2, [r3 + 7 * 16]
paddd m1, m2
- paddd m1, m5
- psrad m1, 12 ; m1 = S8
+ paddd m1, m6
+ psrad m1, IDCT4_SHIFT ; m1 = S8
packssdw m3, m1 ; m3 = m128iD
-
punpcklwd m1, m0, m3
punpckhwd m0, m3
@@ -475,12 +469,19 @@
; ...
; Row6[4-7] Row7[4-7]
;------------------------
+%if BIT_DEPTH == 10
+ %define DCT_SHIFT 4
+ mova m6, [pd_8]
+%elif BIT_DEPTH == 8
+ %define DCT_SHIFT 2
+ mova m6, [pd_2]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
add r2, r2
lea r3, [r2 * 3]
mov r5, rsp
-
- mova m6, [pd_2]
%assign x 0
%rep 2
movu m0, [r0]
@@ -518,7 +519,7 @@
pmaddwd m5, m0, [r4 + 0*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, 2
+ psrad m1, DCT_SHIFT
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -528,7 +529,7 @@
pmaddwd m5, m0, [r4 + 1*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, 2
+ psrad m1, DCT_SHIFT
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -538,7 +539,7 @@
pmaddwd m5, m0, [r4 + 2*16]
phaddd m1, m5
paddd m1, m6
- psrad m1, 2
+ psrad m1, DCT_SHIFT
%if x == 1
pshufd m1, m1, 0x1B
%endif
@@ -548,7 +549,7 @@
pmaddwd m0, [r4 + 3*16]
phaddd m4, m0
paddd m4, m6
- psrad m4, 2
+ psrad m4, DCT_SHIFT
%if x == 1
pshufd m4, m4, 0x1B
%endif
@@ -561,34 +562,30 @@
psubw m2, m3 ; m2 = [EO1 EO0]
psignw m2, [pw_ppppmmmm]
pshufb m2, [pb_unpackhlw1]
-
pmaddwd m3, m0, [r4 + 0*16]
paddd m3, m6
- psrad m3, 2
+ psrad m3, DCT_SHIFT
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 0*2*mmsize], m3 ; Row 0
-
pmaddwd m0, [r4 + 2*16]
paddd m0, m6
- psrad m0, 2
+ psrad m0, DCT_SHIFT
%if x == 1
pshufd m0, m0, 0x1B
%endif
mova [r5 + 4*2*mmsize], m0 ; Row 4
-
pmaddwd m3, m2, [r4 + 1*16]
paddd m3, m6
- psrad m3, 2
+ psrad m3, DCT_SHIFT
%if x == 1
pshufd m3, m3, 0x1B
%endif
mova [r5 + 2*2*mmsize], m3 ; Row 2
-
pmaddwd m2, [r4 + 3*16]
paddd m2, m6
- psrad m2, 2
+ psrad m2, DCT_SHIFT
%if x == 1
pshufd m2, m2, 0x1B
%endif
diff -r 1a0d5b456b19 -r 6150985c3d53 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Tue Feb 18 14:46:51 2014 -0600
+++ b/source/test/mbdstharness.cpp Wed Feb 19 12:21:13 2014 +0530
@@ -169,12 +169,14 @@
X265_FREE(int_test_buff);
X265_FREE(int_idct_test_buff);
}
-
bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width)
{
+#if HIGH_BIT_DEPTH
+ int old_depth = X265_DEPTH;
+ X265_DEPTH = 10;
+#endif
int j = 0;
int cmp_size = sizeof(int) * width * width;
-
for (int i = 0; i <= 100; i++)
{
int index = rand() % TEST_CASES;
@@ -188,24 +190,34 @@
ref(short_test_buff[index] + j, mintbuf3, width);
opt(short_test_buff[index] + j, mintbuf4, width);
#endif
+
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
+
return false;
}
-
j += 16;
#if _DEBUG
memset(mbuf2, 0xCD, mem_cmp_size);
memset(mbuf3, 0xCD, mem_cmp_size);
#endif
}
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
return true;
}
-
bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width)
{
+#if HIGH_BIT_DEPTH
+ int old_depth = X265_DEPTH;
+ X265_DEPTH = 10;
+#endif
+
int j = 0;
int cmp_size = sizeof(int16_t) * width * width;
-
for (int i = 0; i <= 100; i++)
{
int index = rand() % TEST_CASES;
@@ -218,16 +230,22 @@
ref(int_idct_test_buff[index] + j, mbuf2, width);
opt(int_idct_test_buff[index] + j, mbuf3, width);
#endif
+
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
+
return false;
}
-
j += 16;
#if _DEBUG
memset(mbuf2, 0xCD, mem_cmp_size);
memset(mbuf3, 0xCD, mem_cmp_size);
#endif
}
-
+#if HIGH_BIT_DEPTH
+ X265_DEPTH = old_depth;
+#endif
return true;
}
More information about the x265-devel
mailing list