<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Fri, Feb 14, 2014 at 4:41 AM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex"># HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1392374441 -19800<br>
# Fri Feb 14 16:10:41 2014 +0530<br>
# Node ID 831536babdc08f1553a10754bf2a4f4af6aa1695<br>
# Parent ed310b17ff6681f191c85341cf6efe7a50770143<br>
asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives<br></blockquote><div><br></div><div>with this patch applied, if I fixup the elif problems, I get occasional dequant test failures on 8bpp mac.</div>
<div><br></div><div>
<p class="">steve@<span class="">zeppelin</span>> ./test/TestBench</p><p class="">Using random seed 52FE6216 8bpp</p>
<p class="">Testing primitives: SSE2</p>
<p class="">Testing primitives: SSE3</p>
<p class="">Testing primitives: SSSE3</p>
<p class="">Testing primitives: SSE4</p>
<p class="">dequant: Failed!</p></div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
<br>
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Fri Feb 14 02:30:52 2014 -0600<br>
+++ b/source/common/x86/asm-primitives.cpp Fri Feb 14 16:10:41 2014 +0530<br>
@@ -726,6 +726,10 @@<br>
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;<br>
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;<br>
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;<br>
+<br>
+ p.dct[DCT_4x4] = x265_dct4_sse2;<br>
+ p.idct[IDCT_4x4] = x265_idct4_sse2;<br>
+ p.idct[IDST_4x4] = x265_idst4_sse2;<br>
}<br>
if (cpuMask & X265_CPU_SSSE3)<br>
{<br>
@@ -740,9 +744,12 @@<br>
<br>
SETUP_INTRA_ANG32(2, 2, ssse3);<br>
SETUP_INTRA_ANG32(34, 2, ssse3);<br>
+<br>
+ p.dct[DST_4x4] = x265_dst4_ssse3;<br>
}<br>
if (cpuMask & X265_CPU_SSE4)<br>
{<br>
+ p.dct[DCT_8x8] = x265_dct8_sse4;<br>
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;<br>
<br>
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;<br>
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/const-a.asm<br>
--- a/source/common/x86/const-a.asm Fri Feb 14 02:30:52 2014 -0600<br>
+++ b/source/common/x86/const-a.asm Fri Feb 14 16:10:41 2014 +0530<br>
@@ -72,6 +72,8 @@<br>
<br>
const pd_1, times 4 dd 1<br>
const pd_2, times 4 dd 2<br>
+const pd_4, times 4 dd 4<br>
+const pd_8, times 4 dd 8<br>
const pd_16, times 4 dd 16<br>
const pd_32, times 4 dd 32<br>
const pd_64, times 4 dd 64<br>
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/dct8.asm<br>
--- a/source/common/x86/dct8.asm Fri Feb 14 02:30:52 2014 -0600<br>
+++ b/source/common/x86/dct8.asm Fri Feb 14 16:10:41 2014 +0530<br>
@@ -67,6 +67,10 @@<br>
<br>
cextern pd_1<br>
cextern pd_2<br>
+cextern pd_4<br>
+cextern pd_8<br>
+cextern pd_16<br>
+cextern pd_32<br>
cextern pd_64<br>
cextern pd_128<br>
cextern pd_256<br>
@@ -79,6 +83,15 @@<br>
;------------------------------------------------------<br>
INIT_XMM sse2<br>
cglobal dct4, 3, 4, 8<br>
+%if BIT_DEPTH == 10<br>
+ %define DCT_SHIFT 3<br>
+ mova m7, [pd_4]<br>
+%else if BIT_DEPTH == 8<br></blockquote><div><br></div><div>%elif BIT_DEPTH == 8</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
+ %define DCT_SHIFT 1<br>
+ mova m7, [pd_1]<br>
+%else<br>
+ %error Unsupported BIT_DEPTH!<br>
+%endif<br>
<br>
add r2d, r2d<br>
lea r3, [tab_dct4]<br>
@@ -87,8 +100,6 @@<br>
mova m5, [r3 + 1 * 16]<br>
mova m6, [r3 + 2 * 16]<br>
<br>
- mova m7, [pd_1]<br>
-<br>
movh m0, [r0 + 0 * r2]<br>
movh m1, [r0 + 1 * r2]<br>
punpcklqdq m0, m1<br>
@@ -110,11 +121,11 @@<br>
<br>
pmaddwd m0, m1, m4<br>
paddd m0, m7<br>
- psrad m0, 1<br>
+ psrad m0, DCT_SHIFT<br>
<br>
pmaddwd m3, m2, m5<br>
paddd m3, m7<br>
- psrad m3, 1<br>
+ psrad m3, DCT_SHIFT<br>
<br>
packssdw m0, m3<br>
pshufd m0, m0, 0xD8<br>
@@ -122,11 +133,11 @@<br>
<br>
pmaddwd m1, m6<br>
paddd m1, m7<br>
- psrad m1, 1<br>
+ psrad m1, DCT_SHIFT<br>
<br>
pmaddwd m2, [r3 + 3 * 16]<br>
paddd m2, m7<br>
- psrad m2, 1<br>
+ psrad m2, DCT_SHIFT<br>
<br>
packssdw m1, m2<br>
pshufd m1, m1, 0xD8<br>
@@ -179,7 +190,7 @@<br>
%define IDCT4_OFFSET [pd_512]<br>
%define IDCT4_SHIFT 10<br>
%else<br>
- %error Unsupport BIT_DEPTH!<br>
+ %error Unsupported BIT_DEPTH!<br>
%endif<br>
add r2d, r2d<br>
lea r3, [tab_dct4]<br>
@@ -268,25 +279,28 @@<br>
INIT_XMM ssse3<br>
%if ARCH_X86_64<br>
cglobal dst4, 3, 4, 8+2<br>
+ %define coef2 m8<br>
+ %define coef3 m9<br>
%else ; ARCH_X86_64 = 0<br>
cglobal dst4, 3, 4, 8<br>
+ %define coef2 [r3 + 2 * 16]<br>
+ %define coef3 [r3 + 3 * 16]<br>
%endif ; ARCH_X86_64<br>
<br>
- %define coef0 m6<br>
- %define coef1 m7<br>
-%if ARCH_X86_64<br>
- %define coef2 m8<br>
- %define coef3 m9<br>
-%else<br>
- %define coef2 [r3 + 2 * 16]<br>
- %define coef3 [r3 + 3 * 16]<br>
-%endif<br>
+%define coef0 m6<br>
+%define coef1 m7<br>
+<br>
+%if BIT_DEPTH == 8<br>
+ %define DST_SHIFT 1<br>
+ mova m5, [pd_1]<br>
+%else if BIT_DEPTH == 10<br></blockquote><div><br></div><div>%elif BIT_DEPTH == 10, there's one more of these below</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left-width:1px;border-left-color:rgb(204,204,204);border-left-style:solid;padding-left:1ex">
+ %define DST_SHIFT 3<br>
+ mova m5, [pd_4]<br>
+%endif<br>
<br>
add r2d, r2d<br>
lea r3, [tab_dst4]<br>
<br>
- mova m5, [pd_1]<br>
-<br>
mova coef0, [r3 + 0 * 16]<br>
mova coef1, [r3 + 1 * 16]<br>
%if ARCH_X86_64<br>
@@ -294,7 +308,7 @@<br>
mova coef3, [r3 + 3 * 16]<br>
%endif<br>
<br>
- movh m0, [r0 + 0 * r2] ;load<br>
+ movh m0, [r0 + 0 * r2] ; load<br>
movh m1, [r0 + 1 * r2]<br>
punpcklqdq m0, m1<br>
<br>
@@ -303,30 +317,30 @@<br>
movh m2, [r0 + r2]<br>
punpcklqdq m1, m2<br>
<br>
- pmaddwd m2, m0, coef0 ;DST1<br>
+ pmaddwd m2, m0, coef0 ; DST1<br>
pmaddwd m3, m1, coef0<br>
phaddd m2, m3<br>
paddd m2, m5<br>
- psrad m2, 1<br>
+ psrad m2, DST_SHIFT<br>
<br>
pmaddwd m3, m0, coef1<br>
pmaddwd m4, m1, coef1<br>
phaddd m3, m4<br>
paddd m3, m5<br>
- psrad m3, 1<br>
+ psrad m3, DST_SHIFT<br>
packssdw m2, m3 ; m2 = T70<br>
<br>
pmaddwd m3, m0, coef2<br>
pmaddwd m4, m1, coef2<br>
phaddd m3, m4<br>
paddd m3, m5<br>
- psrad m3, 1<br>
+ psrad m3, DST_SHIFT<br>
<br>
pmaddwd m0, coef3<br>
pmaddwd m1, coef3<br>
phaddd m0, m1<br>
paddd m0, m5<br>
- psrad m0, 1<br>
+ psrad m0, DST_SHIFT<br>
packssdw m3, m0 ; m3 = T71<br>
<br>
mova m5, [pd_128]<br>
@@ -365,8 +379,16 @@<br>
;void idst4(int32_t *src, int16_t *dst, intptr_t stride)<br>
;-------------------------------------------------------<br>
INIT_XMM sse2<br>
-cglobal idst4, 3, 4, 6<br>
-<br>
+cglobal idst4, 3, 4, 7<br>
+%if BIT_DEPTH == 8<br>
+ %define m6 [pd_2048]<br>
+ %define IDCT4_SHIFT 12<br>
+%elif BIT_DEPTH == 10<br>
+ %define m6 [pd_512]<br>
+ %define IDCT4_SHIFT 10<br>
+%else<br>
+ %error Unsupported BIT_DEPTH!<br>
+%endif<br>
add r2d, r2d<br>
lea r3, [tab_idst4]<br>
<br>
@@ -415,35 +437,33 @@<br>
punpcklwd m2, m0, m1<br>
punpckhwd m0, m1<br>
<br>
- mova m5, [pd_2048]<br>
-<br>
punpcklwd m1, m2, m0<br>
punpckhwd m2, m0<br>
<br>
pmaddwd m0, m1, [r3 + 0 * 16]<br>
pmaddwd m3, m2, [r3 + 1 * 16]<br>
paddd m0, m3<br>
- paddd m0, m5<br>
- psrad m0, 12 ; m1 = S0<br>
+ paddd m0, m6<br>
+ psrad m0, IDCT4_SHIFT ; m0 = S0<br>
<br>
pmaddwd m3, m1, [r3 + 2 * 16]<br>
pmaddwd m4, m2, [r3 + 3 * 16]<br>
paddd m3, m4<br>
- paddd m3, m5<br>
- psrad m3, 12 ; m3 = S8<br>
+ paddd m3, m6<br>
+ psrad m3, IDCT4_SHIFT ; m3 = S8<br>
packssdw m0, m3 ; m0 = m128iA<br>
<br>
pmaddwd m3, m1, [r3 + 4 * 16]<br>
pmaddwd m4, m2, [r3 + 5 * 16]<br>
paddd m3, m4<br>
- paddd m3, m5<br>
- psrad m3, 12 ; m3 = S0<br>
+ paddd m3, m6<br>
+ psrad m3, IDCT4_SHIFT ; m3 = S0<br>
<br>
pmaddwd m1, [r3 + 6 * 16]<br>
pmaddwd m2, [r3 + 7 * 16]<br>
paddd m1, m2<br>
- paddd m1, m5<br>
- psrad m1, 12 ; m1 = S8<br>
+ paddd m1, m6<br>
+ psrad m1, IDCT4_SHIFT ; m1 = S8<br>
packssdw m3, m1 ; m3 = m128iD<br>
<br>
punpcklwd m1, m0, m3<br>
@@ -476,11 +496,20 @@<br>
; Row6[4-7] Row7[4-7]<br>
;------------------------<br>
<br>
+%if BIT_DEPTH == 10<br>
+ %define DCT_SHIFT 4<br>
+ mova m6, [pd_8]<br>
+%else if BIT_DEPTH == 8<br>
+ %define DCT_SHIFT 2<br>
+ mova m6, [pd_2]<br>
+%else<br>
+ %error Unsupported BIT_DEPTH!<br>
+%endif<br>
+<br>
add r2, r2<br>
lea r3, [r2 * 3]<br>
mov r5, rsp<br>
<br>
- mova m6, [pd_2]<br>
%assign x 0<br>
%rep 2<br>
movu m0, [r0]<br>
@@ -518,7 +547,7 @@<br>
pmaddwd m5, m0, [r4 + 0*16]<br>
phaddd m1, m5<br>
paddd m1, m6<br>
- psrad m1, 2<br>
+ psrad m1, DCT_SHIFT<br>
%if x == 1<br>
pshufd m1, m1, 0x1B<br>
%endif<br>
@@ -528,7 +557,7 @@<br>
pmaddwd m5, m0, [r4 + 1*16]<br>
phaddd m1, m5<br>
paddd m1, m6<br>
- psrad m1, 2<br>
+ psrad m1, DCT_SHIFT<br>
%if x == 1<br>
pshufd m1, m1, 0x1B<br>
%endif<br>
@@ -538,7 +567,7 @@<br>
pmaddwd m5, m0, [r4 + 2*16]<br>
phaddd m1, m5<br>
paddd m1, m6<br>
- psrad m1, 2<br>
+ psrad m1, DCT_SHIFT<br>
%if x == 1<br>
pshufd m1, m1, 0x1B<br>
%endif<br>
@@ -548,7 +577,7 @@<br>
pmaddwd m0, [r4 + 3*16]<br>
phaddd m4, m0<br>
paddd m4, m6<br>
- psrad m4, 2<br>
+ psrad m4, DCT_SHIFT<br>
%if x == 1<br>
pshufd m4, m4, 0x1B<br>
%endif<br>
@@ -564,7 +593,7 @@<br>
<br>
pmaddwd m3, m0, [r4 + 0*16]<br>
paddd m3, m6<br>
- psrad m3, 2<br>
+ psrad m3, DCT_SHIFT<br>
%if x == 1<br>
pshufd m3, m3, 0x1B<br>
%endif<br>
@@ -572,7 +601,7 @@<br>
<br>
pmaddwd m0, [r4 + 2*16]<br>
paddd m0, m6<br>
- psrad m0, 2<br>
+ psrad m0, DCT_SHIFT<br>
%if x == 1<br>
pshufd m0, m0, 0x1B<br>
%endif<br>
@@ -580,7 +609,7 @@<br>
<br>
pmaddwd m3, m2, [r4 + 1*16]<br>
paddd m3, m6<br>
- psrad m3, 2<br>
+ psrad m3, DCT_SHIFT<br>
%if x == 1<br>
pshufd m3, m3, 0x1B<br>
%endif<br>
@@ -588,7 +617,7 @@<br>
<br>
pmaddwd m2, [r4 + 3*16]<br>
paddd m2, m6<br>
- psrad m2, 2<br>
+ psrad m2, DCT_SHIFT<br>
%if x == 1<br>
pshufd m2, m2, 0x1B<br>
%endif<br>
diff -r ed310b17ff66 -r 831536babdc0 source/test/mbdstharness.cpp<br>
--- a/source/test/mbdstharness.cpp Fri Feb 14 02:30:52 2014 -0600<br>
+++ b/source/test/mbdstharness.cpp Fri Feb 14 16:10:41 2014 +0530<br>
@@ -173,6 +173,10 @@<br>
<br>
bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width)<br>
{<br>
+#if HIGH_BIT_DEPTH<br>
+ int old_depth = X265_DEPTH;<br>
+ X265_DEPTH = 10;<br>
+#endif<br>
int j = 0;<br>
int cmp_size = sizeof(int) * width * width;<br>
<br>
@@ -189,6 +193,11 @@<br>
ref(short_test_buff[index] + j, mintbuf3, width);<br>
opt(short_test_buff[index] + j, mintbuf4, width);<br>
#endif<br>
+<br>
+#if HIGH_BIT_DEPTH<br>
+ X265_DEPTH = old_depth;<br>
+#endif<br>
+<br>
return false;<br>
}<br>
<br>
@@ -199,11 +208,20 @@<br>
#endif<br>
}<br>
<br>
+#if HIGH_BIT_DEPTH<br>
+ X265_DEPTH = old_depth;<br>
+#endif<br>
+<br>
return true;<br>
}<br>
<br>
bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width)<br>
{<br>
+#if HIGH_BIT_DEPTH<br>
+ int old_depth = X265_DEPTH;<br>
+ X265_DEPTH = 10;<br>
+#endif<br>
+<br>
int j = 0;<br>
int cmp_size = sizeof(int16_t) * width * width;<br>
<br>
@@ -220,6 +238,11 @@<br>
ref(int_test_buff[index] + j, mbuf2, width);<br>
opt(int_test_buff[index] + j, mbuf3, width);<br>
#endif<br>
+<br>
+#if HIGH_BIT_DEPTH<br>
+ X265_DEPTH = old_depth;<br>
+#endif<br>
+<br>
return false;<br>
}<br>
<br>
@@ -230,6 +253,9 @@<br>
#endif<br>
}<br>
<br>
+#if HIGH_BIT_DEPTH<br>
+ X265_DEPTH = old_depth;<br>
+#endif<br>
return true;<br>
}<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>