[x265] [PATCH] asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Feb 14 11:41:34 CET 2014


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1392374441 -19800
#      Fri Feb 14 16:10:41 2014 +0530
# Node ID 831536babdc08f1553a10754bf2a4f4af6aa1695
# Parent  ed310b17ff6681f191c85341cf6efe7a50770143
asm: added 16bpp support for dct[4x4, 8x8], idct4x4, dst4x4 and idst4x4 primitives

diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 14 02:30:52 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Feb 14 16:10:41 2014 +0530
@@ -726,6 +726,10 @@
         p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
         p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
         p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
+
+        p.dct[DCT_4x4] = x265_dct4_sse2;
+        p.idct[IDCT_4x4] = x265_idct4_sse2;
+        p.idct[IDST_4x4] = x265_idst4_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
@@ -740,9 +744,12 @@
 
         SETUP_INTRA_ANG32(2, 2, ssse3);
         SETUP_INTRA_ANG32(34, 2, ssse3);
+
+        p.dct[DST_4x4] = x265_dst4_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.dct[DCT_8x8] = x265_dct8_sse4;
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
 
         p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Feb 14 02:30:52 2014 -0600
+++ b/source/common/x86/const-a.asm	Fri Feb 14 16:10:41 2014 +0530
@@ -72,6 +72,8 @@
 
 const pd_1,        times 4 dd 1
 const pd_2,        times 4 dd 2
+const pd_4,        times 4 dd 4
+const pd_8,        times 4 dd 8
 const pd_16,       times 4 dd 16
 const pd_32,       times 4 dd 32
 const pd_64,       times 4 dd 64
diff -r ed310b17ff66 -r 831536babdc0 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Fri Feb 14 02:30:52 2014 -0600
+++ b/source/common/x86/dct8.asm	Fri Feb 14 16:10:41 2014 +0530
@@ -67,6 +67,10 @@
 
 cextern pd_1
 cextern pd_2
+cextern pd_4
+cextern pd_8
+cextern pd_16
+cextern pd_32
 cextern pd_64
 cextern pd_128
 cextern pd_256
@@ -79,6 +83,15 @@
 ;------------------------------------------------------
 INIT_XMM sse2
 cglobal dct4, 3, 4, 8
+%if BIT_DEPTH == 10
+  %define       DCT_SHIFT 3
+  mova          m7, [pd_4]
+%else if BIT_DEPTH == 8
+  %define       DCT_SHIFT 1
+  mova          m7, [pd_1]
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
 
     add         r2d, r2d
     lea         r3, [tab_dct4]
@@ -87,8 +100,6 @@
     mova        m5, [r3 + 1 * 16]
     mova        m6, [r3 + 2 * 16]
 
-    mova        m7, [pd_1]
-
     movh        m0, [r0 + 0 * r2]
     movh        m1, [r0 + 1 * r2]
     punpcklqdq  m0, m1
@@ -110,11 +121,11 @@
 
     pmaddwd     m0, m1, m4
     paddd       m0, m7
-    psrad       m0, 1
+    psrad       m0, DCT_SHIFT
 
     pmaddwd     m3, m2, m5
     paddd       m3, m7
-    psrad       m3, 1
+    psrad       m3, DCT_SHIFT
 
     packssdw    m0, m3
     pshufd      m0, m0, 0xD8
@@ -122,11 +133,11 @@
 
     pmaddwd     m1, m6
     paddd       m1, m7
-    psrad       m1, 1
+    psrad       m1, DCT_SHIFT
 
     pmaddwd     m2, [r3 + 3 * 16]
     paddd       m2, m7
-    psrad       m2, 1
+    psrad       m2, DCT_SHIFT
 
     packssdw    m1, m2
     pshufd      m1, m1, 0xD8
@@ -179,7 +190,7 @@
   %define IDCT4_OFFSET  [pd_512]
   %define IDCT4_SHIFT   10
 %else
-  %error Unsupport BIT_DEPTH!
+  %error Unsupported BIT_DEPTH!
 %endif
     add         r2d, r2d
     lea         r3, [tab_dct4]
@@ -268,25 +279,28 @@
 INIT_XMM ssse3
 %if ARCH_X86_64
 cglobal dst4, 3, 4, 8+2
+  %define       coef2   m8
+  %define       coef3   m9
 %else ; ARCH_X86_64 = 0
 cglobal dst4, 3, 4, 8
+  %define       coef2   [r3 + 2 * 16]
+  %define       coef3   [r3 + 3 * 16]
 %endif ; ARCH_X86_64
 
-    %define coef0   m6
-    %define coef1   m7
-%if ARCH_X86_64
-    %define coef2   m8
-    %define coef3   m9
-%else
-    %define coef2   [r3 + 2 * 16]
-    %define coef3   [r3 + 3 * 16]
-%endif
+%define         coef0   m6
+%define         coef1   m7
+
+%if BIT_DEPTH == 8
+  %define       DST_SHIFT 1
+  mova          m5, [pd_1]
+%else if BIT_DEPTH == 10
+  %define       DST_SHIFT 3
+  mova          m5, [pd_4]
+%endif 
 
     add         r2d, r2d
     lea         r3, [tab_dst4]
 
-    mova        m5, [pd_1]
-
     mova        coef0, [r3 + 0 * 16]
     mova        coef1, [r3 + 1 * 16]
 %if ARCH_X86_64
@@ -294,7 +308,7 @@
     mova        coef3, [r3 + 3 * 16]
 %endif
 
-    movh        m0, [r0 + 0 * r2]            ;load
+    movh        m0, [r0 + 0 * r2]            ; load
     movh        m1, [r0 + 1 * r2]
     punpcklqdq  m0, m1
 
@@ -303,30 +317,30 @@
     movh        m2, [r0 + r2]
     punpcklqdq  m1, m2
 
-    pmaddwd     m2, m0, coef0                ;DST1
+    pmaddwd     m2, m0, coef0                ; DST1
     pmaddwd     m3, m1, coef0
     phaddd      m2, m3
     paddd       m2, m5
-    psrad       m2, 1
+    psrad       m2, DST_SHIFT
 
     pmaddwd     m3, m0, coef1
     pmaddwd     m4, m1, coef1
     phaddd      m3, m4
     paddd       m3, m5
-    psrad       m3, 1
+    psrad       m3, DST_SHIFT
     packssdw    m2, m3                       ; m2 = T70
 
     pmaddwd     m3, m0, coef2
     pmaddwd     m4, m1, coef2
     phaddd      m3, m4
     paddd       m3, m5
-    psrad       m3, 1
+    psrad       m3, DST_SHIFT
 
     pmaddwd     m0, coef3
     pmaddwd     m1, coef3
     phaddd      m0, m1
     paddd       m0, m5
-    psrad       m0, 1
+    psrad       m0, DST_SHIFT
     packssdw    m3, m0                       ; m3 = T71
 
     mova        m5, [pd_128]
@@ -365,8 +379,16 @@
 ;void idst4(int32_t *src, int16_t *dst, intptr_t stride)
 ;-------------------------------------------------------
 INIT_XMM sse2
-cglobal idst4, 3, 4, 6
-
+cglobal idst4, 3, 4, 7
+%if BIT_DEPTH == 8
+  %define m6  [pd_2048]
+  %define IDCT4_SHIFT 12
+%elif BIT_DEPTH == 10
+  %define m6  [pd_512]
+  %define IDCT4_SHIFT 10
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
     add         r2d, r2d
     lea         r3, [tab_idst4]
 
@@ -415,35 +437,33 @@
     punpcklwd   m2, m0, m1
     punpckhwd   m0, m1
 
-    mova        m5, [pd_2048]
-
     punpcklwd   m1, m2, m0
     punpckhwd   m2, m0
 
     pmaddwd     m0, m1, [r3 + 0 * 16]
     pmaddwd     m3, m2, [r3 + 1 * 16]
     paddd       m0, m3
-    paddd       m0, m5
-    psrad       m0, 12                      ; m1 = S0
+    paddd       m0, m6
+    psrad       m0, IDCT4_SHIFT             ; m0 = S0
 
     pmaddwd     m3, m1, [r3 + 2 * 16]
     pmaddwd     m4, m2, [r3 + 3 * 16]
     paddd       m3, m4
-    paddd       m3, m5
-    psrad       m3, 12                      ; m3 = S8
+    paddd       m3, m6
+    psrad       m3, IDCT4_SHIFT             ; m3 = S8
     packssdw    m0, m3                      ; m0 = m128iA
 
     pmaddwd     m3, m1, [r3 + 4 * 16]
     pmaddwd     m4, m2, [r3 + 5 * 16]
     paddd       m3, m4
-    paddd       m3, m5
-    psrad       m3, 12                      ; m3 = S0
+    paddd       m3, m6
+    psrad       m3, IDCT4_SHIFT             ; m3 = S0
 
     pmaddwd     m1, [r3 + 6 * 16]
     pmaddwd     m2, [r3 + 7 * 16]
     paddd       m1, m2
-    paddd       m1, m5
-    psrad       m1, 12                      ; m1 = S8
+    paddd       m1, m6
+    psrad       m1, IDCT4_SHIFT             ; m1 = S8
     packssdw    m3, m1                      ; m3 = m128iD
 
     punpcklwd   m1, m0, m3
@@ -476,11 +496,20 @@
     ; Row6[4-7] Row7[4-7]
     ;------------------------
 
+%if BIT_DEPTH == 10
+  %define       DCT_SHIFT 4
+  mova          m6, [pd_8]
+%else if BIT_DEPTH == 8
+  %define       DCT_SHIFT 2
+  mova          m6, [pd_2]
+%else
+  %error Unsupported BIT_DEPTH!
+%endif
+
     add         r2, r2
     lea         r3, [r2 * 3]
     mov         r5, rsp
 
-    mova        m6, [pd_2]
 %assign x 0
 %rep 2
     movu        m0, [r0]
@@ -518,7 +547,7 @@
     pmaddwd     m5, m0, [r4 + 0*16]
     phaddd      m1, m5
     paddd       m1, m6
-    psrad       m1, 2
+    psrad       m1, DCT_SHIFT
   %if x == 1
     pshufd      m1, m1, 0x1B
   %endif
@@ -528,7 +557,7 @@
     pmaddwd     m5, m0, [r4 + 1*16]
     phaddd      m1, m5
     paddd       m1, m6
-    psrad       m1, 2
+    psrad       m1, DCT_SHIFT
   %if x == 1
     pshufd      m1, m1, 0x1B
   %endif
@@ -538,7 +567,7 @@
     pmaddwd     m5, m0, [r4 + 2*16]
     phaddd      m1, m5
     paddd       m1, m6
-    psrad       m1, 2
+    psrad       m1, DCT_SHIFT
   %if x == 1
     pshufd      m1, m1, 0x1B
   %endif
@@ -548,7 +577,7 @@
     pmaddwd     m0, [r4 + 3*16]
     phaddd      m4, m0
     paddd       m4, m6
-    psrad       m4, 2
+    psrad       m4, DCT_SHIFT
   %if x == 1
     pshufd      m4, m4, 0x1B
   %endif
@@ -564,7 +593,7 @@
 
     pmaddwd     m3, m0, [r4 + 0*16]
     paddd       m3, m6
-    psrad       m3, 2
+    psrad       m3, DCT_SHIFT
   %if x == 1
     pshufd      m3, m3, 0x1B
   %endif
@@ -572,7 +601,7 @@
 
     pmaddwd     m0, [r4 + 2*16]
     paddd       m0, m6
-    psrad       m0, 2
+    psrad       m0, DCT_SHIFT
   %if x == 1
     pshufd      m0, m0, 0x1B
   %endif
@@ -580,7 +609,7 @@
 
     pmaddwd     m3, m2, [r4 + 1*16]
     paddd       m3, m6
-    psrad       m3, 2
+    psrad       m3, DCT_SHIFT
   %if x == 1
     pshufd      m3, m3, 0x1B
   %endif
@@ -588,7 +617,7 @@
 
     pmaddwd     m2, [r4 + 3*16]
     paddd       m2, m6
-    psrad       m2, 2
+    psrad       m2, DCT_SHIFT
   %if x == 1
     pshufd      m2, m2, 0x1B
   %endif
diff -r ed310b17ff66 -r 831536babdc0 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Fri Feb 14 02:30:52 2014 -0600
+++ b/source/test/mbdstharness.cpp	Fri Feb 14 16:10:41 2014 +0530
@@ -173,6 +173,10 @@
 
 bool MBDstHarness::check_dct_primitive(dct_t ref, dct_t opt, int width)
 {
+#if HIGH_BIT_DEPTH
+    int old_depth = X265_DEPTH;
+    X265_DEPTH = 10;
+#endif
     int j = 0;
     int cmp_size = sizeof(int) * width * width;
 
@@ -189,6 +193,11 @@
             ref(short_test_buff[index] + j, mintbuf3, width);
             opt(short_test_buff[index] + j, mintbuf4, width);
 #endif
+
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
+
             return false;
         }
 
@@ -199,11 +208,20 @@
 #endif
     }
 
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
+
     return true;
 }
 
 bool MBDstHarness::check_idct_primitive(idct_t ref, idct_t opt, int width)
 {
+#if HIGH_BIT_DEPTH
+    int old_depth = X265_DEPTH;
+    X265_DEPTH = 10;
+#endif
+
     int j = 0;
     int cmp_size = sizeof(int16_t) * width * width;
 
@@ -220,6 +238,11 @@
             ref(int_test_buff[index] + j, mbuf2, width);
             opt(int_test_buff[index] + j, mbuf3, width);
 #endif
+
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
+
             return false;
         }
 
@@ -230,6 +253,9 @@
 #endif
     }
 
+#if HIGH_BIT_DEPTH
+    X265_DEPTH = old_depth;
+#endif
     return true;
 }
 


More information about the x265-devel mailing list