[x265-commits] [x265] main12: pass correct bit-depth to assembler

Fri Jul 3 04:25:29 CEST 2015

details:   http://hg.videolan.org/x265/rev/9b9b3145e7d2
branches:  
changeset: 10754:9b9b3145e7d2
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jul 02 14:25:03 2015 -0700
description:
main12: pass correct bit-depth to assembler
Subject: [x265] asm: main12 dynamic range fixes for DCT

details:   http://hg.videolan.org/x265/rev/984afcf2bb3e
branches:  
changeset: 10755:984afcf2bb3e
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jul 02 17:12:42 2015 -0500
description:
asm: main12 dynamic range fixes for DCT
Subject: [x265] asm: fix Main12 ssim_end_4

details:   http://hg.videolan.org/x265/rev/22d1d86b8cab
branches:  
changeset: 10756:22d1d86b8cab
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jul 02 14:33:32 2015 -0700
description:
asm: fix Main12 ssim_end_4
Subject: [x265] asm: fix Main12 luma_hps_sse2

details:   http://hg.videolan.org/x265/rev/de61709ccf56
branches:  
changeset: 10757:de61709ccf56
user:      Min Chen <chenm003 at 163.com>
date:      Thu Jul 02 16:49:08 2015 -0700
description:
asm: fix Main12 luma_hps_sse2

diffstat:

 source/cmake/CMakeASM_YASMInformation.cmake |    6 +-
 source/common/pixel.cpp                     |    2 +-
 source/common/x86/const-a.asm               |    1 +
 source/common/x86/dct8.asm                  |  118 ++++++++++++++++++++-------
 source/common/x86/ipfilter16.asm            |   16 +++-
 source/common/x86/pixel-util8.asm           |    9 +-
 6 files changed, 115 insertions(+), 37 deletions(-)

diffs (truncated from 383 to 300 lines):

diff -r a81a57cfa495 -r de61709ccf56 source/cmake/CMakeASM_YASMInformation.cmake

--- a/source/cmake/CMakeASM_YASMInformation.cmake	Thu Jul 02 16:11:46 2015 -0500
+++ b/source/cmake/CMakeASM_YASMInformation.cmake	Thu Jul 02 16:49:08 2015 -0700
@@ -31,7 +31,11 @@ else()
 endif()
 
 if(HIGH_BIT_DEPTH)
-    list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
+    if(MAIN12)
+        list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=12 -DX265_NS=${X265_NS})
+    else()
+        list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -DX265_NS=${X265_NS})
+    endif()
 else()
     list(APPEND ASM_FLAGS -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -DX265_NS=${X265_NS})
 endif()
diff -r a81a57cfa495 -r de61709ccf56 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Thu Jul 02 16:11:46 2015 -0500
+++ b/source/common/pixel.cpp	Thu Jul 02 16:49:08 2015 -0700
@@ -690,7 +690,7 @@ static float ssim_end_1(int s1, int s2, 
 
 #define PIXEL_MAX ((1 << X265_DEPTH) - 1)
 #if HIGH_BIT_DEPTH
-    X265_CHECK(X265_DEPTH == 10, "ssim invalid depth\n");
+    X265_CHECK((X265_DEPTH == 10) || (X265_DEPTH == 12), "ssim invalid depth\n");
 #define type float
     static const float ssim_c1 = (float)(.01 * .01 * PIXEL_MAX * PIXEL_MAX * 64);
     static const float ssim_c2 = (float)(.03 * .03 * PIXEL_MAX * PIXEL_MAX * 64 * 63);
diff -r a81a57cfa495 -r de61709ccf56 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Thu Jul 02 16:11:46 2015 -0500
+++ b/source/common/x86/const-a.asm	Thu Jul 02 16:49:08 2015 -0700
@@ -126,6 +126,7 @@ const pd_2048,              times  4 dd 
 const pd_ffff,              times  4 dd 0xffff
 const pd_32767,             times  4 dd 32767
 const pd_n32768,            times  4 dd 0xffff8000
+const pd_n131072,           times  4 dd 0xfffe0000
 
 const trans8_shuf,          times  1 dd   0,   4,   1,   5,   2,   6,   3,   7
 
diff -r a81a57cfa495 -r de61709ccf56 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Thu Jul 02 16:11:46 2015 -0500
+++ b/source/common/x86/dct8.asm	Thu Jul 02 16:49:08 2015 -0700
@@ -337,7 +337,10 @@ cextern trans8_shuf
 ;------------------------------------------------------
 INIT_XMM sse2
 cglobal dct4, 3, 4, 8
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+  %define       DCT_SHIFT 5
+  mova          m7, [pd_16]
+%elif BIT_DEPTH == 10
   %define       DCT_SHIFT 3
   mova          m7, [pd_4]
 %elif BIT_DEPTH == 8
@@ -431,7 +434,10 @@ cglobal dct4, 3, 4, 8
 ; - r2:     source stride
 INIT_YMM avx2
 cglobal dct4, 3, 4, 8, src, dst, srcStride
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define DCT_SHIFT 5
+    vbroadcasti128 m7, [pd_16]
+%elif BIT_DEPTH == 10
     %define DCT_SHIFT 3
     vbroadcasti128 m7, [pd_4]
 %elif BIT_DEPTH == 8
@@ -494,12 +500,15 @@ cglobal dct4, 3, 4, 8, src, dst, srcStri
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idct4, 3, 4, 7
-%if BIT_DEPTH == 8
-  %define IDCT4_OFFSET  [pd_2048]
-  %define IDCT4_SHIFT   12
+%if BIT_DEPTH == 12
+  %define IDCT4_OFFSET  [pd_128]
+  %define IDCT4_SHIFT   8
 %elif BIT_DEPTH == 10
   %define IDCT4_OFFSET  [pd_512]
   %define IDCT4_SHIFT   10
+%elif BIT_DEPTH == 8
+  %define IDCT4_OFFSET  [pd_2048]
+  %define IDCT4_SHIFT   12
 %else
   %error Unsupported BIT_DEPTH!
 %endif
@@ -597,12 +606,17 @@ cglobal dst4, 3, 4, 8
   %define       coef3   [r3 + 3 * 16]
 %endif ; ARCH_X86_64
 
-%if BIT_DEPTH == 8
-  %define       DST_SHIFT 1
-  mova          m5, [pd_1]
+%if BIT_DEPTH == 12
+    %define       DST_SHIFT 5
+    mova          m5, [pd_16]
 %elif BIT_DEPTH == 10
-  %define       DST_SHIFT 3
-  mova          m5, [pd_4]
+    %define       DST_SHIFT 3
+    mova          m5, [pd_4]
+%elif BIT_DEPTH == 8
+    %define       DST_SHIFT 1
+    mova          m5, [pd_1]
+%else
+    %error Unsupported BIT_DEPTH!
 %endif
     add         r2d, r2d
     lea         r3, [tab_dst4]
@@ -869,12 +883,15 @@ cglobal dst4, 3, 4, 6
 ;-------------------------------------------------------
 INIT_XMM sse2
 cglobal idst4, 3, 4, 7
-%if BIT_DEPTH == 8
-  mova m6, [pd_2048]
-  %define IDCT4_SHIFT 12
+%if BIT_DEPTH == 12
+    mova m6,            [pd_128]
+  %define IDCT4_SHIFT   8
 %elif BIT_DEPTH == 10
-  mova m6, [pd_512]
-  %define IDCT4_SHIFT 10
+    mova m6,            [pd_512]
+  %define IDCT4_SHIFT   10
+%elif BIT_DEPTH == 8
+    mova m6,            [pd_2048]
+  %define IDCT4_SHIFT   12
 %else
   %error Unsupported BIT_DEPTH!
 %endif
@@ -961,12 +978,15 @@ cglobal idst4, 3, 4, 7
 ;-----------------------------------------------------------------
 INIT_YMM avx2
 cglobal idst4, 3, 4, 6
-%if BIT_DEPTH == 8
-  vpbroadcastd  m4, [pd_2048]
-  %define       IDCT4_SHIFT 12
+%if BIT_DEPTH == 12
+    vpbroadcastd    m4,     [pd_256]
+    %define IDCT4_SHIFT     8
 %elif BIT_DEPTH == 10
-  vpbroadcastd  m4, [pd_512]
-  %define       IDCT4_SHIFT 10
+    vpbroadcastd    m4,     [pd_512]
+    %define IDCT4_SHIFT     10
+%elif BIT_DEPTH == 8
+    vpbroadcastd    m4,     [pd_2048]
+    %define IDCT4_SHIFT     12
 %else
   %error Unsupported BIT_DEPTH!
 %endif
@@ -1046,7 +1066,10 @@ cglobal dct8, 3,6,8,0-16*mmsize
     ; ...
     ; Row6[4-7] Row7[4-7]
     ;------------------------
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+  %define       DCT_SHIFT1 6
+  %define       DCT_ADD1 [pd_32]
+%elif BIT_DEPTH == 10
   %define       DCT_SHIFT1 4
   %define       DCT_ADD1 [pd_8]
 %elif BIT_DEPTH == 8
@@ -1409,7 +1432,10 @@ cglobal dct8, 3,6,7,0-16*mmsize
     ; ...
     ; Row6[4-7] Row7[4-7]
     ;------------------------
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+  %define       DCT_SHIFT 6
+  mova          m6, [pd_16]
+%elif BIT_DEPTH == 10
   %define       DCT_SHIFT 4
   mova          m6, [pd_8]
 %elif BIT_DEPTH == 8
@@ -1623,7 +1649,10 @@ cglobal dct8, 3,6,7,0-16*mmsize
 ;-------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse2
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define     IDCT_SHIFT 8
+    %define     IDCT_ADD pd_128
+%elif BIT_DEPTH == 10
     %define     IDCT_SHIFT 10
     %define     IDCT_ADD pd_512
 %elif BIT_DEPTH == 8
@@ -2090,7 +2119,9 @@ cglobal patial_butterfly_inverse_interna
     ret
 
 %macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define     IDCT_SHIFT 8
+%elif BIT_DEPTH == 10
     %define     IDCT_SHIFT 10
 %elif BIT_DEPTH == 8
     %define     IDCT_SHIFT 12
@@ -2159,7 +2190,9 @@ cglobal idct8, 3,7,8 ;,0-16*mmsize
 
     call        patial_butterfly_inverse_internal_pass1
 
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    mova        m6, [pd_256]
+%elif BIT_DEPTH == 10
     mova        m6, [pd_512]
 %elif BIT_DEPTH == 8
     mova        m6, [pd_2048]
@@ -2290,7 +2323,10 @@ cglobal denoise_dct, 4, 4, 6
 
 INIT_YMM avx2
 cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         DCT_SHIFT          6
+    vbroadcasti128  m5,                [pd_16]
+%elif BIT_DEPTH == 10
     %define         DCT_SHIFT          4
     vbroadcasti128  m5,                [pd_8]
 %elif BIT_DEPTH == 8
@@ -2456,7 +2492,10 @@ cglobal dct8, 3, 7, 11, 0-8*16
 %endmacro
 INIT_YMM avx2
 cglobal dct16, 3, 9, 16, 0-16*mmsize
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         DCT_SHIFT          7
+    vbroadcasti128  m9,                [pd_64]
+%elif BIT_DEPTH == 10
     %define         DCT_SHIFT          5
     vbroadcasti128  m9,                [pd_16]
 %elif BIT_DEPTH == 8
@@ -2679,7 +2718,10 @@ cglobal dct16, 3, 9, 16, 0-16*mmsize
 
 INIT_YMM avx2
 cglobal dct32, 3, 9, 16, 0-64*mmsize
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         DCT_SHIFT          8
+    vpbroadcastq    m9,                [pd_128]
+%elif BIT_DEPTH == 10
     %define         DCT_SHIFT          6
     vpbroadcastq    m9,                [pd_32]
 %elif BIT_DEPTH == 8
@@ -2973,7 +3015,10 @@ cglobal dct32, 3, 9, 16, 0-64*mmsize
 
 INIT_YMM avx2
 cglobal idct8, 3, 7, 13, 0-8*16
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m12,                [pd_256]
+%elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m12,                [pd_512]
 %elif BIT_DEPTH == 8
@@ -3131,7 +3176,10 @@ cglobal idct8, 3, 7, 13, 0-8*16
 ;-------------------------------------------------------
 INIT_YMM avx2
 cglobal idct16, 3, 7, 16, 0-16*mmsize
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m15,                [pd_256]
+%elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
 %elif BIT_DEPTH == 8
@@ -3550,7 +3598,10 @@ cglobal idct32, 3, 6, 16, 0-32*64
     dec             r5d
     jnz             .pass1
 
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m15,                [pd_256]
+%elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m15,                [pd_512]
 %elif BIT_DEPTH == 8
@@ -3711,7 +3762,10 @@ INIT_YMM avx2
 cglobal idct4, 3, 4, 6
 
 %define             IDCT_SHIFT1         7
-%if BIT_DEPTH == 10
+%if BIT_DEPTH == 12
+    %define         IDCT_SHIFT2        8
+    vpbroadcastd    m5,                [pd_256]
+%elif BIT_DEPTH == 10
     %define         IDCT_SHIFT2        10
     vpbroadcastd    m5,                [pd_512]
 %elif BIT_DEPTH == 8
diff -r a81a57cfa495 -r de61709ccf56 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Thu Jul 02 16:11:46 2015 -0500
+++ b/source/common/x86/ipfilter16.asm	Thu Jul 02 16:49:08 2015 -0700
@@ -3,6 +3,7 @@
 ;*
 ;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
 ;*          Murugan Vairavel <murugan at multicorewareinc.com>
+;*          Min Chen <chenm003 at 163.com>