[x265-commits] [x265] asm: fix Main12 assembly up to SSSE3

Sat Jul 11 19:52:43 CEST 2015

details:   http://hg.videolan.org/x265/rev/b89e593b9a07
branches:  
changeset: 10793:b89e593b9a07
user:      Min Chen <chenm003 at 163.com>
date:      Fri Jul 10 15:57:44 2015 -0700
description:
asm: fix Main12 assembly up to SSSE3
Subject: [x265] Main12: comment on constant SAO_BIT_INC

details:   http://hg.videolan.org/x265/rev/c296e281c75e
branches:  
changeset: 10794:c296e281c75e
user:      Min Chen <chenm003 at 163.com>
date:      Fri Jul 10 16:25:51 2015 -0700
description:
Main12: comment on constant SAO_BIT_INC
Subject: [x265] Main12: fix up sample fault on 10b -> 12b input convert

details:   http://hg.videolan.org/x265/rev/cebf086ceac0
branches:  
changeset: 10795:cebf086ceac0
user:      Min Chen <chenm003 at 163.com>
date:      Fri Jul 10 19:20:56 2015 -0700
description:
Main12: fix up sample fault on 10b -> 12b input convert
Subject: [x265] fix pixelcmp testbench on 12bpp

details:   http://hg.videolan.org/x265/rev/88f9d8752054
branches:  
changeset: 10796:88f9d8752054
user:      Min Chen <chenm003 at 163.com>
date:      Fri Jul 10 19:21:02 2015 -0700
description:
fix pixelcmp testbench on 12bpp
Subject: [x265] test: support test-bench builds with EXPORT_C_API=OFF (special namespaces)

details:   http://hg.videolan.org/x265/rev/133ee202f6db
branches:  
changeset: 10797:133ee202f6db
user:      Steve Borho <steve at borho.org>
date:      Sat Jul 11 12:29:09 2015 -0500
description:
test: support test-bench builds with EXPORT_C_API=OFF (special namespaces)

Going forward a lot of developers will probably use a multilib setup, so this
makes it possible for them to enable the test-bench for all bit-depth builds
Subject: [x265] test: correctly report 12bit builds

details:   http://hg.videolan.org/x265/rev/79f4906e9cb8
branches:  
changeset: 10798:79f4906e9cb8
user:      Steve Borho <steve at borho.org>
date:      Sat Jul 11 12:31:21 2015 -0500
description:
test: correctly report 12bit builds

diffstat:

 source/common/picyuv.cpp          |  105 +++--
 source/common/pixel.cpp           |   13 +
 source/common/primitives.h        |    1 +
 source/common/vec/dct-sse3.cpp    |    9 +-
 source/common/vec/dct-ssse3.cpp   |  130 +++----
 source/common/x86/const-a.asm     |    1 +
 source/common/x86/dct8.asm        |  351 ++++++------------
 source/common/x86/intrapred16.asm |   30 +-
 source/common/x86/ipfilter16.asm  |  682 +++++++++++++++++++------------------
 source/encoder/sao.h              |    2 +-
 source/test/mbdstharness.cpp      |    2 +-
 source/test/testbench.cpp         |    5 +-
 source/test/testharness.h         |   29 +-
 13 files changed, 626 insertions(+), 734 deletions(-)

diffs (truncated from 3024 to 300 lines):

diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/picyuv.cpp

--- a/source/common/picyuv.cpp	Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/picyuv.cpp	Sat Jul 11 12:31:21 2015 -0500
@@ -148,52 +148,62 @@ void PicYuv::copyFromPicture(const x265_
     padx++;
     pady++;
 
-    if (pic.bitDepth < X265_DEPTH)
+    X265_CHECK(pic.bitDepth >= 8, "pic.bitDepth check failure");
+
+    if (pic.bitDepth == 8)
     {
-        pixel *yPixel = m_picOrg[0];
-        pixel *uPixel = m_picOrg[1];
-        pixel *vPixel = m_picOrg[2];
+#if (X265_DEPTH > 8)
+        {
+            pixel *yPixel = m_picOrg[0];
+            pixel *uPixel = m_picOrg[1];
+            pixel *vPixel = m_picOrg[2];
 
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
-        int shift = X265_MAX(0, X265_DEPTH - pic.bitDepth);
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
+            int shift = (X265_DEPTH - 8);
 
-        primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
-        primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-        primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
-    }
-    else if (pic.bitDepth == 8)
-    {
-        pixel *yPixel = m_picOrg[0];
-        pixel *uPixel = m_picOrg[1];
-        pixel *vPixel = m_picOrg[2];
+            primitives.planecopy_cp(yChar, pic.stride[0] / sizeof(*yChar), yPixel, m_stride, width, height, shift);
+            primitives.planecopy_cp(uChar, pic.stride[1] / sizeof(*uChar), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+            primitives.planecopy_cp(vChar, pic.stride[2] / sizeof(*vChar), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift);
+        }
+#else /* Case for (X265_DEPTH == 8) */
+        // TODO: Does we need this path? may merge into above in future
+        {
+            pixel *yPixel = m_picOrg[0];
+            pixel *uPixel = m_picOrg[1];
+            pixel *vPixel = m_picOrg[2];
 
-        uint8_t *yChar = (uint8_t*)pic.planes[0];
-        uint8_t *uChar = (uint8_t*)pic.planes[1];
-        uint8_t *vChar = (uint8_t*)pic.planes[2];
+            uint8_t *yChar = (uint8_t*)pic.planes[0];
+            uint8_t *uChar = (uint8_t*)pic.planes[1];
+            uint8_t *vChar = (uint8_t*)pic.planes[2];
 
-        for (int r = 0; r < height; r++)
-        {
-            memcpy(yPixel, yChar, width * sizeof(pixel));
+            for (int r = 0; r < height; r++)
+            {
+                memcpy(yPixel, yChar, width * sizeof(pixel));
 
-            yPixel += m_stride;
-            yChar += pic.stride[0] / sizeof(*yChar);
+                yPixel += m_stride;
+                yChar += pic.stride[0] / sizeof(*yChar);
+            }
+
+            for (int r = 0; r < height >> m_vChromaShift; r++)
+            {
+                memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
+                memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
+
+                uPixel += m_strideC;
+                vPixel += m_strideC;
+                uChar += pic.stride[1] / sizeof(*uChar);
+                vChar += pic.stride[2] / sizeof(*vChar);
+            }
         }
-
-        for (int r = 0; r < height >> m_vChromaShift; r++)
-        {
-            memcpy(uPixel, uChar, (width >> m_hChromaShift) * sizeof(pixel));
-            memcpy(vPixel, vChar, (width >> m_hChromaShift) * sizeof(pixel));
-
-            uPixel += m_strideC;
-            vPixel += m_strideC;
-            uChar += pic.stride[1] / sizeof(*uChar);
-            vChar += pic.stride[2] / sizeof(*vChar);
-        }
+#endif /* (X265_DEPTH > 8) */
     }
     else /* pic.bitDepth > 8 */
     {
+        /* defensive programming, mask off bits that are supposed to be zero */
+        uint16_t mask = (1 << X265_DEPTH) - 1;
+        int shift = abs(pic.bitDepth - X265_DEPTH);
         pixel *yPixel = m_picOrg[0];
         pixel *uPixel = m_picOrg[1];
         pixel *vPixel = m_picOrg[2];
@@ -202,15 +212,20 @@ void PicYuv::copyFromPicture(const x265_
         uint16_t *uShort = (uint16_t*)pic.planes[1];
         uint16_t *vShort = (uint16_t*)pic.planes[2];
 
-        /* defensive programming, mask off bits that are supposed to be zero */
-        uint16_t mask = (1 << X265_DEPTH) - 1;
-        int shift = X265_MAX(0, pic.bitDepth - X265_DEPTH);
-
-        /* shift and mask pixels to final size */
-
-        primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
-        primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
-        primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        if (pic.bitDepth > X265_DEPTH)
+        {
+            /* shift right and mask pixels to final size */
+            primitives.planecopy_sp(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+            primitives.planecopy_sp(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+            primitives.planecopy_sp(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        }
+        else /* Case for (pic.bitDepth <= X265_DEPTH) */
+        {
+            /* shift left and mask pixels to final size */
+            primitives.planecopy_sp_shl(yShort, pic.stride[0] / sizeof(*yShort), yPixel, m_stride, width, height, shift, mask);
+            primitives.planecopy_sp_shl(uShort, pic.stride[1] / sizeof(*uShort), uPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+            primitives.planecopy_sp_shl(vShort, pic.stride[2] / sizeof(*vShort), vPixel, m_strideC, width >> m_hChromaShift, height >> m_vChromaShift, shift, mask);
+        }
     }
 
     /* extend the right edge if width was not multiple of the minimum CU size */
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/pixel.cpp	Sat Jul 11 12:31:21 2015 -0500
@@ -945,6 +945,18 @@ static void planecopy_sp_c(const uint16_
     }
 }
 
+static void planecopy_sp_shl_c(const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask)
+{
+    for (int r = 0; r < height; r++)
+    {
+        for (int c = 0; c < width; c++)
+            dst[c] = (pixel)((src[c] << shift) & mask);
+
+        dst += dstStride;
+        src += srcStride;
+    }
+}
+
 /* Estimate the total amount of influence on future quality that could be had if we
  * were to improve the reference samples used to inter predict any given CU. */
 static void estimateCUPropagateCost(int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts,
@@ -1245,6 +1257,7 @@ void setupPixelPrimitives_c(EncoderPrimi
 
     p.planecopy_cp = planecopy_cp_c;
     p.planecopy_sp = planecopy_sp_c;
+    p.planecopy_sp_shl = planecopy_sp_shl_c;
     p.propagateCost = estimateCUPropagateCost;
 }
 }
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/primitives.h
--- a/source/common/primitives.h	Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/primitives.h	Sat Jul 11 12:31:21 2015 -0500
@@ -312,6 +312,7 @@ struct EncoderPrimitives
     extendCURowBorder_t   extendRowBorder;
     planecopy_cp_t        planecopy_cp;
     planecopy_sp_t        planecopy_sp;
+    planecopy_sp_t        planecopy_sp_shl;
 
     weightp_sp_t          weight_sp;
     weightp_pp_t          weight_pp;
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/vec/dct-sse3.cpp
--- a/source/common/vec/dct-sse3.cpp	Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/vec/dct-sse3.cpp	Sat Jul 11 12:31:21 2015 -0500
@@ -38,13 +38,8 @@ using namespace X265_NS;
 #define SHIFT1  7
 #define ADD1    64
 
-#if HIGH_BIT_DEPTH
-#define SHIFT2  10
-#define ADD2    512
-#else
-#define SHIFT2  12
-#define ADD2    2048
-#endif
+#define SHIFT2  (12 - (X265_DEPTH - 8))
+#define ADD2    (1 << ((SHIFT2) - 1))
 
 ALIGN_VAR_32(static const int16_t, tab_idct_8x8[12][8]) =
 {
diff -r d62f941f3f88 -r 79f4906e9cb8 source/common/vec/dct-ssse3.cpp
--- a/source/common/vec/dct-ssse3.cpp	Fri Jul 10 11:52:31 2015 -0500
+++ b/source/common/vec/dct-ssse3.cpp	Sat Jul 11 12:31:21 2015 -0500
@@ -34,6 +34,18 @@
 #include <pmmintrin.h> // SSE3
 #include <tmmintrin.h> // SSSE3
 
+#define DCT16_SHIFT1  (3 + X265_DEPTH - 8)
+#define DCT16_ADD1    (1 << ((DCT16_SHIFT1) - 1))
+
+#define DCT16_SHIFT2  10
+#define DCT16_ADD2    (1 << ((DCT16_SHIFT2) - 1))
+
+#define DCT32_SHIFT1  (DCT16_SHIFT1 + 1)
+#define DCT32_ADD1    (1 << ((DCT32_SHIFT1) - 1))
+
+#define DCT32_SHIFT2  (DCT16_SHIFT2 + 1)
+#define DCT32_ADD2    (1 << ((DCT32_SHIFT2) - 1))
+
 using namespace X265_NS;
 
 ALIGN_VAR_32(static const int16_t, tab_dct_8[][8]) =
@@ -100,20 +112,9 @@ ALIGN_VAR_32(static const int16_t, tab_d
 
 static void dct16(const int16_t *src, int16_t *dst, intptr_t stride)
 {
-#if HIGH_BIT_DEPTH
-#define SHIFT1  5
-#define ADD1    16
-#else
-#define SHIFT1  3
-#define ADD1    4
-#endif
-
-#define SHIFT2  10
-#define ADD2    512
-
     // Const
-    __m128i c_4     = _mm_set1_epi32(ADD1);
-    __m128i c_512   = _mm_set1_epi32(ADD2);
+    __m128i c_4     = _mm_set1_epi32(DCT16_ADD1);
+    __m128i c_512   = _mm_set1_epi32(DCT16_ADD2);
 
     int i;
 
@@ -201,29 +202,29 @@ static void dct16(const int16_t *src, in
 
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[1]));
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[1]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[0 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T50, _mm_load_si128((__m128i*)tab_dct_8[2]));
         T61  = _mm_madd_epi16(T51, _mm_load_si128((__m128i*)tab_dct_8[2]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[8 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[3]));
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[3]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[4 * 16 + i], T70);
 
         T60  = _mm_madd_epi16(T52, _mm_load_si128((__m128i*)tab_dct_8[4]));
         T61  = _mm_madd_epi16(T53, _mm_load_si128((__m128i*)tab_dct_8[4]));
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[12 * 16 + i], T70);
 
@@ -233,8 +234,8 @@ static void dct16(const int16_t *src, in
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[5]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[2 * 16 + i], T70);
 
@@ -244,8 +245,8 @@ static void dct16(const int16_t *src, in
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[6]));
         T60  = _mm_hadd_epi32(T60, T61);
         T61  = _mm_hadd_epi32(T62, T63);
-        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), SHIFT1);
-        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), SHIFT1);
+        T60  = _mm_srai_epi32(_mm_add_epi32(T60, c_4), DCT16_SHIFT1);
+        T61  = _mm_srai_epi32(_mm_add_epi32(T61, c_4), DCT16_SHIFT1);
         T70  = _mm_packs_epi32(T60, T61);
         _mm_store_si128((__m128i*)&tmp[6 * 16 + i], T70);
 
@@ -255,8 +256,8 @@ static void dct16(const int16_t *src, in
         T63  = _mm_madd_epi16(T47, _mm_load_si128((__m128i*)tab_dct_8[7]));
         T60  = _mm_hadd_epi32(T60, T61);