[x265] [PATCH] sse: fix overflow in sse_ss for 10 bit

Wed Sep 30 12:56:51 CEST 2015

# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1443607299 -19800
#      Wed Sep 30 15:31:39 2015 +0530
# Node ID aadec6615a3d5f33b4fdb00079e236b019ef1e95
# Parent  6e7761bdfe23addb862483f8407b388800de7d92
sse: fix overflow in sse_ss for 10 bit

diff -r 6e7761bdfe23 -r aadec6615a3d source/common/common.h

--- a/source/common/common.h	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/common.h	Wed Sep 30 15:31:39 2015 +0530
@@ -135,7 +135,7 @@
 typedef int32_t  ssum2_t; // Signed sum
 #endif // if HIGH_BIT_DEPTH
 
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
 typedef uint32_t sse_ret_t;
 #else
 typedef uint64_t sse_ret_t;
diff -r 6e7761bdfe23 -r aadec6615a3d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 30 15:31:39 2015 +0530
@@ -1006,10 +1006,11 @@
         p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
-#if X265_DEPTH <= 10
-        p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
-        ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
-#endif
+
+        // sse_ss primitive need to be fixed for 10 and 12 bit
+        //p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
+        //ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
+
         p.cu[BLOCK_4x4].dct = PFX(dct4_sse2);
         p.cu[BLOCK_8x8].dct = PFX(dct8_sse2);
         p.cu[BLOCK_4x4].idct = PFX(idct4_sse2);
@@ -1535,11 +1536,12 @@
         p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
         p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
 
+        // sse_ss primitive need to be fixed for 10 and 12 bit
+        //p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
+        //p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
+        //p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
+
 #if X265_DEPTH <= 10
-        p.cu[BLOCK_16x16].sse_ss = PFX(pixel_ssd_ss_16x16_avx2);
-        p.cu[BLOCK_32x32].sse_ss = PFX(pixel_ssd_ss_32x32_avx2);
-        p.cu[BLOCK_64x64].sse_ss = PFX(pixel_ssd_ss_64x64_avx2);
-
         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
         p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
         p.cu[BLOCK_64x64].sse_pp = PFX(pixel_ssd_64x64_avx2);
diff -r 6e7761bdfe23 -r aadec6615a3d source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/common/x86/pixel.h	Wed Sep 30 15:31:39 2015 +0530
@@ -39,7 +39,7 @@
 pixel PFX(planeClipAndMax_avx2)(pixel *src, intptr_t stride, int width, int height, uint64_t *outsum, const pixel minPix, const pixel maxPix);
 
 #define DECL_PIXELS(cpu) \
-    FUNCDEF_PU(uint32_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
+    FUNCDEF_PU(sse_ret_t, pixel_ssd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(int, pixel_sa8d, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_PU(void, pixel_sad_x3, cpu, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
     FUNCDEF_PU(void, pixel_sad_x4, cpu, const pixel*, const pixel*, const pixel*, const pixel*, const pixel*, intptr_t, int32_t*); \
@@ -48,7 +48,7 @@
     FUNCDEF_PU(void, pixel_sub_ps, cpu, int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1); \
     FUNCDEF_CHROMA_PU(int, pixel_satd, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_sad, cpu, const pixel*, intptr_t, const pixel*, intptr_t); \
-    FUNCDEF_CHROMA_PU(uint32_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
+    FUNCDEF_CHROMA_PU(sse_ret_t, pixel_ssd_ss, cpu, const int16_t*, intptr_t, const int16_t*, intptr_t); \
     FUNCDEF_CHROMA_PU(void, addAvg, cpu, const int16_t*, const int16_t*, pixel*, intptr_t, intptr_t, intptr_t); \
     FUNCDEF_CHROMA_PU(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
     FUNCDEF_TU_S(int, pixel_ssd_s, cpu, const int16_t*, intptr_t); \
diff -r 6e7761bdfe23 -r aadec6615a3d source/encoder/rdcost.h
--- a/source/encoder/rdcost.h	Wed Sep 30 14:57:15 2015 +0530
+++ b/source/encoder/rdcost.h	Wed Sep 30 15:31:39 2015 +0530
@@ -91,7 +91,7 @@
 
     inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
                    "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
                    distortion, bits, m_lambda2);
@@ -130,7 +130,7 @@
 
     inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
     {
-#if X265_DEPTH <= 10
+#if X265_DEPTH < 10
         X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
                    "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
                    dist, m_chromaDistWeight[plane - 1]);